Exemplo n.º 1
0
def test_buffer_dam_add_or_update(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=6)
    docs = list(random_docs(8))
    dam.extend(docs[:5])

    doc1 = docs[0]
    doc1.content = 'new'

    # doc1 already exists => update
    dam.buffer_pool.add_or_update(doc1.id, doc1)
    assert dam[0].content == doc1.content
    assert len(dam.buffer_pool.buffer) == 5

    # doc does not exist => add to buffer
    dam.buffer_pool.add_or_update(docs[5].id, docs[5])
    assert len(dam.buffer_pool.buffer) == 6

    # buffer is full => remove the LRU (docs[1], because docs[0] was used before)
    dam.buffer_pool.add_or_update(docs[6].id, docs[6])
    assert docs[6].id in dam.buffer_pool
    assert docs[1].id not in dam.buffer_pool

    del dam.buffer_pool[docs[4].id]

    # spot number 4 becomes empty
    assert 4 in dam.buffer_pool._empty
    dam.buffer_pool.add_or_update(docs[7].id, docs[7])
    assert dam.buffer_pool.doc_map[docs[7].id][0] == 4
Exemplo n.º 2
0
class MyIndexer(Executor):
    """Simple indexer class """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=1,
        )
Exemplo n.º 3
0
def test_texts_getter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document(text='hello') for _ in range(100)])
    assert len(dam.texts) == 100
    t1 = dam.texts
    t2 = dam.get_attributes('text')
    assert t1 == t2
Exemplo n.º 4
0
def test_embeddings_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    embeddings = np.ones((2, 10, 10))

    with pytest.raises(ValueError, match='the number of rows in the'):
        dam.embeddings = embeddings
Exemplo n.º 5
0
def test_setter_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    tags = [{'1': 2}]

    with pytest.raises(ValueError, match='the number of tags in the'):
        dam.tags = tags
Exemplo n.º 6
0
def test_memmap_save_reload(tmpdir):
    docs = list(random_docs(100))
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100)
    dam.extend(docs)

    dam1 = DocumentArrayMemmap(tmpdir)

    for doc in docs:
        doc.content = 'new'

    for doc in dam:
        # from memory
        assert doc.content == 'new'
        # from disk
        assert dam._get_doc_by_key(doc.id).content == 'hello world'

    # dam1 from disk (empty memory buffer + dam not persisted)
    for doc in dam1:
        assert doc.content == 'hello world'

    dam.flush()
    dam1.reload()

    # dam from disk
    for doc in dam:
        assert dam._get_doc_by_key(doc.id).content == 'new'

    # dam1 up-to-date
    for doc in dam1:
        assert doc.content == 'new'
Exemplo n.º 7
0
def test_texts_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['hello']

    with pytest.raises(ValueError):
        dam.texts = texts
Exemplo n.º 8
0
def test_blobs_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    blobs = np.ones((2, 10, 10))

    with pytest.raises(ValueError):
        dam.blobs = blobs
Exemplo n.º 9
0
class MyIndexer(Executor):
    """
    Executor with basic exact search using cosine distance
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        """Extend self._docs

        :param docs: DocumentArray containing Documents
        :param kwargs: other keyword arguments
        """
        self._docs.extend(docs)

    @requests(on=['/search', '/eval'])
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )
Exemplo n.º 10
0
def test_match_handle_different_limit(get_two_docarray, limit, tmpdir):
    da1, da2 = get_two_docarray
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(da2)
    da1.match(dam, limit=limit)
    expected_length = limit if limit not in [None, -1] else len(da2)
    assert len(da1[0].matches) == expected_length
Exemplo n.º 11
0
def test_scipy_dist(docarrays_for_embedding_distance_computation,
                    normalization, metric, tmpdir, only_id):
    D1, D2 = docarrays_for_embedding_distance_computation
    D1_ = copy.deepcopy(D1)
    D2_ = copy.deepcopy(D2)
    D1.match(D2,
             metric=metric,
             limit=3,
             normalization=normalization,
             use_scipy=True)
    values_docarray = [m.scores[metric].value for d in D1 for m in d.matches]

    D2memmap = DocumentArrayMemmap(tmpdir)
    D2memmap.extend(D2_)
    D1_.match(
        D2memmap,
        metric=metric,
        limit=3,
        normalization=normalization,
        use_scipy=True,
        only_id=only_id,
    )
    values_docarraymemmap = [
        m.scores[metric].value for d in D1_ for m in d.matches
    ]

    np.testing.assert_equal(values_docarray, values_docarraymemmap)
Exemplo n.º 12
0
def test_persist(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    for doc in docs:
        doc.scores['score'] = 50
        doc.evaluations['eval'] = 100

    dam.extend(docs)

    dam2 = DocumentArrayMemmap(tmpdir)
    assert len(dam2) == 100

    assert dam == dam2

    for d1, d2 in zip(dam, dam2):
        assert d1.proto == d2.proto

    assert '1' in dam

    del dam['1']
    assert len(dam2) == 100
    dam2.reload()
    assert len(dam2) == 99
    for doc2 in dam2:
        assert doc2.scores['score'].value == 50
        assert doc2.evaluations['eval'].value == 100

    dam.clear()
    assert len(dam2) == 99
    dam2.reload()
    assert len(dam2) == 0
Exemplo n.º 13
0
def test_traverse(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(random_docs(100))
    mock = mocker.Mock()
    for c in dam.traverse_flat(['c']):
        assert c.granularity == 1
        mock()
    mock.assert_called()
Exemplo n.º 14
0
def test_buffer_dam_clear(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(5))
    dam.extend(docs)

    dam.buffer_pool.clear()
    for doc in docs:
        assert doc.id not in dam.buffer_pool
Exemplo n.º 15
0
def test_blobs_setter_dam(tmpdir):
    blobs = np.random.random((100, 10, 10))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in blobs])
    dam.blobs = blobs
    np.testing.assert_almost_equal(dam.blobs, blobs)
    for x, doc in zip(blobs, dam):
        np.testing.assert_almost_equal(x, doc.blob)
Exemplo n.º 16
0
def test_memmap_update_in_memory(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'
Exemplo n.º 17
0
def test_sample(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    da.extend(docs)
    sampled = da.sample(5)
    assert len(sampled) == 5
    assert isinstance(sampled, DocumentArray)
    with pytest.raises(ValueError):
        da.sample(101)
Exemplo n.º 18
0
def test_texts_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['text' for _ in range(100)]
    dam.texts = texts
    assert dam.texts == texts

    for x, doc in zip(texts, dam):
        assert x == doc.text
Exemplo n.º 19
0
def test_embeddings_setter_dam(tmpdir):
    emb = np.random.random((100, 128))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    dam.embeddings = emb
    np.testing.assert_almost_equal(dam.embeddings, emb)

    for x, doc in zip(emb, dam):
        np.testing.assert_almost_equal(x, doc.embedding)
Exemplo n.º 20
0
def test_tags_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    tags = [{'a': 2, 'c': 'd'} for _ in range(100)]
    dam.extend([Document() for _ in range(100)])
    dam.tags = tags
    assert dam.tags == tags

    for x, doc in zip(tags, dam):
        assert x == doc.tags
Exemplo n.º 21
0
def test_shuffle_with_seed(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    da.extend(docs)
    shuffled_1 = da.shuffle(seed=1)
    shuffled_2 = da.shuffle(seed=1)
    shuffled_3 = da.shuffle(seed=2)
    assert len(shuffled_1) == len(shuffled_2) == len(shuffled_3) == len(da)
    assert shuffled_1 == shuffled_2
    assert shuffled_1 != shuffled_3
Exemplo n.º 22
0
def test_memmap_update_document(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'
        dam[idx] = candidate

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'
Exemplo n.º 23
0
def test_memmap_buffer_synched(tmpdir):
    docs = list(random_docs(100))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(docs[:50])

    for i, doc in enumerate(docs[50:]):
        dam[i] = doc
        assert dam._buffer_pool[doc.id].id == dam[i].id
        doc.content = 'new'
        assert dam[doc.id].content == 'new'
Exemplo n.º 24
0
def test_prune_save_space(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(random_docs(100))
    old_hsize = os.stat(os.path.join(tmpdir, 'header.bin')).st_size
    old_bsize = os.stat(os.path.join(tmpdir, 'body.bin')).st_size
    del dam['2']
    dam.prune()
    new_hsize = os.stat(os.path.join(tmpdir, 'header.bin')).st_size
    new_bsize = os.stat(os.path.join(tmpdir, 'body.bin')).st_size
    assert new_bsize < old_bsize
    assert new_hsize < old_hsize
Exemplo n.º 25
0
def test_shuffle(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    da.extend(docs)
    shuffled = da.shuffle()
    assert len(shuffled) == len(da)
    assert isinstance(shuffled, DocumentArray)
    ids_before_shuffle = [d.id for d in da]
    ids_after_shuffle = [d.id for d in shuffled]
    assert ids_before_shuffle != ids_after_shuffle
    assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle)
Exemplo n.º 26
0
def test_convert_dm_to_dam(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    da = DocumentArray(random_docs(100))
    dam.extend(da)
    da.clear()
    mock = mocker.Mock()
    for d in dam:
        assert d
        mock()
    mock.assert_called()
    assert len(da) == 0
    assert len(dam) == 100
Exemplo n.º 27
0
def test_buffer_dam_delete(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(6))
    dam.extend(docs)

    first_doc = docs[0]

    # the first element should be out of buffer
    with pytest.raises(KeyError):
        del dam.buffer_pool[first_doc.id]

    # no exception raised
    dam.buffer_pool.delete_if_exists(first_doc.id)
Exemplo n.º 28
0
def test_memmap_append_extend(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    assert len(dam) == 0
    for d in docs[:40]:
        dam.append(d)
    assert len(dam) == 40
    for d1, d2 in zip(docs[:40], dam):
        assert d1.proto == d2.proto
    dam.extend(docs[40:])
    assert len(dam) == 100
    for d1, d2 in zip(docs, dam):
        assert d1.proto == d2.proto
Exemplo n.º 29
0
def test_buffer_dam_getitem(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(10))
    dam.extend(docs)
    for i, doc in enumerate(docs):
        # assert same doc when getting by key
        assert dam.buffer_pool[doc.id].content_hash == doc.content_hash
        assert dam.buffer_pool[doc.id].id == doc.id

    with pytest.raises(TypeError):
        dam.buffer_pool[1:5]

    with pytest.raises(TypeError):
        dam.buffer_pool[0]
Exemplo n.º 30
0
def test_buffers_getter_setter(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([
        Document(buffer=b'aa'),
        Document(buffer=b'bb'),
        Document(buffer=b'cc'),
    ])
    assert dam.buffers == [b'aa', b'bb', b'cc']
    dam.buffers = [b'cc', b'bb', b'aa']
    assert dam.buffers == [b'cc', b'bb', b'aa']
    with pytest.raises(ValueError):
        dam.buffers = [b'cc', b'bb', b'aa', b'dd']
    with pytest.raises(TypeError):
        dam.buffers = ['aa', 'bb', 'cc']