def memmap_with_text_and_embedding(tmpdir): dam = DocumentArrayMemmap(tmpdir) for idx in range(100): d = Document(text=f'random text {idx}', embedding=np.random.rand(512)) dam.append(d) yield dam dam.clear()
def test_issue_3527_delete_and_match(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.append(Document(id='a', embedding=np.array([1, 2, 3], dtype=np.float32))) del dam['a'] dam.append(Document(id='c', embedding=np.array([1, 2, 3], dtype=np.float32))) da = DocumentArray([Document(embedding=np.array([5, 6, 7], dtype=np.float32))]) da.match(dam) assert da[0].matches[0].id == 'c'
def test_batch_iterator_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) for i in range(4): dam.append(Document(id=i)) bi = batch_iterator(dam, 2) expected_iterator = iter(range(4)) for batch in bi: for doc in batch: assert int(doc.id) == next(expected_iterator) # expect that expected_iterator is totally consumed with pytest.raises(StopIteration): next(expected_iterator)
def test_memmap_append_extend(tmpdir): dam = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) assert len(dam) == 0 for d in docs[:40]: dam.append(d) assert len(dam) == 40 for d1, d2 in zip(docs[:40], dam): assert d1.proto == d2.proto dam.extend(docs[40:]) assert len(dam) == 100 for d1, d2 in zip(docs, dam): assert d1.proto == d2.proto
def test_memmap_mutate(tmpdir): da = DocumentArrayMemmap(tmpdir) d0 = Document(text='hello') da.append(d0) assert da[0] == d0 d1 = Document(text='world') da.append(d1) assert da[1] == d1 da2 = DocumentArrayMemmap(tmpdir) assert len(da2) == 2 assert da2[0] == d0 assert da2[1] == d1 da.clear() assert not len(da)
def test_buffer_dam_lru(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5) docs = list(random_docs(6)) dam.extend(docs[:5]) # make the first doc most recently used, the second doc is the LRU doc1 = dam[0] assert next(reversed(dam.buffer_pool.doc_map.keys())) == doc1.id assert next(iter(dam.buffer_pool.doc_map.keys())) == docs[1].id doc2 = docs[1] assert doc1.id == docs[0].id dam.append(docs[5]) # doc1 was not LRU, doc2 was LRU assert doc1.id in dam.buffer_pool assert doc2.id not in dam.buffer_pool assert docs[5].id in dam.buffer_pool
def memmap_for_split(tmpdir): da = DocumentArrayMemmap(tmpdir) da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'b'})) da.append(Document(tags={'category': 'a'})) da.append(Document(tags={'category': 'a'})) return da
def test_memmap_physical_size(tmpdir): da = DocumentArrayMemmap(tmpdir) assert da.physical_size == 0 da.append(Document()) assert da.physical_size > 0