def test_dbms_keyvalue(tmpdir, test_metas): docs = list(get_documents(chunks=False, nr=10, same_content=True)) ids, vecs, meta = zip(*[(doc.id, doc.embedding, _doc_without_embedding(doc).SerializeToString()) for doc in docs]) save_path = None with KeyValueDBMSIndexer(index_filename='dbms', metas=test_metas) as indexer: indexer.add(ids, vecs, meta) assert indexer.size == len(docs) save_path = indexer.save_abspath new_docs = list(get_documents(chunks=False, nr=10, same_content=False)) ids, vecs, meta = zip(*[(doc.id, doc.embedding, _doc_without_embedding(doc).SerializeToString()) for doc in new_docs]) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.update(ids, vecs, meta) assert indexer.size == len(docs) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.delete([d.id for d in docs]) assert indexer.size == 0
def assert_dump_data(dump_path, docs, shards, pea_id): size_shard = len(docs) // shards size_shard_modulus = len(docs) % shards ids_dump, vectors_dump = import_vectors( dump_path, str(pea_id), ) if pea_id == shards - 1: docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard + size_shard_modulus] else: docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard] print(f'### pea {pea_id} has {len(docs_expected)} docs') ids_dump = list(ids_dump) vectors_dump = list(vectors_dump) np.testing.assert_equal(ids_dump, [d.id for d in docs_expected]) np.testing.assert_allclose(vectors_dump, [d.embedding for d in docs_expected]) _, metas_dump = import_metas( dump_path, str(pea_id), ) metas_dump = list(metas_dump) np.testing.assert_equal( metas_dump, [_doc_without_embedding(d).SerializeToString() for d in docs_expected], ) # assert with Indexers # TODO currently metas are only passed to the parent Compound, not to the inner components with TimeContext(f'### reloading {len(docs_expected)}'): # noinspection PyTypeChecker cp: CompoundQueryExecutor = BaseQueryIndexer.load_config( 'indexer_query.yml', pea_id=pea_id, metas={ 'workspace': os.path.join(dump_path, 'new_ws'), 'dump_path': dump_path, }, ) for c in cp.components: assert c.size == len(docs_expected) # test with the inner indexers separate from the Compound for i, indexer_file in enumerate( ['basic/query_np.yml', 'basic/query_kv.yml']): indexer = BaseQueryIndexer.load_config( indexer_file, pea_id=pea_id, metas={ 'workspace': os.path.realpath(os.path.join(dump_path, f'new_ws-{i}')), 'dump_path': dump_path, }, ) assert indexer.size == len(docs_expected)
def _validate_results_nonempty(resp): assert len(resp.docs) == nr_search for d in resp.docs: if nr_docs < 10: assert len(d.matches) == nr_docs else: # TODO does it return all of them no matter how many? assert len(d.matches) > 0 for m in d.matches: assert m.embedding.shape[0] == emb_size assert _doc_without_embedding(m).SerializeToString() is not None assert 'hello world' in m.text assert f'tag data' in m.tags['tag_field']