def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards, post_op): cache_indexer_path = tmp_path / 'cache.bin' cache_full_size = 0 with BaseIndexer.load(cache_indexer_path) as cache: assert isinstance(cache, DocIDCache) cache_full_size = cache.size print(f'cache size {cache.size}') for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]: indexers_full_size = 0 for i in range(shards): from jina.executors.compound import CompoundExecutor compound_name = 'inc_docindexer' if KV_IDX_FILENAME in indexer_fname else 'inc_vecindexer' workspace_folder = CompoundExecutor.get_component_workspace_from_compound_workspace(tmp_path, compound_name, i + 1 if shards > 1 else 0 ) indexer_path = os.path.join(BaseIndexer.get_shard_workspace(workspace_folder=workspace_folder, workspace_name=indexer_fname.rstrip('.bin'), pea_id=i + 1 if shards > 1 else 0), f'{indexer_fname}') # in the configuration of content-hash / same_content=True # there aren't enough docs to satisfy batch size, only 1 shard will have it if os.path.exists(indexer_path): with BaseIndexer.load(indexer_path) as indexer: if indexer_fname == KV_IDX_FILENAME: assert isinstance(indexer, BinaryPbIndexer) else: assert isinstance(indexer, NumpyIndexer) indexers_full_size += indexer.size if post_op == 'delete': assert indexers_full_size == 0 assert cache_full_size == 0 else: if field == 'content_hash' and same_content: if chunks > 0: # one content from Doc, one from chunk expected = 2 assert indexers_full_size == expected assert cache_full_size == 2 else: assert indexers_full_size == 1 assert cache_full_size == 1 else: nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \ else nr_docs + chunks * nr_docs assert indexers_full_size == nr_expected assert cache_full_size == nr_expected
def test_annoy_indexer_known_update_delete(metas): vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) keys = np.array([4, 5, 6, 7]).reshape(-1, 1) with AnnoyIndexer(index_filename='annoy.test.gz', metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, AnnoyIndexer) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]])) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]]) # update with BaseIndexer.load(save_abspath) as indexer: indexer.update([4], np.array([[200, 200, 200]])) indexer.save() assert indexer.size == 4 with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, AnnoyIndexer) idx, dist = indexer.query(queries, top_k=3) np.testing.assert_equal( idx, np.array([[5, 6, 4], [5, 6, 4], [6, 5, 4], [7, 4, 6]])) assert idx.shape == dist.shape assert idx.shape == (4, 3) # delete with BaseIndexer.load(save_abspath) as indexer: indexer.delete([4]) indexer.save() assert indexer.size == 3 with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, AnnoyIndexer) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal(idx, np.array([[5, 6], [5, 6], [6, 5], [7, 6]])) assert idx.shape == dist.shape assert idx.shape == (4, 2)
def test_scann_indexer_known(metas): vectors = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) keys = np.array(['4', '5', '6', '7']).reshape(-1, 1) with ScannIndexer(distance_measure='squared_l2', index_filename='scann.test.gz', metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, ScannIndexer) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal( idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']])) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
def test_numpy_indexer_known(batch_size, compress_level, test_metas): vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) keys = np.array(['4', '5', '6', '7'], dtype=(np.str_, 16)) with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level, metas=test_metas) as indexer: indexer.batch_size = batch_size indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NumpyIndexer) if compress_level == 0: assert isinstance(indexer.query_handler, np.memmap) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal(idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']])) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
def test_faiss_indexer_known(metas): vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) keys = np.array([4, 5, 6, 7]).reshape(-1, 1) train_filepath = os.path.join(os.environ['TEST_WORKSPACE'], 'train.tgz') train_data = vectors with gzip.open(train_filepath, 'wb', compresslevel=1) as f: f.write(train_data.tobytes()) with FaissIndexer(index_filename='faiss.test.gz', index_key='Flat', train_filepath=train_filepath, metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, FaissIndexer) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]])) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])
def test_ngt_indexer_known_big(metas): """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10. We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors. Then the keys will be assigned shifted to test the proper usage of `_int2ext_id` and `ext2int_id` """ vectors = np.random.uniform(low=5.0, high=10.0, size=(10000, 1024)).astype('float32') queries = np.empty((10, 1024)) for idx in range(0, 10000, 1000): array = idx * np.ones((1, 1024)) queries[int(idx / 1000)] = array vectors[idx] = array keys = np.arange(10000, 20000).reshape(-1, 1).astype(str) with NGTIndexer(index_filename='ngt.test.gz', num_threads=4, metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NGTIndexer) idx, dist = indexer.query(queries, top_k=1) np.testing.assert_equal( idx, np.array([[10000], [11000], [12000], [13000], [14000], [15000], [16000], [17000], [18000], [19000]]).astype(str)) assert idx.shape == dist.shape assert idx.shape == (10, 1) np.testing.assert_equal(indexer.query_by_key(['10000', '15000']), vectors[[0, 5000]])
def test_scannindexer(): train_filepath = os.path.join(cur_dir, 'train.tgz') train_data = np.array(np.random.random([1024, 10]), dtype=np.float32) with gzip.open(train_filepath, 'wb', compresslevel=1) as f: f.write(train_data.tobytes()) with ScannIndexer(index_filename='scann.test.gz') as indexer: indexer.add(vec_idx, vec) indexer.save() assert os.path.exists(indexer.index_abspath) index_abspath = indexer.index_abspath save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: idx, dist = indexer.query(query, top_k=4) print(idx, dist) global retr_idx if retr_idx is None: retr_idx = idx else: np.testing.assert_almost_equal(retr_idx, idx) assert idx.shape == dist.shape assert idx.shape == (10, 4) rm_files([index_abspath, save_abspath])
def test_indexer_zeros(metric, dimension, test_metas): import math query_vec = np.array(np.zeros([1, dimension]), dtype=np.float32) add_vec_idx = np.array(np.random.randint(0, high=num_data, size=[num_data]), dtype=(np.str_, 16)) add_vec = np.random.random([num_data, dimension]) with NumpyIndexer(metric=metric, index_filename='np.test.gz', metas=test_metas) as indexer: indexer.add(add_vec_idx, add_vec) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NumpyIndexer) assert isinstance(indexer.query_handler, np.memmap) idx, dist = indexer.query(query_vec, top_k=4) assert idx.shape == dist.shape assert idx.shape == (1, 4) if metric == 'cosine': assert all(math.isnan(x) for x in dist[0]) else: assert not any(math.isnan(x) for x in dist[0])
def validate_index_size(num_indexed_docs): path = Path(os.environ['JINA_TOPK_DIR']) index_files = list(path.glob('*.bin')) assert len(index_files) > 0 for index_file in index_files: index = BaseIndexer.load(str(index_file)) assert index.size == num_indexed_docs
def test_indexer_train_from_index_different_compression_levels( metas, compression_level): np.random.seed(500) num_data = 500 num_dim = 64 num_query = 10 query = np.array(np.random.random([num_query, num_dim]), dtype=np.float32) vec_idx = np.random.randint(0, high=num_data, size=[num_data]) vec = np.random.random([num_data, num_dim]) train_filepath = os.path.join(metas['workspace'], 'faiss.test.gz') with FaissIndexer(index_filename='faiss.test.gz', index_key='IVF10,PQ4', train_filepath=train_filepath, compression_level=compression_level, metas=metas) as indexer: indexer.add(vec_idx, vec) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, FaissIndexer) idx, dist = indexer.query(query, top_k=4) assert idx.shape == dist.shape assert idx.shape == (num_query, 4)
def test_binarypb_update_twice(test_metas): """two updates in a row does work""" with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: idxer.update(['1'], [b'newvalue']) idxer.update(['2'], [b'othernewvalue']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'newvalue' assert idxer.query('2') == b'othernewvalue'
def run_test(indexer): def create_document(doc_id, text, weight, length): d = jina_pb2.Document() d.id = doc_id d.buffer = text.encode('utf8') d.weight = weight d.length = length return d with indexer as idx: data = { 'd1': MessageToJson(create_document(1, 'cat', 0.1, 3)), 'd2': MessageToJson(create_document(2, 'dog', 0.2, 3)), 'd3': MessageToJson(create_document(3, 'bird', 0.3, 3)), } idx.add(data) idx.touch() idx.save() save_abspath = idx.save_abspath index_abspath = idx.index_abspath assert os.path.exists(index_abspath) assert os.path.exists(save_abspath) with BaseIndexer.load(save_abspath) as searcher: doc = searcher.query('d2') assert doc.id == 2 assert doc.length == 3 rm_files([save_abspath, index_abspath])
def test_sptag_indexer_known(metas): vectors = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str) with SptagIndexer(dist_calc_method='L2', index_filename='sptag.test.gz', metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, SptagIndexer) idx, distances = indexer.query(queries, top_k=2) np.testing.assert_equal( idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str)) for distance in distances: assert distance[0] < distance[1] assert idx.shape == distances.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
def validate_index_size(num_indexed_docs, expected_indices): path = Path(os.environ['JINA_CORRUPTED_DOCS_TEST_DIR']) index_files = list(path.glob('*.bin')) assert len(index_files) == expected_indices for index_file in index_files: index = BaseIndexer.load(str(index_file)) assert index.size == num_indexed_docs
def test_numpy_indexer_known(batch_size, compress_level, test_metas): vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) keys = np.array([4, 5, 6, 7]).reshape(-1, 1) with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level, metas=test_metas) as indexer: indexer.batch_size = batch_size indexer.add(keys, vectors) indexer.save() assert Path(indexer.index_abspath).exists() save_abspath = indexer.save_abspath queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]]) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NumpyIndexer) if compress_level == 0: assert isinstance(indexer.raw_ndarray, np.memmap) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]])) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])
def test_nmslib_indexer_known(metas): vectors = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str) with NmsLibIndexer(space='l2', index_filename='nmslib.test.gz', metas=metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath queries = np.array( [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]], dtype=np.float32) with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NmsLibIndexer) idx, dist = indexer.query(queries, top_k=2) np.testing.assert_equal( idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str)) assert idx.shape == dist.shape assert idx.shape == (4, 2) np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
def test_faiss_indexer(self): train_filepath = os.path.join(cur_dir, 'train.tgz') train_data = np.array(np.random.random([1024, 10]), dtype=np.float32) with gzip.open(train_filepath, 'wb', compresslevel=1) as f: f.write(train_data.tobytes()) with FaissIndexer(index_filename='faiss.test.gz', index_key='IVF10,PQ2', train_filepath=train_filepath) as a: a.add(vec_idx, vec) a.save() self.assertTrue(os.path.exists(a.index_abspath)) index_abspath = a.index_abspath save_abspath = a.save_abspath with BaseIndexer.load(save_abspath) as b: idx, dist = b.query(query, top_k=4) global retr_idx if retr_idx is None: retr_idx = idx else: np.testing.assert_almost_equal(retr_idx, idx) self.assertEqual(idx.shape, dist.shape) self.assertEqual(idx.shape, (10, 4)) self.add_tmpfile(index_abspath, save_abspath, train_filepath)
def test_scipy_indexer_known_big(compress_level, test_metas): """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10. We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors. Then the keys will be assigned shifted to test the proper usage of `int2ext_id` and `ext2int_id` """ vectors = np.random.uniform(low=5.0, high=10.0, size=(10000, 1024)) queries = np.empty((10, 1024)) for idx in range(0, 10000, 1000): array = idx * np.ones((1, 1024)) queries[int(idx / 1000)] = array vectors[idx] = array keys = np.squeeze(np.array(np.arange(10000, 20000).reshape(-1, 1), dtype=(np.str_, 16))) with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', backend='scipy', compress_level=compress_level, metas=test_metas) as indexer: indexer.add(keys, vectors) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NumpyIndexer) if compress_level == 0: assert isinstance(indexer.query_handler, np.memmap) idx, dist = indexer.query(queries, top_k=1) np.testing.assert_equal(idx, np.array( [['10000'], ['11000'], ['12000'], ['13000'], ['14000'], ['15000'], ['16000'], ['17000'], ['18000'], ['19000']])) assert idx.shape == dist.shape assert idx.shape == (10, 1) np.testing.assert_equal(indexer.query_by_key(['10000', '15000']), vectors[[0, 5000]])
def test_kvindexer_iterate(test_metas): """two updates in a row does work""" with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert list(idxer) == [[b'oldvalue'], [b'same'], [b'random']]
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards, post_op): cache_indexer_path = tmp_path / 'cache.bin' cache_full_size = 0 with BaseIndexer.load(cache_indexer_path) as cache: assert isinstance(cache, DocIDCache) cache_full_size = cache.size print(f'cache size {cache.size}') for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]: indexers_full_size = 0 for i in range(shards): indexer_folder = 'docindexer' if indexer_fname == KV_IDX_FILENAME else 'vecindexer' indexer_folder = f'inc_{indexer_folder}-{i + 1}' indexer_path = tmp_path / indexer_folder / indexer_fname if shards > 1 else tmp_path / indexer_fname # in the configuration of content-hash / same_content=True # there aren't enough docs to satisfy batch size, only 1 shard will have it if os.path.exists(indexer_path): with BaseIndexer.load(indexer_path) as indexer: if indexer_fname == KV_IDX_FILENAME: assert isinstance(indexer, BinaryPbIndexer) else: assert isinstance(indexer, NumpyIndexer) indexers_full_size += indexer.size if post_op == 'delete': assert indexers_full_size == 0 assert cache_full_size == 0 else: if field == 'content_hash' and same_content: if chunks > 0: # one content from Doc, one from chunk expected = 2 assert indexers_full_size == expected assert cache_full_size == 2 else: assert indexers_full_size == 1 assert cache_full_size == 1 else: nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \ else nr_docs + chunks * nr_docs assert indexers_full_size == nr_expected assert cache_full_size == nr_expected
def test_binarypb_delete(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: idxer.delete(iter([1, 2])) idxer.save() assert idxer.size == 1 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == None assert idxer.query(2) == None assert idxer.query(3) == b'random'
def test_binarypb_delete(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: idxer.delete(iter(['1', '2'])) idxer.save() assert idxer.size == 1 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') is None assert idxer.query('2') is None assert idxer.query('3') == b'random'
def test_scannindexer(metas): with ScannIndexer(index_filename='scann.test.gz', metas=metas) as indexer: indexer.add(vec_idx, vec) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: idx, dist = indexer.query(query, top_k=4) assert idx.shape == dist.shape assert idx.shape == (10, 4)
def test_ngt_indexer(metas): with NGTIndexer(index_filename='ngt.test.gz', metas=metas) as indexer: indexer.add(vec_idx, vec) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NGTIndexer) idx, dist = indexer.query(query, top_k=4) assert idx.shape == dist.shape assert idx.shape == (10, 4)
def validate_index_size(num_indexed_docs): from jina.executors.compound import CompoundExecutor path_compound = Path( CompoundExecutor.get_component_workspace_from_compound_workspace( os.environ['JINA_REST_DIR'], 'chunk_indexer', 0)) path = Path(os.environ['JINA_REST_DIR']) bin_files = list(path_compound.glob('*.bin')) + list(path.glob('*.bin')) assert len(bin_files) > 0 for index_file in bin_files: index = BaseIndexer.load(str(index_file)) assert index.size == num_indexed_docs
def test_sptagindexer(metas): with SptagIndexer(index_filename='np.test.gz', metas=metas) as indexer: indexer.add(vec_idx, vec) indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, SptagIndexer) idx, dist = indexer.query(query, top_k=top_k) assert idx.shape == dist.shape assert idx.shape == (num_queries, top_k)
def validate_index_size(expected_count, index_name): path = Path(os.environ['JINA_SHARDING_DIR']) index_files = list(path.glob(f'{index_name}.bin')) + list( path.glob(f'*/{index_name}.bin')) assert len(index_files) > 0 actual_count_list = [] assert len(index_files) > 0 count_sum = 0 for index_file in index_files: index = BaseIndexer.load(str(index_file)) count_sum += index.size actual_count_list.sort() assert count_sum == expected_count
def test_binarypb_update1(test_metas, delete_on_dump): with BinaryPbIndexer(metas=test_metas, delete_on_dump=delete_on_dump) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 first_size = os.path.getsize(idxer.index_abspath) save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'oldvalue'] with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'oldvalue'] second_size = os.path.getsize(idxer.index_abspath) assert second_size == first_size with BaseIndexer.load(save_abspath) as idxer: # some new value idxer.update(['1', '2'], [b'newvalue', b'same']) idxer.save() third_size = os.path.getsize(idxer.index_abspath) if delete_on_dump: assert third_size == first_size else: assert third_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'newvalue'] assert idxer.query(['2']) == [b'same'] assert idxer.query(['3']) == [b'random'] assert idxer.query(['99']) == [None] with BaseIndexer.load(save_abspath) as idxer: # partial update when missing keys encountered idxer.update(['1', '2', '99'], [b'abcvalue', b'abcd', b'WILL_BE_IGNORED']) idxer.save() assert idxer.size == 3 fourth_size = os.path.getsize(idxer.index_abspath) if delete_on_dump: assert fourth_size == first_size else: assert fourth_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'abcvalue'] assert idxer.query(['2']) == [b'abcd'] assert idxer.query(['3']) == [b'random'] assert idxer.query(['99']) == [None] assert idxer.query(['1', '2']) == [b'abcvalue', b'abcd'] assert idxer.query(['1', '2', '3']) == [b'abcvalue', b'abcd', b'random']
def test_binarypb_add_and_update_not_working(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['11', '12'], [b'eleven', b'twelve']) idxer.save() # FIXME `add` and `update` won't work in the same context # since `.save` calls `.flush` on a closed handler # and the handler needs to have been # closed for us to allow querying in the `.update` with pytest.raises(AttributeError): idxer.update(['12'], [b'twelve-new']) idxer.save() assert idxer.size == 2 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: idxer.update(['12'], [b'twelve-new']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('11') == b'eleven' assert idxer.query('12') == b'twelve-new' assert idxer.size == 2
def test_numpy_indexer_empty_data(batch_size, compress_level, test_metas): idx_file_path = os.path.join(test_metas['workspace'], 'np.test.gz') with NumpyIndexer(index_filename=str(idx_file_path), compress_level=compress_level, metas=test_metas) as indexer: indexer.batch_size = batch_size indexer.touch() indexer.save() assert os.path.exists(indexer.index_abspath) save_abspath = indexer.save_abspath with BaseIndexer.load(save_abspath) as indexer: assert isinstance(indexer, NumpyIndexer) idx, dist = indexer.query(query, top_k=4) assert len(idx) == 0 assert len(dist) == 0