예제 #1
0
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards, post_op):
    cache_indexer_path = tmp_path / 'cache.bin'
    cache_full_size = 0
    with BaseIndexer.load(cache_indexer_path) as cache:
        assert isinstance(cache, DocIDCache)
        cache_full_size = cache.size
        print(f'cache size {cache.size}')

    for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]:
        indexers_full_size = 0
        for i in range(shards):
            from jina.executors.compound import CompoundExecutor
            compound_name = 'inc_docindexer' if KV_IDX_FILENAME in indexer_fname else 'inc_vecindexer'
            workspace_folder = CompoundExecutor.get_component_workspace_from_compound_workspace(tmp_path,
                                                                                                compound_name,
                                                                                                i + 1 if shards > 1 else 0 )
            indexer_path = os.path.join(BaseIndexer.get_shard_workspace(workspace_folder=workspace_folder,
                                                                        workspace_name=indexer_fname.rstrip('.bin'),
                                                                        pea_id=i + 1 if shards > 1 else 0),
                                        f'{indexer_fname}')

            # in the configuration of content-hash / same_content=True
            # there aren't enough docs to satisfy batch size, only 1 shard will have it
            if os.path.exists(indexer_path):
                with BaseIndexer.load(indexer_path) as indexer:
                    if indexer_fname == KV_IDX_FILENAME:
                        assert isinstance(indexer, BinaryPbIndexer)
                    else:
                        assert isinstance(indexer, NumpyIndexer)
                    indexers_full_size += indexer.size

        if post_op == 'delete':
            assert indexers_full_size == 0
            assert cache_full_size == 0
        else:
            if field == 'content_hash' and same_content:
                if chunks > 0:
                    # one content from Doc, one from chunk
                    expected = 2
                    assert indexers_full_size == expected
                    assert cache_full_size == 2
                else:
                    assert indexers_full_size == 1
                    assert cache_full_size == 1
            else:
                nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \
                    else nr_docs + chunks * nr_docs
                assert indexers_full_size == nr_expected
                assert cache_full_size == nr_expected
예제 #2
0
def test_annoy_indexer_known_update_delete(metas):
    vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1)
    with AnnoyIndexer(index_filename='annoy.test.gz', metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])

    # update
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.update([4], np.array([[200, 200, 200]]))
        indexer.save()
        assert indexer.size == 4

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=3)
        np.testing.assert_equal(
            idx, np.array([[5, 6, 4], [5, 6, 4], [6, 5, 4], [7, 4, 6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 3)

    # delete
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.delete([4])
        indexer.save()
        assert indexer.size == 3

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[5, 6], [5, 6], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
예제 #3
0
def test_scann_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array(['4', '5', '6', '7']).reshape(-1, 1)
    with ScannIndexer(distance_measure='squared_l2',
                      index_filename='scann.test.gz',
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, ScannIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #4
0
def test_numpy_indexer_known(batch_size, compress_level, test_metas):
    vectors = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array(['4', '5', '6', '7'], dtype=(np.str_, 16))
    with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
예제 #5
0
def test_faiss_indexer_known(metas):
    vectors = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]], dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1)

    train_filepath = os.path.join(os.environ['TEST_WORKSPACE'], 'train.tgz')
    train_data = vectors
    with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
        f.write(train_data.tobytes())

    with FaissIndexer(index_filename='faiss.test.gz', index_key='Flat', train_filepath=train_filepath,
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]], dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7, 6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])
예제 #6
0
def test_ngt_indexer_known_big(metas):
    """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10.
     We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector
     at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors.
     Then the keys will be assigned shifted to test the proper usage of `_int2ext_id` and `ext2int_id`
    """
    vectors = np.random.uniform(low=5.0, high=10.0,
                                size=(10000, 1024)).astype('float32')

    queries = np.empty((10, 1024))
    for idx in range(0, 10000, 1000):
        array = idx * np.ones((1, 1024))
        queries[int(idx / 1000)] = array
        vectors[idx] = array

    keys = np.arange(10000, 20000).reshape(-1, 1).astype(str)

    with NGTIndexer(index_filename='ngt.test.gz', num_threads=4,
                    metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NGTIndexer)
        idx, dist = indexer.query(queries, top_k=1)
        np.testing.assert_equal(
            idx,
            np.array([[10000], [11000], [12000], [13000], [14000], [15000],
                      [16000], [17000], [18000], [19000]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (10, 1)
        np.testing.assert_equal(indexer.query_by_key(['10000', '15000']),
                                vectors[[0, 5000]])
예제 #7
0
def test_scannindexer():
    train_filepath = os.path.join(cur_dir, 'train.tgz')
    train_data = np.array(np.random.random([1024, 10]), dtype=np.float32)
    with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
        f.write(train_data.tobytes())

    with ScannIndexer(index_filename='scann.test.gz') as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        index_abspath = indexer.index_abspath
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        idx, dist = indexer.query(query, top_k=4)
        print(idx, dist)
        global retr_idx
        if retr_idx is None:
            retr_idx = idx
        else:
            np.testing.assert_almost_equal(retr_idx, idx)
        assert idx.shape == dist.shape
        assert idx.shape == (10, 4)

    rm_files([index_abspath, save_abspath])
예제 #8
0
def test_indexer_zeros(metric, dimension, test_metas):
    import math

    query_vec = np.array(np.zeros([1, dimension]), dtype=np.float32)
    add_vec_idx = np.array(np.random.randint(0, high=num_data,
                                             size=[num_data]),
                           dtype=(np.str_, 16))
    add_vec = np.random.random([num_data, dimension])
    with NumpyIndexer(metric=metric,
                      index_filename='np.test.gz',
                      metas=test_metas) as indexer:
        indexer.add(add_vec_idx, add_vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(query_vec, top_k=4)

        assert idx.shape == dist.shape
        assert idx.shape == (1, 4)
        if metric == 'cosine':
            assert all(math.isnan(x) for x in dist[0])
        else:
            assert not any(math.isnan(x) for x in dist[0])
예제 #9
0
def validate_index_size(num_indexed_docs):
    path = Path(os.environ['JINA_TOPK_DIR'])
    index_files = list(path.glob('*.bin'))
    assert len(index_files) > 0
    for index_file in index_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
예제 #10
0
def test_indexer_train_from_index_different_compression_levels(
        metas, compression_level):
    np.random.seed(500)
    num_data = 500
    num_dim = 64
    num_query = 10
    query = np.array(np.random.random([num_query, num_dim]), dtype=np.float32)
    vec_idx = np.random.randint(0, high=num_data, size=[num_data])
    vec = np.random.random([num_data, num_dim])

    train_filepath = os.path.join(metas['workspace'], 'faiss.test.gz')

    with FaissIndexer(index_filename='faiss.test.gz',
                      index_key='IVF10,PQ4',
                      train_filepath=train_filepath,
                      compression_level=compression_level,
                      metas=metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(query, top_k=4)
        assert idx.shape == dist.shape
        assert idx.shape == (num_query, 4)
예제 #11
0
def test_binarypb_update_twice(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['1'], [b'newvalue'])
        idxer.update(['2'], [b'othernewvalue'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'newvalue'
        assert idxer.query('2') == b'othernewvalue'
예제 #12
0
def run_test(indexer):
    def create_document(doc_id, text, weight, length):
        d = jina_pb2.Document()
        d.id = doc_id
        d.buffer = text.encode('utf8')
        d.weight = weight
        d.length = length
        return d

    with indexer as idx:
        data = {
            'd1': MessageToJson(create_document(1, 'cat', 0.1, 3)),
            'd2': MessageToJson(create_document(2, 'dog', 0.2, 3)),
            'd3': MessageToJson(create_document(3, 'bird', 0.3, 3)),
        }
        idx.add(data)
        idx.touch()
        idx.save()
        save_abspath = idx.save_abspath
        index_abspath = idx.index_abspath
    assert os.path.exists(index_abspath)
    assert os.path.exists(save_abspath)

    with BaseIndexer.load(save_abspath) as searcher:
        doc = searcher.query('d2')
        assert doc.id == 2
        assert doc.length == 3

    rm_files([save_abspath, index_abspath])
예제 #13
0
def test_sptag_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str)
    with SptagIndexer(dist_calc_method='L2',
                      index_filename='sptag.test.gz',
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, SptagIndexer)
        idx, distances = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str))
        for distance in distances:
            assert distance[0] < distance[1]
        assert idx.shape == distances.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #14
0
def validate_index_size(num_indexed_docs, expected_indices):
    path = Path(os.environ['JINA_CORRUPTED_DOCS_TEST_DIR'])
    index_files = list(path.glob('*.bin'))
    assert len(index_files) == expected_indices
    for index_file in index_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
예제 #15
0
def test_numpy_indexer_known(batch_size, compress_level, test_metas):
    vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1)
    with NumpyIndexer(metric='euclidean',
                      index_filename='np.test.gz',
                      compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.add(keys, vectors)
        indexer.save()
        assert Path(indexer.index_abspath).exists()
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.raw_ndarray, np.memmap)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])
예제 #16
0
def test_nmslib_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str)
    with NmsLibIndexer(space='l2',
                       index_filename='nmslib.test.gz',
                       metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NmsLibIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #17
0
    def test_faiss_indexer(self):
        train_filepath = os.path.join(cur_dir, 'train.tgz')
        train_data = np.array(np.random.random([1024, 10]), dtype=np.float32)
        with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
            f.write(train_data.tobytes())

        with FaissIndexer(index_filename='faiss.test.gz',
                          index_key='IVF10,PQ2',
                          train_filepath=train_filepath) as a:
            a.add(vec_idx, vec)
            a.save()
            self.assertTrue(os.path.exists(a.index_abspath))
            index_abspath = a.index_abspath
            save_abspath = a.save_abspath

        with BaseIndexer.load(save_abspath) as b:
            idx, dist = b.query(query, top_k=4)
            global retr_idx
            if retr_idx is None:
                retr_idx = idx
            else:
                np.testing.assert_almost_equal(retr_idx, idx)
            self.assertEqual(idx.shape, dist.shape)
            self.assertEqual(idx.shape, (10, 4))

        self.add_tmpfile(index_abspath, save_abspath, train_filepath)
예제 #18
0
def test_scipy_indexer_known_big(compress_level, test_metas):
    """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10.
     We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector
     at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors.
     Then the keys will be assigned shifted to test the proper usage of `int2ext_id` and `ext2int_id`
    """
    vectors = np.random.uniform(low=5.0, high=10.0, size=(10000, 1024))

    queries = np.empty((10, 1024))
    for idx in range(0, 10000, 1000):
        array = idx * np.ones((1, 1024))
        queries[int(idx / 1000)] = array
        vectors[idx] = array

    keys = np.squeeze(np.array(np.arange(10000, 20000).reshape(-1, 1), dtype=(np.str_, 16)))

    with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', backend='scipy', compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(queries, top_k=1)
        np.testing.assert_equal(idx, np.array(
            [['10000'], ['11000'], ['12000'], ['13000'], ['14000'], ['15000'], ['16000'], ['17000'], ['18000'], ['19000']]))
        assert idx.shape == dist.shape
        assert idx.shape == (10, 1)
        np.testing.assert_equal(indexer.query_by_key(['10000', '15000']), vectors[[0, 5000]])
예제 #19
0
def test_kvindexer_iterate(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert list(idxer) == [[b'oldvalue'], [b'same'], [b'random']]
예제 #20
0
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards,
                        post_op):
    cache_indexer_path = tmp_path / 'cache.bin'
    cache_full_size = 0
    with BaseIndexer.load(cache_indexer_path) as cache:
        assert isinstance(cache, DocIDCache)
        cache_full_size = cache.size
        print(f'cache size {cache.size}')

    for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]:
        indexers_full_size = 0
        for i in range(shards):
            indexer_folder = 'docindexer' if indexer_fname == KV_IDX_FILENAME else 'vecindexer'
            indexer_folder = f'inc_{indexer_folder}-{i + 1}'
            indexer_path = tmp_path / indexer_folder / indexer_fname if shards > 1 else tmp_path / indexer_fname

            # in the configuration of content-hash / same_content=True
            # there aren't enough docs to satisfy batch size, only 1 shard will have it
            if os.path.exists(indexer_path):
                with BaseIndexer.load(indexer_path) as indexer:
                    if indexer_fname == KV_IDX_FILENAME:
                        assert isinstance(indexer, BinaryPbIndexer)
                    else:
                        assert isinstance(indexer, NumpyIndexer)
                    indexers_full_size += indexer.size

        if post_op == 'delete':
            assert indexers_full_size == 0
            assert cache_full_size == 0
        else:
            if field == 'content_hash' and same_content:
                if chunks > 0:
                    # one content from Doc, one from chunk
                    expected = 2
                    assert indexers_full_size == expected
                    assert cache_full_size == 2
                else:
                    assert indexers_full_size == 1
                    assert cache_full_size == 1
            else:
                nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \
                    else nr_docs + chunks * nr_docs
                assert indexers_full_size == nr_expected
                assert cache_full_size == nr_expected
예제 #21
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter([1, 2]))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == None
        assert idxer.query(2) == None
        assert idxer.query(3) == b'random'
예제 #22
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter(['1', '2']))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') is None
        assert idxer.query('2') is None
        assert idxer.query('3') == b'random'
예제 #23
0
def test_scannindexer(metas):
    with ScannIndexer(index_filename='scann.test.gz', metas=metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        idx, dist = indexer.query(query, top_k=4)
        assert idx.shape == dist.shape
        assert idx.shape == (10, 4)
예제 #24
0
def test_ngt_indexer(metas):
    with NGTIndexer(index_filename='ngt.test.gz', metas=metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NGTIndexer)
        idx, dist = indexer.query(query, top_k=4)
        assert idx.shape == dist.shape
        assert idx.shape == (10, 4)
예제 #25
0
def validate_index_size(num_indexed_docs):
    from jina.executors.compound import CompoundExecutor

    path_compound = Path(
        CompoundExecutor.get_component_workspace_from_compound_workspace(
            os.environ['JINA_REST_DIR'], 'chunk_indexer', 0))
    path = Path(os.environ['JINA_REST_DIR'])
    bin_files = list(path_compound.glob('*.bin')) + list(path.glob('*.bin'))
    assert len(bin_files) > 0
    for index_file in bin_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
예제 #26
0
def test_sptagindexer(metas):
    with SptagIndexer(index_filename='np.test.gz', metas=metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, SptagIndexer)
        idx, dist = indexer.query(query, top_k=top_k)
        assert idx.shape == dist.shape
        assert idx.shape == (num_queries, top_k)
예제 #27
0
def validate_index_size(expected_count, index_name):
    path = Path(os.environ['JINA_SHARDING_DIR'])
    index_files = list(path.glob(f'{index_name}.bin')) + list(
        path.glob(f'*/{index_name}.bin'))
    assert len(index_files) > 0
    actual_count_list = []
    assert len(index_files) > 0
    count_sum = 0
    for index_file in index_files:
        index = BaseIndexer.load(str(index_file))
        count_sum += index.size
    actual_count_list.sort()
    assert count_sum == expected_count
예제 #28
0
def test_binarypb_update1(test_metas, delete_on_dump):
    with BinaryPbIndexer(metas=test_metas,
                         delete_on_dump=delete_on_dump) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3

    first_size = os.path.getsize(idxer.index_abspath)
    save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'oldvalue']

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'oldvalue']

    second_size = os.path.getsize(idxer.index_abspath)
    assert second_size == first_size

    with BaseIndexer.load(save_abspath) as idxer:
        # some new value
        idxer.update(['1', '2'], [b'newvalue', b'same'])
        idxer.save()

    third_size = os.path.getsize(idxer.index_abspath)
    if delete_on_dump:
        assert third_size == first_size
    else:
        assert third_size > first_size
    assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'newvalue']
        assert idxer.query(['2']) == [b'same']
        assert idxer.query(['3']) == [b'random']
        assert idxer.query(['99']) == [None]

    with BaseIndexer.load(save_abspath) as idxer:
        # partial update when missing keys encountered
        idxer.update(['1', '2', '99'],
                     [b'abcvalue', b'abcd', b'WILL_BE_IGNORED'])
        idxer.save()
        assert idxer.size == 3

    fourth_size = os.path.getsize(idxer.index_abspath)
    if delete_on_dump:
        assert fourth_size == first_size
    else:
        assert fourth_size > first_size
    assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'abcvalue']
        assert idxer.query(['2']) == [b'abcd']
        assert idxer.query(['3']) == [b'random']
        assert idxer.query(['99']) == [None]
        assert idxer.query(['1', '2']) == [b'abcvalue', b'abcd']
        assert idxer.query(['1', '2',
                            '3']) == [b'abcvalue', b'abcd', b'random']
예제 #29
0
def test_binarypb_add_and_update_not_working(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['11', '12'], [b'eleven', b'twelve'])
        idxer.save()
        # FIXME `add` and `update` won't work in the same context
        # since `.save` calls `.flush` on a closed handler
        # and the handler needs to have been
        # closed for us to allow querying in the `.update`
        with pytest.raises(AttributeError):
            idxer.update(['12'], [b'twelve-new'])
            idxer.save()
        assert idxer.size == 2
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['12'], [b'twelve-new'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('11') == b'eleven'
        assert idxer.query('12') == b'twelve-new'
        assert idxer.size == 2
예제 #30
0
def test_numpy_indexer_empty_data(batch_size, compress_level, test_metas):
    idx_file_path = os.path.join(test_metas['workspace'], 'np.test.gz')
    with NumpyIndexer(index_filename=str(idx_file_path), compress_level=compress_level, metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.touch()
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        idx, dist = indexer.query(query, top_k=4)
        assert len(idx) == 0
        assert len(dist) == 0