예제 #1
0
def test_binarypb_add_and_update_not_working(test_metas, delete_on_dump):
    with BinaryPbIndexer(metas=test_metas,
                         delete_on_dump=delete_on_dump) as idxer:
        idxer.add(['11', '12', '13'], [b'eleven', b'twelve', b'thirteen'])
        idxer.save()
        # FIXME `add` and `update` won't work in the same context
        # since `.save` calls `.flush` on a closed handler
        # and the handler needs to have been
        # closed for us to allow querying in the `.update`
        with pytest.raises(AttributeError):
            idxer.update(['12'], [b'twelve-new'])
            idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['12'], [b'twelve-new'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['11']) == [b'eleven']
        assert idxer.query(['12']) == [b'twelve-new']
        assert idxer.query(['12', '13']) == [b'twelve-new', b'thirteen']
        assert idxer.size == 3
        assert idxer.sample() in (b'eleven', b'twelve-new', b'thirteen')
예제 #2
0
    def assert_bi():
        b = BaseIndexer(1)

        b.save_config(os.path.join(tmpdir, 'tmp.yml'))
        with open(os.path.join(tmpdir, 'tmp.yml')) as fp:
            b = JAML.load(fp)
            assert b.a == 1
예제 #3
0
    def test_annoy_wrap_indexer(self):
        with NumpyIndexer(index_filename='wrap-npidx.gz') as a:
            a.name = 'wrap-npidx'
            a.add(vec_idx, vec)
            a.save()
            index_abspath = a.index_abspath
            save_abspath = a.save_abspath

        with BaseIndexer.load_config(os.path.join(cur_dir,
                                                  'annoy-wrap.yml')) as b:
            idx, dist = b.query(query, top_k=4)
            global retr_idx
            if retr_idx is None:
                retr_idx = idx
            else:
                np.testing.assert_almost_equal(retr_idx, idx)
            self.assertEqual(idx.shape, dist.shape)
            self.assertEqual(idx.shape, (10, 4))

        with BaseIndexer.load_config(os.path.join(cur_dir,
                                                  'nmslib-wrap.yml')) as c:
            idx, dist = c.query(query, top_k=4)
            if retr_idx is None:
                retr_idx = idx
            else:
                np.testing.assert_almost_equal(retr_idx, idx)
            self.assertEqual(idx.shape, dist.shape)
            self.assertEqual(idx.shape, (10, 4))
            self.add_tmpfile(index_abspath, save_abspath)
예제 #4
0
    def test_annoy_wrap_indexer(self):
        a = NumpyIndexer(index_filename='wrap-npidx.gz')
        a.name = 'wrap-npidx'
        a.add(vec_idx, vec)
        a.save()
        a.close()

        b = BaseIndexer.load_config('annoy-wrap.yml')
        idx, dist = b.query(query, top_k=4)
        print(idx, dist)
        global retr_idx
        if retr_idx is None:
            retr_idx = idx
        else:
            np.testing.assert_almost_equal(retr_idx, idx)
        self.assertEqual(idx.shape, dist.shape)
        self.assertEqual(idx.shape, (10, 4))

        c = BaseIndexer.load_config('nmslib-wrap.yml')
        idx, dist = c.query(query, top_k=4)
        print(idx, dist)
        if retr_idx is None:
            retr_idx = idx
        else:
            np.testing.assert_almost_equal(retr_idx, idx)
        self.assertEqual(idx.shape, dist.shape)
        self.assertEqual(idx.shape, (10, 4))
        self.add_tmpfile(a.index_abspath, a.save_abspath)
예제 #5
0
def test_numpy_update_delete(compress_level, test_metas):
    np.random.seed(500)
    num_dim = 3
    vec_idx = np.array(['12', '112', '903'], dtype=(np.str_, 16))
    vec = np.random.random([len(vec_idx), num_dim])

    with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert indexer.num_dim == num_dim
        assert indexer.size == len(vec_idx)
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        query_results = indexer.query_by_key(vec_idx)
        assert np.array_equal(vec, query_results)

    # update
    key_to_update = vec_idx[0]
    data_to_update = np.random.random([1, num_dim])
    # nonexistent key
    random_keys = np.array(['999'], dtype=(np.str_, 16))
    random_data = np.random.random([1, num_dim])

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        # NON-EXISTENT KEYS: this will log warning but not fail
        indexer.update(random_keys, random_data)
        indexer.update([key_to_update], data_to_update)
        indexer.save()

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        query_results = indexer.query_by_key([key_to_update])
        assert np.array_equal(data_to_update, query_results)

    # delete
    keys_to_delete = 1
    vec_idx_to_delete = vec_idx[:keys_to_delete]

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        indexer.delete(vec_idx_to_delete)
        indexer.save()
        assert indexer.size == len(vec_idx) - keys_to_delete

    assert indexer.size == len(vec_idx) - keys_to_delete

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        assert indexer.size == len(vec_idx) - keys_to_delete
        # random non-existent key
        assert indexer.query_by_key(['123861942']) is None
        query_results = indexer.query_by_key(vec_idx[keys_to_delete:])
        expected = vec[keys_to_delete:]
        np.testing.assert_allclose(query_results, expected, equal_nan=True)
예제 #6
0
def test_faiss_indexer_known_update_delete(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1)

    train_filepath = os.path.join(os.environ['TEST_WORKSPACE'], 'train.tgz')
    train_data = vectors
    with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
        f.write(train_data.tobytes())

    with FaissIndexer(index_filename='faiss.test.gz',
                      index_key='Flat',
                      train_filepath=train_filepath,
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])

    # update
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.update([4], np.array([[200, 200, 200]], dtype=np.float32))
        indexer.save()
        assert indexer.size == 4

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(queries, top_k=3)
        np.testing.assert_equal(
            idx, np.array([[5, 6, 4], [5, 6, 4], [6, 5, 4], [7, 4, 6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 3)

    # delete
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.delete([4])
        indexer.save()
        assert indexer.size == 3

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[5, 6], [5, 6], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
예제 #7
0
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards,
                        post_op):
    cache_indexer_path = os.path.join(tmp_path, 'cache-0', 'cache.bin')
    with BaseIndexer.load(cache_indexer_path) as cache:
        assert isinstance(cache, DocCache)
        cache_full_size = cache.size
        print(f'cache size {cache.size}')

    for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]:
        indexers_full_size = 0
        for i in range(shards):
            from jina.executors.compound import CompoundExecutor

            compound_name = ('inc_docindexer' if KV_IDX_FILENAME
                             in indexer_fname else 'inc_vecindexer')
            workspace_folder = (
                CompoundExecutor.
                get_component_workspace_from_compound_workspace(
                    tmp_path, compound_name, i))
            indexer_path = os.path.join(
                BaseIndexer.get_shard_workspace(
                    workspace_folder=workspace_folder,
                    workspace_name=indexer_fname.rstrip('.bin'),
                    pea_id=i,
                ),
                f'{indexer_fname}',
            )

            # in the configuration of content-hash / same_content=True
            # there aren't enough docs to satisfy batch size, only 1 shard will have it
            if os.path.exists(indexer_path):
                with BaseIndexer.load(indexer_path) as indexer:
                    if indexer_fname == KV_IDX_FILENAME:
                        assert isinstance(indexer, BinaryPbIndexer)
                    else:
                        assert isinstance(indexer, NumpyIndexer)
                    indexers_full_size += indexer.size

        if post_op == 'delete':
            assert indexers_full_size == 0
            assert cache_full_size == 0
        else:
            if field == 'content_hash' and same_content:
                if chunks > 0:
                    # one content from Doc, one from chunk
                    expected = 2
                    assert indexers_full_size == expected
                    assert cache_full_size == 2
                else:
                    assert indexers_full_size == 1
                    assert cache_full_size == 1
            else:
                nr_expected = ((nr_docs + chunks * nr_docs) *
                               2 if post_op == 'index2' else nr_docs +
                               chunks * nr_docs)
                assert indexers_full_size == nr_expected
                assert cache_full_size == nr_expected
예제 #8
0
def test_numpy_indexer_known_and_delete(batch_size, compress_level, test_metas):
    vectors = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100]])
    keys = np.array([4, 5, 6])
    with CRUDNumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level,
                          metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.add(keys, vectors)
        indexer.save()
        assert Path(indexer.index_abspath).exists()
        save_abspath = indexer.save_abspath

    top_k = 3
    queries = np.array([[1, 1, 1],
                        [10, 10, 10]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, CRUDNumpyIndexer)
        idx, dist = indexer.query(queries, top_k=top_k)
        np.testing.assert_equal(idx, np.array([[4, 5, 6], [5, 4, 6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (len(queries), top_k)
        np.testing.assert_equal(indexer.query_by_id([5, 4, 6]), vectors[[1, 0, 2]])

    # update and query again
    key_to_update = np.array([4])
    data_to_update = np.array([[1000, 1000, 1000]])

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, CRUDNumpyIndexer)
        indexer.update(key_to_update, data_to_update)
        indexer.save()

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, CRUDNumpyIndexer)
        idx, dist = indexer.query(queries, top_k=top_k)
        np.testing.assert_equal(idx, np.array([[5, 6, 4], [5, 6, 4]]))
        assert idx.shape == dist.shape
        assert idx.shape == (len(queries), top_k)

    # delete and query again
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, CRUDNumpyIndexer)
        indexer.delete([4])
        indexer.save()

    top_k = 2
    queries = np.array([[100, 100, 100],
                        [10, 10, 10]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, CRUDNumpyIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[6, 5], [5, 6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (len(queries), top_k)
        np.testing.assert_equal(indexer.query_by_id([6, 5]), vectors[[2, 1]])
예제 #9
0
def test_annoy_indexer_known_update_delete(metas):
    vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str)
    with AnnoyIndexer(index_filename='annoy.test.gz', metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])

    # update
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.update(['4'], np.array([[200, 200, 200]]))
        indexer.save()
        assert indexer.size == 4

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=3)
        np.testing.assert_equal(
            idx,
            np.array([[5, 6, 4], [5, 6, 4], [6, 5, 4], [7, 4, 6]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 3)

    # delete
    with BaseIndexer.load(save_abspath) as indexer:
        indexer.delete(['4'])
        indexer.save()
        assert indexer.size == 3

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[5, 6], [5, 6], [6, 5], [7, 6]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
예제 #10
0
def validate_index_size(num_indexed_docs):
    path = Path(os.environ['JINA_TOPK_DIR'])
    index_files = list(path.glob('*.bin'))
    assert len(index_files) > 0
    for index_file in index_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
예제 #11
0
def test_scann_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array(['4', '5', '6', '7']).reshape(-1, 1)
    with ScannIndexer(distance_measure='squared_l2',
                      index_filename='scann.test.gz',
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, ScannIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #12
0
def test_indexer_train_from_index_different_compression_levels(
        metas, compression_level):
    np.random.seed(500)
    num_data = 500
    num_dim = 64
    num_query = 10
    query = np.array(np.random.random([num_query, num_dim]), dtype=np.float32)
    vec_idx = np.random.randint(0, high=num_data, size=[num_data])
    vec = np.random.random([num_data, num_dim])

    train_filepath = os.path.join(metas['workspace'], 'faiss.test.gz')

    with FaissIndexer(index_filename='faiss.test.gz',
                      index_key='IVF10,PQ4',
                      train_filepath=train_filepath,
                      compression_level=compression_level,
                      metas=metas) as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, FaissIndexer)
        idx, dist = indexer.query(query, top_k=4)
        assert idx.shape == dist.shape
        assert idx.shape == (num_query, 4)
예제 #13
0
def test_exec_type(tmpdir):
    from jina.executors.indexers import BaseIndexer
    assert 'BaseIndexer' in BaseExecutor._registered_class

    # init from YAML should be okay as well
    BaseExecutor.load_config('BaseIndexer')

    BaseIndexer().save_config(os.path.join(tmpdir, 'tmp.yml'))
    with open(os.path.join(tmpdir, 'tmp.yml')) as fp:
        _ = JAML.load(fp)

    def assert_bi():
        b = BaseIndexer(1)
        b.save_config(os.path.join(tmpdir, 'tmp.yml'))
        with open(os.path.join(tmpdir, 'tmp.yml')) as fp:
            b = JAML.load(fp)
            assert b.a == 1

    # we override BaseIndexer now, without force it shall not store all init values
    class BaseIndexer(BaseExecutor):
        def __init__(self, a=0):
            super().__init__()
            self.a = a

    with pytest.raises(AssertionError):
        assert_bi()

    class BaseIndexer(BaseExecutor):
        force_register = True

        def __init__(self, a=0):
            super().__init__()
            self.a = a

    assert_bi()
예제 #14
0
def run_test(indexer):
    def create_document(doc_id, text, weight, length):
        d = jina_pb2.Document()
        d.id = doc_id
        d.buffer = text.encode('utf8')
        d.weight = weight
        d.length = length
        return d

    with indexer as idx:
        data = {
            'd1': MessageToJson(create_document(1, 'cat', 0.1, 3)),
            'd2': MessageToJson(create_document(2, 'dog', 0.2, 3)),
            'd3': MessageToJson(create_document(3, 'bird', 0.3, 3)),
        }
        idx.add(data)
        idx.touch()
        idx.save()
        save_abspath = idx.save_abspath
        index_abspath = idx.index_abspath
    assert os.path.exists(index_abspath)
    assert os.path.exists(save_abspath)

    with BaseIndexer.load(save_abspath) as searcher:
        doc = searcher.query('d2')
        assert doc.id == 2
        assert doc.length == 3

    rm_files([save_abspath, index_abspath])
예제 #15
0
def test_sptag_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str)
    with SptagIndexer(dist_calc_method='L2',
                      index_filename='sptag.test.gz',
                      metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, SptagIndexer)
        idx, distances = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str))
        for distance in distances:
            assert distance[0] < distance[1]
        assert idx.shape == distances.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #16
0
def validate_index_size(num_indexed_docs, expected_indices):
    path = Path(os.environ['JINA_CORRUPTED_DOCS_TEST_DIR'])
    index_files = list(path.glob('*.bin'))
    assert len(index_files) == expected_indices
    for index_file in index_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
예제 #17
0
def test_numpy_indexer_known(batch_size, compress_level, test_metas):
    vectors = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1)
    with NumpyIndexer(metric='euclidean',
                      index_filename='np.test.gz',
                      compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.add(keys, vectors)
        indexer.save()
        assert Path(indexer.index_abspath).exists()
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1], [10, 10, 10], [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.raw_ndarray, np.memmap)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([[4, 5], [5, 4], [6, 5], [7,
                                                                        6]]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_id([7, 4]), vectors[[3, 0]])
예제 #18
0
def test_scannindexer():
    train_filepath = os.path.join(cur_dir, 'train.tgz')
    train_data = np.array(np.random.random([1024, 10]), dtype=np.float32)
    with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
        f.write(train_data.tobytes())

    with ScannIndexer(index_filename='scann.test.gz') as indexer:
        indexer.add(vec_idx, vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        index_abspath = indexer.index_abspath
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        idx, dist = indexer.query(query, top_k=4)
        print(idx, dist)
        global retr_idx
        if retr_idx is None:
            retr_idx = idx
        else:
            np.testing.assert_almost_equal(retr_idx, idx)
        assert idx.shape == dist.shape
        assert idx.shape == (10, 4)

    rm_files([index_abspath, save_abspath])
예제 #19
0
def test_nmslib_indexer_known(metas):
    vectors = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    keys = np.array([4, 5, 6, 7]).reshape(-1, 1).astype(str)
    with NmsLibIndexer(space='l2',
                       index_filename='nmslib.test.gz',
                       metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array(
        [[1, 1, 1], [10, 10, 10], [100, 100, 100], [1000, 1000, 1000]],
        dtype=np.float32)
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NmsLibIndexer)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(
            idx,
            np.array([[4, 5], [5, 4], [6, 5], [7, 6]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']),
                                vectors[[3, 0]])
예제 #20
0
def test_numpy_indexer_known(batch_size, compress_level, test_metas):
    vectors = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]])
    keys = np.array(['4', '5', '6', '7'], dtype=(np.str_, 16))
    with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.batch_size = batch_size
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    queries = np.array([[1, 1, 1],
                        [10, 10, 10],
                        [100, 100, 100],
                        [1000, 1000, 1000]])
    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(queries, top_k=2)
        np.testing.assert_equal(idx, np.array([['4', '5'], ['5', '4'], ['6', '5'], ['7', '6']]))
        assert idx.shape == dist.shape
        assert idx.shape == (4, 2)
        np.testing.assert_equal(indexer.query_by_key(['7', '4']), vectors[[3, 0]])
예제 #21
0
def test_scipy_indexer_known_big(compress_level, test_metas):
    """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10.
     We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector
     at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors.
     Then the keys will be assigned shifted to test the proper usage of `int2ext_id` and `ext2int_id`
    """
    vectors = np.random.uniform(low=5.0, high=10.0, size=(10000, 1024))

    queries = np.empty((10, 1024))
    for idx in range(0, 10000, 1000):
        array = idx * np.ones((1, 1024))
        queries[int(idx / 1000)] = array
        vectors[idx] = array

    keys = np.squeeze(np.array(np.arange(10000, 20000).reshape(-1, 1), dtype=(np.str_, 16)))

    with NumpyIndexer(metric='euclidean', index_filename='np.test.gz', backend='scipy', compress_level=compress_level,
                      metas=test_metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        if compress_level == 0:
            assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(queries, top_k=1)
        np.testing.assert_equal(idx, np.array(
            [['10000'], ['11000'], ['12000'], ['13000'], ['14000'], ['15000'], ['16000'], ['17000'], ['18000'], ['19000']]))
        assert idx.shape == dist.shape
        assert idx.shape == (10, 1)
        np.testing.assert_equal(indexer.query_by_key(['10000', '15000']), vectors[[0, 5000]])
예제 #22
0
def test_binarypb_update_twice(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['1'], [b'newvalue'])
        idxer.update(['2'], [b'othernewvalue'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'newvalue'
        assert idxer.query('2') == b'othernewvalue'
예제 #23
0
def test_indexer_zeros(metric, dimension, test_metas):
    import math

    query_vec = np.array(np.zeros([1, dimension]), dtype=np.float32)
    add_vec_idx = np.array(np.random.randint(0, high=num_data,
                                             size=[num_data]),
                           dtype=(np.str_, 16))
    add_vec = np.random.random([num_data, dimension])
    with NumpyIndexer(metric=metric,
                      index_filename='np.test.gz',
                      metas=test_metas) as indexer:
        indexer.add(add_vec_idx, add_vec)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NumpyIndexer)
        assert isinstance(indexer.query_handler, np.memmap)
        idx, dist = indexer.query(query_vec, top_k=4)

        assert idx.shape == dist.shape
        assert idx.shape == (1, 4)
        if metric == 'cosine':
            assert all(math.isnan(x) for x in dist[0])
        else:
            assert not any(math.isnan(x) for x in dist[0])
예제 #24
0
    def test_faiss_indexer(self):
        train_filepath = os.path.join(cur_dir, 'train.tgz')
        train_data = np.array(np.random.random([1024, 10]), dtype=np.float32)
        with gzip.open(train_filepath, 'wb', compresslevel=1) as f:
            f.write(train_data.tobytes())

        with FaissIndexer(index_filename='faiss.test.gz',
                          index_key='IVF10,PQ2',
                          train_filepath=train_filepath) as a:
            a.add(vec_idx, vec)
            a.save()
            self.assertTrue(os.path.exists(a.index_abspath))
            index_abspath = a.index_abspath
            save_abspath = a.save_abspath

        with BaseIndexer.load(save_abspath) as b:
            idx, dist = b.query(query, top_k=4)
            global retr_idx
            if retr_idx is None:
                retr_idx = idx
            else:
                np.testing.assert_almost_equal(retr_idx, idx)
            self.assertEqual(idx.shape, dist.shape)
            self.assertEqual(idx.shape, (10, 4))

        self.add_tmpfile(index_abspath, save_abspath, train_filepath)
예제 #25
0
def test_ngt_indexer_known_big(metas):
    """Let's try to have some real test. We will have an index with 10k vectors of random values between 5 and 10.
     We will change tweak some specific vectors that we expect to be retrieved at query time. We will tweak vector
     at index [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000], this will also be the query vectors.
     Then the keys will be assigned shifted to test the proper usage of `_int2ext_id` and `ext2int_id`
    """
    vectors = np.random.uniform(low=5.0, high=10.0,
                                size=(10000, 1024)).astype('float32')

    queries = np.empty((10, 1024))
    for idx in range(0, 10000, 1000):
        array = idx * np.ones((1, 1024))
        queries[int(idx / 1000)] = array
        vectors[idx] = array

    keys = np.arange(10000, 20000).reshape(-1, 1).astype(str)

    with NGTIndexer(index_filename='ngt.test.gz', num_threads=4,
                    metas=metas) as indexer:
        indexer.add(keys, vectors)
        indexer.save()
        assert os.path.exists(indexer.index_abspath)
        save_abspath = indexer.save_abspath

    with BaseIndexer.load(save_abspath) as indexer:
        assert isinstance(indexer, NGTIndexer)
        idx, dist = indexer.query(queries, top_k=1)
        np.testing.assert_equal(
            idx,
            np.array([[10000], [11000], [12000], [13000], [14000], [15000],
                      [16000], [17000], [18000], [19000]]).astype(str))
        assert idx.shape == dist.shape
        assert idx.shape == (10, 1)
        np.testing.assert_equal(indexer.query_by_key(['10000', '15000']),
                                vectors[[0, 5000]])
예제 #26
0
def test_kvindexer_iterate(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert list(idxer) == [[b'oldvalue'], [b'same'], [b'random']]
예제 #27
0
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards,
                        post_op):
    cache_indexer_path = tmp_path / 'cache.bin'
    cache_full_size = 0
    with BaseIndexer.load(cache_indexer_path) as cache:
        assert isinstance(cache, DocIDCache)
        cache_full_size = cache.size
        print(f'cache size {cache.size}')

    for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]:
        indexers_full_size = 0
        for i in range(shards):
            indexer_folder = 'docindexer' if indexer_fname == KV_IDX_FILENAME else 'vecindexer'
            indexer_folder = f'inc_{indexer_folder}-{i + 1}'
            indexer_path = tmp_path / indexer_folder / indexer_fname if shards > 1 else tmp_path / indexer_fname

            # in the configuration of content-hash / same_content=True
            # there aren't enough docs to satisfy batch size, only 1 shard will have it
            if os.path.exists(indexer_path):
                with BaseIndexer.load(indexer_path) as indexer:
                    if indexer_fname == KV_IDX_FILENAME:
                        assert isinstance(indexer, BinaryPbIndexer)
                    else:
                        assert isinstance(indexer, NumpyIndexer)
                    indexers_full_size += indexer.size

        if post_op == 'delete':
            assert indexers_full_size == 0
            assert cache_full_size == 0
        else:
            if field == 'content_hash' and same_content:
                if chunks > 0:
                    # one content from Doc, one from chunk
                    expected = 2
                    assert indexers_full_size == expected
                    assert cache_full_size == 2
                else:
                    assert indexers_full_size == 1
                    assert cache_full_size == 1
            else:
                nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \
                    else nr_docs + chunks * nr_docs
                assert indexers_full_size == nr_expected
                assert cache_full_size == nr_expected
예제 #28
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter(['1', '2']))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') is None
        assert idxer.query('2') is None
        assert idxer.query('3') == b'random'
예제 #29
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter([1, 2]))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == None
        assert idxer.query(2) == None
        assert idxer.query(3) == b'random'
예제 #30
0
def test_annoy_wrap_indexer(metas):
    with NumpyIndexer(index_filename='wrap-npidx.gz', metas=metas) as indexer:
        indexer.name = 'wrap-npidx'
        indexer.add(vec_idx, vec)

    with BaseIndexer.load_config(os.path.join(
            cur_dir, 'yaml/annoy-wrap.yml')) as indexer:
        assert isinstance(indexer, AnnoyIndexer)
        idx, dist = indexer.query(query, top_k=4)
        assert idx.shape == dist.shape
        assert idx.shape == (10, 4)