Exemplo n.º 1
0
def test_dbms_keyvalue(tmpdir, test_metas):
    docs = list(get_documents(chunks=False, nr=10, same_content=True))
    ids, vecs, meta = zip(*[(doc.id, doc.embedding,
                             _doc_without_embedding(doc).SerializeToString())
                            for doc in docs])
    save_path = None
    with KeyValueDBMSIndexer(index_filename='dbms',
                             metas=test_metas) as indexer:
        indexer.add(ids, vecs, meta)
        assert indexer.size == len(docs)
        save_path = indexer.save_abspath

    new_docs = list(get_documents(chunks=False, nr=10, same_content=False))
    ids, vecs, meta = zip(*[(doc.id, doc.embedding,
                             _doc_without_embedding(doc).SerializeToString())
                            for doc in new_docs])

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.update(ids, vecs, meta)
        assert indexer.size == len(docs)

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.delete([d.id for d in docs])
        assert indexer.size == 0
Exemplo n.º 2
0
def test_cache_driver_multiple_fields(test_metas):
    docs1 = list(get_documents(0, same_content=True, same_tag_content=False, index_start=0))
    docs2 = list(get_documents(0, same_content=True, same_tag_content=False, index_start=0))
    filename = 'cache'
    test_metas['name'] = filename
    driver = MockBaseCacheDriver()

    with DocCache(filename, metas=test_metas, fields=(CONTENT_HASH_KEY, 'tags__tag_field')) as executor:
        driver.attach(executor=executor, runtime=None)
        driver._apply_all(docs1)
        with pytest.raises(NotImplementedError):
            driver._apply_all(docs2)
        assert executor.size == len(docs1)

    with BaseExecutor.load(executor.save_abspath) as executor:
        driver.attach(executor=executor, runtime=None)
        with pytest.raises(NotImplementedError):
            driver._apply_all(docs1)
        assert executor.size == len(docs1)

    # switching order doesn't matter
    with DocCache(metas=test_metas, fields=('tags__tag_field', CONTENT_HASH_KEY,)) as executor:
        driver.attach(executor=executor, runtime=None)
        with pytest.raises(NotImplementedError):
            driver._apply_all(docs1)
        with pytest.raises(AssertionError):
            # TODO(cristian): size should be loaded if there is an existing cache?
            assert executor.size == len(docs1)
Exemplo n.º 3
0
def test_dbms_keyvalue(tmpdir, test_metas):
    docs = list(get_documents(chunks=False, nr=10, same_content=True))
    ids, vecs, metas = _get_ids_vecs_meta(docs)

    save_path = None
    with KeyValueDBMSIndexer(index_filename='dbms',
                             metas=test_metas) as indexer:
        indexer.add(ids, vecs, metas)
        assert indexer.size == len(docs)
        save_path = indexer.save_abspath
        indexer.dump(os.path.join(tmpdir, 'dump1'), 2)

        # we can index and dump again in the same context
        docs2 = list(
            get_documents(chunks=False,
                          nr=10,
                          same_content=True,
                          index_start=len(docs)))
        ids, vecs, metas = _get_ids_vecs_meta(docs2)
        indexer.add(ids, vecs, metas)
        assert indexer.size == 2 * len(docs)
        indexer.dump(os.path.join(tmpdir, 'dump2'), 3)

    new_docs = list(get_documents(chunks=False, nr=10, same_content=False))
    ids, vecs, meta = zip(*[(
        doc.id,
        doc.embedding,
        DBMSIndexDriver._doc_without_embedding(doc).SerializeToString(),
    ) for doc in new_docs])

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.update(ids, vecs, meta)
        assert indexer.size == 2 * len(docs)

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.delete([d.id for d in docs])
        assert indexer.size == len(docs)
Exemplo n.º 4
0
def test_cache_crud(tmp_path, mocker, indexers, field, shards, chunks,
                    same_content):
    flow_index = get_index_flow(field=field,
                                tmp_path=tmp_path,
                                shards=shards,
                                indexers=indexers)
    flow_query = get_query_flow(field=field, tmp_path=tmp_path, shards=shards)
    flow_delete = get_delete_flow(field=field,
                                  tmp_path=tmp_path,
                                  shards=shards,
                                  indexers=indexers)

    def validate_result_factory(num_matches):
        def validate_results(resp):
            assert len(resp.docs) == DOCS_TO_SEARCH
            for d in resp.docs:
                matches = list(d.matches)
                # this differs depending on cache settings
                # it could be lower
                if num_matches != 0:
                    if field == 'content_hash' and same_content:
                        if chunks:
                            assert len(matches) == 2
                        else:
                            assert len(matches) == 1
                else:
                    assert len(matches) == num_matches

        return validate_results

    docs = list(
        get_documents(chunks=chunks,
                      same_content=same_content,
                      nr=DOCS_TO_INDEX))
    # ids in order to ensure no matches in KV
    search_docs = list(
        get_documents(chunks=0,
                      same_content=False,
                      nr=DOCS_TO_SEARCH,
                      index_start=9999))

    # INDEX
    with flow_index as f:
        f.index(docs, request_size=REQUEST_SIZE)
    check_indexers_size(chunks, len(docs), field, tmp_path, same_content,
                        shards, 'index')

    # INDEX (with new documents)
    chunks_ids = np.concatenate([d.chunks for d in docs])
    index_start_new_docs = 1 + len(docs) + len(chunks_ids)

    new_docs = list(
        get_documents(chunks=chunks,
                      same_content=same_content,
                      index_start=index_start_new_docs))
    with flow_index as f:
        f.index(new_docs, request_size=REQUEST_SIZE)

    check_indexers_size(chunks, len(docs), field, tmp_path, same_content,
                        shards, 'index2')

    # QUERY
    mock = mocker.Mock()
    with flow_query as f:
        f.search(search_docs, on_done=mock)
    mock.assert_called_once()
    validate_callback(mock, validate_result_factory(TOP_K))

    # UPDATE
    docs.extend(new_docs)
    del new_docs

    # id stays the same, we change the content
    for d in docs:
        d_content_hash_before = d.content_hash
        d.content = f'this is some new content for doc {d.id}'
        d.update_content_hash()
        assert d.content_hash != d_content_hash_before
        for chunk in d.chunks:
            c_content_hash_before = chunk.content_hash
            chunk.content = f'this is some new content for chunk {chunk.id}'
            chunk.update_content_hash()
            assert chunk.content_hash != c_content_hash_before

    with flow_index as f:
        f.update(docs)

    check_indexers_size(chunks,
                        len(docs) / 2, field, tmp_path, same_content, shards,
                        'index2')

    # QUERY
    mock = mocker.Mock()
    with flow_query as f:
        f.search(search_docs, on_done=mock)
    mock.assert_called_once()
    validate_callback(mock, validate_result_factory(TOP_K))

    # DELETE
    delete_ids = []
    for d in docs:
        delete_ids.append(d.id)
        for c in d.chunks:
            delete_ids.append(c.id)
    with flow_delete as f:
        f.delete(delete_ids)

    check_indexers_size(chunks, 0, field, tmp_path, same_content, shards,
                        'delete')

    # QUERY
    mock = mocker.Mock()
    with flow_query as f:
        f.search(search_docs, on_done=mock)
    mock.assert_called_once()
    validate_callback(mock, validate_result_factory(0))