Exemplo n.º 1
0
def test_multimodal_driver_with_shuffled_order(simple_multimodal_driver, mock_multimodal_encoder_shuffled,
                                               doc_with_multimodal_chunks):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder_shuffled, pea=None)
    simple_multimodal_driver._apply_all(DocumentSet([doc_with_multimodal_chunks]))
    doc = doc_with_multimodal_chunks
    assert len(doc.chunks) == 3
    visual1 = doc.chunks[2]
    visual2 = doc.chunks[0]
    textual = doc.chunks[1]
    control = np.concatenate([visual2.embedding, textual.embedding,
                              visual1.embedding])
    test = doc.embedding
    np.testing.assert_array_equal(control, test)
Exemplo n.º 2
0
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder,
                           doc_with_multimodal_chunks):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder,
                                    runtime=None)
    simple_multimodal_driver._apply_all(
        DocumentSet([doc_with_multimodal_chunks]))
    doc = doc_with_multimodal_chunks
    assert len(doc.chunks) == 3
    visual1 = doc.chunks[0]
    visual2 = doc.chunks[1]
    textual = doc.chunks[2]
    assert doc.embedding.shape[0] == visual1.embedding.shape[0] + \
           visual2.embedding.shape[0] + textual.embedding.shape[0]
Exemplo n.º 3
0
def test_cache_driver_from_file(tmpdir, test_metas):
    filename = 'cache'
    test_metas['name'] = filename
    folder = os.path.join(test_metas["workspace"])
    folder = os.path.join(folder, 'cache-0')
    os.makedirs(folder)
    bin_full_path = os.path.join(folder, filename)
    docs = DocumentSet(list(random_docs(10, embedding=False)))
    pickle.dump(
        {
            doc.id: BaseCacheDriver.hash_doc(doc, ['content_hash'])
            for doc in docs
        },
        open(f'{bin_full_path}.bin.ids', 'wb'),
    )
    pickle.dump(
        {
            BaseCacheDriver.hash_doc(doc, ['content_hash']): doc.id
            for doc in docs
        },
        open(f'{bin_full_path}.bin.cache', 'wb'),
    )

    driver = MockCacheDriver()
    with DocCache(metas=test_metas, fields=(CONTENT_HASH_KEY, )) as executor:
        assert not executor.handler_mutex
        driver.attach(executor=executor, runtime=None)

        with pytest.raises(NotImplementedError):
            # duplicate docs
            driver._apply_all(docs)

        # new docs
        docs = DocumentSet(list(random_docs(10, start_id=100)))
        driver._apply_all(docs)

    # check persistence
    assert os.path.exists(executor.save_abspath)
Exemplo n.º 4
0
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(document_with_matches_on_chunks):
    driver = SimpleKVSearchDriver(traversal_paths=('cm',))
    executor = MockIndexer()
    driver.attach(executor=executor, runtime=None)

    driver._traverse_apply(DocumentSet([document_with_matches_on_chunks]))

    dcs = list(document_with_matches_on_chunks.chunks)
    assert len(dcs) == 1
    chunk = dcs[0]
    matches = list(chunk.matches)
    assert len(matches) == 3
    for match in matches:
        assert NdArray(match.embedding).value is not None
        embedding_array = NdArray(match.embedding).value
        np.testing.assert_equal(embedding_array, np.array([match.id]))
Exemplo n.º 5
0
def test_extract_bad_fields_no_strict_args(mocker):
    encode_mock = mocker.Mock()

    class MyExecutor(BaseEncoder):
        def encode(self, hello):
            encode_mock()

    exec = MyExecutor()
    bd = EncodeDriver(strict_method_args=False)

    bd.attach(exec, runtime=None)
    docs = list(random_docs(10))

    ds = DocumentSet(docs)

    bd._apply_all(ds)
    encode_mock.assert_not_called()
Exemplo n.º 6
0
def test_extract_bad_fields(mocker):
    encode_mock = mocker.Mock()

    class MyExecutor(BaseEncoder):
        def encode(self, data):
            encode_mock()

    exec = MyExecutor()
    bd = EncodeDriver()

    bd.attach(exec, runtime=None)
    docs = list(random_docs(10))

    ds = DocumentSet(docs)

    with pytest.raises(
        AttributeError, match='is now deprecated and not a valid argument'
    ):
        bd._apply_all(ds)
    encode_mock.assert_not_called()

    class MyExecutor(BaseEncoder):
        def encode(self, hello):
            encode_mock()

    exec = MyExecutor()
    bd = EncodeDriver()
    bd.attach(exec, runtime=None)

    with pytest.raises(AttributeError, match='are invalid Document attributes'):
        bd._apply_all(ds)
    encode_mock.assert_not_called()

    class MyExecutor(BaseEncoder):
        def encode(self, mimeType):
            encode_mock()

    exec = MyExecutor()
    bd = EncodeDriver()
    bd.attach(exec, runtime=None)

    with pytest.raises(AttributeError, match='you give them in CamelCase'):
        bd._apply_all(ds)
    encode_mock.assert_not_called()
Exemplo n.º 7
0
def test_cache_driver_twice(tmp_path):
    filename = tmp_path / 'test-tmp.bin'
    docs = DocumentSet(list(random_docs(10)))
    driver = MockCacheDriver()
    with DocIDCache(filename) as executor:
        assert not executor.handler_mutex
        driver.attach(executor=executor, pea=None)
        driver._traverse_apply(docs)

        with pytest.raises(NotImplementedError):
            # duplicate docs
            driver._traverse_apply(docs)

        # new docs
        docs = list(random_docs(10))
        driver._traverse_apply(docs)

        # check persistence
        assert Path(filename).exists()
Exemplo n.º 8
0
def test_exec_fn_arbitrary_name(mocker):
    encode_mock = mocker.Mock()

    class MyExecutor(BaseEncoder):
        def foo(self, id):
            assert isinstance(id[0], str)
            assert isinstance(id, list)
            encode_mock()

    exec = MyExecutor()
    bd = EncodeDriver(method='foo')

    bd.attach(exec, runtime=None)
    docs = list(random_docs(10))

    ds = DocumentSet(docs)

    bd._apply_all(ds)
    encode_mock.assert_called()
Exemplo n.º 9
0
def test_cache_driver_twice(tmpdir, test_metas):
    docs = DocumentSet(list(random_docs(10)))
    driver = MockCacheDriver()
    # FIXME DocIdCache doesn't use tmpdir, it saves in curdir
    with DocIDCache(tmpdir, metas=test_metas) as executor:
        assert not executor.handler_mutex
        driver.attach(executor=executor, runtime=None)
        driver._traverse_apply(docs)

        with pytest.raises(NotImplementedError):
            # duplicate docs
            driver._traverse_apply(docs)

        # new docs
        docs = list(random_docs(10, start_id=100))
        driver._traverse_apply(docs)
        filename = executor.save_abspath

    # check persistence
    assert os.path.exists(filename)
Exemplo n.º 10
0
def test_extract_multi_fields(mocker):
    encode_mock = mocker.Mock()

    class MyExecutor(BaseEncoder):
        def encode(self, id, embedding):
            encode_mock()
            assert isinstance(id, list)
            assert isinstance(embedding, list)
            assert isinstance(id[0], str)
            assert isinstance(embedding[0], np.ndarray)

    exec = MyExecutor()
    bd = EncodeDriver()

    bd.attach(exec, runtime=None)
    docs = list(random_docs(10))

    ds = DocumentSet(docs)

    bd._apply_all(ds)
    encode_mock.assert_called()
Exemplo n.º 11
0
def test_exec_fn_return_doc(mocker):
    encode_mock = mocker.Mock()

    class MyExecutor(BaseEncoder):
        def encode(self, id):
            encode_mock()
            return [Document(mime_type='image/png')] * len(id)

    exec = MyExecutor()
    bd = EncodeDriver()

    bd.attach(exec, runtime=None)
    docs = list(random_docs(10))

    ds = DocumentSet(docs)

    bd._apply_all(ds)
    encode_mock.assert_called()

    for d in ds:
        assert d.mime_type == 'image/png'
Exemplo n.º 12
0
def test_vectorsearch_driver_mock_indexer_apply_all(document):
    driver = SimpleKVSearchDriver()

    executor = MockIndexer()
    driver.attach(executor=executor, runtime=None)

    dcs = list(document.chunks)
    assert len(dcs) == 5
    for chunk in dcs:
        assert chunk.embedding is None

    driver._apply_all([DocumentSet(document.chunks)])

    dcs = list(document.chunks)

    # chunk idx: 5 had no matched and is removed as missing idx
    assert len(dcs) == 4
    for chunk in dcs:
        assert chunk.embedding is not None
        embedding_array = chunk.embedding
        np.testing.assert_equal(embedding_array, np.array([chunk.id]))
Exemplo n.º 13
0
def random_docs_with_chunks(num_docs):
    docs = []
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.granularity = 0
        d.tags['id'] = j
        d.text = 'hello world'
        d.uri = 'doc://'
        for c in range(10):
            dc = d.chunks.add()
            dc.text = 'chunk to hello world'
            dc.granularity = 1
            dc.uri = 'doc://chunk'
            dc.tags['id'] = c
            for cc in range(10):
                dcc = dc.chunks.add()
                dcc.text = 'nested chunk to chunk'
                dcc.uri = 'doc://chunk/chunk'
                dcc.tags['id'] = cc
                dcc.granularity = 2
        docs.append(d)
    return DocumentSet(docs)
Exemplo n.º 14
0
def build_docs():
    """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """
    max_granularity = 2
    max_adjacency = 2

    def iterate_build(document, current_granularity, current_adjacency):
        if current_granularity < max_granularity:
            for i in range(DOCUMENTS_PER_LEVEL):
                chunk = add_chunk(document)
                iterate_build(chunk, chunk.granularity, chunk.adjacency)
        if current_adjacency < max_adjacency:
            for i in range(DOCUMENTS_PER_LEVEL):
                match = add_match(document)
                iterate_build(match, match.granularity, match.adjacency)

    docs = []
    for base_id in range(DOCUMENTS_PER_LEVEL):
        document = Document()
        document.granularity = 0
        document.adjacency = 0
        docs.append(document)
        iterate_build(document, 0, 0)
    return DocumentSet(docs)
Exemplo n.º 15
0
def docs_to_encode(num_docs):
    docs = []
    for idx in range(1, num_docs + 1):
        doc = Document(content=np.array([idx]))
        docs.append(doc)
    return DocumentSet(docs)
Exemplo n.º 16
0
 def docs(self):
     return DocumentSet(list(random_docs(10)))
Exemplo n.º 17
0
def test_invalid_document(craft_driver):
    invalid_document = Document(content='invalid')
    docs = DocumentSet([invalid_document])
    with pytest.raises(AttributeError) as error:
        craft_driver._apply_all(docs)
        assert error.value.__str__() == '\'non_existing_key\' is not recognized'