Exemplo n.º 1
0
    def _iter_doc(self, content) -> Generator['Document', None, None]:
        from docarray import Document

        self._return_plain = True

        for c in content:
            if isinstance(c, str):
                self._return_plain = True
                _mime = mimetypes.guess_type(c)[0]
                if _mime and _mime.startswith('image'):
                    yield Document(uri=c).load_uri_to_blob()
                else:
                    yield Document(text=c)
            elif isinstance(c, Document):
                if c.content_type in ('text', 'blob'):
                    self._return_plain = False
                    yield c
                elif not c.blob and c.uri:
                    c.load_uri_to_blob()
                    self._return_plain = False
                    yield c
                else:
                    raise TypeError(
                        f'unsupported input type {c!r} {c.content_type}')
            else:
                raise TypeError(f'unsupported input type {c!r}')
Exemplo n.º 2
0
def test_conditions_filtering(tmpdir, flow):
    with flow:
        ret = flow.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
            ]),
        )
        assert len(ret) == 2
        types_set = set()
        for doc in ret:
            if doc.tags['type'] == 1:
                assert doc.text == 'type1 processed by exec1'
            else:
                assert doc.tags['type'] == 2
                assert doc.text == 'type2 processed by exec2'
            types_set.add(doc.tags['type'])

        assert types_set == {1, 2}

    with open(os.path.join(str(tmpdir), 'exec1', '0', f'exec1.txt'),
              'r') as fp:
        assert fp.read() == 'type1'

    with open(os.path.join(str(tmpdir), 'exec2', '0', f'exec2.txt'),
              'r') as fp:
        assert fp.read() == 'type2'
Exemplo n.º 3
0
 def random_docs(num_docs):
     for j in range(1, num_docs + 1):
         doc = Document()
         doc.text = f'i\'m dummy doc {j}'
         doc.offset = 1000
         doc.tags['id'] = 1000  # this will be ignored
         yield doc
Exemplo n.º 4
0
Arquivo: helper.py Projeto: srbhr/jina
def _new_doc_from_data(data, data_type: DataInputType,
                       **kwargs) -> Tuple['Document', 'DataInputType']:
    def _build_doc_from_content():
        return Document(content=data, **kwargs), DataInputType.CONTENT

    if data_type == DataInputType.DICT:
        doc = Document.from_dict(data)
        return doc, DataInputType.DICT
    if data_type == DataInputType.AUTO or data_type == DataInputType.DOCUMENT:
        if isinstance(data, Document):
            # if incoming is already primitive type Document, then all good, best practice!
            return data, DataInputType.DOCUMENT
        elif isinstance(data, dict):
            return Document.from_dict(data), DataInputType.DICT
        try:
            d = Document(data, **kwargs)
            return d, DataInputType.DOCUMENT
        except ValueError:
            # AUTO has a fallback, now reconsider it as content
            if data_type == DataInputType.AUTO:
                return _build_doc_from_content()
            else:
                raise
    elif data_type == DataInputType.CONTENT:
        return _build_doc_from_content()
Exemplo n.º 5
0
 def segment(self, docs: DocumentArray, **kwargs):
     for doc in docs:
         text = doc.tags['caption']
         uri = f'{os.environ["HW_WORKDIR"]}/people-img/{doc.tags["image"]}'
         chunk_text = Document(text=text, mime_type='text/plain')
         chunk_uri = Document(uri=uri, mime_type='image/jpeg')
         doc.chunks = DocumentArray([chunk_text, chunk_uri])
         doc.uri = uri
         doc.convert_uri_to_datauri()
Exemplo n.º 6
0
def test_data_type_builder_doc(builder, input_data_type, output_data_type):
    a = Document()
    a.id = 'a236cbb0eda62d58'
    a.text = 'text test'
    d, t = _new_doc_from_data(builder(a), input_data_type)
    if input_data_type != DataInputType.CONTENT:
        assert d.id == a.id
    assert d.text == a.text
    assert t == output_data_type
Exemplo n.º 7
0
def test_set_workspace(tmpdir):
    complete_workspace = os.path.abspath(
        os.path.join(tmpdir, 'WorkspaceExec', '0'))
    with Flow().add(uses=WorkspaceExec, workspace=str(tmpdir)) as f:
        resp = f.post(on='/foo', inputs=Document())
    assert resp[0].text == complete_workspace
    with Flow().add(uses=WorkspaceExec, uses_metas={'workspace':
                                                    str(tmpdir)}) as f:
        resp = f.post(on='/foo', inputs=Document())
    assert resp[0].text == complete_workspace
Exemplo n.º 8
0
def test_flow_default_polling_endpoints(polling):
    f = Flow().add(uses=DynamicPollingExecutorDefaultNames,
                   shards=2,
                   polling=polling)

    with f:
        docs_index = f.post(on='/index', inputs=[Document(text='1')])
        docs_search = f.post(on='/search', inputs=[Document(text='1')])
        docs_custom = f.post(on='/custom', inputs=[Document(text='1')])
    assert len(docs_index) == 2
    assert len(docs_search) == 3
    assert len(docs_custom) == 3 if polling == 'all' else 2
Exemplo n.º 9
0
def test_reducer_executor(n_shards, n_matches, n_chunks):
    reducer_executor = ReducerExecutor()
    query = DocumentArray([Document() for _ in range(5)])
    docs_matrix = [deepcopy(query) for _ in range(n_shards)]
    for da in docs_matrix:
        for doc in da:
            doc.matches.extend([Document() for _ in range(n_matches)])
            doc.chunks.extend([Document() for _ in range(n_chunks)])

    reduced_da = reducer_executor.reduce(docs_matrix=docs_matrix)
    for doc in reduced_da:
        assert len(doc.matches) == n_shards * n_matches
        assert len(doc.chunks) == n_shards * n_chunks
Exemplo n.º 10
0
def documents(start_index, end_index):
    for i in range(start_index, end_index):
        doc = Document()
        doc.text = 'this is text'
        doc.tags['id'] = 'id in tags'
        doc.tags['inner_dict'] = {'id': 'id in inner_dict'}
        chunk = Document()
        chunk.text = 'text in chunk'
        chunk.tags['id'] = 'id in chunk tags'
        doc.chunks.append(chunk)
        yield doc
Exemplo n.º 11
0
def test_flow_default_custom_polling_endpoints(polling):
    custom_polling_config = {'/custom': 'ALL', '/search': 'ANY', '*': polling}
    f = Flow().add(
        uses=DynamicPollingExecutorDefaultNames,
        shards=2,
        polling=custom_polling_config,
    )

    with f:
        docs_index = f.post(on='/index', inputs=[Document(text='1')])
        docs_search = f.post(on='/search', inputs=[Document(text='1')])
        docs_custom = f.post(on='/custom', inputs=[Document(text='1')])
    assert len(docs_index) == 2
    assert len(docs_search) == 2
    assert len(docs_custom) == 3
Exemplo n.º 12
0
def get_all_docs(client,
                 doc_class='Document',
                 attribute_container='serialized_doc'):

    s_docs = client.query.get('Document', ['serialized_doc']).do()
    sdocs = [s['serialized_doc'] for s in s_docs['data']['Get']['Document']]
    return [Document.from_base64(sdoc) for sdoc in sdocs]
Exemplo n.º 13
0
def test_volumes_in_flow(tmpdir, source, destination, workspace,
                         filewriter_exec_docker_image_built):
    with mock.patch.dict(
            os.environ,
        {'JINA_DEFAULT_WORKSPACE_BASE': str(os.path.join(tmpdir, 'default'))},
    ):
        if source:  # test manually set volume and workspace
            source = os.path.join(tmpdir, source)
            volumes = [str(source) + ':' + destination]
        else:  # test auto volume and workspace
            volumes = None
            source = os.path.join(tmpdir, 'default')

        f = Flow().add(uses='docker://filewriter-exec',
                       volumes=volumes,
                       workspace=workspace)
        with f:
            f.post(inputs=[Document()], on='/foo')

        assert os.path.exists(source)

        found_output_file = False  # workspace has random element, so we search for it
        for cur_path, dirs, files in os.walk(source):
            if 'out.txt' in files:
                with open(os.path.join(cur_path, 'out.txt'), 'r') as f:
                    if f.read() == 'Filewriter was here':
                        found_output_file = True
        assert found_output_file
Exemplo n.º 14
0
def index_generator(num_docs: int, target: dict):
    """
    Generate the index data.

    :param num_docs: Number of documents to be indexed.
    :param target: Dictionary which stores the data paths
    :yields: index data
    """
    for internal_doc_id in range(num_docs):
        # x_blackwhite.shape is (28,28)
        x_blackwhite = 255 - target['index']['data'][internal_doc_id]
        # x_color.shape is (28,28,3)
        x_color = np.stack((x_blackwhite, ) * 3, axis=-1)
        d = Document(content=x_color)
        d.tags['id'] = internal_doc_id
        yield d
Exemplo n.º 15
0
def test_complex_flow(disable_reduce):
    f = (Flow().add(name='first', uses=SimpleAddExecutor,
                    needs=['gateway']).add(name='forth',
                                           uses=SimpleAddExecutor,
                                           needs=['first'],
                                           shards=2).add(
                                               name='second_shards_needs',
                                               uses=SimpleAddExecutor,
                                               needs=['gateway'],
                                               shards=2,
                                           ).add(
                                               name='third',
                                               uses=SimpleAddExecutor,
                                               shards=3,
                                               needs=['second_shards_needs'],
                                           ).add(
                                               name='merger',
                                               uses=MergeDocsExecutor,
                                               needs=['forth', 'third'],
                                               disable_reduce=disable_reduce,
                                           ))

    with f:
        docs = f.post(on='/index', inputs=[Document(text='1')])
    assert len(docs) == 6 if disable_reduce else 5
Exemplo n.º 16
0
def test_executor_load_from_hub():
    exec = Executor.from_hub('jinahub://DummyHubExecutor',
                             uses_metas={'name': 'hello123'})
    da = DocumentArray([Document()])
    exec.foo(da)
    assert da.texts == ['hello']
    assert exec.metas.name == 'hello123'
Exemplo n.º 17
0
def test_shards():
    f = Flow().add(uses=SimpleAddExecutor, shards=2)

    with f:
        docs = f.post(on='/index',
                      inputs=[Document(text='1')],
                      return_results=True)
        assert len(docs) == 2
Exemplo n.º 18
0
def test_grpc_compression(compression_client, compression_gateway):
    with Flow(grpc_compression=compression_gateway).add().add() as f:
        ret = f.post(
            on='/',
            inputs=DocumentArray([Document()]),
            grpc_compression=compression_client,
        )
    assert len(ret) == 1
Exemplo n.º 19
0
def test_blob_transmission(decode, protocol):
    decode = False
    f = Flow(protocol=protocol).add(uses=MyExec)
    with f:
        c = Client(port=f.port, protocol=protocol)
        d = c.post('/', Document(blob=b'hello'), parameters={'decode': decode})[0]
    if decode:  # test that the Executor gets the correct data
        assert d.text == 'hello'
    else:  # test that the response contains the correct data
        assert d.blob == b'hello'
Exemplo n.º 20
0
def test_status():
    r = DataRequest()
    r.docs.extend([Document()])
    r.add_exception(ValueError('intentional_error'))
    byte_array = DataRequestProto.SerializeToString(r)

    deserialized_request = DataRequestProto.FromString(byte_array)
    assert not deserialized_request.is_decompressed
    assert deserialized_request.status.code == jina_pb2.StatusProto.ERROR
    assert deserialized_request.is_decompressed
Exemplo n.º 21
0
def test_conditions_filtering_on_joiner(tmpdir):
    flow = (Flow().add(name='first').add(
        uses=ConditionDumpExecutor,
        uses_metas={
            'name': 'joiner_test_exec1'
        },
        workspace=str(tmpdir),
        name='joiner_test_exec1',
        needs=['first'],
    ).add(
        uses=ConditionDumpExecutor,
        workspace=str(tmpdir),
        uses_metas={
            'name': 'joiner_test_exec2'
        },
        name='joiner_test_exec2',
        needs='first',
    ).needs_all('joiner', when={'tags__type': {
        '$eq': 3
    }}))
    with flow:
        ret = flow.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
            ]),
        )
        assert len(ret) == 0

    with open(
            os.path.join(str(tmpdir), 'joiner_test_exec1', '0',
                         f'joiner_test_exec1.txt'),
            'r',
    ) as fp:
        assert fp.read() == 'type1type2'

    with open(
            os.path.join(str(tmpdir), 'joiner_test_exec2', '0',
                         f'joiner_test_exec2.txt'),
            'r',
    ) as fp:
        assert fp.read() == 'type1type2'
Exemplo n.º 22
0
def get_doc_by_id(client, doc_id):

    result_dict = client.data_object.get_by_id(doc_id)

    if result_dict is None:
        return None
    else:
        serialized_doc = result_dict['properties']['serialized_doc']

    return Document.from_base64(serialized_doc)
Exemplo n.º 23
0
def test_grpc_ssl_with_flow(cert_pem, key_pem, error_log_level):
    with Flow(
            protocol='grpc',
            ssl_certfile=cert_pem,
            ssl_keyfile=key_pem,
    ) as f:

        with pytest.raises(grpc.aio._call.AioRpcError):
            Client(protocol='grpc', port=f.port, tls=True).index([Document()])
    # the openssl error from above seems to take a bit to actually terminate and may cause the next test to seg fault
    time.sleep(1.0)
Exemplo n.º 24
0
def test_uvicorn_ssl_with_flow(cert_pem, key_pem, protocol, capsys, error_log_level):
    with Flow(
        protocol=protocol,
        uvicorn_kwargs=[
            'ssl_keyfile_password: abcd',
        ],
        ssl_certfile=cert_pem,
        ssl_keyfile=key_pem,
    ) as f:

        with pytest.raises(aiohttp.ClientConnectorCertificateError):
            Client(protocol=protocol, port=f.port, tls=True).index([Document()])
Exemplo n.º 25
0
def test_expected_messages_routing(disable_reduce):
    f = (Flow().add(name='foo', uses=SimplExecutor).add(
        name='bar',
        uses=MergeExecutor,
        needs=['foo', 'gateway'],
        disable_reduce=disable_reduce,
    ))

    with f:
        docs = f.post(on='/index', inputs=[Document(text='1')])
        # there merge executor actually does not merge despite its name
        assert len(docs) == 2 if disable_reduce else 1
        assert docs[0].text == 'merged' if disable_reduce else '1'
Exemplo n.º 26
0
async def test_aync_data_request_handler_new_docs(logger):
    args = set_pod_parser().parse_args(['--uses', 'AsyncNewDocsExecutor'])
    handler = DataRequestHandler(args, logger)
    req = list(
        request_generator(
            '/',
            DocumentArray([Document(text='input document')
                           for _ in range(10)])))[0]
    assert len(req.docs) == 10
    response = await handler.handle(requests=[req])

    assert len(response.docs) == 1
    assert response.docs[0].text == 'new document'
Exemplo n.º 27
0
def test_expected_messages_routing():
    f = (Flow().add(name='foo',
                    uses=SimplExecutor).add(name='bar',
                                            uses=MergeExecutor,
                                            needs=['foo', 'gateway']))

    with f:
        docs = f.post(on='/index',
                      inputs=[Document(text='1')],
                      return_results=True)
        # there merge executor actually does not merge despite its name
        assert len(docs) == 2
        assert docs[0].text == 'merged'
Exemplo n.º 28
0
def test_chained_conditions(tmpdir, temp_workspace):
    f = (Flow().add(name='first').add(
        uses=ConditionDumpExecutor,
        uses_metas={
            'name': 'exec1'
        },
        workspace=os.environ['TEMP_WORKSPACE'],
        name='exec1',
        needs=['first'],
        when={
            'tags__type': {
                '$gte': 2
            }
        },
    ).add(
        uses=ConditionDumpExecutor,
        workspace=os.environ['TEMP_WORKSPACE'],
        uses_metas={
            'name': 'exec2'
        },
        name='exec2',
        needs='exec1',
        when={
            'tags__type': {
                '$lte': 1
            }
        },
    ).needs_all('joiner'))

    with f:
        ret = f.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
                Document(text='type2', tags={'type': 3}),
            ]),
        )
        assert len(ret) == 0
Exemplo n.º 29
0
    def rank(
        self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs
    ) -> 'DocumentArray':
        """
        :param docs_matrix: list of :class:`DocumentArray` on multiple requests to
          get bubbled up matches.
        :param parameters: the parameters passed into the ranker, in this case stores :attr`top_k`
          to filter k results based on score.
        :param kwargs: not used (kept to maintain interface)
        """

        result_da = DocumentArray()  # length: 1 as every time there is only one query
        for d_mod1, d_mod2 in zip(*docs_matrix):

            final_matches = {}  # type: Dict[str, Document]
            for m in d_mod1.matches:
                relevance_score = m.scores['cosine'].value * d_mod1.weight
                m.scores['relevance'].value = relevance_score
                final_matches[m.parent_id] = Document(m, copy=True)

            for m in d_mod2.matches:
                if m.parent_id in final_matches:
                    final_matches[m.parent_id].scores[
                        'relevance'
                    ].value = final_matches[m.parent_id].scores['relevance'].value + (
                        m.scores['cosine'].value * d_mod2.weight
                    )
                else:
                    m.scores['relevance'].value = (
                        m.scores['cosine'].value * d_mod2.weight
                    )
                    final_matches[m.parent_id] = Document(m, copy=True)

            da = DocumentArray(list(final_matches.values()))
            da = sorted(da, key=lambda ma: ma.scores['relevance'].value, reverse=True)
            d = Document(matches=da[: int(parameters['top_k'])])
            result_da.append(d)
        return result_da
Exemplo n.º 30
0
def test_lazy_serialization():
    doc_count = 1000
    r = DataRequest()
    da = r.docs
    da.extend([Document(text='534534534er5yr5y645745675675675345')] *
              doc_count)
    r.data.docs = da
    byte_array = DataRequestProto.SerializeToString(r)

    deserialized_request = DataRequestProto.FromString(byte_array)
    assert not deserialized_request.is_decompressed
    assert len(deserialized_request.docs) == doc_count
    assert deserialized_request.docs == r.docs
    assert deserialized_request.is_decompressed