class MyIndexer(Executor): """Simple indexer class""" def __init__(self, **kwargs): super().__init__(**kwargs) self.table_name = 'qabot_docs' self._docs = DocumentArray( storage='sqlite', config={ 'connection': os.path.join(self.workspace, 'indexer.db'), 'table_name': self.table_name, }, ) @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=1, )
def test_document_processed_total(port_generator, executor): port0 = port_generator() port1 = port_generator() with Flow(monitoring=True, port_monitoring=port0).add(uses=executor, port_monitoring=port1) as f: resp = req.get(f'http://localhost:{port1}/') assert resp.status_code == 200 f.post( f'/foo', inputs=DocumentArray.empty(size=4)) # process 4 documents on foo resp = req.get(f'http://localhost:{port1}/') assert ( f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 4.0' # check that we count 4 documents on foo in str(resp.content)) assert not ( f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/bar",runtime_name="executor0/rep-0"}}' # check that we does not start counting documents on bar as it has not been called yet in str(resp.content)) f.post( f'/bar', inputs=DocumentArray.empty(size=5)) # process 5 documents on bar assert not ( f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/bar",runtime_name="executor0/rep-0"}} 5.0' # check that we count 5 documents on foo in str(resp.content)) assert ( f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 4.0' # check that we nothing change on bar count in str(resp.content))
def foo(self, docs: DocumentArray, **kwargs): def bar(d: Document): d.text = 'hello' return d docs.apply(bar) return docs
def test_requests_size(port_generator, executor): port0 = port_generator() port1 = port_generator() with Flow(monitoring=True, port_monitoring=port0).add(uses=executor, port_monitoring=port1) as f: f.post('/foo', inputs=DocumentArray.empty(size=1)) resp = req.get(f'http://localhost:{port1}/') # enable on port0 assert resp.status_code == 200 assert ( f'jina_request_size_bytes_count{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 1.0' in str(resp.content)) def _get_request_bytes_size(): resp = req.get(f'http://localhost:{port1}/') # enable on port0 resp_lines = str(resp.content).split('\\n') byte_line = [ line for line in resp_lines if 'jina_request_size_bytes_sum{executor="DummyExecutor"' in line ] return float(byte_line[0][-5:]) measured_request_bytes_sum_init = _get_request_bytes_size() f.post('/foo', inputs=DocumentArray.empty(size=1)) measured_request_bytes_sum = _get_request_bytes_size() assert measured_request_bytes_sum > measured_request_bytes_sum_init
class KeyValueIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if os.path.exists(self.workspace + '/kv-idx'): self._docs = DocumentArray.load(self.workspace + '/kv-idx') else: self._docs = DocumentArray() @requests(on='/index') def index(self, docs: DocumentArray, **kwargs): self._docs.extend(docs) @requests(on='/search') def query(self, docs: DocumentArray, **kwargs): for doc in docs: new_matches = DocumentArray() for match in doc.matches: extracted_doc = self._docs[match.parent_id] extracted_doc.scores = match.scores new_matches.append(extracted_doc) doc.matches = new_matches def close(self): """ Stores the DocumentArray to disk """ self._docs.save(self.workspace + '/kv-idx')
class MyIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) if os.path.exists(self.workspace + '/indexer'): self._docs = DocumentArray.load(self.workspace + '/indexer') else: self._docs = DocumentArray() @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=1, ) def close(self): """ Stores the DocumentArray to disk """ self._docs.save(self.workspace + '/indexer')
def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self._index_file_name = index_file_name if os.path.exists(self.workspace + f'/{index_file_name}'): self._docs = DocumentArray.load(self.workspace + f'/{index_file_name}') else: self._docs = DocumentArray()
class DocVectorIndexer(Executor): def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self._index_file_name = index_file_name if os.path.exists(self.workspace + f'/{index_file_name}'): self._docs = DocumentArray.load(self.workspace + f'/{index_file_name}') else: self._docs = DocumentArray() @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=int(parameters['top_k']), ) def close(self): """ Stores the DocumentArray to disk """ self._docs.save(self.workspace + f'/{self._index_file_name}')
async def aencode(self, content, **kwargs): from docarray import DocumentArray r = DocumentArray() async for da in self._async_client.post( **self._get_post_payload(content, kwargs)): r.extend(da) return r.embeddings if self._return_plain else r
def query(self, docs: DocumentArray, **kwargs): for doc in docs: new_matches = DocumentArray() for match in doc.matches: extracted_doc = self._docs[match.parent_id] extracted_doc.scores = match.scores new_matches.append(extracted_doc) doc.matches = new_matches
def __init__(self, **kwargs): super().__init__(**kwargs) self.table_name = 'qabot_docs' self._docs = DocumentArray( storage='sqlite', config={ 'connection': os.path.join(self.workspace, 'indexer.db'), 'table_name': self.table_name, }, )
def craft(self, docs: DocumentArray, **kwargs): filtered_docs = DocumentArray( d for d in docs.traverse_flat('c') if d.mime_type == 'image/jpeg' ) target_size = 224 for doc in filtered_docs: doc.load_uri_to_image_tensor() doc.set_image_tensor_shape(shape=(target_size, target_size)) doc.set_image_tensor_channel_axis(-1, 0) return filtered_docs
def docs(self) -> 'DocumentArray': """Get the :class: `DocumentArray` with sequence `data.docs` as content. .. # noqa: DAR201""" if not self._loaded_doc_array: if self._content.WhichOneof('documents') == 'docs_bytes': self._loaded_doc_array = DocumentArray.from_bytes( self._content.docs_bytes) else: self._loaded_doc_array = DocumentArray.from_protobuf( self._content.docs) return self._loaded_doc_array
async def dry_run(self, empty, context) -> jina_pb2.StatusProto: """ Process the the call requested by having a dry run call to every Executor in the graph :param empty: The service expects an empty protobuf message :param context: grpc context :returns: the response request """ from docarray import DocumentArray from jina.clients.request import request_generator from jina.enums import DataInputType from jina.serve.executors import __dry_run_endpoint__ da = DocumentArray() try: req_iterator = request_generator( exec_endpoint=__dry_run_endpoint__, data=da, data_type=DataInputType.DOCUMENT, ) async for _ in self.streamer.stream(request_iterator=req_iterator): pass status_message = StatusMessage() status_message.set_code(jina_pb2.StatusProto.SUCCESS) return status_message.proto except Exception as ex: status_message = StatusMessage() status_message.set_exception(ex) return status_message.proto
def encode(self, docs: DocumentArray, **kwargs): with torch.inference_mode(): _input = torch.from_numpy(docs.tensors.astype('float32')) _features = self._get_features(_input).detach() _features = _features.numpy() _features = self._get_pooling(_features) docs.embeddings = _features
def test_decorator_interface(port_generator): class DummyExecutor(Executor): @requests(on='/foo') def foo(self, docs, **kwargs): self._proces(docs) self.proces_2(docs) @monitor(name='metrics_name', documentation='metrics description') def _proces(self, docs): ... @monitor() def proces_2(self, docs): ... port = port_generator() with Flow(monitoring=True, port_monitoring=port_generator()).add(uses=DummyExecutor, monitoring=True, port_monitoring=port) as f: f.post('/foo', inputs=DocumentArray.empty(4)) resp = req.get(f'http://localhost:{port}/') assert f'jina_metrics_name_count{{runtime_name="executor0/rep-0"}} 1.0' in str( resp.content) assert ( f'jina_proces_2_seconds_count{{runtime_name="executor0/rep-0"}} 1.0' in str(resp.content))
def test_app_models_acceptance(docs_input): f = Flow(protocol='http').add() with f: r = req.post(f'http://localhost:{f.port}/index', json=docs_input) assert DocumentArray.from_dict(r.json()['data'])[0].text == 'text_input'
def test_conditions_filtering(tmpdir, flow): with flow: ret = flow.post( on='index', inputs=DocumentArray([ Document(text='type1', tags={'type': 1}), Document(text='type2', tags={'type': 2}), ]), ) assert len(ret) == 2 types_set = set() for doc in ret: if doc.tags['type'] == 1: assert doc.text == 'type1 processed by exec1' else: assert doc.tags['type'] == 2 assert doc.text == 'type2 processed by exec2' types_set.add(doc.tags['type']) assert types_set == {1, 2} with open(os.path.join(str(tmpdir), 'exec1', '0', f'exec1.txt'), 'r') as fp: assert fp.read() == 'type1' with open(os.path.join(str(tmpdir), 'exec2', '0', f'exec2.txt'), 'r') as fp: assert fp.read() == 'type2'
def test_client_on_error_call(protocol, exception): with pytest.raises(exception): Client(host='0.0.0.0', protocol=protocol, port=12345).post( '/blah', inputs=DocumentArray.empty(10), )
def test_executor_load_from_hub(): exec = Executor.from_hub('jinahub://DummyHubExecutor', uses_metas={'name': 'hello123'}) da = DocumentArray([Document()]) exec.foo(da) assert da.texts == ['hello'] assert exec.metas.name == 'hello123'
def test_client_host_scheme(protocol): port = random_port() f = Flow(protocol='websocket' if protocol == 'ws' else protocol, port=port).add() with f: c = Client(host=f'{protocol}://localhost:{port}') c.post('/', inputs=DocumentArray.empty(2))
def test_healthcheck_logs_websocket_with_env(capfd, health_check_env): f = Flow(protocol='websocket', port=12345).add() with f: f.post('/', inputs=DocumentArray.empty()) req.get('http://localhost:12345/') out, _ = capfd.readouterr() assert '"GET / HTTP/1.1" 200 OK' not in out
def test_grpc_compression(compression_client, compression_gateway): with Flow(grpc_compression=compression_gateway).add().add() as f: ret = f.post( on='/', inputs=DocumentArray([Document()]), grpc_compression=compression_client, ) assert len(ret) == 1
def docs(self, value: DocumentArray): """Overide the DocumentArray with the provided one :param value: a DocumentArray """ if value: self._loaded_doc_array = None self._content.docs.CopyFrom(value.to_protobuf())
def segment(self, docs: DocumentArray, **kwargs): for doc in docs: text = doc.tags['caption'] uri = f'{os.environ["HW_WORKDIR"]}/people-img/{doc.tags["image"]}' chunk_text = Document(text=text, mime_type='text/plain') chunk_uri = Document(uri=uri, mime_type='image/jpeg') doc.chunks = DocumentArray([chunk_text, chunk_uri]) doc.uri = uri doc.convert_uri_to_datauri()
def hello_world(args): """ Execute the chatbot example. :param args: arguments passed from CLI """ Path(args.workdir).mkdir(parents=True, exist_ok=True) with ImportExtensions( required=True, help_text= 'this demo requires Pytorch and Transformers to be installed, ' 'if you haven\'t, please do `pip install jina[torch,transformers]`', ): import torch import transformers assert [torch, transformers] #: prevent pycharm auto remove the above line targets = { 'covid-csv': { 'url': args.index_data_url, 'filename': os.path.join(args.workdir, 'dataset.csv'), } } # download the data download_data(targets, args.download_proxy, task_name='download csv data') # now comes the real work # load index flow from a YAML file f = _get_flow(args) # index it! with f: f.index( DocumentArray.from_csv(targets['covid-csv']['filename'], field_resolver={'question': 'text'}), show_progress=True, ) url_html_path = 'file://' + os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'static/index.html')) try: webbrowser.open(url_html_path, new=2) except: pass # intentional pass, browser support isn't cross-platform finally: default_logger.info( f'You should see a demo page opened in your browser, ' f'if not, you may open {url_html_path} manually') if not args.unblock_query_flow: f.block()
def _sort_response_docs(response): # sort response docs according to their order in the initial request def sort_by_request_order(doc): if doc.id in request_doc_ids: return request_doc_ids.index(doc.id) else: return len(request_doc_ids) # put new/unknown docs at the end sorted_docs = sorted(response.data.docs, key=sort_by_request_order) response.data.docs = DocumentArray(sorted_docs)
def test_empty_arrays(linear_flow): docs = DocumentArray.empty(5) with linear_flow as f: resp = f.post(on='/foo', inputs=docs) for doc in resp: assert not doc.tags['listcheck_embedding'] assert not doc.tags['listcheck_tensor'] assert not doc.tags['nparraycheck_embedding'] assert not doc.tags['nparraycheck_tensor']
def set_docs_convert_arrays(self, value: DocumentArray, ndarray_type: Optional[str] = None): """ " Convert embedding and tensor to given type, then set DocumentArray :param value: a DocumentArray :param ndarray_type: type embedding and tensor will be converted to """ if value is not None: self._loaded_doc_array = None self._content.docs.CopyFrom( value.to_protobuf(ndarray_type=ndarray_type))
async def test_async_apply(): class AsyncExecutor(Executor): @requests async def foo(self, docs: DocumentArray, **kwargs): docs.apply(set_hello) return docs N = 2 da = DocumentArray.empty(N) exec = AsyncExecutor() da1 = await exec.foo(da) assert da1.texts == ['hello'] * N