def test_complex_needs(replicas, mocker): response_mock = mocker.Mock() f = (Flow().add(name='r1').add(name='r2', host=CLOUD_HOST).add( name='r3', needs='r1', host=CLOUD_HOST, replicas=replicas).add(name='r4', needs='r2', replicas=replicas).add( name='r5', needs='r3').add(name='r6', needs='r4', host=CLOUD_HOST).add(name='r8', needs='r6', replicas=replicas).add( name='r9', needs='r5', host=CLOUD_HOST, replicas=replicas).add( name='r10', needs=['r9', 'r8'])) with f: f.index( inputs=(Document(text='hello') for _ in range(NUM_DOCS)), on_done=response_mock, ) response_mock.assert_called()
def index_restful(num_docs): f = Flow().load_config('flows/index.yml') with f: data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None)) f.logger.info(f'Indexing {data_path}') url = f'http://0.0.0.0:{f.port_expose}/index' input_docs = _input_lines( filepath=data_path, size=num_docs, read_mode='r', ) data_json = { 'data': [Document(text=text).dict() for text in input_docs] } r = requests.post(url, json=data_json) if r.status_code != 200: raise Exception( f'api request failed, url: {url}, status: {r.status_code}, content: {r.content}' )
def test_remote_workspace_value(): """ This tests the value set in `self.workspace` in a remote Flow. It should always be `/workspace/ExecutorName/... """ HOST = __default_host__ client = JinaDClient(host=HOST, port=8000) workspace_id = client.workspaces.create( paths=[os.path.join(cur_dir, 'yamls')]) flow_id = client.flows.create(workspace_id=workspace_id, filename='flow_workspace_validate.yml') args = client.flows.get(flow_id)['arguments']['object']['arguments'] response = Client(host=HOST, port=args['port_expose'], protocol=args['protocol']).post(on='/', inputs=[Document()], show_progress=True, return_results=True) assert (response[0].data.docs[0].text.startswith( f'{__partial_workspace__}/WorkspaceValidator/0')) assert client.flows.delete(flow_id) assert client.workspaces.delete(workspace_id)
def test_get_content_multiple_fields_arrays(num_rows): fields = ['blob', 'embedding'] batch_size = 10 embed_size = 20 kwargs = { field: np.random.random((num_rows, embed_size)) for field in fields } docs = DocumentSet([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs._extract_docs(*fields) assert len(contents) == len(fields) assert isinstance(contents, list) assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size assert content.shape == (batch_size, num_rows, embed_size)
def test_simple_flow(protocol): bytes_gen = (Document() for _ in range(10)) def bytes_fn(): for _ in range(100): yield Document() f = Flow(protocol=protocol).add(name='executor0') with f: f.index(inputs=bytes_gen) with f: f.index(inputs=bytes_fn) with f: f.index(inputs=bytes_fn) f.index(inputs=bytes_fn) _validate_flow(f) assert 'gateway' not in f
def create_document_to_score_same_depth_level(): # doc: 1 # | matches: (id: 2, parent_id: 20, score.value: 30, length: 3), # | matches: (id: 3, parent_id: 20, score.value: 40, length: 4), # | matches: (id: 4, parent_id: 30, score.value: 20, length: 2), # | matches: (id: 5, parent_id: 30, score.value: 10, length: 1), doc = jina_pb2.DocumentProto() doc.id = str(1) * 16 match2 = doc.matches.add() match2.id = str(2) * 16 match2.parent_id = str(20) * 8 match2.length = 3 match2.score.ref_id = doc.id match2.score.value = 30 match3 = doc.matches.add() match3.id = str(3) * 16 match3.parent_id = str(20) * 8 match3.length = 4 match3.score.ref_id = doc.id match3.score.value = 40 match4 = doc.matches.add() match4.id = str(4) * 16 match4.parent_id = str(30) * 8 match4.length = 2 match4.score.ref_id = doc.id match4.score.value = 20 match5 = doc.matches.add() match5.id = str(4) * 16 match5.parent_id = str(30) * 8 match5.length = 1 match5.score.ref_id = doc.id match5.score.value = 10 return Document(doc)
def test_crud_http_flow_stateful(protocol): # This tests # Index Flow stores data on disk -> terminated # Query Flow accesses same data using Index Flows workspace to `/search` INDEX_FLOW_NAME = f'simpleindexer-{protocol}-index' SEARCH_FLOW_NAME = F'simpleindexer-{protocol}-search' FLOW_FILE_PATH = os.path.join(cur_dir, 'flows', f'flow-{protocol}-stateful.yml') index_docs = [ Document(text=f'text-{i}', embedding=np.array([i, i + 1, i + 2])) for i in range(5) ] query_doc = index_docs[0] with WolfFlow(filepath=FLOW_FILE_PATH, name=INDEX_FLOW_NAME) as index_flow: da_index = Client(host=index_flow.gateway).index(inputs=index_docs) assert da_index.texts == [f'text-{i}' for i in range(5)] for limit in [3, 5]: da_search = Client(host=index_flow.gateway).search( inputs=query_doc, parameters={'limit': limit}) assert len(da_search[0].matches.texts) == limit assert da_search[0].matches.texts == [ f'text-{i}' for i in range(limit) ] with WolfFlow(filepath=FLOW_FILE_PATH, name=SEARCH_FLOW_NAME, workspace=index_flow.workspace) as search_flow: da_search = Client(host=search_flow.gateway).search(inputs=query_doc) assert da_search[0].matches.texts == [f'text-{i}' for i in range(5)] for limit in [3, 5]: da_search = Client(host=search_flow.gateway).search( inputs=query_doc, parameters={'limit': limit}) assert len(da_search[0].matches.texts) == limit assert da_search[0].matches.texts == [ f'text-{i}' for i in range(limit) ]
def test_reduce_status(): n_shards = 2 flow = Flow(port_expose=exposed_port).add(uses=ExecutorStatus, name='pod0', shards=n_shards, polling='all') with flow as f: da = DocumentArray([Document() for _ in range(5)]) resp = Client(port=exposed_port).post('/status', parameters={'foo': 'bar'}, inputs=da, return_results=True) assert resp[0].parameters['foo'] == 'bar' assert len(resp[0].parameters['__results__']) == n_shards for _, param in resp[0].parameters['__results__'].items(): assert 'shard_id' in param.keys() assert 'happy_status' in param.keys() for doc in resp[0].docs: assert doc.text == 'exec-status'
async def run_test(flow, endpoint, num_docs=10, request_size=10): # start port forwarding from jina.clients import Client client_kwargs = dict( host='localhost', port=flow.port_expose, asyncio=True, ) client_kwargs.update(flow._common_kwargs) client = Client(**client_kwargs) client.show_progress = True responses = [] async for resp in client.post( endpoint, inputs=[Document() for _ in range(num_docs)], return_results=True, request_size=request_size, ): responses.append(resp) return responses
def test_indexer_with_ref_indexer_move(random_workspace_move, parallel, index_docs, mocker): top_k = 10 with Flow.load_config('index.yml') as index_flow: index_flow.index(input_fn=index_docs, batch_size=10) mock = mocker.Mock() shutil.copytree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER'], os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER_QUERY']) shutil.rmtree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER']) def validate_response(resp): mock() assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config('query.yml') as query_flow: query_flow.search(input_fn=[query_document], on_done=validate_response, top_k=top_k) mock.assert_called_once()
def documentset(): """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """ max_granularity = 2 max_adjacency = 2 def iterate_build(document, current_granularity, current_adjacency): if current_granularity < max_granularity: for i in range(DOCUMENTS_PER_LEVEL): chunk = add_chunk(document) iterate_build(chunk, chunk.granularity, chunk.adjacency) if current_adjacency < max_adjacency: for i in range(DOCUMENTS_PER_LEVEL): match = add_match(document) iterate_build(match, match.granularity, match.adjacency) docs = [] for base_id in range(DOCUMENTS_PER_LEVEL): with Document() as d: d.granularity = 0 d.adjacency = 0 docs.append(d) iterate_build(d, 0, 0) return DocumentSet(docs)
def test_upload_via_pymodule(replicas): from .mwu_encoder import MWUEncoder f = (Flow(port_expose=exposed_port).add().add( uses=MWUEncoder, uses_with={ 'greetings': 'hi' }, host=CLOUD_HOST, replicas=replicas, py_modules='mwu_encoder.py', upload_files=cur_dir, ).add()) with f: responses = Client(port=exposed_port).index( inputs=(Document(tensor=np.random.random([1, 100])) for _ in range(NUM_DOCS)), return_results=True, ) assert len(responses) > 0 assert len(responses[0].docs) > 0 for doc in responses[0].docs: assert doc.tags['greetings'] == 'hi'
def test_override_requests(): class MyExec(Executor): @requests def foo(self, docs, **kwargs): for d in docs: d.text = 'foo' def bar(self, docs, **kwargs): for d in docs: d.text = 'bar' @requests(on=['/1', '/2']) def foobar(self, docs, **kwargs): for d in docs: d.text = 'foobar' # original f = Flow().add(uses=MyExec) with f: req = f.post('/index', Document(), return_results=True) assert req[0].docs[0].text == 'foo' # change bind to bar() f = Flow().add(uses=MyExec, uses_requests={'/index': 'bar'}) with f: req = f.post('/index', Document(), return_results=True) assert req[0].docs[0].text == 'bar' req = f.post('/1', Document(), return_results=True) assert req[0].docs[0].text == 'foobar' # change bind to foobar() f = Flow().add(uses=MyExec, uses_requests={'/index': 'foobar'}) with f: req = f.post('/index', Document(), return_results=True) assert req[0].docs[0].text == 'foobar' req = f.post('/index-blah', Document(), return_results=True) assert req[0].docs[0].text == 'foo' # change default bind to foo() f = Flow().add(uses=MyExec, uses_requests={'/default': 'bar'}) with f: req = f.post('/index', Document(), return_results=True) assert req[0].docs[0].text == 'bar'
def test_target_peapod_with_overlaped_name(mocker): class FailExecutor(Executor): @requests def fail(self, **kwargs): raise RuntimeError class PassExecutor(Executor): @requests def success(self, **kwargs): pass f = (Flow().add(uses=FailExecutor, name='foo_with_what_ever_suffix').add(uses=PassExecutor, name='foo')) with f: # both pods are called, create no error mock = mocker.Mock() f.post(on='/foo', target_peapod='foo', inputs=Document(), on_error=mock) mock.assert_called()
def send_requests( port_expose, start_rolling_update_event: multiprocessing.Event, result_queue: multiprocessing.Queue, doc_count: int, request_count: int, ): client = Client(port=port_expose) for i in range(request_count): responses = client.search( [ Document(id=f'{idx}', text=f'doc{idx}') for idx in range(doc_count) ], request_size=10, return_results=True, ) for r in responses: result_queue.put(r.docs.texts) if i == 5: start_rolling_update_event.set() # give the rolling update some time to kick in time.sleep(0.1)
def query(): f = Flow.load_config('flows/query-multimodal.yml') # f.plot() with f: with TimeContext(f'QPS: query with {3}', logger=f.logger): d = Document() search_text = 'It makes sense to first define what we mean by multimodality before going into morefancy terms.' # blog1 # search_text = 'We all know about CRUD[1]. Every app out there does it.'#blog2 # search_text = 'Developing a Jina app often means writing YAML configs.'#blog3 d.text = search_text # There are three ways to search. print('text search:') f.search(input_fn=d, on_done=get_pdf) print('image search:') f.search( input_fn=search_generator(data_path='toy_data/photo-1.png'), read_mode='r', on_done=get_pdf) print('pdf search:') f.search(input_fn=search_generator( data_path='toy_data/blog2-pages-1.pdf'), read_mode='r', on_done=get_pdf)
def test_indexer_with_ref_indexer_compound(random_workspace, parallel, index_docs, mocker, uses_no_docker): top_k = 10 with Flow.load_config(os.path.join(cur_dir, 'compound-index.yml')) as index_flow: index_flow.index(input_fn=index_docs, request_size=10) mock = mocker.Mock() def validate_response(resp): mock() assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config(os.path.join(cur_dir, 'compound-query.yml')) as query_flow: query_flow.search(input_fn=[query_document], on_done=validate_response, top_k=top_k) mock.assert_called_once()
def test_batching_mix_multi_flow(crafter, mocker): NUM_DOCS = 15 def validate_response(resp): assert len(resp.index.docs) == NUM_DOCS for i, doc in enumerate(resp.index.docs): assert doc.text == f'text-{i}-crafted' np.testing.assert_equal( NdArray(doc.embedding).value, np.array([i] * 5)) docs = DocumentArray([ Document( text=f'text-{i}', embedding=np.array([i] * 5), ) for i in range(NUM_DOCS) ]) mock = mocker.Mock() with Flow().add(name='crafter', uses=crafter) as f: f.index(inputs=docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_response)
def test_flow(docker_compose, tmpdir, mocker, encoder_needs, indexer_needs, indexer_method): text = 'cats rules' m = mocker.Mock() def validate_output(resp): m() assert len(resp.index.docs) == 1 assert resp.index.docs[0].text == text os.environ['JINA_ENCODER_HOST'] = '172.28.1.1' os.environ['JINA_WORKSPACE'] = str(tmpdir) os.environ['JINA_ENCODER_NEEDS'] = encoder_needs os.environ['JINA_INDEXER_NEEDS'] = indexer_needs os.environ['JINA_INDEXER_METHOD'] = indexer_method with Document() as doc: doc.content = text with Flow.load_config(flow_yml) as f: f.index([doc], on_done=validate_output) m.assert_called_once()
def test_data_request_handler_change_docs_from_partial_requests(logger): NUM_PARTIAL_REQUESTS = 5 args = set_pea_parser().parse_args(['--uses', 'MergeChangeDocsExecutor']) handler = DataRequestHandler(args, logger) partial_reqs = [ list( request_generator( '/', DocumentArray( [Document(text='input document') for _ in range(10)])))[0] ] * NUM_PARTIAL_REQUESTS msg = Message(None, partial_reqs[-1], 'test', '123') assert len(msg.request.docs) == 10 handler.handle( msg=msg, partial_requests=partial_reqs, peapod_name='name', ) assert len(msg.request.docs) == 10 * NUM_PARTIAL_REQUESTS for doc in msg.request.docs: assert doc.text == 'changed document'
def test_segment_driver(): valid_doc = Document() valid_doc.update_id() valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply_all(DocumentSet([valid_doc])) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0. assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1. assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2. assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def send_requests_once(self, requests: List[Request], deployment: str, head: bool, endpoint: str = None) -> asyncio.Task: assert head response_msg = copy.deepcopy(requests[0]) new_docs = DocumentArray() for doc in requests[0].docs: clientid = doc.text[0:7] self.sent_msg[clientid][deployment] = doc.text new_doc = Document(text=doc.text + f'-{clientid}-{deployment}') new_docs.append(new_doc) self.responded_messages[clientid][deployment] = new_doc.text response_msg.data.docs = new_docs async def task_wrapper(): import random await asyncio.sleep(1 / (random.randint(1, 3) * 10)) return response_msg, {} return asyncio.create_task(task_wrapper())
async def run_test(flow, core_client, namespace, endpoint, n_docs=10, request_size=100): # start port forwarding from jina.clients import Client gateway_pod_name = (core_client.list_namespaced_pod( namespace=namespace, label_selector='app=gateway').items[0].metadata.name) config_path = os.environ['KUBECONFIG'] import portforward with portforward.forward(namespace, gateway_pod_name, flow.port, flow.port, config_path): client_kwargs = dict( host='localhost', port=flow.port, return_responses=True, asyncio=True, ) client_kwargs.update(flow._common_kwargs) client = Client(**client_kwargs) client.show_progress = True responses = [] async for resp in client.post( endpoint, inputs=[Document() for _ in range(n_docs)], request_size=request_size, ): responses.append(resp) return responses
def query_generator(num_docs: int, target: dict): """ Generate the query data. :param num_docs: Number of documents to be queried :param target: Dictionary which stores the data paths :yields: query data """ for _ in range(num_docs): num_data = len(target['query-labels']['data']) idx = random.randint(0, num_data - 1) # x_blackwhite.shape is (28,28) x_blackwhite = 255 - target['query']['data'][idx] # x_color.shape is (28,28,3) x_color = np.stack((x_blackwhite, ) * 3, axis=-1) d = Document( content=x_color, tags={ 'id': -1, 'query_label': float(target['query-labels']['data'][idx][0]), }, ) yield d
def test_flow( docker_compose, tmpdir, mocker, encoder_needs, indexer_needs, indexer_method ): text = 'cats rules' def validate_output(resp): assert len(resp.data.docs) == 1 assert resp.data.docs[0].text == text os.environ['JINA_INTERNAL_HOST'] = get_internal_ip() os.environ['JINA_ENCODER_HOST'] = '172.28.1.1' os.environ['JINA_WORKSPACE'] = str(tmpdir) os.environ['JINA_ENCODER_NEEDS'] = encoder_needs os.environ['JINA_INDEXER_NEEDS'] = indexer_needs os.environ['JINA_INDEXER_METHOD'] = indexer_method doc = Document(content=text) mock = mocker.Mock() with Flow.load_config(flow_yml) as f: f.index([doc], on_done=mock) mock.assert_called_once() validate_callback(mock, validate_output)
def send_requests_once(self, requests, deployment: str, head: bool, endpoint: str = None) -> asyncio.Task: assert head request = requests[0] response_msg = copy.deepcopy(request) new_docs = DocumentArray() docs = request.docs for doc in docs: clientid = doc.text[0:7] new_doc = Document(text=doc.text + f'-{clientid}-{deployment}') new_docs.append(new_doc) response_msg.data.docs = new_docs async def task_wrapper(): import random await asyncio.sleep(1 / (random.randint(1, 3) * 10)) return response_msg, {} return asyncio.create_task(task_wrapper())
def query_generator(num_docs: int, target: dict, with_groundtruth: bool = True): """ Generate the query data. :param num_docs: Number of documents to be queried :param target: Dictionary which stores the data paths :param with_groundtruth: True if want to include labels into query data :yields: query data """ gts = _get_groundtruths(target) for _ in range(num_docs): num_data = len(target['query-labels']['data']) idx = random.randint(0, num_data - 1) # x_blackwhite.shape is (28,28) x_blackwhite = 255 - target['query']['data'][idx] # x_color.shape is (28,28,3) x_color = np.stack((x_blackwhite,) * 3, axis=-1) d = Document(content=x_color) if with_groundtruth: gt = gts[target['query-labels']['data'][idx][0]] yield d, gt else: yield d
def test_func_joiner(mocker): class Joiner(Executor): @requests def foo(self, docs_matrix, **kwargs): for d1, d2 in zip(docs_matrix[0], docs_matrix[1]): d1.text = d1.text + d2.text + '!!!' return docs_matrix[0] class M1(Executor): @requests def foo(self, docs, **kwargs): for idx, d in enumerate(docs): d.text = f'hello {idx}' class M2(Executor): @requests def foo(self, docs, **kwargs): for idx, d in enumerate(docs): d.text = f'world {idx}' f = (Flow(port=1234).add(uses=M1).add(uses=M2, needs='gateway').add( uses=Joiner, needs=['executor0', 'executor1'])) with f: resp = Client(port=1234).post( on='/some_endpoint', inputs=[Document() for _ in range(3)], parameters={ 'hello': 'world', 'topk': 10 }, return_responses=True, ) texts = {d.text for r in resp for d in r.docs} assert len(texts) == 3
def debug(self, docs_matrix: List[DocumentArray], **kwargs): self.logger.debug( f'received doc matrix in exec-merger with length {len(docs_matrix)}.' ) result = DocumentArray() for docs in zip(*docs_matrix): traversed_executors = [ doc.tags['traversed-executors'] for doc in docs ] shard_ids = [doc.tags['shard_id'] for doc in docs] shards = [doc.tags['shards'] for doc in docs] parallels = [doc.tags['parallel'] for doc in docs] traversed_executors = list(chain(*traversed_executors)) doc = Document() doc.tags['traversed-executors'] = traversed_executors doc.tags['shard_id'] = shard_ids doc.tags['shards'] = shards doc.tags['parallel'] = parallels doc.tags['merged'] = True result.append(doc) return result
def _benchmark_dam_extend_qps() -> Dict[str, float]: """Benchmark on adding 1M documents to DocumentArrayMemmap. Returns: A dict mapping of dam extend time in seconds as float number. """ dlist = [] dam_size = 1000000 dam = DocumentArrayMemmap(os.path.join(os.getcwd(), 'MyMemMap')) for i in range(dam_size): dlist.append(Document(text=f'This is the document number: {i}', )) log.info('Benchmarking DAM extend') st = time.perf_counter() dam.extend(dlist) dam_extend_time = time.perf_counter() - st log.info('%d qps within %d seconds', dam_size / dam_extend_time, dam_extend_time) return { 'dam_extend_time': dam_extend_time, 'dam_extend_qps': dam_size / dam_extend_time, }