def test_pdf_flow_mix_buffer(): path = os.path.join(cur_dir, 'cats_are_awesome.pdf') with open(path, 'rb') as pdf: input_bytes = pdf.read() f = Flow().add(uses='PDFExtractorSegmenter', array_in_pb=True) with f: f.search(input_fn=search_generator(path=None, buffer=input_bytes), on_done=validate_mix_fn)
def test_shards_insufficient_data(self): """THIS IS SUPER IMPORTANT FOR TESTING SHARDS IF THIS FAILED, DONT IGNORE IT, DEBUG IT """ index_docs = 3 replicas = 4 def validate(req): self.assertEqual(len(req.docs), 1) self.assertEqual(len(req.docs[0].matches), index_docs) for d in req.docs[0].matches: self.assertTrue(hasattr(d.match, 'weight')) self.assertIsNotNone(d.match.weight) self.assertEqual(d.match.meta_info, b'hello world') f = Flow().add(name='doc_pb', yaml_path=os.path.join(cur_dir, '../yaml/test-docpb.yml'), replicas=replicas, separated_workspace=True) with f: f.index(input_fn=random_docs(index_docs), random_doc_id=False) time.sleep(2) with f: pass time.sleep(2) f = Flow().add(name='doc_pb', yaml_path=os.path.join(cur_dir, '../yaml/test-docpb.yml'), replicas=replicas, separated_workspace=True, polling='all', reducing_yaml_path='_merge_all') with f: f.search(input_fn=random_queries(1, index_docs), random_doc_id=False, output_fn=validate, callback_on_body=True) time.sleep(2) self.add_tmpfile('test-docshard-tmp')
def test_normal(docs): NUM_REPLICAS = 3 NUM_SHARDS = 2 doc_id_path = collections.OrderedDict() def handle_search_result(resp): for doc in resp.search.docs: doc_id_path[int(doc.id)] = (doc.tags['replica'], doc.tags['shard']) flow = Flow().add( name='pod1', uses='!DummyMarkExecutor', replicas=NUM_REPLICAS, parallel=NUM_SHARDS, ) with flow: flow.search(inputs=docs, request_size=1, on_done=handle_search_result) assert len(doc_id_path.keys()) == len(docs) num_used_replicas = len(set(map(lambda x: x[0], doc_id_path.values()))) assert num_used_replicas == NUM_REPLICAS shards = collections.defaultdict(list) for replica, shard in doc_id_path.values(): shards[replica].append(shard) assert len(shards.keys()) == NUM_REPLICAS for shard_list in shards.values(): assert len(set(shard_list)) == NUM_SHARDS
def test_chunk_joint_idx(self): f = Flow().add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint.yml')) def random_docs(num_docs, chunks_per_doc=5, embed_dim=10): c_id = 0 for j in range(num_docs): d = jina_pb2.Document() for k in range(chunks_per_doc): c = d.chunks.add() c.embedding.CopyFrom(array2pb(np.random.random([embed_dim]))) c.chunk_id = c_id c.doc_id = j c_id += 1 yield d def validate(req, indexer_name): self.assertTrue(req.status.code < jina_pb2.Status.ERROR) self.assertEqual(req.search.docs[0].chunks[0].topk_results[0].score.op_name, indexer_name) with f: f.index(random_docs(100)) g = Flow().add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint.yml')) with g: g.search(random_docs(10), output_fn=lambda x: validate(x, 'NumpyIndexer')) g = Flow(timeout_ready=-1).add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint-wrap.yml')) with g: g.search(random_docs(10), output_fn=lambda x: validate(x, 'AnnoyIndexer'))
def test_high_order_matches(): f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency.yml')) with f: f.index(random_docs(100)) with f: f.search(random_docs(1), output_fn=validate)
def test_high_order_matches_integrated(): # this is equivalent to the last test but with simplified YAML spec. f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency-integrated.yml')) with f: f.index(random_docs(100)) with f: f.search(random_docs(1), output_fn=validate)
def query(top_k): f = Flow().load_config(QUERY_FLOW_YAML) with f: while True: text = input('\n\n\n- Please type a sentence to find semantic similarity in multiple languages : ') if not text: break ppr = lambda x: print_topk(x, text) f.search(read_query_data(text), callback=ppr, topk=top_k)
def test_high_order_matches(): f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency.yml')) with f: f.index(random_docs(100)) with f: f.search(random_docs(1), output_fn=validate) shutil.rmtree('test-index-file', ignore_errors=False, onerror=None)
def test_simple_run(docs): flow = Flow().add( name='pod1', replicas=2, parallel=3, ) with flow: # test rolling update does not hang flow.search(docs) flow.rolling_update('pod1', None) flow.search(docs)
def test_index(): f = Flow().add(uses=os.path.join(cur_dir, 'yaml/test-index.yml'), parallel=3, separated_workspace=True) with f: f.index(input_fn=random_docs(1000)) for j in range(3): assert os.path.exists(f'test2-{j + 1}/test2.bin') assert os.path.exists(f'test2-{j + 1}/tmp2') with f: f.search(input_fn=random_docs(2), output_fn=get_result, top_k=50)
def test_binarypb_in_flow(): docs = list(random_docs(10)) f = Flow(callback_on_body=True).add(uses='binarypb.yml') with f: f.index(docs, override_doc_id=False) def validate(req): for d, d0 in zip(req.docs, docs): assert d.embedding.buffer == d0.embedding.buffer with f: f.search(docs, output_fn=validate, override_doc_id=False)
def test_index(self): f = Flow().add(uses=os.path.join(cur_dir, 'yaml/test-index.yml'), parallel=3, separated_workspace=True) with f: f.index(input_fn=random_docs(1000)) for j in range(3): self.assertTrue(os.path.exists(f'test2-{j + 1}/test2.bin')) self.assertTrue(os.path.exists(f'test2-{j + 1}/tmp2')) self.add_tmpfile(f'test2-{j + 1}/test2.bin', f'test2-{j + 1}/tmp2', f'test2-{j + 1}') time.sleep(3) with f: f.search(input_fn=random_docs(2), output_fn=get_result)
def test_thread_run(docs): flow = Flow().add( name='pod1', replicas=2, parallel=2, timeout_ready=30000, ) with flow: x = threading.Thread(target=flow.rolling_update, args=('pod1', )) for i in range(50): flow.search(docs) if i == 5: x.start() x.join()
def test_index_depth_0_search_depth_1(): os.environ['CUR_DIR_GRANULARITY'] = cur_dir os.environ['TEST_WORKDIR'] = os.getcwd() index_data = [ 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1', 'I am chunk 0 of doc 2, I am chunk 1 of doc 2', 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3', ] index_flow = Flow().load_config('flow-index.yml') with index_flow: index_flow.index(index_data) def validate_granularity_1(resp): assert len(resp.docs) == 3 for doc in resp.docs: assert doc.granularity == 1 assert len(doc.matches) == 1 assert doc.matches[0].id == doc.id # done on purpose assert doc.matches[0].granularity == 0 assert resp.docs[0].text == 'I am chunk 1 of doc 1,' assert resp.docs[0].matches[ 0].text == 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1' assert resp.docs[1].text == 'I am chunk 0 of doc 2,' assert resp.docs[1].matches[ 0].text == 'I am chunk 0 of doc 2, I am chunk 1 of doc 2' assert resp.docs[2].text == 'I am chunk 3 of doc 3' assert resp.docs[2].matches[ 0].text == 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3' search_data = [ 'I am chunk 1 of doc 1,', 'I am chunk 0 of doc 2,', 'I am chunk 3 of doc 3', ] search_flow = Flow().load_config('flow-query.yml') with search_flow: search_flow.search(input_fn=search_data, output_fn=validate_granularity_1, callback_on_body=True, granularity=1) rm_files([os.path.join(os.getenv('TEST_WORKDIR'), 'test_workspace')]) del os.environ['CUR_DIR_GRANULARITY'] del os.environ['TEST_WORKDIR']
def test_binarypb_in_flow(test_metas): docs = list(random_docs(10)) f = Flow(callback_on='body').add(uses='binarypb.yml') with f: f.index(docs, override_doc_id=False) def validate(req): assert len(docs) == len(req.docs) for d, d0 in zip(req.docs, docs): np.testing.assert_almost_equal( NdArray(d.embedding).value, NdArray(d0.embedding).value) docs_no_embedding = copy.deepcopy(docs) for d in docs_no_embedding: d.ClearField('embedding') with f: f.search(docs_no_embedding, output_fn=validate, override_doc_id=False)
def test_shards_insufficient_data(mocker, restful, docpb_workspace): """THIS IS SUPER IMPORTANT FOR TESTING SHARDS IF THIS FAILED, DONT IGNORE IT, DEBUG IT """ index_docs = 3 parallel = 4 mock = mocker.Mock() def validate(req): assert len(req.docs) == 1 assert len(req.docs[0].matches) == index_docs for d in req.docs[0].matches: assert hasattr(d, 'weight') assert d.weight f = Flow(restful=restful).add( name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, ) with f: f.index(inputs=random_docs(index_docs)) time.sleep(2) with f: pass time.sleep(2) f = Flow(restful=restful).add( name='doc_pb', uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'), parallel=parallel, polling='all', uses_after='_merge_chunks', ) with f: f.search(inputs=random_queries(1, index_docs), on_done=mock) time.sleep(2) mock.assert_called_once() validate_callback(mock, validate)