def test_flow_with_modalities(tmpdir): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_fn(): doc1 = jina_pb2.Document() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.Document() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.Document() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn, override_doc_id=False) with open(tmpdir.join('vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open(tmpdir.join('vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin')) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer1.query(d_id)) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin')) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer2.query(d_id)) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
def test_flow_with_modalities(): def input_fn(): doc1 = Document() doc1.id = 1 doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc2 = Document() doc2.id = 2 doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc3 = Document() doc3.id = 3 doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn) with open('vec1.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open('vec2.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load('kvidx1.bin') assert chunkIndexer1.size == 6 d_id = list(chunkIndexer1.query_handler.header.keys())[0] assert chunkIndexer1.query(d_id).modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load('kvidx2.bin') assert chunkIndexer2.size == 6 d_id = list(chunkIndexer2.query_handler.header.keys())[0] assert chunkIndexer2.query(d_id).modality == 'mode2'
def test_binary_pb(): num_docs = 100 docs = list(random_docs(num_docs, jitter=50)) with BinaryPbIndexer('test-shelf') as spi: spi.add(docs) spi.save() with BinaryPbIndexer.load(spi.save_abspath) as spi: assert spi.size == num_docs for j in range(num_docs): assert spi.query(j) == docs[j]
def test_flow_with_modalities(tmpdir, restful): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_function(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = '1' doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = '2' doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = '3' return [doc1, doc2, doc3] flow = ( Flow(restful=restful) .add(name='segmenter', uses='!MockSegmenter') .add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')) .add( name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1'], ) .add( name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['segmenter'], ) .add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')) .join(['indexer1', 'indexer2']) ) with flow: flow.index(inputs=input_function) with open(os.path.join(tmpdir, 'compound', 'vecidx1-0', 'vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) ) with open(os.path.join(tmpdir, 'compound', 'vecidx2-0', 'vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) ) chunkIndexer1 = BinaryPbIndexer.load( os.path.join(tmpdir, 'compound', 'kvidx1-0', 'kvidx1.bin') ) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer1.query([d_id])[0]) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load( os.path.join(tmpdir, 'compound', 'kvidx2-0', 'kvidx2.bin') ) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer2.query([d_id])[0]) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']