def test_flow_with_modalities(tmpdir): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_fn(): doc1 = jina_pb2.Document() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.Document() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.Document() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn, override_doc_id=False) with open(tmpdir.join('vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open(tmpdir.join('vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin')) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer1.query(d_id)) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin')) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer2.query(d_id)) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
def test_flow_with_modalitys(self): def input_fn(): doc1 = Document() doc1.id = 1 doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc2 = Document() doc2.id = 2 doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc3 = Document() doc3.id = 3 doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) self.add_tmpfile('vec1.gz') self.add_tmpfile('vec2.gz') self.add_tmpfile('chunk1.gz') self.add_tmpfile('chunk2.gz') self.add_tmpfile('vecidx1.bin') self.add_tmpfile('vecidx2.bin') self.add_tmpfile('kvidx1.bin') self.add_tmpfile('kvidx2.bin') with flow: flow.index(input_fn=input_fn) with gzip.open('vec1.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with gzip.open('vec2.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer(index_filename='chunk1.gz') self.assertEqual(len(chunkIndexer1.query_handler.items()), 3) for key, pb in chunkIndexer1.query_handler.items(): for chunk in pb.chunks: self.assertEqual(chunk.modality, 'mode1') chunkIndexer2 = BinaryPbIndexer(index_filename='chunk2.gz') self.assertEqual(len(chunkIndexer2.query_handler.items()), 3) for key, pb in chunkIndexer2.query_handler.items(): for chunk in pb.chunks: self.assertEqual(chunk.modality, 'mode2')
def test_binary_pb(): num_docs = 100 docs = list(random_docs(num_docs, jitter=50)) with BinaryPbIndexer('test-shelf') as spi: spi.add(docs) spi.save() with BinaryPbIndexer.load(spi.save_abspath) as spi: assert spi.size == num_docs for j in range(num_docs): assert spi.query(j) == docs[j]
def test_binarypb_add_and_update_not_working(test_metas, delete_on_dump): with BinaryPbIndexer(metas=test_metas, delete_on_dump=delete_on_dump) as idxer: idxer.add(['11', '12', '13'], [b'eleven', b'twelve', b'thirteen']) idxer.save() # FIXME `add` and `update` won't work in the same context # since `.save` calls `.flush` on a closed handler # and the handler needs to have been # closed for us to allow querying in the `.update` with pytest.raises(AttributeError): idxer.update(['12'], [b'twelve-new']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: idxer.update(['12'], [b'twelve-new']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['11']) == [b'eleven'] assert idxer.query(['12']) == [b'twelve-new'] assert idxer.query(['12', '13']) == [b'twelve-new', b'thirteen'] assert idxer.size == 3 assert idxer.sample() in (b'eleven', b'twelve-new', b'thirteen')
def test_binarypb_update1(test_metas, delete_on_dump): with BinaryPbIndexer(metas=test_metas, delete_on_dump=delete_on_dump) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 first_size = os.path.getsize(idxer.index_abspath) save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'oldvalue'] with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'oldvalue'] second_size = os.path.getsize(idxer.index_abspath) assert second_size == first_size with BaseIndexer.load(save_abspath) as idxer: # some new value idxer.update(['1', '2'], [b'newvalue', b'same']) idxer.save() third_size = os.path.getsize(idxer.index_abspath) if delete_on_dump: assert third_size == first_size else: assert third_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'newvalue'] assert idxer.query(['2']) == [b'same'] assert idxer.query(['3']) == [b'random'] assert idxer.query(['99']) == [None] with BaseIndexer.load(save_abspath) as idxer: # partial update when missing keys encountered idxer.update(['1', '2', '99'], [b'abcvalue', b'abcd', b'WILL_BE_IGNORED']) idxer.save() assert idxer.size == 3 fourth_size = os.path.getsize(idxer.index_abspath) if delete_on_dump: assert fourth_size == first_size else: assert fourth_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(['1']) == [b'abcvalue'] assert idxer.query(['2']) == [b'abcd'] assert idxer.query(['3']) == [b'random'] assert idxer.query(['99']) == [None] assert idxer.query(['1', '2']) == [b'abcvalue', b'abcd'] assert idxer.query(['1', '2', '3']) == [b'abcvalue', b'abcd', b'random']
def test_kvindexer_iterate(test_metas): """two updates in a row does work""" with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert list(idxer) == [[b'oldvalue'], [b'same'], [b'random']]
def test_flow_with_modalities(): def input_fn(): doc1 = Document() doc1.id = 1 doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc2 = Document() doc2.id = 2 doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc3 = Document() doc3.id = 3 doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn) with open('vec1.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open('vec2.gz', 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load('kvidx1.bin') assert chunkIndexer1.size == 6 d_id = list(chunkIndexer1.query_handler.header.keys())[0] assert chunkIndexer1.query(d_id).modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load('kvidx2.bin') assert chunkIndexer2.size == 6 d_id = list(chunkIndexer2.query_handler.header.keys())[0] assert chunkIndexer2.query(d_id).modality == 'mode2'
def test_binarypb_update_twice(test_metas): """two updates in a row does work""" with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: idxer.update(['1'], [b'newvalue']) idxer.update(['2'], [b'othernewvalue']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'newvalue' assert idxer.query('2') == b'othernewvalue'
def test_binarypb_delete(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: idxer.delete(iter(['1', '2'])) idxer.save() assert idxer.size == 1 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') is None assert idxer.query('2') is None assert idxer.query('3') == b'random'
def test_binarypb_delete(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: idxer.delete(iter([1, 2])) idxer.save() assert idxer.size == 1 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == None assert idxer.query(2) == None assert idxer.query(3) == b'random'
def test_binarypb_update1(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 first_size = os.fstat(idxer.write_handler.body.fileno()).st_size save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'oldvalue' second_size = os.fstat(idxer.query_handler._body.fileno()).st_size assert second_size == first_size with BaseIndexer.load(save_abspath) as idxer: # some new value idxer.update(['1', '2'], [b'newvalue', b'same']) idxer.save() third_size = os.fstat(idxer.write_handler.body.fileno()).st_size assert third_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'newvalue' assert idxer.query('2') == b'same' assert idxer.query('3') == b'random' assert idxer.query('99') is None with BaseIndexer.load(save_abspath) as idxer: # partial update when missing keys encountered idxer.update(['1', '2', '99'], [b'newvalue2', b'newvalue3', b'decoy']) idxer.save() assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query('1') == b'newvalue2' assert idxer.query('2') == b'newvalue3' assert idxer.query('3') == b'random' assert idxer.query('99') is None
def test_binarypb_add_and_update_not_working(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add([11, 12], [b'eleven', b'twelve']) idxer.save() # FIXME `add` and `update` won't work in the same context # since `.save` calls `.flush` on a closed handler # and the handler needs to have been # closed for us to allow querying in the `.update` with pytest.raises(AttributeError): idxer.update([12], [b'twelve-new']) idxer.save() assert idxer.size == 2 save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: idxer.update([12], [b'twelve-new']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(11) == b'eleven' assert idxer.query(12) == b'twelve-new' assert idxer.size == 2
def test_binarypb_benchmark(test_metas, delete_on_dump): entries = 100000 nr_to_update = 10000 keys = np.arange(entries) values = np.random.randint(0, 10, size=entries).astype(bytes) with BinaryPbIndexer(metas=test_metas, delete_on_dump=delete_on_dump) as idxer: idxer.add(keys, values) idxer.save() assert idxer.size == entries save_abspath = idxer.save_abspath new_values = np.random.randint(0, 10, size=nr_to_update).astype(bytes) with BaseIndexer.load(save_abspath) as idxer: idxer.update(keys[:nr_to_update], new_values) time_now = time.time() idxer.save() time_end = time.time() print( f'delete_on_dump = {delete_on_dump}, entries={entries}. took {time_end - time_now} seconds' )
def test_binarypb_update1(test_metas): with BinaryPbIndexer(metas=test_metas) as idxer: idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random']) idxer.save() assert idxer.size == 3 first_size = os.fstat(idxer.write_handler.body.fileno()).st_size save_abspath = idxer.save_abspath with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == b'oldvalue' with BaseIndexer.load(save_abspath) as idxer: # no update triggered AT ALL when encountering missing key # atomic op. at indexer level with pytest.raises(KeyError): idxer.update([1, 2, 99], [b'newvalue', b'same', b'decoy']) idxer.save() with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == b'oldvalue' second_size = os.fstat(idxer.query_handler._body.fileno()).st_size assert second_size == first_size with BaseIndexer.load(save_abspath) as idxer: # some new value idxer.update([1, 2], [b'newvalue', b'same']) idxer.save() third_size = os.fstat(idxer.write_handler.body.fileno()).st_size assert third_size > first_size assert idxer.size == 3 with BaseIndexer.load(save_abspath) as idxer: assert idxer.query(1) == b'newvalue' assert idxer.query(2) == b'same' assert idxer.query(3) == b'random' assert idxer.query(99) is None
def test_flow_with_modalities(tmpdir, restful): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_function(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = '1' doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = '2' doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = '3' return [doc1, doc2, doc3] flow = ( Flow(restful=restful) .add(name='segmenter', uses='!MockSegmenter') .add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')) .add( name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1'], ) .add( name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['segmenter'], ) .add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')) .join(['indexer1', 'indexer2']) ) with flow: flow.index(inputs=input_function) with open(os.path.join(tmpdir, 'compound', 'vecidx1-0', 'vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) ) with open(os.path.join(tmpdir, 'compound', 'vecidx2-0', 'vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) ) chunkIndexer1 = BinaryPbIndexer.load( os.path.join(tmpdir, 'compound', 'kvidx1-0', 'kvidx1.bin') ) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer1.query([d_id])[0]) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load( os.path.join(tmpdir, 'compound', 'kvidx2-0', 'kvidx2.bin') ) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer2.query([d_id])[0]) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']