def test_envelope_in_sep_request(): """ ser/des on envelope only much faster :return: """ recv = [(rr.envelope, rr.request.SerializeToString()) for rr in (add_envelope(r, 'test', '123') for r in _generate(random_docs(num_docs)))] with TimeContext('serialize and deserialize'): for _ in range( num_reqs ): # mimic chaining _pass, no need to deserialize request sent = [(msg[0].SerializeToString(), msg[1]) for msg in recv] # mimic sent # mimic receive recv.clear() for m in sent: msg = jina_pb2.EnvelopeProto() msg.ParseFromString(m[0]) msg.request_id += 'r' recv.append((msg, m[1])) for r in recv: assert r[0].request_id.endswith('r' * num_reqs)
def test_lazy_msg_access(): reqs = [ ProtoMessage(None, r.SerializeToString(), 'test', '123', request_id='123', request_type='IndexRequest') for r in _generate(random_docs(10)) ] for r in reqs: assert not r.request.is_used assert r.envelope assert len(r.dump()) == 3 assert not r.request.is_used for r in reqs: assert not r.request.is_used assert r.request assert len(r.dump()) == 3 assert not r.request.is_used for r in reqs: assert not r.request.is_used assert r.request.index.docs assert len(r.dump()) == 3 assert r.request.is_used
def test_request_generate_numpy_arrays_iterator(self): input_array = np.random.random([10, 10]) def generator(): for array in input_array: yield array req = _generate(data=generator(), batch_size=5) request = next(req) self.assertEqual(len(request.index.docs), 5) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.id, index) self.assertEqual(doc.length, 5) self.assertEqual(doc.granularity, 0) self.assertEqual(pb2array(doc.blob).shape, (10, )) self.assertEqual(doc.blob.shape, [10]) request = next(req) self.assertEqual(len(request.index.docs), 5) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.id, 5 + index) self.assertEqual(doc.length, 5) self.assertEqual(doc.granularity, 0) self.assertEqual(pb2array(doc.blob).shape, (10, )) self.assertEqual(doc.blob.shape, [10])
def test_lazy_append_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.docs.append(jina_pb2.DocumentProto()) # now it is read assert r.is_used
def test_request_generate(self): def random_lines(num_lines): for j in range(num_lines): yield "https://github.com 'i\'m dummy doc %d'" % j req = _generate(data=random_lines(100), batch_size=100) assert len(list(req)) == 1
def test_lazy_nested_clear_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.index.ClearField('docs') # now it is read assert r.is_used
def test_lazy_change_message_type(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.control.command = jina_pb2.RequestProto.ControlRequestProto.IDLE # now it is read assert r.is_used assert len(r.index.docs) == 0
def test_lazy_nest_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.docs[0].id = '1' # now it is read assert r.is_used assert r.index.docs[0].id == '1'
def test_message_size(): reqs = [Message(None, r, 'test', '123') for r in _generate(random_docs(10))] for r in reqs: assert r.size == 0 assert sys.getsizeof(r.envelope.SerializeToString()) assert sys.getsizeof(r.request.SerializeToString()) assert len(r.dump()) == 3 assert r.size > sys.getsizeof(r.envelope.SerializeToString()) \ + sys.getsizeof(r.request.SerializeToString())
def test_lazy_access(field): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # access r.train print(getattr(r, field)) # now it is read assert r.is_used
def test_multiple_access(): reqs = [Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))] for r in reqs: assert not r.is_used assert r assert not r.is_used for r in reqs: assert not r.is_used assert r.index assert r.is_used
def test_request_generate_lines_from_list(self): def random_lines(num_lines): return [f'i\'m dummy doc {j}' for j in range(1, num_lines + 1)] req = _generate(data=random_lines(100), batch_size=100) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'text/plain') self.assertEqual(doc.text, f'i\'m dummy doc {index}')
def test_request_generate_lines_from_list(): def random_lines(num_lines): return [f'i\'m dummy doc {j}' for j in range(1, num_lines + 1)] req = _generate(data=random_lines(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'text/plain' assert doc.text == f'i\'m dummy doc {index}'
def test_request_generate_bytes(self): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'i\'m dummy doc {j}'.encode('utf8') req = _generate(data=random_lines(100), batch_size=100) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'text/plain') self.assertEqual(doc.buffer.decode(), f'i\'m dummy doc {index}')
def test_request_generate_lines_with_fake_url(): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'https://github.com i\'m dummy doc {j}' req = _generate(data=random_lines(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'text/plain' assert doc.text == f'https://github.com i\'m dummy doc {index}'
def test_request_generate_bytes(): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'i\'m dummy doc {j}'.encode('utf8') req = _generate(data=random_lines(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'text/plain' assert doc.buffer.decode() == f'i\'m dummy doc {index}'
def test_request_generate_lines_with_fake_url(self): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'https://github.com i\'m dummy doc {j}' req = _generate(data=random_lines(100), batch_size=100) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'text/plain') self.assertEqual(doc.text, f'https://github.com i\'m dummy doc {index}')
def test_request_generate_lines(self): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'i\'m dummy doc {j}' req = _generate(data=random_lines(100), batch_size=100) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.id, index) self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'text/plain') self.assertEqual(doc.granularity, 0) self.assertEqual(doc.text, f'i\'m dummy doc {index}')
def test_request_generate_numpy_arrays(): input_array = np.random.random([10, 10]) req = _generate(data=input_array, batch_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert GenericNdArray(doc.blob).value.shape == (10,) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert GenericNdArray(doc.blob).value.shape == (10,)
def test_request_generate_docs(self): def random_docs(num_docs): for j in range(1, num_docs + 1): doc = jina_pb2.Document() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.tags['id'] = 1000 # this will be ignored doc.mime_type = 'mime_type' yield doc req = _generate(data=random_docs(100), batch_size=100) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'mime_type') self.assertEqual(doc.text, f'i\'m dummy doc {index}') self.assertEqual(doc.offset, 1000)
def test_all_in_one_request(): recv = [ add_envelope(r, 'test', '123') for r in _generate(random_docs(num_docs)) ] with TimeContext('serialize and deserialize'): for _ in range(num_reqs): # mimic multipic pods sent = [msg.SerializeToString() for msg in recv] # mimic sent # mimic receive recv.clear() for m in sent: msg = jina_pb2.MessageProto() msg.ParseFromString(m) msg.envelope.request_id += 'r' recv.append(msg) for r in recv: assert r.envelope.request_id.endswith('r' * num_reqs)
def test_request_generate_docs(): def random_docs(num_docs): for j in range(1, num_docs + 1): doc = jina_pb2.Document() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.tags['id'] = 1000 # this will be ignored doc.mime_type = 'mime_type' yield doc req = _generate(data=random_docs(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'mime_type' assert doc.text == f'i\'m dummy doc {index}' assert doc.offset == 1000
def test_request_generate_numpy_arrays(self): input_array = np.random.random([10, 10]) req = _generate(data=input_array, batch_size=5) request = next(req) self.assertEqual(len(request.index.docs), 5) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 5) self.assertEqual(pb2array(doc.blob).shape, (10, )) self.assertEqual(doc.blob.shape, [10]) request = next(req) self.assertEqual(len(request.index.docs), 5) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.length, 5) self.assertEqual(pb2array(doc.blob).shape, (10, )) self.assertEqual(doc.blob.shape, [10])
def test_request_generate_numpy_arrays_iterator(): input_array = np.random.random([10, 10]) def generator(): for array in input_array: yield array req = _generate(data=generator(), batch_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10, ) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10, )
def test_request_generate_docs_with_different_granularity(self): def random_docs(num_docs): for j in range(1, num_docs + 1): doc = jina_pb2.Document() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.id = 1000 # this will be ignored doc.granularity = 3 # this will be overriden by _generate granularity param doc.mime_type = 'mime_type' yield doc req = _generate(data=random_docs(100), batch_size=100, granularity=5) request = next(req) self.assertEqual(len(request.index.docs), 100) for index, doc in enumerate(request.index.docs, 1): self.assertEqual(doc.id, index) self.assertEqual(doc.length, 100) self.assertEqual(doc.mime_type, 'mime_type') self.assertEqual(doc.granularity, 5) self.assertEqual(doc.text, f'i\'m dummy doc {index}') self.assertEqual(doc.offset, 1000)
def test_request_generate_dict_str(): import json def random_docs(num_docs): for j in range(1, num_docs + 1): doc = { 'text': f'i\'m dummy doc {j}', 'offset': 1000, 'tags': { 'id': 1000 }, 'chunks': [ { 'text': f'i\'m chunk 1', 'modality': 'text' }, { 'text': f'i\'m chunk 2', 'modality': 'image' }, ] } yield json.dumps(doc) req = _generate(data=random_docs(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.text == f'i\'m dummy doc {index}' assert doc.offset == 1000 assert doc.tags['id'] == 1000 assert len(doc.chunks) == 2 assert doc.chunks[0].modality == 'text' assert doc.chunks[0].text == f'i\'m chunk 1' assert doc.chunks[1].modality == 'image' assert doc.chunks[1].text == f'i\'m chunk 2'
def test_lazy_request_fields(): reqs = (LazyRequest(r.SerializeToString(), Envelope()) for r in _generate(random_docs(10))) for r in reqs: assert list(r.DESCRIPTOR.fields_by_name.keys())