def test_compression(compress_algo, low_bytes, high_ratio): no_comp_sizes = [] sizes = [] docs = list(random_docs(100, embed_dim=100)) kwargs = dict(identity='gateway', pod_name='123', compress_min_bytes=2 * sum(no_comp_sizes) if low_bytes else 0, compress_min_ratio=10 if high_ratio else 1) with TimeContext(f'no compress'): for r in _generate(docs): m = Message(None, r, compress=CompressAlgo.NONE, **kwargs) m.dump() no_comp_sizes.append(m.size) kwargs = dict(identity='gateway', pod_name='123', compress_min_bytes=2 * sum(no_comp_sizes) if low_bytes else 0, compress_min_ratio=10 if high_ratio else 1) with TimeContext(f'compressing with {str(compress_algo)}') as tc: for r in _generate(docs): m = Message(None, r, compress=compress_algo, **kwargs) m.dump() sizes.append(m.size) if compress_algo == CompressAlgo.NONE or low_bytes or high_ratio: assert sum(sizes) >= sum(no_comp_sizes) else: assert sum(sizes) < sum(no_comp_sizes) print( f'{str(compress_algo)}: size {sum(sizes) / len(sizes)} (ratio: {sum(no_comp_sizes) / sum(sizes):.2f}) with {tc.duration:.2f}s' )
def test_request_generate_dict(): def random_docs(num_docs): for j in range(1, num_docs + 1): doc = { 'text': f'i\'m dummy doc {j}', 'offset': 1000, 'tags': { 'id': 1000 }, 'chunks': [ { 'text': f'i\'m chunk 1', 'modality': 'text' }, { 'text': f'i\'m chunk 2', 'modality': 'image' }, ] } yield doc req = _generate(data=random_docs(100), request_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.text == f'i\'m dummy doc {index}' assert doc.offset == 1000 assert doc.tags['id'] == 1000 assert len(doc.chunks) == 2 assert doc.chunks[0].modality == 'text' assert doc.chunks[0].text == f'i\'m chunk 1' assert doc.chunks[1].modality == 'image' assert doc.chunks[1].text == f'i\'m chunk 2'
def test_lazy_msg_access(): reqs = [ Message(None, r.SerializeToString(), 'test', '123', request_id='123', request_type='IndexRequest') for r in _generate(random_docs(10)) ] for r in reqs: assert not r.request.is_used assert r.envelope assert len(r.dump()) == 3 assert not r.request.is_used for r in reqs: assert not r.request.is_used assert r.request assert len(r.dump()) == 3 assert not r.request.is_used for r in reqs: assert not r.request.is_used assert r.request.index.docs assert len(r.dump()) == 3 assert r.request.is_used
def test_lazy_append_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.docs.append(Document()) # now it is read assert r.is_used
def test_lazy_nested_clear_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.index.ClearField('docs') # now it is read assert r.is_used
def test_lazy_change_message_type(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.control.command = jina_pb2.RequestProto.ControlRequestProto.IDLE # now it is read assert r.is_used assert len(r.index.docs) == 0
def test_lazy_nest_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.docs[0].id = '1' * 16 # now it is read assert r.is_used assert r.index.docs[0].id == '1' * 16
def test_lazy_access(field): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # access r.train print(getattr(r, field)) # now it is read assert r.is_used
def test_message_size(): reqs = [ Message(None, r, 'test', '123') for r in _generate(random_docs(10)) ] for r in reqs: assert r.size == 0 assert sys.getsizeof(r.envelope.SerializeToString()) assert sys.getsizeof(r.request.SerializeToString()) assert len(r.dump()) == 3 assert r.size > sys.getsizeof(r.envelope.SerializeToString()) \ + sys.getsizeof(r.request.SerializeToString())
def test_request_generate_lines_from_list(): def random_lines(num_lines): return [f'i\'m dummy doc {j}' for j in range(1, num_lines + 1)] req = _generate(data=random_lines(100), request_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'text/plain' assert doc.text == f'i\'m dummy doc {index}'
def test_request_generate_lines_with_fake_url(): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'https://github.com i\'m dummy doc {j}' req = _generate(data=random_lines(100), request_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'text/plain' assert doc.text == f'https://github.com i\'m dummy doc {index}'
def test_request_generate_bytes(): def random_lines(num_lines): for j in range(1, num_lines + 1): yield f'i\'m dummy doc {j}' req = _generate(data=random_lines(100), batch_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.text == f'i\'m dummy doc {index}' assert doc.mime_type == 'text/plain'
def test_multiple_access(): reqs = [ Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10)) ] for r in reqs: assert not r.is_used assert r assert not r.is_used for r in reqs: assert not r.is_used assert r.index assert r.is_used
def test_request_generate_numpy_arrays(): input_array = np.random.random([10, 10]) req = _generate(data=input_array, request_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10,) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10,)
def test_request_generate_docs(): def random_docs(num_docs): for j in range(1, num_docs + 1): doc = jina_pb2.DocumentProto() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.tags['id'] = 1000 # this will be ignored doc.mime_type = 'mime_type' yield doc req = _generate(data=random_docs(100), request_size=100) request = next(req) assert len(request.index.docs) == 100 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 100 assert doc.mime_type == 'mime_type' assert doc.text == f'i\'m dummy doc {index}' assert doc.offset == 1000
def test_request_generate_numpy_arrays_iterator(): input_array = np.random.random([10, 10]) def generator(): for array in input_array: yield array req = _generate(data=generator(), batch_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10, ) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert NdArray(doc.blob).value.shape == (10, )
def test_lazy_request_fields(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert list(r.DESCRIPTOR.fields_by_name.keys())