def test_content_hash(): d0 = Document(content='a') assert d0.content empty_doc = Document() assert not empty_doc.content assert empty_doc.content_hash # warning: a Doc with empty content will have a hash -- it hashes '' assert empty_doc.content_hash != d0.content_hash d1 = Document(content='text') init_content_hash = d1.content_hash assert init_content_hash assert init_content_hash == d1.content_hash d2 = Document(content='text') assert init_content_hash == d2.content_hash d3 = Document(content='text1') assert init_content_hash != d3.content_hash d4 = Document(id='a') d5 = Document(id='b') assert d5.content_hash == d4.content_hash d6 = Document(d2.proto) assert d6.content_hash == d2.content_hash d7 = Document(d2) assert d6.content_hash == d2.content_hash == d7.content_hash # test hash image d8 = Document(blob=np.array([1, 3, 5])) d9 = Document(blob=np.array([2, 4, 6])) d10 = Document(blob=np.array([1, 3, 5])) assert d8.content_hash != d9.content_hash assert d8.content_hash == d10.content_hash # test hash buffer d11 = Document(content=b'buffer1') d12 = Document(content=b'buffer2') d13 = Document(content=b'buffer1') assert d11.content_hash != d12.content_hash assert d11.content_hash == d13.content_hash # document with more fields d14 = Document(uri='http://test1.com', tags={'key1': 'value1'}, granularity=2, adjacency=2) d15 = Document(uri='http://test2.com', tags={'key1': 'value2'}, granularity=3, adjacency=2) d16 = Document(uri='http://test2.com', tags={'key1': 'value2'}, granularity=3, adjacency=2) assert d14.content_hash != d15.content_hash assert d15.content_hash == d16.content_hash nr = 10 with TimeContext(f'creating {nr} docs without hashing content at init'): da = DocumentArray() for _ in range(nr): d = Document(content='text' * 2) da.append(d) with TimeContext(f'creating {nr} docs with hashing content at init'): da = DocumentArray() for _ in range(nr): d = Document(content='text' * 2) da.append(d) with TimeContext(f'iterating through docs with content hash'): for d in da: assert d.content_hash
def test_time_context(): with TimeContext('dummy') as tc: time.sleep(2) assert int(tc.duration) == 2 assert tc.readable_duration == '2 seconds'
def assert_dump_data(dump_path, docs, shards, pea_id): size_shard = len(docs) // shards size_shard_modulus = len(docs) % shards ids_dump, vectors_dump = import_vectors( dump_path, str(pea_id), ) if pea_id == shards - 1: docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard + size_shard_modulus] else: docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard] print(f'### pea {pea_id} has {len(docs_expected)} docs') ids_dump = list(ids_dump) vectors_dump = list(vectors_dump) np.testing.assert_equal(ids_dump, [d.id for d in docs_expected]) np.testing.assert_allclose(vectors_dump, [d.embedding for d in docs_expected]) _, metas_dump = import_metas( dump_path, str(pea_id), ) metas_dump = list(metas_dump) np.testing.assert_equal( metas_dump, [ DBMSIndexDriver._doc_without_embedding(d).SerializeToString() for d in docs_expected ], ) # assert with Indexers # TODO currently metas are only passed to the parent Compound, not to the inner components with TimeContext(f'### reloading {len(docs_expected)}'): # noinspection PyTypeChecker cp: CompoundQueryExecutor = BaseQueryIndexer.load_config( 'indexer_query.yml', pea_id=pea_id, metas={ 'workspace': os.path.join(dump_path, 'new_ws'), 'dump_path': dump_path, }, ) for c in cp.components: assert c.size == len(docs_expected) # test with the inner indexers separate from the Compound for i, indexer_file in enumerate( ['basic/query_np.yml', 'basic/query_kv.yml']): indexer = BaseQueryIndexer.load_config( indexer_file, pea_id=pea_id, metas={ 'workspace': os.path.realpath(os.path.join(dump_path, f'new_ws-{i}')), 'dump_path': dump_path, }, ) assert indexer.size == len(docs_expected)
def test_flow_slow_executor_inter(): f = (Flow().add(uses='SlowExecutor', parallel=3).add(uses='SlowExecutor', parallel=3)) with f, TimeContext('start flow') as tc: assert tc.now() < 8
async def test_run_async_flow_other_task_concurrent(): with TimeContext('concurrent await') as t: await concurrent_main() # some dispatch cost, can't be just 5s, usually at <7s assert t.duration < 8
def test_threading_query_while_reloading(tmpdir, nr_docs, emb_size, mocker, reraise): global operations def update_rolling(flow, pod_name, dump_path): with reraise: flow.rolling_update(pod_name, dump_path) # TODO better way to test async procedure call order # patch def _rolling_update(self, dump_path): _print_and_append_to_ops(f'### calling patched rolling update') for i in range(len(self.replicas)): _print_and_append_to_ops(f'### replica {i} -- starting') replica = self.replicas[i] replica.close() _print_and_append_to_ops(f'### replica {i} -- went offline') time.sleep( 3) # wait for query to hit system when one replica is offline _args = self.replicas_args[i] _args.noblock_on_start = False _args.dump_path = dump_path new_replica = Pod(_args) self.enter_context(new_replica) _print_and_append_to_ops(f'### replica {i} - new instance online') self.replicas[i] = new_replica time.sleep(5) mocker.patch( 'jina.peapods.pods.compoundpod.CompoundPod.rolling_update', new_callable=lambda: _rolling_update, ) docs = list(get_documents(nr=nr_docs, index_start=0, emb_size=emb_size)) assert len(docs) == nr_docs nr_search = 3 dump_path = os.path.join(str(tmpdir), 'dump_dir') os.environ['DBMS_WORKSPACE'] = os.path.join(str(tmpdir), 'index_ws') os.environ['QUERY_WORKSPACE'] = os.path.join(str(tmpdir), 'query_ws') os.environ['USES_AFTER'] = '_pass' os.environ['QUERY_SHARDS'] = str(1) with Flow.load_config('flow_dbms.yml') as flow_dbms: with Flow.load_config('flow_query.yml') as flow_query: client_dbms = get_client(flow_dbms.port_expose) client_query = get_client(flow_query.port_expose) with TimeContext(f'### indexing {len(docs)} docs'): client_dbms.index(docs) with TimeContext(f'### dumping {len(docs)} docs'): flow_dbms.dump('indexer_dbms', dump_path=dump_path, shards=1) dir_size = path_size(dump_path) print(f'### dump path size: {dir_size} MBs') # test with query while reloading async. t = Thread(target=update_rolling, args=(flow_query, 'indexer_query', dump_path)) # searching on the still empty replica t.start() time.sleep(1) # wait a bit for replica 1 to be offline _print_and_append_to_ops(f'### querying -- expecting empty') result = client_query.search(docs[:nr_search], ) _validate_results_empty(result[0]) t.join() # done with both -- we should have matches now cb = functools.partial(_validate_results_nonempty, nr_search, nr_docs, emb_size) _print_and_append_to_ops(f'### querying -- expecting data') result = client_query.search(docs[:nr_search], ) cb(result[0]) # collect logs and assert order of operations assert _assert_order_ops( operations, [ '### replica 0 -- went offline', '### querying -- expecting empty', '### replica 0 - new instance online', '### replica 1 -- went offline', '### replica 1 - new instance online', '### querying -- expecting data', ], ) operations = []
async def test_run_async_flow_other_task_sequential(): with TimeContext('sequential await') as t: await sequential_main() assert t.duration >= 10
async def test_run_async_flow_other_task_concurrent(protocol): with TimeContext('concurrent await') as t: await concurrent_main(protocol) # some dispatch cost, can't be just 5s, usually at 7~8s, but must <10s assert t.duration < 10
def test_flow_slow_executor_inter(): f = Flow().add(uses='SlowExecutor', shards=3).add(uses='SlowExecutor', shards=3) with f, TimeContext('start flow') as tc: assert tc.now() < 8
def index(num_doc, target: dict): f = Flow.load_config('flows/index.yml') with f: with TimeContext(f'QPS: indexing {num_doc}', logger=f.logger): f.index(index_generator(num_doc, target), request_size=2048)
def index(): f = Flow.load_config('flows/index.yml') with f: input_docs = input_fn() with TimeContext(f'QPS: indexing {len(input_docs)}', logger=f.logger): f.index(input_docs, request_size=8)