def get_two_docarray(): d1 = Document(embedding=np.array([0, 0, 0])) d1c1 = Document(embedding=np.array([0, 1, 0])) d2 = Document(embedding=np.array([1, 0, 0])) d2c1 = Document(embedding=np.array([1, 1, 0])) d2c2 = Document(embedding=np.array([1, 0, 1])) d3 = Document(embedding=np.array([2, 1, 1])) d3c1 = Document(embedding=np.array([2, 1, 0])) d3c2 = Document(embedding=np.array([2, 0, 1])) d3c3 = Document(embedding=np.array([2, 0, 0])) d4 = Document(embedding=np.array([3, 1, 1])) d4c1 = Document(embedding=np.array([3, 1, 0])) d4c2 = Document(embedding=np.array([3, 0, 1])) d4c3 = Document(embedding=np.array([3, 0, 0])) d4c4 = Document(embedding=np.array([3, 1, 1])) d1.chunks.extend([d1c1]) d2.chunks.extend([d2c1, d2c2]) d3.chunks.extend([d3c1, d3c2, d3c3]) d4.chunks.extend([d4c1, d4c2, d4c3, d4c4]) da1 = DocumentArray([d1, d2]) da2 = DocumentArray([d3, d4]) yield da1, da2
async def _compute_response(): response_msg = copy.deepcopy(request) exec_endpoint = request.header.exec_endpoint new_docs = DocumentArray() await asyncio.sleep(0.1) if deployment == 'indexer-executor': if exec_endpoint == '/index': time.sleep(0.1) self._docs.extend(request.docs) else: docs = response_msg.docs docs.clear() docs.extend( DocumentArray( Document(tags={'ids': self._docs[:, 'id']}))) response_msg.data.docs = docs return response_msg else: if deployment == 'slow-executor': await asyncio.sleep(SLOW_EXECUTOR_SLEEP_TIME) for doc in request.docs: new_doc = Document(doc, copy=True) new_doc.tags['executor'] = time.time() print( f'in {deployment}, {new_doc.id} => time: {readable_time_from(new_doc.tags["executor"])}, {new_doc.tags["executor"]}', flush=True, ) new_docs.append(new_doc) docs = response_msg.docs docs.clear() docs.extend(new_docs) response_msg.data.docs = docs return response_msg
def test_scale_success(remote_flow_with_runtime: Flow, deployment_params): num_replicas, scale_to, shards = deployment_params with remote_flow_with_runtime as f: ret1 = Client(port=exposed_port).index( inputs=DocumentArray([Document() for _ in range(200)]), return_results=True, request_size=10, ) f.scale(deployment_name='executor', replicas=scale_to) ret2 = Client(port=exposed_port).index( inputs=DocumentArray([Document() for _ in range(200)]), return_results=True, request_size=10, ) assert len(ret1) == 20 replicas = set() for r in ret1: assert len(r.docs) == 10 # replicas are identified via their docker id for id in r.docs[:, 'tags__uid']: replicas.add(id) assert len(replicas) == num_replicas * shards assert len(ret2) == 20 replicas = set() for r in ret2: assert len(r.docs) == 10 for id in r.docs[:, 'tags__uid']: replicas.add(id) assert len(replicas) == scale_to * shards
def test_scale_success(remote_flow_with_runtime: Flow, pod_params): num_replicas, scale_to, shards = pod_params with remote_flow_with_runtime as f: ret1 = f.index( inputs=DocumentArray([Document() for _ in range(200)]), return_results=True, request_size=10, ) f.scale(pod_name='executor', replicas=scale_to) ret2 = f.index( inputs=DocumentArray([Document() for _ in range(200)]), return_results=True, request_size=10, ) assert len(ret1) == 20 replica_ids = set() for r in ret1: assert len(r.docs) == 10 for replica_id in r.docs.get_attributes('tags__replica_id'): replica_ids.add(replica_id) assert replica_ids == set(range(num_replicas)) assert len(ret2) == 20 replica_ids = set() for r in ret2: assert len(r.docs) == 10 for replica_id in r.docs.get_attributes('tags__replica_id'): replica_ids.add(replica_id) assert replica_ids == set(range(scale_to))
def test_cache_legacy_field_type(tmp_path, test_metas): filename = os.path.join(tmp_path, 'DocCache.bin') doc1 = Document(id=1) doc1.text = 'blabla' doc1.update_content_hash() docs1 = DocumentArray([doc1]) doc2 = Document(id=1) doc2.text = 'blabla2' doc2.update_content_hash() docs2 = DocumentArray([doc2]) doc3 = Document(id=12312) doc3.text = 'blabla' doc3.update_content_hash() docs3 = DocumentArray([doc3]) driver = MockBaseCacheDriver() with DocCache(filename, metas=test_metas, field=CONTENT_HASH_KEY) as executor: driver.attach(executor=executor, runtime=None) assert executor.fields == [CONTENT_HASH_KEY] driver._apply_all(docs1) driver._apply_all(docs2) assert executor.size == 2 with BaseExecutor.load(executor.save_abspath) as executor: driver.attach(executor=executor, runtime=None) assert executor.fields == [CONTENT_HASH_KEY] with pytest.raises(NotImplementedError): driver._apply_all(docs3)
def rank(self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs) -> 'DocumentArray': """ :param docs: the doc which gets bubbled up matches :param kwargs: not used (kept to maintain interface) """ result_da = DocumentArray( ) # length: 1 as every time there is only one query for d_mod1, d_mod2 in zip(*docs_matrix): final_matches = {} # type: Dict[str, Document] for m in d_mod1.matches: m.score.value *= d_mod1.weight final_matches[m.parent_id] = Document(m, copy=True) for m in d_mod2.matches: if m.parent_id in final_matches: final_matches[m.parent_id].score.value += (m.score.value * d_mod2.weight) else: m.score.value *= d_mod2.weight final_matches[m.parent_id] = Document(m, copy=True) da = DocumentArray(list(final_matches.values())) da.sort(key=lambda ma: ma.score.value, reverse=True) d = Document(matches=da[:int(parameters['top_k'])]) result_da.append(d) return result_da
def test_override_requests_uses_after(): class FooExecutor(Executor): @requests(on='/bar') def foo(self, docs, **kwargs): for doc in docs: doc.text = 'foo called' class OtherExecutor(Executor): @requests(on='/bar') def bar(self, docs, **kwargs): for doc in docs: doc.text = 'bar called' with Flow(port_expose=exposed_port).add( uses=FooExecutor, uses_requests={'/foo': 'foo'}, uses_after=OtherExecutor, uses_before=OtherExecutor, ) as f: c = Client(port=exposed_port) resp1 = c.post(on='/foo', inputs=DocumentArray([Document(text='')]), return_results=True) resp2 = c.post( on='/non_foo', inputs=DocumentArray([Document(text='')]), return_results=True, ) resp3 = c.post(on='/bar', inputs=DocumentArray([Document(text='')]), return_results=True) assert resp1[0].docs[0].text == 'foo called' assert resp2[0].docs[0].text == '' assert resp3[0].docs[0].text == 'bar called'
def test_docuset_traverse_over_iterator_CAVEAT(): # HACKY USAGE's CAVEAT: but it can not iterate over an iterator twice ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['r', 'c']) # note that random_docs is a generator and can be only used once, # therefore whoever comes first wil get iterated, and then it becomes empty assert len(list(ds)) == 1 + num_docs ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['c', 'r']) assert len(list(ds)) == num_docs + 1
def test_docuset_traverse_over_iterator_HACKY(): # HACKY USAGE DO NOT RECOMMEND: can also traverse over "runtime"-DocumentArray ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['r']) assert len(list(list(ds)[0])) == num_docs ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['c']) ds = list(ds) assert len(ds) == num_docs assert len(ds[0]) == num_chunks_per_doc
def test_extract_docs(): d = Document() contents, docs_pts = DocumentArray([d]).all_embeddings assert contents is None vec = np.random.random([2, 2]) d.embedding = vec contents, docs_pts = DocumentArray([d]).all_embeddings np.testing.assert_equal(contents[0], vec)
def test_documentarray_filter(): da = DocumentArray([Document() for _ in range(6)]) for j in range(6): da[j].scores['score'].value = j da = [d for d in da if d.scores['score'].value > 2] assert len(DocumentArray(da)) == 3 for d in da: assert d.scores['score'].value > 2
def test_doc_iter_method(filter_fn): ds = list(random_docs(10)) for d in DocumentArray(ds): assert d.text == 'hello world' for d in DocumentArray(ds).traverse_flat('c,r', filter_fn=filter_fn): d.text = 'modified' for d in DocumentArray(ds): assert d.text == 'modified'
def test_doc_iter_method(): ds = list(random_docs(10)) for d in DocumentArray(ds): assert d.text == 'hello world' for d in DocumentArray(ds).traverse_flat(['c', 'r']): d.text = 'modified' for d in DocumentArray(ds): assert d.text == 'modified'
def get_pair_document_array(): da1 = DocumentArray([ Document(id='1', embedding=np.array([1, 2])), Document(id='2', embedding=np.array([3, 4])), ]) da2 = DocumentArray([ Document(id='1', embedding=np.array([1, 2])), Document(id='2', embedding=np.array([3, 4])), Document(id='3', embedding=np.array([4, 5])), ]) yield da1, da2
def test_documentarray_filter(): da = DocumentArray([Document() for _ in range(6)]) for j in range(6): da[j].score.value = j da = filter(lambda d: d.score.value > 2, da) assert len(DocumentArray(list(da))) == 3 for d in da: assert d.score.value > 2
def test_match_exclude_self(exclude_self, num_matches, only_id): da1 = DocumentArray([ Document(id='1', embedding=np.array([1, 2])), Document(id='2', embedding=np.array([3, 4])), ]) da2 = DocumentArray([ Document(id='1', embedding=np.array([1, 2])), Document(id='2', embedding=np.array([3, 4])), ]) da1.match(da2, exclude_self=exclude_self, only_id=only_id) for d in da1: assert len(d.matches) == num_matches
def doc_lists_to_doc_arrays(doc_lists, tmpdir, first_memmap, second_memmap, buffer_pool_size): doc_list1, doc_list2 = doc_lists tmpdir1, tmpdir2 = tmpdir / '1', tmpdir / '2' D1 = (DocumentArray() if not first_memmap else DocumentArrayMemmap( tmpdir1, buffer_pool_size=buffer_pool_size)) D1.extend(doc_list1) D2 = (DocumentArray() if not second_memmap else DocumentArrayMemmap( tmpdir2, buffer_pool_size=buffer_pool_size)) D2.extend(doc_list2) return D1, D2
async def test_scale_remote_flow_async(docker_image_built, async_jinad_client, deployment_params): replicas, scale_to, shards = deployment_params workspace_id = await async_jinad_client.workspaces.create( paths=[os.path.join(cur_dir, cur_dir)]) assert workspace_id flow_id = await async_jinad_client.flows.create( workspace_id=workspace_id, filename='flow-scalable.yml', envs={ 'num_shards': str(shards), 'num_replicas': str(replicas) }, ) assert flow_id ret1 = Client(host=HOST, port=FLOW_PORT, protocol='http', asyncio=True).index( inputs=DocumentArray([Document() for _ in range(1000)]), return_results=True, request_size=10, ) process_ids = set() async for r in ret1: for p_id in r.docs[:, 'tags__process_id']: process_ids.add( p_id) # identify replicas by the process they run in assert len(process_ids) == replicas * shards await async_jinad_client.flows.scale(id=flow_id, deployment_name=SCALE_EXECUTOR, replicas=scale_to) ret2 = Client(host=HOST, port=FLOW_PORT, protocol='http', asyncio=True).index( inputs=DocumentArray([Document() for _ in range(1000)]), return_results=True, request_size=10, ) process_ids = set() async for r in ret2: for p_id in r.docs[:, 'tags__process_id']: process_ids.add(p_id) assert len(process_ids) == scale_to * shards assert await async_jinad_client.flows.delete(flow_id) assert await async_jinad_client.workspaces.delete(workspace_id)
def docarrays_for_embedding_distance_computation_sparse(): d1 = Document(embedding=sp.csr_matrix([0, 0, 0])) d2 = Document(embedding=sp.csr_matrix([3, 0, 0])) d3 = Document(embedding=sp.csr_matrix([1, 0, 0])) d4 = Document(embedding=sp.csr_matrix([2, 0, 0])) d1_m = Document(embedding=sp.csr_matrix([1, 0, 0])) d2_m = Document(embedding=sp.csr_matrix([2, 0, 0])) d3_m = Document(embedding=sp.csr_matrix([0, 0, 1])) d4_m = Document(embedding=sp.csr_matrix([0, 0, 2])) d5_m = Document(embedding=sp.csr_matrix([0, 0, 3])) D1 = DocumentArray([d1, d2, d3, d4]) D2 = DocumentArray([d1_m, d2_m, d3_m, d4_m, d5_m]) return D1, D2
def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self.index_file_name = index_file_name if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray()
async def _test(): responses = [] req = request_generator( '/', DocumentArray([Document(text='client0-Request')])) async for resp in runtime.streamer.Call(request_iterator=req): responses.append(resp) return responses
def send_requests_once( self, requests: List[Request], deployment: str, head: bool, endpoint: str = None, timeout: float = 1.0, ) -> asyncio.Task: assert head self.deployments_called.append(deployment) response_msg = copy.deepcopy(requests[0]) new_docs = DocumentArray() for doc in requests[0].docs: clientid = doc.text[0:7] self.sent_msg[clientid][deployment] = doc.text new_doc = Document( text=doc.text + f'-{clientid}-{deployment}', tags=doc.tags ) new_docs.append(new_doc) self.responded_messages[clientid][deployment] = new_doc.text response_msg.data.docs = new_docs async def task_wrapper(): import random await asyncio.sleep(1 / (random.randint(1, 3) * 10)) return response_msg, {} return asyncio.create_task(task_wrapper())
def _create_test_data_message(): req = list( request_generator( '/', DocumentArray([Document(text='input document') for _ in range(10)])))[0] return req
def test_document_save_load(method, tmp_path): da1 = DocumentArray(random_docs(1000)) da2 = DocumentArray() for doc in random_docs(10): da2.append(doc) for da in [da1, da2]: tmp_file = os.path.join(tmp_path, 'test') with TimeContext(f'w/{method}'): da.save(tmp_file, file_format=method) with TimeContext(f'r/{method}'): da_r = DocumentArray.load(tmp_file, file_format=method) assert len(da) == len(da_r) for d, d_r in zip(da, da_r): assert d.id == d_r.id np.testing.assert_equal(d.embedding, d_r.embedding) assert d.content == d_r.content
def empty_documents(): docs = [] for idx in range(100, 120): with Document() as d: d.id = f'{idx:0>16}' docs.append(d) return DocumentArray(docs)
def deleted_documents(): docs = [] for idx in range(3): with Document() as d: d.id = f'{idx:0>16}' docs.append(d) return DocumentArray(docs)
def index(num_docs: int): num_docs = min( num_docs, len(glob(os.path.join(os.getcwd(), IMAGE_SRC), recursive=True))) flow = Flow(workspace="workspace")\ .add(uses={"jtype": "ImageCrafter", "with": {"target_size": 96, "img_mean": [0.485, 0.456, 0.406], "img_std": [0.229, 0.224, 0.225]}}) \ .add(uses=BigTransferEncoder) \ .add(uses={"jtype": "EmbeddingIndexer", "with": {"index_file_name": "image.json"}, "metas": {"name": "vec_idx"}}, name="vec_idx") \ .add(uses={"jtype": "KeyValueIndexer", "metas": {"name": "kv_idx"}}, name="kv_idx", needs="gateway") \ .add(name="join_all", needs=["kv_idx", "vec_idx"], read_only="true") with flow: document_generator = from_files(IMAGE_SRC, size=num_docs) flow.post(on='/index', inputs=DocumentArray(document_generator), request_size=64, read_mode='rb')
def test_override_params(mocker): f = (Flow(port_expose=exposed_port).add(uses={ 'jtype': 'DummyOverrideParams', 'metas': { 'name': 'exec_name' } }, ).add(uses=DummyAssertNotOverrideBetweenPodsParams).add( uses=DummyAssertIfParamsCanBeChangedInsidePods)) error_mock = mocker.Mock() with f: resp = Client(port=exposed_port).index( inputs=DocumentArray([Document()]), parameters={ 'param1': 50, 'param2': 60, 'exec_name': { 'param1': 'changed' } }, on_error=error_mock, return_results=True, ) error_mock.assert_not_called() assert len(resp) == 1 assert len(resp[0].docs) == 1 for doc in resp[0].docs: assert doc.tags == OVERRIDEN_EXECUTOR1_PARAMS assert doc.tags['param1'] == 'changed' assert doc.tags['param2'] == 60 assert doc.tags['exec_name']['param1'] == 'changed'
def documents(embedding_cls_type, text_prefix='', num_docs=5): docs = [] for idx in range(num_docs): with Document(text=f'{text_prefix}{idx}') as d: d.id = f'{idx:0>16}' dense_embedding = np.random.random([10]) if embedding_cls_type == 'dense': d.embedding = dense_embedding elif embedding_cls_type == 'scipy_csr': d.embedding = scipy.sparse.csr_matrix(dense_embedding) elif embedding_cls_type == 'scipy_coo': d.embedding = scipy.sparse.coo_matrix(dense_embedding) elif embedding_cls_type == 'torch': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = np.vstack( (sparse_embedding.row, sparse_embedding.col)) d.embedding = torch.sparse_coo_tensor( indices, values, sparse_embedding.shape, ) elif embedding_cls_type == 'tf': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = [(x, y) for x, y in zip(sparse_embedding.row, sparse_embedding.col)] d.embedding = tf.SparseTensor( indices=indices, values=values, dense_shape=[1, 10], ) docs.append(d) return DocumentArray(docs)
def test_da_with_different_inputs(): docs = [Document() for _ in range(10)] da = DocumentArray( [docs[i] if (i % 2 == 0) else docs[i].proto for i in range(len(docs))]) assert len(da) == 10 for d in da: assert isinstance(d, Document)