async def _compute_response(): response_msg = copy.deepcopy(request) exec_endpoint = request.header.exec_endpoint new_docs = DocumentArray() await asyncio.sleep(0.1) if deployment == 'indexer-executor': if exec_endpoint == '/index': time.sleep(0.1) self._docs.extend(request.docs) else: docs = response_msg.docs docs.clear() docs.extend( DocumentArray( Document(tags={'ids': self._docs[:, 'id']}))) response_msg.data.docs = docs return response_msg else: if deployment == 'slow-executor': await asyncio.sleep(SLOW_EXECUTOR_SLEEP_TIME) for doc in request.docs: new_doc = Document(doc, copy=True) new_doc.tags['executor'] = time.time() print( f'in {deployment}, {new_doc.id} => time: {readable_time_from(new_doc.tags["executor"])}, {new_doc.tags["executor"]}', flush=True, ) new_docs.append(new_doc) docs = response_msg.docs docs.clear() docs.extend(new_docs) response_msg.data.docs = docs return response_msg
def _yield_data_request(): req = DataRequest() req.header.request_id = random_identity() da = DocumentArray() da.append(Document()) req.data.docs = da return req
def send_requests_once( self, requests: List[Request], deployment: str, head: bool, endpoint: str = None, timeout: float = 1.0, ) -> asyncio.Task: assert head self.deployments_called.append(deployment) response_msg = copy.deepcopy(requests[0]) new_docs = DocumentArray() for doc in requests[0].docs: clientid = doc.text[0:7] self.sent_msg[clientid][deployment] = doc.text new_doc = Document( text=doc.text + f'-{clientid}-{deployment}', tags=doc.tags ) new_docs.append(new_doc) self.responded_messages[clientid][deployment] = new_doc.text response_msg.data.docs = new_docs async def task_wrapper(): import random await asyncio.sleep(1 / (random.randint(1, 3) * 10)) return response_msg, {} return asyncio.create_task(task_wrapper())
def debug(self, docs_matrix: List[DocumentArray], **kwargs): self.logger.debug( f'Received doc matrix in exec-merger with length {len(docs_matrix)}.' ) result = DocumentArray() for docs in zip(*docs_matrix): traversed_executors = [ doc.tags['traversed-executors'] for doc in docs ] shard_ids = [doc.tags['shard_id'] for doc in docs] pea_ids = [doc.tags['pea_id'] for doc in docs] shards = [doc.tags['shards'] for doc in docs] parallels = [doc.tags['parallel'] for doc in docs] traversed_executors = list(chain(*traversed_executors)) doc = Document() doc.tags['traversed-executors'] = traversed_executors doc.tags['shard_id'] = shard_ids doc.tags['pea_id'] = pea_ids doc.tags['shards'] = shards doc.tags['parallel'] = parallels result.append(doc) return result
def rank(self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs) -> 'DocumentArray': """ :param docs: the doc which gets bubbled up matches :param kwargs: not used (kept to maintain interface) """ result_da = DocumentArray( ) # length: 1 as every time there is only one query for d_mod1, d_mod2 in zip(*docs_matrix): final_matches = {} # type: Dict[str, Document] for m in d_mod1.matches: m.score.value *= d_mod1.weight final_matches[m.parent_id] = Document(m, copy=True) for m in d_mod2.matches: if m.parent_id in final_matches: final_matches[m.parent_id].score.value += (m.score.value * d_mod2.weight) else: m.score.value *= d_mod2.weight final_matches[m.parent_id] = Document(m, copy=True) da = DocumentArray(list(final_matches.values())) da.sort(key=lambda ma: ma.score.value, reverse=True) d = Document(matches=da[:int(parameters['top_k'])]) result_da.append(d) return result_da
def send_requests_once( self, requests, deployment: str, head: bool, endpoint: str = None, timeout: float = 1.0, ) -> asyncio.Task: assert head request = requests[0] response_msg = copy.deepcopy(request) new_docs = DocumentArray() docs = request.docs for doc in docs: clientid = doc.text[0:7] new_doc = Document(id=doc.id, text=doc.text + f'-{clientid}-{deployment}') new_docs.append(new_doc) response_msg.data.docs = new_docs async def task_wrapper(): import random await asyncio.sleep(1 / (random.randint(1, 3) * 10)) return response_msg, {} return asyncio.create_task(task_wrapper())
def _get_sync_requests_iterator(num_requests): for i in range(num_requests): req = DataRequest() req.header.request_id = random_identity() da = DocumentArray() da.append(Document()) req.data.docs = da yield req
def join_reduce(self, docs_matrix: List[DocumentArray], parameters, **kwargs): final_docs = DocumentArray() for doc_arr in docs_matrix: if not doc_arr: continue for doc in doc_arr: final_docs.append(doc) return final_docs
def encoder_doc_array_for_search(encoder_doc_array, tmpdir): create_test_img(path=str(tmpdir), file_name='1.png') da = DocumentArray() for doc in encoder_doc_array: for chunk in doc.chunks: if chunk.mime_type == 'image/jpeg': chunk.convert_uri_to_datauri() da.append(doc) return da
def test_union(docarray, document_factory): additional_docarray = DocumentArray([]) for idx in range(4, 10): doc = document_factory.create(idx, f'test {idx}') additional_docarray.append(doc) union = docarray + additional_docarray for idx in range(0, 3): assert union[idx].id == docarray[idx].id for idx in range(0, 6): assert union[idx + 3].id == additional_docarray[idx].id
def test_document_save_load(method, tmp_path): da1 = DocumentArray(random_docs(1000)) da2 = DocumentArray() for doc in random_docs(10): da2.append(doc) for da in [da1, da2]: tmp_file = os.path.join(tmp_path, 'test') with TimeContext(f'w/{method}'): da.save(tmp_file, file_format=method) with TimeContext(f'r/{method}'): da_r = DocumentArray.load(tmp_file, file_format=method) assert len(da) == len(da_r) for d, d_r in zip(da, da_r): assert d.id == d_r.id np.testing.assert_equal(d.embedding, d_r.embedding) assert d.content == d_r.content
def debug(self, docs_matrix: List[DocumentArray], parameters: Dict, **kwargs): self.logger.debug( f'Received doc matrix in exec-merger with length {len(docs_matrix)}.' ) result = DocumentArray() for docs in zip(*docs_matrix): traversed_executors = [ doc.tags['traversed-executors'] for doc in docs ] traversed_executors = list(chain(*traversed_executors)) doc = Document() doc.tags['traversed-executors'] = traversed_executors result.append(doc) return result
def rank(self, docs_matrix: List[DocumentArray], parameters: Dict, **kwargs): result = DocumentArray() docs_matrix = [doc_arr for doc_arr in docs_matrix if doc_arr is not None and len(doc_arr) > 0] for single_doc_per_modality in zip(*docs_matrix): final_matches = {} for doc in single_doc_per_modality: for m in doc.matches: if m.tags['root_doc_id'] in final_matches: final_matches[m.tags['root_doc_id']].score.value += m.score.value else: final_matches[m.tags['root_doc_id']] = Document(id=m.tags['root_doc_id'], score=m.score) da = DocumentArray(list(final_matches.values())) da.sort(key=lambda ma: ma.score.value, reverse=True) d = Document(matches=da[: int(parameters.get('top_k', 3))]) result.append(d) return result
def docarray_for_nest_split(): da = DocumentArray() da.append(Document(tags={'nest': {'category': 'c'}})) da.append(Document(tags={'nest': {'category': 'c'}})) da.append(Document(tags={'nest': {'category': 'b'}})) da.append(Document(tags={'nest': {'category': 'a'}})) da.append(Document(tags={'nest': {'category': 'a'}})) return da
def docarray_for_split_at_zero(): da = DocumentArray() da.append(Document(tags={'category': 0.0})) da.append(Document(tags={'category': 0.0})) da.append(Document(tags={'category': 1.0})) da.append(Document(tags={'category': 2.0})) da.append(Document(tags={'category': 2.0})) return da
def rank(self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs) -> 'DocumentArray': """ :param docs_matrix: list of :class:`DocumentArray` on multiple requests to get bubbled up matches. :param parameters: the parameters passed into the ranker, in this case stores :attr`top_k` to filter k results based on score. :param kwargs: not used (kept to maintain interface) """ result_da = DocumentArray( ) # length: 1 as every time there is only one query for d_mod1, d_mod2 in zip(*docs_matrix): final_matches = {} # type: Dict[str, Document] for m in d_mod1.matches: m.scores[ 'relevance'] = m.scores['cosine'].value * d_mod1.weight final_matches[m.parent_id] = Document(m, copy=True) for m in d_mod2.matches: if m.parent_id in final_matches: final_matches[ m.parent_id].scores['relevance'] = final_matches[ m.parent_id].scores['relevance'].value + ( m.scores['cosine'].value * d_mod2.weight) else: m.scores[ 'relevance'] = m.scores['cosine'].value * d_mod2.weight final_matches[m.parent_id] = Document(m, copy=True) da = DocumentArray(list(final_matches.values())) da.sort(key=lambda ma: ma.scores['relevance'].value, reverse=True) d = Document(matches=da[:int(parameters['top_k'])]) result_da.append(d) return result_da
def docarray_for_split_at_zero(): da = DocumentArray() da.append(Document(tags={'category': 0.0})) da.append(Document(tags={'category': 0.0})) da.append(Document(tags={'category': 1.0})) da.append(Document(tags={'category': 2.0})) da.append(Document(tags={'category': 2.0})) dam = DocumentArrayMemmap() dam.extend(da) return da, dam
def docarray_for_split(): da = DocumentArray() da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'b'})) da.append(Document(tags={'category': 'a'})) da.append(Document(tags={'category': 'a'})) dam = DocumentArrayMemmap() dam.extend(da) return da, dam
def foo(self, docs: DocumentArray, **kwargs): docs.append(Document(text=str(self.shard_id))) return docs
def create_req_from_text(text: str): req = DataRequest() da = DocumentArray() da.append(Document(text=text, tags={'key': 4})) req.data.docs = da return req
def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray: embedding_docs = DocumentArray() for doc in docs: embedding_docs.append(Document(id=doc.id, embedding=doc.embedding)) self._docs.extend(embedding_docs) return docs
def test_content_hash(): d0 = Document(content='a') assert d0.content empty_doc = Document() assert not empty_doc.content assert empty_doc.content_hash # warning: a Doc with empty content will have a hash -- it hashes '' assert empty_doc.content_hash != d0.content_hash d1 = Document(content='text') init_content_hash = d1.content_hash assert init_content_hash assert init_content_hash == d1.content_hash d2 = Document(content='text') assert init_content_hash == d2.content_hash d3 = Document(content='text1') assert init_content_hash != d3.content_hash d4 = Document(id='a') d5 = Document(id='b') assert d5.content_hash == d4.content_hash d6 = Document(d2.proto) assert d6.content_hash == d2.content_hash d7 = Document(d2) assert d6.content_hash == d2.content_hash == d7.content_hash # test hash image d8 = Document(blob=np.array([1, 3, 5])) d9 = Document(blob=np.array([2, 4, 6])) d10 = Document(blob=np.array([1, 3, 5])) assert d8.content_hash != d9.content_hash assert d8.content_hash == d10.content_hash # test hash buffer d11 = Document(content=b'buffer1') d12 = Document(content=b'buffer2') d13 = Document(content=b'buffer1') assert d11.content_hash != d12.content_hash assert d11.content_hash == d13.content_hash # document with more fields d14 = Document(uri='http://test1.com', tags={'key1': 'value1'}, granularity=2, adjacency=2) d15 = Document(uri='http://test2.com', tags={'key1': 'value2'}, granularity=3, adjacency=2) d16 = Document(uri='http://test2.com', tags={'key1': 'value2'}, granularity=3, adjacency=2) assert d14.content_hash != d15.content_hash assert d15.content_hash == d16.content_hash nr = 10 with TimeContext(f'creating {nr} docs without hashing content at init'): da = DocumentArray() for _ in range(nr): d = Document(content='text' * 2) da.append(d) with TimeContext(f'creating {nr} docs with hashing content at init'): da = DocumentArray() for _ in range(nr): d = Document(content='text' * 2) da.append(d) with TimeContext(f'iterating through docs with content hash'): for d in da: assert d.content_hash
def test_delete_by_id(docarray: DocumentArray, document_factory): doc = document_factory.create(4, 'test 4') docarray.append(doc) del docarray[doc.id] assert len(docarray) == 3 assert docarray == docarray
def no_polling(self, docs: DocumentArray, **kwargs): docs.append(Document(text='added')) return docs
def search(self, docs: DocumentArray, **kwargs): docs.append(Document(text='added')) return docs
def foo(self, docs: DocumentArray, **kwargs): self.logger.info(f'doc count {len(docs)}') docs.append(Document(text=self.name)) return docs