def create_chunk_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunk: (id: 101, granularity=1) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = Document() doc.id = '100' doc.granularity = 0 chunk = Document() chunk.id = '101' chunk.parent_id = doc.id chunk.granularity = doc.granularity + 1 num_matches = 2 for parent_id in range(1, 3): chunk_chunk = Document() chunk_chunk.id = str(parent_id * 10) chunk_chunk.parent_id = str(parent_id) chunk_chunk.granularity = chunk.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = Document() match.parent_id = str(parent_id) match.score = NamedScore(value=score_value, ref_id=chunk_chunk.id) match.id = str(10 * parent_id + score_value) match.length = 4 chunk_chunk.matches.append(match) chunk.chunks.append(chunk_chunk) doc.chunks.append(chunk) return Document(doc)
def create_document_to_score(): # doc: 1 # |- chunk: 2 # | |- matches: (id: 4, parent_id: 40, score.value: 4), # | |- matches: (id: 5, parent_id: 50, score.value: 5), # | # |- chunk: 3 # |- matches: (id: 6, parent_id: 60, score.value: 6), # |- matches: (id: 7, parent_id: 70, score.value: 7) doc = Document() doc.id = '1' for c in range(2): chunk = Document() chunk_id = str(c + 2) chunk.id = chunk_id for m in range(2): match = Document() match_id = 2 * int(chunk_id) + m match.id = str(match_id) parent_id = 10 * int(match_id) match.parent_id = str(parent_id) match.length = int(match_id) # to be used by MaxRanker and MinRanker match.score = NamedScore(value=int(match_id), ref_id=chunk.id) match.tags['price'] = match.score.value match.tags['discount'] = DISCOUNT_VAL chunk.matches.append(match) doc.chunks.append(chunk) return doc
def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \ -> DocumentArray: a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) top_k = int(parameters.get('top_k', 5)) assert top_k > 0 idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): doc = Document(self._docs[int(_id)], copy=True) doc.score.value = 1 - _dist doc.parent_id = int(_id) _q.matches.append(doc) return docs
def create_document_to_score_same_depth_level(): # doc: 1 # | matches: (id: 2, parent_id: 20, score.value: 30, length: 3), # | matches: (id: 3, parent_id: 20, score.value: 40, length: 4), # | matches: (id: 4, parent_id: 30, score.value: 20, length: 2), # | matches: (id: 5, parent_id: 30, score.value: 10, length: 1), doc = Document() doc.id = 1 for match_id, parent_id, match_score, weight in [ (2, 20, 30, 3), (3, 20, 40, 4), (4, 30, 20, 2), (5, 30, 10, 1), ]: match = Document() match.id = match_id match.parent_id = parent_id match.weight = weight match.score = NamedScore(value=match_score, ref_id=doc.id) doc.matches.append(match) return doc