def create_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = Document() doc.id = '1' doc.granularity = 0 num_matches = 2 for parent_id in range(1, 3): chunk = Document() chunk_id = parent_id * 10 chunk.id = str(chunk_id) chunk.granularity = doc.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = Document() match.granularity = chunk.granularity match.parent_id = str(parent_id) match.score = NamedScore(value=score_value, ref_id=chunk.id) match.id = str(10 * int(parent_id) + score_value) match.length = 4 chunk.matches.append(match) doc.chunks.append(chunk) return doc
def request(field_type): num_docs = 10 req = jina_pb2.RequestProto() for idx in range(num_docs): doc = req.index.docs.add() gt = req.index.groundtruths.add() chunk_doc = Document(doc.chunks.add()) chunk_gt = Document(gt.chunks.add()) chunk_doc.granularity = 1 chunk_gt.granularity = 1 if field_type == 'text': chunk_doc.text = 'aaa' chunk_gt.text = 'aaaa' elif field_type == 'buffer': chunk_doc.buffer = b'\x01\x02\x03' chunk_gt.buffer = b'\x01\x02\x03\x04' elif field_type == 'blob': chunk_doc.blob = np.array([1, 1, 1]) chunk_gt.blob = np.array([1, 1, 1, 1]) return Request(req).as_typed_request('index')
def build_docs(): """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """ max_granularity = 2 max_adjacency = 2 def iterate_build(document, current_granularity, current_adjacency): if current_granularity < max_granularity: for i in range(DOCUMENTS_PER_LEVEL): chunk = add_chunk(document) iterate_build(chunk, chunk.granularity, chunk.adjacency) if current_adjacency < max_adjacency: for i in range(DOCUMENTS_PER_LEVEL): match = add_match(document) iterate_build(match, match.granularity, match.adjacency) docs = [] for base_id in range(DOCUMENTS_PER_LEVEL): document = Document() document.granularity = 0 document.adjacency = 0 docs.append(document) iterate_build(document, 0, 0) return DocumentArray(docs)
def add_match(doc): match = Document() match.granularity = doc.granularity match.adjacency = doc.adjacency + 1 doc.matches.append(match) return match
def add_chunk(doc): chunk = Document() chunk.granularity = doc.granularity + 1 chunk.adjacency = doc.adjacency doc.chunks.append(chunk) return chunk
def chunk_4(textual_embedding): chunk = Document() chunk.modality = 'textual' chunk.embedding = textual_embedding chunk.granularity = 1 return chunk
def chunk_2(textual_embedding): chunk = Document() chunk.modality = 'textual' chunk.content = textual_embedding chunk.granularity = 0 return chunk
def chunk_1(visual_embedding): chunk = Document() chunk.modality = 'visual' chunk.embedding = visual_embedding chunk.granularity = 0 return chunk