def create_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = Document() doc.id = '1' doc.granularity = 0 num_matches = 2 for parent_id in range(1, 3): chunk = Document() chunk_id = parent_id * 10 chunk.id = str(chunk_id) chunk.granularity = doc.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = Document() match.granularity = chunk.granularity match.parent_id = str(parent_id) match.score = NamedScore(value=score_value, ref_id=chunk.id) match.id = str(10 * int(parent_id) + score_value) match.length = 4 chunk.matches.append(match) doc.chunks.append(chunk) return doc
def create_document_to_score(): # doc: 1 # |- chunk: 2 # | |- matches: (id: 4, parent_id: 40, score.value: 4), # | |- matches: (id: 5, parent_id: 50, score.value: 5), # | # |- chunk: 3 # |- matches: (id: 6, parent_id: 60, score.value: 6), # |- matches: (id: 7, parent_id: 70, score.value: 7) doc = Document() doc.id = '1' for c in range(2): chunk = Document() chunk_id = str(c + 2) chunk.id = chunk_id for m in range(2): match = Document() match_id = 2 * int(chunk_id) + m match.id = str(match_id) parent_id = 10 * int(match_id) match.parent_id = str(parent_id) match.length = int(match_id) # to be used by MaxRanker and MinRanker match.score = NamedScore(value=int(match_id), ref_id=chunk.id) match.tags['price'] = match.score.value match.tags['discount'] = DISCOUNT_VAL chunk.matches.append(match) doc.chunks.append(chunk) return doc
def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = Document() d.id = j for k in range(chunks_per_doc): dd = Document() dd.id = num_docs + j * chunks_per_doc + k d.chunks.add(dd) yield d
def get_docs_to_delete(doc_id_to_chunk_ids): for i, (doc_id, chunks) in enumerate(doc_id_to_chunk_ids.items()): document = Document() document.id = str(f'{i}' * 16) for chunk in chunks: document.chunks.append(chunk) yield document
def create_document(doc_id, text, weight, length): d = Document() d.id = str(doc_id) d.buffer = text.encode('utf8') d.weight = weight d.length = length return d
def random_docs(start, end, embed_dim=10): for j in range(start, end): d = Document() d.id = f'{j:0>16}' d.tags['id'] = j d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim]) yield d
def index_docs(): docs = [] for idx in range(0, 100): doc = Document() doc.id = f'{idx:0>16}' doc.embedding = doc.embedding = np.array([idx, idx]) docs.append(doc) return docs
def test_broken_document(segment_driver, text_segmenter_executor): segment_driver.attach(executor=text_segmenter_executor, runtime=None) invalid_doc = Document() invalid_doc.id = 1 invalid_doc.text = 'invalid' with pytest.raises(AttributeError): segment_driver._apply_all([DocumentArray([invalid_doc])])
def random_docs(start, end, embed_dim=10, jitter=1, has_content=True): for j in range(start, end): d = Document() d.id = str(f'{j}' * 16) if has_content: d.tags['id'] = j d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim + np.random.randint(0, jitter)]) yield d
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = Document() doc1.id = str(1) * 16 doc1.embedding = np.array([int(doc1.id)]) doc2 = Document() doc2.id = str(2) * 16 doc2.embedding = np.array([int(doc2.id)]) doc3 = Document() doc3.id = str(3) * 16 doc3.embedding = np.array([int(doc3.id)]) doc4 = Document() doc4.id = str(4) * 16 doc4.embedding = np.array([int(doc4.id)]) self.db = { id2hash(doc1.id): doc1.SerializeToString(), id2hash(doc2.id): doc2.SerializeToString(), id2hash(doc3.id): doc3.SerializeToString(), id2hash(doc4.id): doc4.SerializeToString() }
def test_broken_document(): driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, runtime=None) invalid_doc = Document() invalid_doc.id = 1 invalid_doc.text = 'invalid' with pytest.raises(AttributeError): driver._apply_all([DocumentSet([invalid_doc])])
def random_docs(start, end): documents = [] for j in range(start, end): d = Document() d.id = j d.tags['id'] = j d.text = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([10 + np.random.randint(0, 1)]) documents.append(d) return documents
def test_broken_document(): driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) invalid_doc = Document() invalid_doc.id = 1 invalid_doc.text = 'invalid' invalid_doc.length = 2 assert invalid_doc.length == 2 with pytest.raises(AttributeError): driver._apply_all([invalid_doc])
def create_document_to_search(): # 1-D embedding # doc: 0 # - chunk: 1 # - chunk: 2 # - chunk: 3 # - chunk: 4 # - chunk: 5 - will be missing from KV indexer doc = Document() doc.id = '0' * 16 for c in range(5): chunk = doc.add_chunk() chunk.id = str(c + 1) * 16 return doc
def create_document_to_score_same_depth_level(): # doc: 1 # | matches: (id: 2, parent_id: 20, score.value: 30, length: 3), # | matches: (id: 3, parent_id: 20, score.value: 40, length: 4), # | matches: (id: 4, parent_id: 30, score.value: 20, length: 2), # | matches: (id: 5, parent_id: 30, score.value: 10, length: 1), doc = Document() doc.id = 1 for match_id, parent_id, match_score, weight in [ (2, 20, 30, 3), (3, 20, 40, 4), (4, 30, 20, 2), (5, 30, 10, 1), ]: match = Document() match.id = match_id match.parent_id = parent_id match.weight = weight match.score = NamedScore(value=match_score, ref_id=doc.id) doc.matches.append(match) return doc
def create_document_to_search(): # 1-D embedding # doc: 1 - chunk: 2 - embedding(2.0) # - chunk: 3 - embedding(3.0) # - chunk: 4 - embedding(4.0) # - chunk: 5 - embedding(5.0) # .... doc = Document() for c in range(10): chunk = Document() chunk.id = str(c) * 16 chunk.embedding = np.array([c]) doc.chunks.append(chunk) return doc
def document(): # 1-D embedding # doc: 0 # - chunk: 1 # - chunk: 2 # - chunk: 3 # - chunk: 4 # - chunk: 5 - will be missing from KV indexer doc = Document() doc.id = '0' * 16 for c in range(5): with Document() as chunk: chunk.id = str(c + 1) * 16 doc.chunks.add(chunk) return doc
def create_document_to_search_with_matches_on_chunks(): # 1-D embedding # doc: 0 # - chunk: 1 # - match: 2 # - match: 3 # - match: 4 # - match: 5 - will be missing from KV indexer # - match: 6 - will be missing from KV indexer doc = Document() doc.id = '0' * 16 chunk = doc.add_chunk() chunk.id = '1' * 16 for m in range(5): match = chunk.add_match(doc_id=str(m + 2) * 16, score_value=1.) return doc
def create_document_to_score(): # doc: 1 # |- matches: (id: 2, parent_id: 1, score.value: 2), # |- matches: (id: 3, parent_id: 1, score.value: 3), # |- matches: (id: 4, parent_id: 1, score.value: 4), # |- matches: (id: 5, parent_id: 1, score.value: 5), doc = Document() doc.id = '1' * 16 doc.length = 5 for match_id, match_score in [(2, 3), (3, 6), (4, 1), (5, 8)]: with Document() as match: match.id = str(match_id) * 16 match.length = match_score match.score.value = match_score doc.matches.append(match) return doc
def create_document_to_search_with_matches_on_chunks(): # 1-D embedding # doc: 0 # - chunk: 1 # - match: 2 # - match: 3 # - match: 4 # - match: 5 - will be missing from KV indexer # - match: 6 - will be missing from KV indexer doc = Document() doc.id = '0' * 16 chunk = doc.chunks.append() chunk.id = '1' * 16 for m in range(5): d = Document(id=str(m + 2) * 16) d.score.value = 1. chunk.matches.append(d) return doc
def create_document_to_score(): # doc: 1 # |- matches: (id: 2, parent_id: 1, score.value: 2), # |- matches: (id: 3, parent_id: 1, score.value: 3), # |- matches: (id: 4, parent_id: 1, score.value: 4), # |- matches: (id: 5, parent_id: 1, score.value: 5), doc = Document() doc.id = '1' * 20 for match_id, match_score, match_length in [ (2, 3, 16), (3, 6, 24), (4, 1, 8), (5, 8, 16), ]: with Document() as match: match.id = match_id match.score = NamedScore(value=match_score, ref_id=doc.id) match.weight = match_length doc.matches.append(match) return doc
def evaluate_docs(): """Evaluate Documents: doc: id = 00 tag__groundtruth = False text = aaa doc: id = 01 tag__groundtruth = False text = aaa doc: id = 02 tag__groundtruth = False text = aaa ... """ docs = [] for idx in range(0, 100): doc = Document() doc.id = f'{idx:0>16}' doc.tags['groundtruth'] = False doc.text = 'aaa' docs.append(doc) return docs
def index_groundtruth(): """Index Groundtruth: doc: id = 00 tag__groundtruth = True text = aa doc: id = 01 tag__groundtruth = True text = aa doc: id = 02 tag__groundtruth = True text = aa ... we will not have groundtruth for id 5, 10, 50 """ docs = [] for idx in range(0, 100): doc = Document() doc.id = f'{idx:0>16}' doc.tags['groundtruth'] = True doc.text = 'aa' if idx not in (5, 10, 50): docs.append(doc) return docs
def documents(embedding_cls_type): doc = Document() for c in range(10): chunk = Document() chunk.id = str(c) * 16 dense_embedding = np.random.random([10]) if embedding_cls_type == 'dense': chunk.embedding = dense_embedding elif embedding_cls_type == 'scipy_csr': chunk.embedding = scipy.sparse.csr_matrix(dense_embedding) elif embedding_cls_type == 'scipy_coo': chunk.embedding = scipy.sparse.coo_matrix(dense_embedding) elif embedding_cls_type == 'torch': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = np.vstack( (sparse_embedding.row, sparse_embedding.col)) chunk.embedding = torch.sparse_coo_tensor( indices, values, sparse_embedding.shape, ) elif embedding_cls_type == 'tf': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = [ (x, y) for x, y in zip(sparse_embedding.row, sparse_embedding.col) ] chunk.embedding = tf.SparseTensor( indices=indices, values=values, dense_shape=[1, 10], ) doc.chunks.append(chunk) return doc
def create_document(doc_id, text, weight): d = Document() d.id = str(doc_id) d.buffer = text.encode('utf8') d.weight = weight return d
def get_docs_to_index(contents): for i, content in enumerate(contents): document = Document() document.id = str(f'{i}' * 16) document.text = content yield document