def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1, start_id=0, embedding=True) -> Iterator['Document']: next_chunk_doc_id = start_id + num_docs for j in range(num_docs): doc_id = start_id + j d = Document(id=doc_id) d.text = b'hello world' d.tags['id'] = doc_id if embedding: d.embedding = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.update_content_hash() for _ in range(chunks_per_doc): chunk_doc_id = next_chunk_doc_id c = Document(id=chunk_doc_id) c.text = 'i\'m chunk %d from doc %d' % (chunk_doc_id, doc_id) if embedding: c.embedding = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['parent_id'] = doc_id c.tags['id'] = chunk_doc_id c.update_content_hash() d.chunks.append(c) next_chunk_doc_id += 1 yield d
def doc_with_multimodal_chunks(embeddings): doc = MultimodalDocument() chunk1 = Document() chunk2 = Document() chunk3 = Document() chunk1.modality = 'visual1' chunk2.modality = 'visual2' chunk3.modality = 'textual' chunk1.embedding = embeddings[0] chunk2.embedding = embeddings[1] chunk3.embedding = embeddings[2] doc.chunks.append(chunk1) doc.chunks.append(chunk2) doc.chunks.append(chunk3) return doc
def test_indexer_with_ref_indexer_compound_move(random_workspace_move, parallel, index_docs, mocker, uses_no_docker): top_k = 10 with Flow.load_config(os.path.join(cur_dir, 'compound-index.yml')) as index_flow: index_flow.index(input_fn=index_docs, request_size=10) mock = mocker.Mock() shutil.copytree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER'], os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER_QUERY']) shutil.rmtree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER']) def validate_response(resp): mock() assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config(os.path.join(cur_dir, 'compound-query.yml')) as query_flow: query_flow.search(input_fn=[query_document], on_done=validate_response, top_k=top_k) mock.assert_called_once()
def random_docs(start, end, embed_dim=10): for j in range(start, end): d = Document() d.id = f'{j:0>16}' d.tags['id'] = j d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim]) yield d
def random_docs( num_docs, chunks_per_doc=5, embed_dim=10, jitter=1, start_id=0, embedding=True, sparse_embedding=False, text='hello world', ) -> Iterator['Document']: next_chunk_doc_id = start_id + num_docs for j in range(num_docs): doc_id = start_id + j d = Document(id=doc_id) d.text = text d.tags['id'] = doc_id if embedding: if sparse_embedding: from scipy.sparse import coo_matrix d.embedding = coo_matrix( (np.array([1, 1, 1]), (np.array([0, 1, 2]), np.array([1, 2, 1])))) else: d.embedding = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.update_content_hash() for _ in range(chunks_per_doc): chunk_doc_id = next_chunk_doc_id c = Document(id=chunk_doc_id) c.text = 'i\'m chunk %d from doc %d' % (chunk_doc_id, doc_id) if embedding: c.embedding = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['parent_id'] = doc_id c.tags['id'] = chunk_doc_id c.update_content_hash() d.chunks.append(c) next_chunk_doc_id += 1 yield d
def random_docs(start, end, embed_dim=10, jitter=1, has_content=True): for j in range(start, end): d = Document() d.id = str(f'{j}' * 16) if has_content: d.tags['id'] = j d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim + np.random.randint(0, jitter)]) yield d
def test_extract_docs(): d = Document() contents, docs_pts = DocumentSet([d]).all_embeddings assert contents is None vec = np.random.random([2, 2]) d.embedding = vec contents, docs_pts = DocumentSet([d]).all_embeddings np.testing.assert_equal(contents[0], vec)
def doc_with_multimodal_chunks_wrong(embeddings): doc = Document() chunk1 = Document() chunk2 = Document() chunk3 = Document() chunk1.modality = 'visual' chunk2.modality = 'visual' chunk3.modality = 'textual' chunk1.embedding = embeddings[0] chunk2.embedding = embeddings[1] chunk3.embedding = embeddings[2] chunk1.update_id() chunk2.update_id() chunk3.update_id() doc.update_id() doc.chunks.append(chunk1) doc.chunks.append(chunk2) doc.chunks.append(chunk3) return doc
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = Document() doc1.id = str(1) * 16 doc1.embedding = np.array([int(doc1.id)]) doc2 = Document() doc2.id = str(2) * 16 doc2.embedding = np.array([int(doc2.id)]) doc3 = Document() doc3.id = str(3) * 16 doc3.embedding = np.array([int(doc3.id)]) doc4 = Document() doc4.id = str(4) * 16 doc4.embedding = np.array([int(doc4.id)]) self.db = { id2hash(doc1.id): doc1.SerializeToString(), id2hash(doc2.id): doc2.SerializeToString(), id2hash(doc3.id): doc3.SerializeToString(), id2hash(doc4.id): doc4.SerializeToString() }
def test_extract_docs(): d = Document() contents, docs_pts, bad_doc_ids = extract_embedding([d]) assert len(bad_doc_ids) > 0 assert contents is None vec = np.random.random([2, 2]) d.embedding = vec contents, docs_pts, bad_doc_ids = extract_embedding([d]) assert len(bad_doc_ids) == 0 np.testing.assert_equal(contents[0], vec)
def random_docs(start, end): documents = [] for j in range(start, end): d = Document() d.id = j d.tags['id'] = j d.text = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([10 + np.random.randint(0, 1)]) documents.append(d) return documents
def create_document_to_search(): # 1-D embedding # doc: 1 - chunk: 2 - embedding(2.0) # - chunk: 3 - embedding(3.0) # - chunk: 4 - embedding(4.0) # - chunk: 5 - embedding(5.0) # .... doc = Document() for c in range(10): chunk = Document() chunk.id = str(c) * 16 chunk.embedding = np.array([c]) doc.chunks.append(chunk) return doc
def documents(embedding_cls_type): doc = Document() for c in range(10): chunk = Document() chunk.id = str(c) * 16 dense_embedding = np.random.random([10]) if embedding_cls_type == 'dense': chunk.embedding = dense_embedding elif embedding_cls_type == 'scipy_csr': chunk.embedding = scipy.sparse.csr_matrix(dense_embedding) elif embedding_cls_type == 'scipy_coo': chunk.embedding = scipy.sparse.coo_matrix(dense_embedding) elif embedding_cls_type == 'torch': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = np.vstack( (sparse_embedding.row, sparse_embedding.col)) chunk.embedding = torch.sparse_coo_tensor( indices, values, sparse_embedding.shape, ) elif embedding_cls_type == 'tf': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = [ (x, y) for x, y in zip(sparse_embedding.row, sparse_embedding.col) ] chunk.embedding = tf.SparseTensor( indices=indices, values=values, dense_shape=[1, 10], ) doc.chunks.append(chunk) return doc
def test_indexer_with_ref_indexer(random_workspace, parallel, index_docs, mocker): top_k = 10 with Flow.load_config('index.yml') as index_flow: index_flow.index(input_fn=index_docs, batch_size=10) mock = mocker.Mock() def validate_response(resp): mock() assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config('query.yml') as query_flow: query_flow.search(input_fn=[query_document], on_done=validate_response, top_k=top_k) mock.assert_called_once()
def test_empty_shard(mocker, workdir): doc = Document() doc.text = 'text' doc.embedding = np.array([1, 1, 1]) def validate_response(resp): assert len(resp.docs) == 1 assert len(resp.docs[0].matches) == 0 mock = mocker.Mock() error_mock = mocker.Mock() with Flow.load_config(os.path.join(cur_dir, 'flow.yml')) as f: f.search([doc], on_done=mock, on_error=error_mock) mock.assert_called_once() validate_callback(mock, validate_response) error_mock.assert_not_called()
def test_indexer_with_ref_indexer(random_workspace, parallel, index_docs, mocker, uses_no_docker): top_k = 10 with Flow.load_config(os.path.join('index.yml')) as index_flow: index_flow.index(inputs=index_docs, request_size=10) mock = mocker.Mock() def validate_response(resp): assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config(os.path.join('query.yml')) as query_flow: query_flow.search(inputs=[query_document], on_done=mock, top_k=top_k) mock.assert_called_once() validate_callback(mock, validate_response)
def index_docs(): for i in range(0, 100): d = Document() d.embedding = np.array([i, i]) d.tags['filename'] = f' hey here {i}' yield d
def chunk_4(textual_embedding): chunk = Document() chunk.modality = 'textual' chunk.embedding = textual_embedding chunk.granularity = 1 return chunk
def chunk_1(visual_embedding): chunk = Document() chunk.modality = 'visual' chunk.embedding = visual_embedding chunk.granularity = 0 return chunk