def test_generic(): from jina.types.ndarray.generic import NdArray from scipy.sparse import coo_matrix row = np.array([0, 3, 1, 0]) col = np.array([0, 3, 1, 2]) data = np.array([4, 5, 7, 9]) a = coo_matrix((data, (row, col)), shape=(4, 4)) dense_a = a.toarray() b = NdArray(a, is_sparse=True) assert b.is_sparse dense_b = b.value.toarray() assert b.is_sparse np.testing.assert_equal(dense_b, dense_a) c = np.random.random([10, 3, 4]) # without change of `is_sparse`, this should raise error with pytest.raises(AttributeError): b.value = c b.is_sparse = False b.value = c np.testing.assert_equal(b.value, c)
def validate_response(resp): assert len(resp.index.docs) == NUM_DOCS for i, doc in enumerate(resp.index.docs): np.testing.assert_equal( NdArray(doc.blob).value, np.array([[i] * 5, [i] * 5])) np.testing.assert_equal( NdArray(doc.embedding).value, np.array([i] * 5))
def multimodal_all_types_documents(): docs = [] for idx in range(0, NUM_DOCS): """ doc - idx | | - chunk - embedding [idx, idx] - modality1 | - chunk - blob [idx, idx, idx] - modality2 | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3] | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4] Result: doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4] """ doc = jina_pb2.DocumentProto() doc.text = f'{idx}' for modality in ['modality1', 'modality2', 'modality3', 'modality4']: chunk = doc.chunks.add() chunk.modality = modality if modality == 'modality1': NdArray(chunk.embedding).value = np.array([idx, idx]) elif modality == 'modality2': NdArray(chunk.blob).value = np.array([idx, idx, idx]) elif modality == 'modality3': chunk.text = 'modality3' elif modality == 'modality4': chunk.buffer = 'modality4'.encode() docs.append(doc) return docs
def test_segment_driver(): valid_doc = jina_pb2.DocumentProto() valid_doc.id = uid.new_doc_id(valid_doc) valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply_all([valid_doc]) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def index_documents(): """Index Documents: doc: tag__id = 0 tag__dummy_score = 0 embedding = 0 doc: tag__id = 1 tag__dummy_score = -1 embedding = 1 doc: tag__id = 2 tag__dummy_score = -2 embedding = 2 """ doc0 = jina_pb2.DocumentProto() doc0.tags['id'] = '0' doc0.tags['dummy_score'] = 0 NdArray(doc0.embedding).value = np.array([0]) doc1 = jina_pb2.DocumentProto() doc1.tags['id'] = '1' doc1.tags['dummy_score'] = -1 NdArray(doc1.embedding).value = np.array([1]) doc2 = jina_pb2.DocumentProto() doc2.tags['id'] = '2' doc2.tags['dummy_score'] = -2 NdArray(doc2.embedding).value = np.array([2]) return [doc0, doc1, doc2]
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1) -> Iterator['DocumentProto']: warnings.warn( 'since 0.7.11 the introduce of Document primitive type, this ' 'fake-doc generator has been depreciated. Use "random_docs_new_api" instead', DeprecationWarning) c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello world' NdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) NdArray(c.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def validate_chunks_and_matches_fn(resp): assert len(resp.search.docs) == 1 doc = resp.search.docs[0] assert int(doc.tags['id']) == 1 assert len(doc.chunks) == 3 chunk0 = doc.chunks[0] assert int(chunk0.tags['id']) == 10 assert chunk0.text == text np.testing.assert_almost_equal(random_np_array, NdArray(chunk0.embedding).value) chunk1 = doc.chunks[1] assert int(chunk1.tags['id']) == 20 np.testing.assert_almost_equal(random_np_array, NdArray(chunk1.blob).value) chunk2 = doc.chunks[2] assert int(chunk2.tags['id']) == 30 assert chunk2.buffer == buffer assert len(doc.matches) == 3 match0 = doc.matches[0] assert int(match0.tags['id']) == 10 assert match0.text == text np.testing.assert_almost_equal(random_np_array, NdArray(match0.embedding).value) match1 = doc.matches[1] assert int(match1.tags['id']) == 20 np.testing.assert_almost_equal(random_np_array, NdArray(match1.blob).value) match2 = doc.matches[2] assert int(match2.tags['id']) == 30 assert match2.buffer == buffer
def extract_docs(docs: Iterable['jina_pb2.DocumentProto'], embedding: bool) -> Tuple: """Iterate over a list of protobuf documents and extract chunk-level information from them :param docs: an iterable of protobuf documents :param embedding: an indicator of extracting embedding or not. If ``True`` then all doc-level embedding are extracted. If ``False`` then ``text``, ``buffer``, ``blob`` info of each doc are extracted :return: A tuple of 3 pieces: - a numpy ndarray of extracted info - the corresponding doc references - the doc_id list where the doc has no contents, useful for debugging """ contents = [] docs_pts = [] bad_doc_ids = [] if embedding: _extract_fn = lambda doc: NdArray(doc.embedding).value else: _extract_fn = lambda doc: doc.text or doc.buffer or NdArray(doc.blob ).value for doc in docs: content = _extract_fn(doc) if content is not None: contents.append(content) docs_pts.append(doc) else: bad_doc_ids.append((doc.id, doc.parent_id)) contents = np.stack(contents) if contents else None return contents, docs_pts, bad_doc_ids
def validate(req): mock() assert len(docs) == len(req.docs) for d, d0 in zip(req.docs, docs): np.testing.assert_almost_equal( NdArray(d.embedding).value, NdArray(d0.embedding).value)
def validate(req): assert len(req.docs) == 2 assert NdArray(req.docs[0].embedding).value.shape == (e1.shape[0] * 2,) assert NdArray(req.docs[1].embedding).value.shape == (e3.shape[0] * 2,) # assert NdArray(req.docs[0].chunks[0].embedding).value.shape == (e2.shape[0] * 2,) # assert NdArray(req.docs[1].chunks[0].embedding).value.shape == (e4.shape[0] * 2,) np.testing.assert_almost_equal(NdArray(req.docs[0].embedding).value, np.concatenate([e1, e1], axis=0), decimal=4)
def ground_truth_pairs(): num_docs = 10 pairs = [] for idx in range(num_docs): doc = jina_pb2.DocumentProto() gt = jina_pb2.DocumentProto() NdArray(doc.embedding).value = np.array([1, 1]) NdArray(gt.embedding).value = np.array([2, 2]) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def test_array2pb(): # i don't understand why is this set? # os env should be available to that process-context only if 'JINA_ARRAY_QUANT' in os.environ: print(f'quant is on: {os.environ["JINA_ARRAY_QUANT"]}') del os.environ['JINA_ARRAY_QUANT'] d = NdArray() d.value = e4 np.testing.assert_almost_equal(d.value, e4)
def test_vectorsearch_driver_mock_indexer_with_fill(): doc = create_document_to_search() driver = SimpleVectorSearchDriver(top_k=2, fill_embedding=True) executor = MockIndexer() driver.attach(executor=executor, pea=None) driver._apply_all(doc.chunks) for chunk in doc.chunks: assert NdArray(chunk.matches[0].embedding).value.shape == (7,) assert NdArray(chunk.matches[-1].embedding).value.shape == (7,) assert NdArray(chunk.matches[-1].embedding).value is not None
def _extract_doc_content(doc: 'jina_pb2.DocumentProto'): """Returns the content of the document with the following priority: If the document has an embedding, return it, otherwise return its content. """ r = NdArray(doc.embedding).value if r is not None: return r elif doc.text or doc.buffer: return doc.text or doc.buffer else: return NdArray(doc.blob).value
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder, doc_with_multimodal_chunks): simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None) simple_multimodal_driver._apply_all([doc_with_multimodal_chunks]) doc = doc_with_multimodal_chunks assert len(doc.chunks) == 3 visual1 = doc.chunks[0] visual2 = doc.chunks[1] textual = doc.chunks[2] assert NdArray(doc.embedding).value.shape[0] == NdArray(visual1.embedding).value.shape[0] + \ NdArray(visual2.embedding).value.shape[0] + NdArray(textual.embedding).value.shape[0]
def input_fn(): doc1 = DocumentProto() NdArray(doc1.embedding).value = e1 c = doc1.chunks.add() NdArray(c.embedding).value = e2 c.id = UniqueId(1) doc2 = DocumentProto() NdArray(doc2.embedding).value = e3 d = doc2.chunks.add() d.id = UniqueId(2) NdArray(d.embedding).value = e4 return [doc1, doc2]
def test_index_driver(): docs = create_documents_to_encode(10) driver = SimpleFillDriver() executor = MockIndexer() driver.attach(executor=executor, pea=None) assert len(docs) == 10 for doc in docs: assert NdArray(doc.embedding).value is None driver._apply_all(docs) assert len(docs) == 10 for doc in docs: assert NdArray(doc.embedding).value.shape == (5, )
def input_fn(): doc1 = DocumentProto() NdArray(doc1.embedding).value = e1 c = doc1.chunks.add() NdArray(c.embedding).value = e2 c.id = uid.new_doc_id(c) doc2 = DocumentProto() NdArray(doc2.embedding).value = e3 d = doc2.chunks.add() d.id = uid.new_doc_id(d) NdArray(d.embedding).value = e4 return [doc1, doc2]
def eval_request(): num_docs = 10 req = jina_pb2.RequestProto.IndexRequestProto() for idx in range(num_docs): doc = req.docs.add() gt = req.groundtruths.add() chunk_doc = doc.chunks.add() chunk_gt = gt.chunks.add() chunk_doc.granularity = 1 chunk_gt.granularity = 1 NdArray(chunk_doc.embedding).value = np.array([1, 1]) NdArray(chunk_gt.embedding).value = np.array([2, 2]) return req
def input_doc_with_matches(): doc = jina_pb2.DocumentProto() doc.tags['id'] = 1 match0 = doc.matches.add() match0.tags['id'] = 10 match0.text = text NdArray(match0.embedding).value = random_np_array match1 = doc.matches.add() match1.tags['id'] = 20 NdArray(match1.blob).value = random_np_array match2 = doc.matches.add() match2.tags['id'] = 30 match2.buffer = buffer return doc
def input_doc_with_chunks(): doc = jina_pb2.DocumentProto() doc.tags['id'] = 1 chunk0 = doc.chunks.add() chunk0.tags['id'] = 10 chunk0.text = text NdArray(chunk0.embedding).value = random_np_array chunk1 = doc.chunks.add() chunk1.tags['id'] = 20 NdArray(chunk1.blob).value = random_np_array chunk2 = doc.chunks.add() chunk2.tags['id'] = 30 chunk2.buffer = buffer return doc
def get_output(req): np.random.seed(rseed) err = 0 for d in req.docs: recv = NdArray(d.embedding).value send = np.random.random([embed_dim]) err += np.sum(np.abs(recv - send)) / embed_dim for c in d.chunks: recv = NdArray(c.embedding).value send = np.random.random([embed_dim]) err += np.sum(np.abs(recv - send)) / embed_dim print(f'reconstruction error: {err / num_docs:.6f}')
def test_request_generate_numpy_arrays(): input_array = np.random.random([10, 10]) req = request_generator(data=input_array, request_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert NdArray(doc.blob).value.shape == (10,) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert NdArray(doc.blob).value.shape == (10,)
def doc_with_multimodal_chunks(embeddings): doc = jina_pb2.DocumentProto() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual1' chunk2.modality = 'visual2' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) NdArray(chunk1.embedding).value = embeddings[0] NdArray(chunk2.embedding).value = embeddings[1] NdArray(chunk3.embedding).value = embeddings[2] return doc
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(): driver = SimpleKVSearchDriver(traversal_paths=('cm',)) executor = MockIndexer() driver.attach(executor=executor, pea=None) doc = create_document_to_search_with_matches_on_chunks() driver._traverse_apply([doc]) assert len(doc.chunks) == 1 chunk = doc.chunks[0] assert len(chunk.matches) == 3 for match in chunk.matches: assert NdArray(match.embedding).value is not None embedding_array = NdArray(match.embedding).value np.testing.assert_equal(embedding_array, np.array([int(match.id)]))
def _apply_all(self, docs: Sequence['jina_pb2.DocumentProto'], context_doc: 'jina_pb2.DocumentProto', field: str, concatenate: bool = False, *args, **kwargs): doc = context_doc if concatenate: NdArray(doc.embedding).value = np.concatenate( self.doc_pointers[doc.id], axis=0) else: if doc.id not in self.doc_pointers: self.doc_pointers[doc.id] = [NdArray(doc.embedding).value] else: self.doc_pointers[doc.id].append(NdArray(doc.embedding).value)
def validate_response(resp): is_callback_called._callback_called = True assert len(resp.index.docs) == NUM_DOCS for idx, doc in enumerate(resp.index.docs): np.testing.assert_almost_equal( NdArray(doc.embedding).value, np.array([idx, idx, idx, idx, idx, 3, 3, 4, 4]))
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(document_with_matches_on_chunks): driver = SimpleKVSearchDriver(traversal_paths=('cm',)) executor = MockIndexer() driver.attach(executor=executor, runtime=None) driver._traverse_apply(DocumentSet([document_with_matches_on_chunks])) dcs = list(document_with_matches_on_chunks.chunks) assert len(dcs) == 1 chunk = dcs[0] matches = list(chunk.matches) assert len(matches) == 3 for match in matches: assert NdArray(match.embedding).value is not None embedding_array = NdArray(match.embedding).value np.testing.assert_equal(embedding_array, np.array([match.id]))
def validate_response(resp): mock() assert len(resp.index.docs) == NUM_DOCS for idx, doc in enumerate(resp.index.docs): np.testing.assert_almost_equal( NdArray(doc.embedding).value, np.array([idx, idx, idx, idx, idx]))
def test_as_blob_driver(): docs = DocumentSet(random_docs(2)) driver = MockPrediction2DocBlobDriver() driver._apply_all(docs) for d in docs: assert NdArray(d.blob).value.shape == (3, )