def test_non_empty_fields(): d_score = Document(score=NamedScore(value=42)) assert d_score.non_empty_fields == ('id', 'score') d = Document() assert d.non_empty_fields == ('id', ) d = Document(id='') assert not d.non_empty_fields
def test_graph_count_invariance(graph, expected_output): doc0 = Document(text='Document0') doc1 = Document(text='Document1') graph.add_edge(doc0, doc1) graph.add_edge(doc0, doc1) assert graph.num_edges == expected_output
def test_graph_edge_behaviour_creation(graph, expected_output): doc0 = Document(text='Document0') doc1 = Document(text='Document1') graph.add_edge(doc0, doc1) graph.add_edge(doc1, doc0) assert graph.num_edges == expected_output
def test_document_sparse_attributes_pytorch(torch_sparse_matrix): d = Document() d.embedding = torch_sparse_matrix d.blob = torch_sparse_matrix np.testing.assert_array_equal(d.embedding.todense(), torch_sparse_matrix.to_dense().numpy()) np.testing.assert_array_equal(d.blob.todense(), torch_sparse_matrix.to_dense().numpy())
def build_document(chunk=None): d = Document() d.chunks.append(chunk) d.chunks[0].update_content_hash( exclude_fields=('parent_id', 'id', 'content_hash') ) d.chunks[0].parent_id = 0 d.update_content_hash(include_fields=('chunks',), exclude_fields=None) return d
def test_add_single_edge_from_id_strings_non_existing_nodes(): graph = GraphDocument() doc0 = Document(text='Document0') doc1 = Document(text='Document1') with pytest.raises(AssertionError): graph.add_single_edge(doc0.id, doc1.id, features={'text': 'I connect Doc0 and Doc1'})
def test_uri_get_set(): a = Document() a.uri = 'https://abc.com/a.jpg' assert a.uri == 'https://abc.com/a.jpg' assert a.mime_type == 'image/jpeg' a.uri = 'abcdefg' assert a.uri == 'abcdefg' a.content = 'abcdefg' assert a.text == 'abcdefg' assert not a.uri
def test_add_remove_node_deprecated(): graph = GraphDocument() d1 = Document(id='1') d2 = Document(id='2') graph.add_node(d1) graph.add_node(d2) assert len(graph.nodes) == 2 graph.remove_node(d1) graph.remove_node(d2) assert len(graph.nodes) == 0
def test_pydatic_document_to_jina_document(): document_proto_model = PROTO_TO_PYDANTIC_MODELS.DocumentProto jina_doc = Document(document_proto_model(text='abc').json()) assert jina_doc.text == 'abc' assert jina_doc.content == 'abc' jina_doc = Document(document_proto_model(text='abc').dict()) assert jina_doc.text == 'abc' assert jina_doc.content == 'abc'
def test_manual_update_edges_features(graph, expected_output): doc0 = Document(text='Document0') doc1 = Document(text='Document1') graph.add_single_edge(doc0, doc1) edge_key = graph._get_edge_key(doc0.id, doc1.id) graph._pb_body.graph.edge_features[edge_key] = {'number_value': 1234} assert graph._pb_body.graph.edge_features[edge_key]['number_value'] == 1234
def test_doc_content(): d = Document() assert d.content is None d.text = 'abc' assert d.content == 'abc' c = np.random.random([10, 10]) d.blob = c np.testing.assert_equal(d.content, c) d.buffer = b'123' assert d.buffer == b'123'
def test_document_sparse_attributes_tensorflow(tf_sparse_matrix): import tensorflow as tf d = Document() d.embedding = tf_sparse_matrix d.blob = tf_sparse_matrix np.testing.assert_array_equal(d.embedding.todense(), tf.sparse.to_dense(tf_sparse_matrix)) np.testing.assert_array_equal(d.blob.todense(), tf.sparse.to_dense(tf_sparse_matrix))
def test_added_edges_in_edge_features(graph, expected_output): doc0 = Document(text='Document0') doc1 = Document(text='Document1') graph.add_edge(doc0, doc1) edge_key = graph._get_edge_key(doc0, doc1) assert edge_key in graph.edge_features assert graph.edge_features[edge_key] is None
def test_sparse_get_set(): d = Document() assert d.content is None mat1 = coo_matrix(np.array([1, 2, 3])) d.content = mat1 assert (d.content != mat1).nnz == 0 mat2 = coo_matrix(np.array([3, 2, 1])) assert (d.content != mat2).nnz != 0 d.blob = mat2 assert (d.content != mat2).nnz == 0
def test_graph_add_multiple_nodes(): graph = GraphDocument() doc0 = Document(text='Document0') doc1 = Document(text='Document1') doc2 = Document(text='Document2') doc3 = Document(text='Document3') graph.add_nodes([doc0, doc1, doc2, doc3]) assert graph.num_nodes == 4 assert graph.num_edges == 0
def test_doc_update_fields(): a = Document() b = np.random.random([10, 10]) c = {'tags': 'string', 'tag-tag': {'tags': 123.45}} d = [12, 34, 56] e = 'text-mod' a.update(embedding=b, tags=c, location=d, modality=e) np.testing.assert_equal(a.embedding, b) assert list(a.location) == d assert a.modality == e assert MessageToDict(a.tags) == c
def test_doc_score(): from jina.types.score import NamedScore doc = Document(text='text') score = NamedScore(op_name='operation', value=10.0, ref_id=doc.id) doc.score = score assert doc.score.op_name == 'operation' assert doc.score.value == 10.0 assert doc.score.ref_id == doc.id
def test_tags_document(): doc = PROTO_TO_PYDANTIC_MODELS.DocumentProto(hello='world') assert doc.tags == {'hello': 'world'} assert MessageToDict(Document(doc.dict()).tags) == {'hello': 'world'} doc = PROTO_TO_PYDANTIC_MODELS.DocumentProto(hello='world', tags={'key': 'value'}) assert doc.tags == {'hello': 'world', 'key': 'value'} assert MessageToDict(Document(doc.dict()).tags) == { 'hello': 'world', 'key': 'value', }
def test_doc_plot(): docs = [Document(id='🐲', embedding=np.array([0, 0]), tags={'guardian': 'Azure Dragon', 'position': 'East'}), Document(id='🐦', embedding=np.array([1, 0]), tags={'guardian': 'Vermilion Bird', 'position': 'South'}), Document(id='🐢', embedding=np.array([0, 1]), tags={'guardian': 'Black Tortoise', 'position': 'North'}), Document(id='🐯', embedding=np.array([1, 1]), tags={'guardian': 'White Tiger', 'position': 'West'})] docs[0].chunks.append(docs[1]) docs[0].chunks[0].chunks.append(docs[2]) docs[0].matches.append(docs[3]) assert docs[0]._mermaid_to_url('svg')
def test_query_match_array_sort_scores(): query = Document() query.matches = [ Document(id=i, copy=True, scores={'euclid': 10 - i}) for i in range(10) ] assert query.matches[0].id == '0' assert query.matches[0].scores['euclid'].value == 10 query.matches.sort( key=lambda m: m.scores['euclid'].value) # sort matches by their values assert query.matches[0].id == '9' assert query.matches[0].scores['euclid'].value == 1
def get_test_doc(): s = Document(id='🐲', content='hello-world', tags={'a': 'b'}, embedding=np.array([1, 2, 3]), chunks=[Document(id='🐢')]) d = Document(id='🐦', content='goodbye-world', tags={'c': 'd'}, embedding=np.array([4, 5, 6]), chunks=[Document(id='🐯')]) return s, d
def test_add_remove_edge_deprecated(): graph = GraphDocument() doc0 = Document(text='Document0') doc1 = Document(text='Document1') graph.add_edge(doc0, doc1, features={'text': 'I connect Doc0 and Doc1'}) assert graph.num_nodes == 2 assert graph.num_edges == 1 graph.remove_edge(doc0, doc1) assert graph.num_nodes == 2 assert graph.num_edges == 0
def eval_request(): req = Request() req.request_type = 'search' # doc: 1 # doc: 2 # doc: 3 # doc: 4 # doc: 5 - will be missing from KV indexer for idx in range(5): dp = Document() dp.id = f'0{str(idx + 1)}' * 8 req.docs.append(dp) return req
def test_doc_update_fields(): a = Document() b = np.random.random([10, 10]) c = {'tags': 'string', 'tag-tag': {'tags': 123.45}} d = [12, 34, 56] e = 'text-mod' w = 2.0 a.set_attributes(embedding=b, tags=c, location=d, modality=e, weight=w) np.testing.assert_equal(a.embedding, b) assert list(a.location) == d assert a.modality == e assert a.tags == c assert a.weight == w
def test_set_get_mime(): a = Document() a.mime_type = 'jpg' assert a.mime_type == 'image/jpeg' b = Document() b.mime_type = 'jpeg' assert b.mime_type == 'image/jpeg' c = Document() c.mime_type = '.jpg' assert c.mime_type == 'image/jpeg'
def graph(): graph = GraphDocument() doc0 = Document(text='Document0') doc1 = Document(text='Document1') doc2 = Document(text='Document2') doc3 = Document(text='Document3') graph.add_edge(doc0, doc1, features={'text': 'I connect Doc0 and Doc1'}) graph.add_edge(doc0, doc2, features={'text': 'I connect Doc0 and Doc2'}) graph.add_edge(doc2, doc1, features={'text': 'I connect Doc2 and Doc1'}) graph.add_edge(doc1, doc3, features={'text': 'I connect Doc1 and Doc3'}) graph.add_edge(doc2, doc3, features={'text': 'I connect Doc2 and Doc3'}) return graph
def test_doc_arbitrary_dict(from_str): d_src = {'id': '123', 'hello': 'world', 'tags': {'good': 'bye'}} if from_str: d_src = json.dumps(d_src) d = Document(d_src) assert d.id == '123' assert d.tags['hello'] == 'world' assert d.tags['good'] == 'bye' d_src = {'hello': 'world', 'good': 'bye'} if from_str: d_src = json.dumps(d_src) d = Document(d_src) assert d.tags['hello'] == 'world' assert d.tags['good'] == 'bye'
def empty_documents(): docs = [] for idx in range(100, 120): with Document() as d: d.id = f'{idx:0>16}' docs.append(d) return DocumentSet(docs)
def deleted_documents(): docs = [] for idx in range(3): with Document() as d: d.id = f'{idx:0>16}' docs.append(d) return DocumentSet(docs)
def documents(embedding_cls_type, text_prefix='', num_docs=5): docs = [] for idx in range(num_docs): with Document(text=f'{text_prefix}{idx}') as d: d.id = f'{idx:0>16}' dense_embedding = np.random.random([10]) if embedding_cls_type == 'dense': d.embedding = dense_embedding elif embedding_cls_type == 'scipy_csr': d.embedding = scipy.sparse.csr_matrix(dense_embedding) elif embedding_cls_type == 'scipy_coo': d.embedding = scipy.sparse.coo_matrix(dense_embedding) elif embedding_cls_type == 'torch': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = np.vstack( (sparse_embedding.row, sparse_embedding.col)) d.embedding = torch.sparse_coo_tensor( indices, values, sparse_embedding.shape, ) elif embedding_cls_type == 'tf': sparse_embedding = scipy.sparse.coo_matrix(dense_embedding) values = sparse_embedding.data indices = [(x, y) for x, y in zip(sparse_embedding.row, sparse_embedding.col)] d.embedding = tf.SparseTensor( indices=indices, values=values, dense_shape=[1, 10], ) docs.append(d) return DocumentArray(docs)