Пример #1
0
def multimodal_all_types_documents():
    docs = []
    for idx in range(0, NUM_DOCS):
        """
        doc - idx
            |
            | - chunk - embedding [idx, idx] - modality1
            | - chunk - blob [idx, idx, idx] - modality2
            | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3]
            | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4]
        Result:
            doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4]
        """
        doc = jina_pb2.Document()
        doc.text = f'{idx}'

        for modality in ['modality1', 'modality2', 'modality3', 'modality4']:
            chunk = doc.chunks.add()
            chunk.modality = modality
            if modality == 'modality1':
                GenericNdArray(chunk.embedding).value = np.array([idx, idx])
            elif modality == 'modality2':
                GenericNdArray(chunk.blob).value = np.array([idx, idx, idx])
            elif modality == 'modality3':
                chunk.text = 'modality3'
            elif modality == 'modality4':
                chunk.buffer = 'modality4'.encode()
        docs.append(doc)
    return docs
Пример #2
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.Document()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     GenericNdArray(doc0.embedding).value = np.array([0])
     doc1 = jina_pb2.Document()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     GenericNdArray(doc1.embedding).value = np.array([1])
     doc2 = jina_pb2.Document()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     GenericNdArray(doc2.embedding).value = np.array([2])
     return [doc0, doc1, doc2]
Пример #3
0
def test_array2pb():
    # i don't understand why is this set?
    # os env should be available to that process-context only
    if 'JINA_ARRAY_QUANT' in os.environ:
        print(f'quant is on: {os.environ["JINA_ARRAY_QUANT"]}')
        del os.environ['JINA_ARRAY_QUANT']

    d = GenericNdArray()
    d.value = e4
    np.testing.assert_almost_equal(d.value, e4)
Пример #4
0
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        gt = jina_pb2.Document()
        GenericNdArray(doc.embedding).value = np.array([1, 1])
        GenericNdArray(gt.embedding).value = np.array([2, 2])
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Пример #5
0
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder, doc_with_multimodal_chunks):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None)
    simple_multimodal_driver._apply_all([doc_with_multimodal_chunks])
    doc = doc_with_multimodal_chunks
    assert len(doc.chunks) == 3
    visual1 = doc.chunks[0]
    visual2 = doc.chunks[1]
    textual = doc.chunks[2]
    assert GenericNdArray(doc.embedding).value.shape[0] == GenericNdArray(visual1.embedding).value.shape[0] + \
           GenericNdArray(visual2.embedding).value.shape[0] + GenericNdArray(textual.embedding).value.shape[0]
Пример #6
0
def test_vectorsearch_driver_mock_indexer_with_fill():
    doc = create_document_to_search()
    driver = SimpleVectorSearchDriver(top_k=2, fill_embedding=True)
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    driver._apply_all(doc.chunks)

    for chunk in doc.chunks:
        assert GenericNdArray(chunk.matches[0].embedding).value.shape == (7, )
        assert GenericNdArray(chunk.matches[-1].embedding).value.shape == (7, )
        assert GenericNdArray(chunk.matches[-1].embedding).value is not None
Пример #7
0
 def validate(req):
     assert len(req.docs) == 2
     assert GenericNdArray(
         req.docs[0].embedding).value.shape == (e1.shape[0] * 2, )
     assert GenericNdArray(
         req.docs[1].embedding).value.shape == (e3.shape[0] * 2, )
     # assert GenericNdArray(req.docs[0].chunks[0].embedding).value.shape == (e2.shape[0] * 2,)
     # assert GenericNdArray(req.docs[1].chunks[0].embedding).value.shape == (e4.shape[0] * 2,)
     np.testing.assert_almost_equal(GenericNdArray(
         req.docs[0].embedding).value,
                                    np.concatenate([e1, e1], axis=0),
                                    decimal=4)
Пример #8
0
def input_fn():
    doc1 = Document()
    GenericNdArray(doc1.embedding).value = e1
    c = doc1.chunks.add()
    GenericNdArray(c.embedding).value = e2
    c.id = uid.new_doc_id(c)
    doc2 = Document()
    GenericNdArray(doc2.embedding).value = e3
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    GenericNdArray(d.embedding).value = e4
    return [doc1, doc2]
Пример #9
0
def test_index_driver():
    docs = create_documents_to_encode(10)
    driver = SimpleFillDriver()
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    assert len(docs) == 10
    for doc in docs:
        assert GenericNdArray(doc.embedding).value is None
    driver._apply_all(docs)
    assert len(docs) == 10
    for doc in docs:
        assert GenericNdArray(doc.embedding).value.shape == (5, )
Пример #10
0
def eval_request():
    num_docs = 10
    req = jina_pb2.Request.IndexRequest()
    for idx in range(num_docs):
        doc = req.docs.add()
        gt = req.groundtruths.add()
        chunk_doc = doc.chunks.add()
        chunk_gt = gt.chunks.add()
        chunk_doc.granularity = 1
        chunk_gt.granularity = 1
        GenericNdArray(chunk_doc.embedding).value = np.array([1, 1])
        GenericNdArray(chunk_gt.embedding).value = np.array([2, 2])
    return req
Пример #11
0
def test_multimodal_driver_with_shuffled_order(simple_multimodal_driver, mock_multimodal_encoder_shuffled,
                                               doc_with_multimodal_chunks):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder_shuffled, pea=None)
    simple_multimodal_driver._apply_all([doc_with_multimodal_chunks])
    doc = doc_with_multimodal_chunks
    assert len(doc.chunks) == 3
    visual1 = doc.chunks[2]
    visual2 = doc.chunks[0]
    textual = doc.chunks[1]
    control = np.concatenate([GenericNdArray(visual2.embedding).value, GenericNdArray(textual.embedding).value,
                              GenericNdArray(visual1.embedding).value])
    test = GenericNdArray(doc.embedding).value
    np.testing.assert_array_equal(control, test)
Пример #12
0
def get_output(req):
    np.random.seed(rseed)

    err = 0
    for d in req.docs:
        recv = GenericNdArray(d.embedding).value
        send = np.random.random([embed_dim])
        err += np.sum(np.abs(recv - send)) / embed_dim
        for c in d.chunks:
            recv = GenericNdArray(c.embedding).value
            send = np.random.random([embed_dim])
            err += np.sum(np.abs(recv - send)) / embed_dim

    print(f'reconstruction error: {err / num_docs:.6f}')
Пример #13
0
 def input_doc():
     doc = jina_pb2.Document()
     doc.tags['id'] = 1
     match0 = doc.matches.add()
     match0.tags['id'] = 10
     match0.text = text
     GenericNdArray(match0.embedding).value = random_np_array
     match1 = doc.matches.add()
     match1.tags['id'] = 20
     GenericNdArray(match1.blob).value = random_np_array
     match2 = doc.matches.add()
     match2.tags['id'] = 30
     match2.buffer = buffer
     return doc
Пример #14
0
 def input_doc():
     doc = jina_pb2.Document()
     doc.tags['id'] = 1
     chunk0 = doc.chunks.add()
     chunk0.tags['id'] = 10
     chunk0.text = text
     GenericNdArray(chunk0.embedding).value = random_np_array
     chunk1 = doc.chunks.add()
     chunk1.tags['id'] = 20
     GenericNdArray(chunk1.blob).value = random_np_array
     chunk2 = doc.chunks.add()
     chunk2.tags['id'] = 30
     chunk2.buffer = buffer
     return doc
Пример #15
0
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks():
    driver = SimpleKVSearchDriver(traversal_paths=('cm',))
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    doc = create_document_to_search_with_matches_on_chunks()

    driver._traverse_apply([doc])

    assert len(doc.chunks) == 1
    chunk = doc.chunks[0]
    assert len(chunk.matches) == 3
    for match in chunk.matches:
        assert GenericNdArray(match.embedding).value is not None
        embedding_array = GenericNdArray(match.embedding).value
        np.testing.assert_equal(embedding_array, np.array([int(match.id)]))
Пример #16
0
def doc_with_multimodal_chunks_wrong(embeddings):
    doc = jina_pb2.Document()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual'
    chunk2.modality = 'visual'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    GenericNdArray(chunk1.embedding).value = embeddings[0]
    GenericNdArray(chunk2.embedding).value = embeddings[1]
    GenericNdArray(chunk3.embedding).value = embeddings[2]
    return doc
Пример #17
0
def test_queryset_with_struct(random_workspace):
    total_docs = 4
    docs = []
    for doc_id in range(total_docs):
        doc = jina_pb2.Document()
        doc.text = f'I am doc{doc_id}'
        GenericNdArray(doc.embedding).value = np.array([doc_id])
        doc.tags['label'] = f'label{doc_id % 2 + 1}'
        docs.append(doc)

    f = (Flow().add(
        uses=
        '- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}'
    ))

    def validate_all_docs(resp):
        assert len(resp.docs) == total_docs

    def validate_label2_docs(resp):
        assert len(resp.docs) == total_docs / 2

    with f:
        # keep all the docs
        f.index(docs, output_fn=validate_all_docs, callback_on_body=True)

        # keep only the docs with label2
        qs = jina_pb2.QueryLang(name='FilterQL', priority=1)
        qs.parameters['lookups'] = {'tags__label': 'label2'}
        qs.parameters['traversal_paths'] = ['r']
        f.index(docs,
                queryset=qs,
                output_fn=validate_label2_docs,
                callback_on_body=True)
Пример #18
0
def test_as_blob_driver():
    docs = list(random_docs(2))
    driver = MockPrediction2DocBlobDriver()
    driver._traverse_apply(docs)

    for d in docs:
        assert GenericNdArray(d.blob).value.shape == (3, )
Пример #19
0
def test_request_generate_numpy_arrays():
    input_array = np.random.random([10, 10])

    req = _generate(data=input_array, batch_size=5)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert GenericNdArray(doc.blob).value.shape == (10,)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert GenericNdArray(doc.blob).value.shape == (10,)
Пример #20
0
def create_documents_to_encode(num_docs):
    docs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        GenericNdArray(doc.blob).value = np.array([idx])
        docs.append(doc)
    return docs
Пример #21
0
def test_multimodal_driver_assert_one_chunk_per_modality(simple_multimodal_driver, mock_multimodal_encoder,
                                                         doc_with_multimodal_chunks_wrong):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None)
    simple_multimodal_driver._apply_all([doc_with_multimodal_chunks_wrong])
    doc = doc_with_multimodal_chunks_wrong
    assert len(doc.chunks) == 3
    # Document consider invalid to be encoded by the driver
    assert GenericNdArray(doc.embedding).value is None
Пример #22
0
def random_docs(num_docs, embed_dim=10, jitter=1):
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.tags['id'] = j
        d.text = b'hello'
        GenericNdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        yield d
Пример #23
0
 def create(self):
     gt = jina_pb2.Document()
     if field_type == 'text':
         gt.text = 'aaaa'
     elif field_type == 'buffer':
         gt.buffer = b'\x01\x02\x03\04'
     elif field_type == 'blob':
         GenericNdArray(gt.blob).value = np.array([1, 1, 1, 1])
     return gt
Пример #24
0
    def doc_groundtruth_evaluation_pairs():
        doc0 = jina_pb2.Document()
        GenericNdArray(doc0.embedding).value = np.array(
            [0])  # it will match 0 and 1
        groundtruth0 = jina_pb2.Document()
        match0 = groundtruth0.matches.add()
        match0.tags['id'] = '0'
        match1 = groundtruth0.matches.add()
        match1.tags['id'] = '2'
        # top_k is set to 2 for VectorSearchDriver
        # expects as matches [0, 2] but given [0, 1]
        # Precision@1 = 100%
        # Precision@2 = 50%
        # Recall@1 = 100%
        # Recall@2 = 50%

        # expects as ranked [0, 2] but given [0, 1]
        # Precision@1 = 100%
        # Precision@2 = 50%
        # Recall@1 = 100%
        # Recall@2 = 50%

        doc1 = jina_pb2.Document()
        GenericNdArray(doc1.embedding).value = np.array(
            [2])  # it will match 2 and 1
        groundtruth1 = jina_pb2.Document()
        match0 = groundtruth1.matches.add()
        match0.tags['id'] = '1'
        match1 = groundtruth1.matches.add()
        match1.tags['id'] = '2'
        # expects as matches [1, 2] but given [2, 1]
        # Precision@1 = 100%
        # Precision@2 = 100%
        # Recall@1 = 100%
        # Recall@2 = 100%

        # expects as ranked [1, 2] but given [2, 1]
        # Precision@1 = 100%
        # Precision@2 = 100%
        # Recall@1 = 100%
        # Recall@2 = 100%

        return [(doc0, groundtruth0), (doc1, groundtruth1)]
Пример #25
0
def test_vectorsearch_driver_mock_indexer_traverse_apply():
    doc = create_document_to_search()
    driver = SimpleKVSearchDriver()

    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)

    assert len(doc.chunks) == 5
    for chunk in doc.chunks:
        assert GenericNdArray(chunk.embedding).value is None

    driver._traverse_apply(doc.chunks)

    # chunk idx: 5 had no matched and is removed as missing idx
    assert len(doc.chunks) == 4
    for chunk in doc.chunks:
        assert GenericNdArray(chunk.embedding).value is not None
        embedding_array = GenericNdArray(chunk.embedding).value
        np.testing.assert_equal(embedding_array, np.array([int(chunk.id)]))
Пример #26
0
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1):
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.tags['id'] = j
        d.text = b'hello world'
        GenericNdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            GenericNdArray(c.embedding).value = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
Пример #27
0
def get_duplicate_docs(num_docs=10):
    result = []
    unique_set = set()
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        content = int(idx / 2)
        GenericNdArray(doc.embedding).value = np.array([content])
        doc.text = f'I am doc{content}'
        result.append(doc)
        unique_set.add(content)
    return result, len(unique_set)
Пример #28
0
 def request(field_type):
     num_docs = 10
     req = jina_pb2.Request.IndexRequest()
     for idx in range(num_docs):
         doc = req.docs.add()
         gt = req.groundtruths.add()
         chunk_doc = doc.chunks.add()
         chunk_gt = gt.chunks.add()
         chunk_doc.granularity = 1
         chunk_gt.granularity = 1
         if field_type == 'text':
             chunk_doc.text = 'aaa'
             chunk_gt.text = 'aaaa'
         elif field_type == 'buffer':
             chunk_doc.buffer = b'\x01\x02\x03'
             chunk_gt.buffer = b'\x01\x02\x03\x04'
         elif field_type == 'blob':
             GenericNdArray(chunk_doc.blob).value = np.array([1, 1, 1])
             GenericNdArray(chunk_gt.blob).value = np.array([1, 1, 1, 1])
     return req
Пример #29
0
    def validate_fn(resp):
        assert len(resp.search.docs) == 1
        doc = resp.search.docs[0]
        assert int(doc.tags['id']) == 1
        assert len(doc.chunks) == 3

        chunk0 = doc.chunks[0]
        assert int(chunk0.tags['id']) == 10
        assert chunk0.text == text
        np.testing.assert_almost_equal(random_np_array,
                                       GenericNdArray(chunk0.embedding).value)

        chunk1 = doc.chunks[1]
        assert int(chunk1.tags['id']) == 20
        np.testing.assert_almost_equal(random_np_array,
                                       GenericNdArray(chunk1.blob).value)

        chunk2 = doc.chunks[2]
        assert int(chunk2.tags['id']) == 30
        assert chunk2.buffer == buffer
Пример #30
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = '1'
     GenericNdArray(doc1.embedding).value = np.array([int(doc1.id)])
     doc2 = jina_pb2.Document()
     doc2.id = '2'
     GenericNdArray(doc2.embedding).value = np.array([int(doc2.id)])
     doc3 = jina_pb2.Document()
     doc3.id = '3'
     GenericNdArray(doc3.embedding).value = np.array([int(doc3.id)])
     doc4 = jina_pb2.Document()
     doc4.id = '4'
     GenericNdArray(doc4.embedding).value = np.array([int(doc4.id)])
     self.db = {
         1: doc1.SerializeToString(),
         2: doc2.SerializeToString(),
         3: doc3.SerializeToString(),
         4: doc4.SerializeToString()
     }