示例#1
0
def random_docs(num_docs,
                chunks_per_doc=5,
                embed_dim=10,
                jitter=1,
                start_id=0,
                embedding=True) -> Iterator['Document']:
    next_chunk_doc_id = start_id + num_docs
    for j in range(num_docs):
        doc_id = start_id + j

        d = Document(id=doc_id)
        d.text = b'hello world'
        d.tags['id'] = doc_id
        if embedding:
            d.embedding = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
        d.update_content_hash()

        for _ in range(chunks_per_doc):
            chunk_doc_id = next_chunk_doc_id

            c = Document(id=chunk_doc_id)
            c.text = 'i\'m chunk %d from doc %d' % (chunk_doc_id, doc_id)
            if embedding:
                c.embedding = np.random.random(
                    [embed_dim + np.random.randint(0, jitter)])
            c.tags['parent_id'] = doc_id
            c.tags['id'] = chunk_doc_id
            c.update_content_hash()
            d.chunks.append(c)
            next_chunk_doc_id += 1

        yield d
示例#2
0
def doc_with_multimodal_chunks(embeddings):
    doc = MultimodalDocument()
    chunk1 = Document()
    chunk2 = Document()
    chunk3 = Document()
    chunk1.modality = 'visual1'
    chunk2.modality = 'visual2'
    chunk3.modality = 'textual'
    chunk1.embedding = embeddings[0]
    chunk2.embedding = embeddings[1]
    chunk3.embedding = embeddings[2]
    doc.chunks.append(chunk1)
    doc.chunks.append(chunk2)
    doc.chunks.append(chunk3)
    return doc
示例#3
0
def test_indexer_with_ref_indexer_compound_move(random_workspace_move,
                                                parallel, index_docs, mocker,
                                                uses_no_docker):
    top_k = 10
    with Flow.load_config(os.path.join(cur_dir,
                                       'compound-index.yml')) as index_flow:
        index_flow.index(input_fn=index_docs, request_size=10)

    mock = mocker.Mock()

    shutil.copytree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER'],
                    os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER_QUERY'])

    shutil.rmtree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER'])

    def validate_response(resp):
        mock()
        assert len(resp.search.docs) == 1
        assert len(resp.search.docs[0].matches) == top_k

    query_document = Document()
    query_document.embedding = np.array([1, 1])
    with Flow.load_config(os.path.join(cur_dir,
                                       'compound-query.yml')) as query_flow:
        query_flow.search(input_fn=[query_document],
                          on_done=validate_response,
                          top_k=top_k)

    mock.assert_called_once()
示例#4
0
def random_docs(start, end, embed_dim=10):
    for j in range(start, end):
        d = Document()
        d.id = f'{j:0>16}'
        d.tags['id'] = j
        d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8')
        d.embedding = np.random.random([embed_dim])
        yield d
示例#5
0
def random_docs(
    num_docs,
    chunks_per_doc=5,
    embed_dim=10,
    jitter=1,
    start_id=0,
    embedding=True,
    sparse_embedding=False,
    text='hello world',
) -> Iterator['Document']:
    next_chunk_doc_id = start_id + num_docs
    for j in range(num_docs):
        doc_id = start_id + j

        d = Document(id=doc_id)
        d.text = text
        d.tags['id'] = doc_id
        if embedding:
            if sparse_embedding:
                from scipy.sparse import coo_matrix

                d.embedding = coo_matrix(
                    (np.array([1, 1, 1]), (np.array([0, 1,
                                                     2]), np.array([1, 2,
                                                                    1]))))
            else:
                d.embedding = np.random.random(
                    [embed_dim + np.random.randint(0, jitter)])
        d.update_content_hash()

        for _ in range(chunks_per_doc):
            chunk_doc_id = next_chunk_doc_id

            c = Document(id=chunk_doc_id)
            c.text = 'i\'m chunk %d from doc %d' % (chunk_doc_id, doc_id)
            if embedding:
                c.embedding = np.random.random(
                    [embed_dim + np.random.randint(0, jitter)])
            c.tags['parent_id'] = doc_id
            c.tags['id'] = chunk_doc_id
            c.update_content_hash()
            d.chunks.append(c)
            next_chunk_doc_id += 1

        yield d
示例#6
0
def random_docs(start, end, embed_dim=10, jitter=1, has_content=True):
    for j in range(start, end):
        d = Document()
        d.id = str(f'{j}' * 16)
        if has_content:
            d.tags['id'] = j
            d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8')
            d.embedding = np.random.random([embed_dim + np.random.randint(0, jitter)])
        yield d
示例#7
0
def test_extract_docs():
    d = Document()

    contents, docs_pts = DocumentSet([d]).all_embeddings
    assert contents is None

    vec = np.random.random([2, 2])
    d.embedding = vec
    contents, docs_pts = DocumentSet([d]).all_embeddings
    np.testing.assert_equal(contents[0], vec)
示例#8
0
def doc_with_multimodal_chunks_wrong(embeddings):
    doc = Document()
    chunk1 = Document()
    chunk2 = Document()
    chunk3 = Document()
    chunk1.modality = 'visual'
    chunk2.modality = 'visual'
    chunk3.modality = 'textual'
    chunk1.embedding = embeddings[0]
    chunk2.embedding = embeddings[1]
    chunk3.embedding = embeddings[2]
    chunk1.update_id()
    chunk2.update_id()
    chunk3.update_id()
    doc.update_id()
    doc.chunks.append(chunk1)
    doc.chunks.append(chunk2)
    doc.chunks.append(chunk3)
    return doc
示例#9
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = Document()
     doc1.id = str(1) * 16
     doc1.embedding = np.array([int(doc1.id)])
     doc2 = Document()
     doc2.id = str(2) * 16
     doc2.embedding = np.array([int(doc2.id)])
     doc3 = Document()
     doc3.id = str(3) * 16
     doc3.embedding = np.array([int(doc3.id)])
     doc4 = Document()
     doc4.id = str(4) * 16
     doc4.embedding = np.array([int(doc4.id)])
     self.db = {
         id2hash(doc1.id): doc1.SerializeToString(),
         id2hash(doc2.id): doc2.SerializeToString(),
         id2hash(doc3.id): doc3.SerializeToString(),
         id2hash(doc4.id): doc4.SerializeToString()
     }
示例#10
0
def test_extract_docs():
    d = Document()

    contents, docs_pts, bad_doc_ids = extract_embedding([d])
    assert len(bad_doc_ids) > 0
    assert contents is None

    vec = np.random.random([2, 2])
    d.embedding = vec
    contents, docs_pts, bad_doc_ids = extract_embedding([d])
    assert len(bad_doc_ids) == 0
    np.testing.assert_equal(contents[0], vec)
示例#11
0
def random_docs(start, end):
    documents = []
    for j in range(start, end):
        d = Document()
        d.id = j
        d.tags['id'] = j
        d.text = ''.join(
            random.choice(string.ascii_lowercase)
            for _ in range(10)).encode('utf8')
        d.embedding = np.random.random([10 + np.random.randint(0, 1)])
        documents.append(d)
    return documents
示例#12
0
def create_document_to_search():
    # 1-D embedding
    # doc: 1 - chunk: 2 - embedding(2.0)
    #        - chunk: 3 - embedding(3.0)
    #        - chunk: 4 - embedding(4.0)
    #        - chunk: 5 - embedding(5.0)
    # ....
    doc = Document()
    for c in range(10):
        chunk = Document()
        chunk.id = str(c) * 16
        chunk.embedding = np.array([c])
        doc.chunks.append(chunk)
    return doc
示例#13
0
 def documents(embedding_cls_type):
     doc = Document()
     for c in range(10):
         chunk = Document()
         chunk.id = str(c) * 16
         dense_embedding = np.random.random([10])
         if embedding_cls_type == 'dense':
             chunk.embedding = dense_embedding
         elif embedding_cls_type == 'scipy_csr':
             chunk.embedding = scipy.sparse.csr_matrix(dense_embedding)
         elif embedding_cls_type == 'scipy_coo':
             chunk.embedding = scipy.sparse.coo_matrix(dense_embedding)
         elif embedding_cls_type == 'torch':
             sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
             values = sparse_embedding.data
             indices = np.vstack(
                 (sparse_embedding.row, sparse_embedding.col))
             chunk.embedding = torch.sparse_coo_tensor(
                 indices,
                 values,
                 sparse_embedding.shape,
             )
         elif embedding_cls_type == 'tf':
             sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
             values = sparse_embedding.data
             indices = [
                 (x, y)
                 for x, y in zip(sparse_embedding.row, sparse_embedding.col)
             ]
             chunk.embedding = tf.SparseTensor(
                 indices=indices,
                 values=values,
                 dense_shape=[1, 10],
             )
         doc.chunks.append(chunk)
     return doc
def test_indexer_with_ref_indexer(random_workspace, parallel, index_docs, mocker):
    top_k = 10
    with Flow.load_config('index.yml') as index_flow:
        index_flow.index(input_fn=index_docs, batch_size=10)

    mock = mocker.Mock()

    def validate_response(resp):
        mock()
        assert len(resp.search.docs) == 1
        assert len(resp.search.docs[0].matches) == top_k

    query_document = Document()
    query_document.embedding = np.array([1, 1])
    with Flow.load_config('query.yml') as query_flow:
        query_flow.search(input_fn=[query_document], on_done=validate_response, top_k=top_k)

    mock.assert_called_once()
示例#15
0
def test_empty_shard(mocker, workdir):
    doc = Document()
    doc.text = 'text'
    doc.embedding = np.array([1, 1, 1])

    def validate_response(resp):
        assert len(resp.docs) == 1
        assert len(resp.docs[0].matches) == 0

    mock = mocker.Mock()
    error_mock = mocker.Mock()

    with Flow.load_config(os.path.join(cur_dir, 'flow.yml')) as f:
        f.search([doc], on_done=mock, on_error=error_mock)

    mock.assert_called_once()
    validate_callback(mock, validate_response)

    error_mock.assert_not_called()
示例#16
0
def test_indexer_with_ref_indexer(random_workspace, parallel, index_docs,
                                  mocker, uses_no_docker):
    top_k = 10
    with Flow.load_config(os.path.join('index.yml')) as index_flow:
        index_flow.index(inputs=index_docs, request_size=10)

    mock = mocker.Mock()

    def validate_response(resp):
        assert len(resp.search.docs) == 1
        assert len(resp.search.docs[0].matches) == top_k

    query_document = Document()
    query_document.embedding = np.array([1, 1])
    with Flow.load_config(os.path.join('query.yml')) as query_flow:
        query_flow.search(inputs=[query_document], on_done=mock, top_k=top_k)

    mock.assert_called_once()
    validate_callback(mock, validate_response)
示例#17
0
def index_docs():
    for i in range(0, 100):
        d = Document()
        d.embedding = np.array([i, i])
        d.tags['filename'] = f' hey here {i}'
        yield d
示例#18
0
def chunk_4(textual_embedding):
    chunk = Document()
    chunk.modality = 'textual'
    chunk.embedding = textual_embedding
    chunk.granularity = 1
    return chunk
示例#19
0
def chunk_1(visual_embedding):
    chunk = Document()
    chunk.modality = 'visual'
    chunk.embedding = visual_embedding
    chunk.granularity = 0
    return chunk