示例#1
0
def test_pdf_flow_mix_buffer():
    path = os.path.join(cur_dir, 'cats_are_awesome.pdf')
    with open(path, 'rb') as pdf:
        input_bytes = pdf.read()
    f = Flow().add(uses='PDFExtractorSegmenter', array_in_pb=True)
    with f:
        f.search(input_fn=search_generator(path=None, buffer=input_bytes), on_done=validate_mix_fn)
示例#2
0
    def test_shards_insufficient_data(self):
        """THIS IS SUPER IMPORTANT FOR TESTING SHARDS

        IF THIS FAILED, DONT IGNORE IT, DEBUG IT
        """
        index_docs = 3
        replicas = 4

        def validate(req):
            self.assertEqual(len(req.docs), 1)
            self.assertEqual(len(req.docs[0].matches), index_docs)

            for d in req.docs[0].matches:
                self.assertTrue(hasattr(d.match, 'weight'))
                self.assertIsNotNone(d.match.weight)
                self.assertEqual(d.match.meta_info, b'hello world')

        f = Flow().add(name='doc_pb', yaml_path=os.path.join(cur_dir, '../yaml/test-docpb.yml'), replicas=replicas,
                       separated_workspace=True)
        with f:
            f.index(input_fn=random_docs(index_docs), random_doc_id=False)

        time.sleep(2)
        with f:
            pass
        time.sleep(2)
        f = Flow().add(name='doc_pb', yaml_path=os.path.join(cur_dir, '../yaml/test-docpb.yml'), replicas=replicas,
                       separated_workspace=True, polling='all', reducing_yaml_path='_merge_all')
        with f:
            f.search(input_fn=random_queries(1, index_docs), random_doc_id=False, output_fn=validate,
                     callback_on_body=True)
        time.sleep(2)
        self.add_tmpfile('test-docshard-tmp')
示例#3
0
def test_normal(docs):
    NUM_REPLICAS = 3
    NUM_SHARDS = 2
    doc_id_path = collections.OrderedDict()

    def handle_search_result(resp):
        for doc in resp.search.docs:
            doc_id_path[int(doc.id)] = (doc.tags['replica'], doc.tags['shard'])

    flow = Flow().add(
        name='pod1',
        uses='!DummyMarkExecutor',
        replicas=NUM_REPLICAS,
        parallel=NUM_SHARDS,
    )
    with flow:
        flow.search(inputs=docs, request_size=1, on_done=handle_search_result)

    assert len(doc_id_path.keys()) == len(docs)

    num_used_replicas = len(set(map(lambda x: x[0], doc_id_path.values())))
    assert num_used_replicas == NUM_REPLICAS

    shards = collections.defaultdict(list)
    for replica, shard in doc_id_path.values():
        shards[replica].append(shard)

    assert len(shards.keys()) == NUM_REPLICAS

    for shard_list in shards.values():
        assert len(set(shard_list)) == NUM_SHARDS
示例#4
0
    def test_chunk_joint_idx(self):
        f = Flow().add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint.yml'))

        def random_docs(num_docs, chunks_per_doc=5, embed_dim=10):
            c_id = 0
            for j in range(num_docs):
                d = jina_pb2.Document()
                for k in range(chunks_per_doc):
                    c = d.chunks.add()
                    c.embedding.CopyFrom(array2pb(np.random.random([embed_dim])))
                    c.chunk_id = c_id
                    c.doc_id = j
                    c_id += 1
                yield d

        def validate(req, indexer_name):
            self.assertTrue(req.status.code < jina_pb2.Status.ERROR)
            self.assertEqual(req.search.docs[0].chunks[0].topk_results[0].score.op_name, indexer_name)

        with f:
            f.index(random_docs(100))

        g = Flow().add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint.yml'))

        with g:
            g.search(random_docs(10), output_fn=lambda x: validate(x, 'NumpyIndexer'))

        g = Flow(timeout_ready=-1).add(yaml_path=os.path.join(cur_dir, 'yaml/test-joint-wrap.yml'))

        with g:
            g.search(random_docs(10), output_fn=lambda x: validate(x, 'AnnoyIndexer'))
示例#5
0
def test_high_order_matches():
    f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency.yml'))

    with f:
        f.index(random_docs(100))

    with f:
        f.search(random_docs(1), output_fn=validate)
示例#6
0
def test_high_order_matches_integrated():
    # this is equivalent to the last test but with simplified YAML spec.
    f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency-integrated.yml'))

    with f:
        f.index(random_docs(100))

    with f:
        f.search(random_docs(1), output_fn=validate)
示例#7
0
def query(top_k):
    f = Flow().load_config(QUERY_FLOW_YAML)
    with f:
        while True:
            text = input('\n\n\n- Please type a sentence to find semantic similarity in multiple languages : ')
            if not text:
                break
            ppr = lambda x: print_topk(x, text)
            f.search(read_query_data(text), callback=ppr, topk=top_k)
示例#8
0
def test_high_order_matches():
    f = Flow(callback_on_body=True).add(uses=os.path.join(cur_dir, 'test-adjacency.yml'))

    with f:
        f.index(random_docs(100))

    with f:
        f.search(random_docs(1), output_fn=validate)

    shutil.rmtree('test-index-file', ignore_errors=False, onerror=None)
示例#9
0
def test_simple_run(docs):
    flow = Flow().add(
        name='pod1',
        replicas=2,
        parallel=3,
    )
    with flow:
        # test rolling update does not hang
        flow.search(docs)
        flow.rolling_update('pod1', None)
        flow.search(docs)
示例#10
0
def test_index():
    f = Flow().add(uses=os.path.join(cur_dir, 'yaml/test-index.yml'), parallel=3, separated_workspace=True)
    with f:
        f.index(input_fn=random_docs(1000))

    for j in range(3):
        assert os.path.exists(f'test2-{j + 1}/test2.bin')
        assert os.path.exists(f'test2-{j + 1}/tmp2')

    with f:
        f.search(input_fn=random_docs(2), output_fn=get_result, top_k=50)
示例#11
0
def test_binarypb_in_flow():
    docs = list(random_docs(10))
    f = Flow(callback_on_body=True).add(uses='binarypb.yml')

    with f:
        f.index(docs, override_doc_id=False)

    def validate(req):
        for d, d0 in zip(req.docs, docs):
            assert d.embedding.buffer == d0.embedding.buffer

    with f:
        f.search(docs, output_fn=validate, override_doc_id=False)
示例#12
0
    def test_index(self):
        f = Flow().add(uses=os.path.join(cur_dir, 'yaml/test-index.yml'), parallel=3, separated_workspace=True)
        with f:
            f.index(input_fn=random_docs(1000))

        for j in range(3):
            self.assertTrue(os.path.exists(f'test2-{j + 1}/test2.bin'))
            self.assertTrue(os.path.exists(f'test2-{j + 1}/tmp2'))
            self.add_tmpfile(f'test2-{j + 1}/test2.bin', f'test2-{j + 1}/tmp2', f'test2-{j + 1}')

        time.sleep(3)
        with f:
            f.search(input_fn=random_docs(2), output_fn=get_result)
示例#13
0
def test_thread_run(docs):
    flow = Flow().add(
        name='pod1',
        replicas=2,
        parallel=2,
        timeout_ready=30000,
    )
    with flow:
        x = threading.Thread(target=flow.rolling_update, args=('pod1', ))
        for i in range(50):
            flow.search(docs)
            if i == 5:
                x.start()
        x.join()
示例#14
0
def test_index_depth_0_search_depth_1():
    os.environ['CUR_DIR_GRANULARITY'] = cur_dir
    os.environ['TEST_WORKDIR'] = os.getcwd()
    index_data = [
        'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1',
        'I am chunk 0 of doc 2, I am chunk 1 of doc 2',
        'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3',
    ]

    index_flow = Flow().load_config('flow-index.yml')
    with index_flow:
        index_flow.index(index_data)

    def validate_granularity_1(resp):
        assert len(resp.docs) == 3
        for doc in resp.docs:
            assert doc.granularity == 1
            assert len(doc.matches) == 1
            assert doc.matches[0].id == doc.id  # done on purpose
            assert doc.matches[0].granularity == 0

        assert resp.docs[0].text == 'I am chunk 1 of doc 1,'
        assert resp.docs[0].matches[
            0].text == 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1'

        assert resp.docs[1].text == 'I am chunk 0 of doc 2,'
        assert resp.docs[1].matches[
            0].text == 'I am chunk 0 of doc 2, I am chunk 1 of doc 2'

        assert resp.docs[2].text == 'I am chunk 3 of doc 3'
        assert resp.docs[2].matches[
            0].text == 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3'

    search_data = [
        'I am chunk 1 of doc 1,',
        'I am chunk 0 of doc 2,',
        'I am chunk 3 of doc 3',
    ]

    search_flow = Flow().load_config('flow-query.yml')
    with search_flow:
        search_flow.search(input_fn=search_data,
                           output_fn=validate_granularity_1,
                           callback_on_body=True,
                           granularity=1)

    rm_files([os.path.join(os.getenv('TEST_WORKDIR'), 'test_workspace')])
    del os.environ['CUR_DIR_GRANULARITY']
    del os.environ['TEST_WORKDIR']
示例#15
0
def test_binarypb_in_flow(test_metas):
    docs = list(random_docs(10))
    f = Flow(callback_on='body').add(uses='binarypb.yml')

    with f:
        f.index(docs, override_doc_id=False)

    def validate(req):
        assert len(docs) == len(req.docs)
        for d, d0 in zip(req.docs, docs):
            np.testing.assert_almost_equal(
                NdArray(d.embedding).value,
                NdArray(d0.embedding).value)

    docs_no_embedding = copy.deepcopy(docs)
    for d in docs_no_embedding:
        d.ClearField('embedding')
    with f:
        f.search(docs_no_embedding, output_fn=validate, override_doc_id=False)
示例#16
0
def test_shards_insufficient_data(mocker, restful, docpb_workspace):
    """THIS IS SUPER IMPORTANT FOR TESTING SHARDS

    IF THIS FAILED, DONT IGNORE IT, DEBUG IT
    """
    index_docs = 3
    parallel = 4

    mock = mocker.Mock()

    def validate(req):
        assert len(req.docs) == 1
        assert len(req.docs[0].matches) == index_docs

        for d in req.docs[0].matches:
            assert hasattr(d, 'weight')
            assert d.weight

    f = Flow(restful=restful).add(
        name='doc_pb',
        uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'),
        parallel=parallel,
    )
    with f:
        f.index(inputs=random_docs(index_docs))

    time.sleep(2)
    with f:
        pass
    time.sleep(2)
    f = Flow(restful=restful).add(
        name='doc_pb',
        uses=os.path.join(cur_dir, '../yaml/test-docpb.yml'),
        parallel=parallel,
        polling='all',
        uses_after='_merge_chunks',
    )
    with f:
        f.search(inputs=random_queries(1, index_docs), on_done=mock)
    time.sleep(2)
    mock.assert_called_once()
    validate_callback(mock, validate)