예제 #1
0
def get_two_docarray():
    d1 = Document(embedding=np.array([0, 0, 0]))
    d1c1 = Document(embedding=np.array([0, 1, 0]))

    d2 = Document(embedding=np.array([1, 0, 0]))
    d2c1 = Document(embedding=np.array([1, 1, 0]))
    d2c2 = Document(embedding=np.array([1, 0, 1]))

    d3 = Document(embedding=np.array([2, 1, 1]))
    d3c1 = Document(embedding=np.array([2, 1, 0]))
    d3c2 = Document(embedding=np.array([2, 0, 1]))
    d3c3 = Document(embedding=np.array([2, 0, 0]))

    d4 = Document(embedding=np.array([3, 1, 1]))
    d4c1 = Document(embedding=np.array([3, 1, 0]))
    d4c2 = Document(embedding=np.array([3, 0, 1]))
    d4c3 = Document(embedding=np.array([3, 0, 0]))
    d4c4 = Document(embedding=np.array([3, 1, 1]))

    d1.chunks.extend([d1c1])
    d2.chunks.extend([d2c1, d2c2])
    d3.chunks.extend([d3c1, d3c2, d3c3])
    d4.chunks.extend([d4c1, d4c2, d4c3, d4c4])

    da1 = DocumentArray([d1, d2])
    da2 = DocumentArray([d3, d4])
    yield da1, da2
예제 #2
0
        async def _compute_response():
            response_msg = copy.deepcopy(request)
            exec_endpoint = request.header.exec_endpoint
            new_docs = DocumentArray()
            await asyncio.sleep(0.1)
            if deployment == 'indexer-executor':
                if exec_endpoint == '/index':
                    time.sleep(0.1)
                    self._docs.extend(request.docs)
                else:
                    docs = response_msg.docs
                    docs.clear()
                    docs.extend(
                        DocumentArray(
                            Document(tags={'ids': self._docs[:, 'id']})))
                    response_msg.data.docs = docs
                return response_msg
            else:
                if deployment == 'slow-executor':
                    await asyncio.sleep(SLOW_EXECUTOR_SLEEP_TIME)
                for doc in request.docs:
                    new_doc = Document(doc, copy=True)
                    new_doc.tags['executor'] = time.time()
                    print(
                        f'in {deployment}, {new_doc.id} => time: {readable_time_from(new_doc.tags["executor"])}, {new_doc.tags["executor"]}',
                        flush=True,
                    )
                    new_docs.append(new_doc)

                docs = response_msg.docs
                docs.clear()
                docs.extend(new_docs)
                response_msg.data.docs = docs
                return response_msg
예제 #3
0
def test_scale_success(remote_flow_with_runtime: Flow, deployment_params):
    num_replicas, scale_to, shards = deployment_params
    with remote_flow_with_runtime as f:
        ret1 = Client(port=exposed_port).index(
            inputs=DocumentArray([Document() for _ in range(200)]),
            return_results=True,
            request_size=10,
        )
        f.scale(deployment_name='executor', replicas=scale_to)
        ret2 = Client(port=exposed_port).index(
            inputs=DocumentArray([Document() for _ in range(200)]),
            return_results=True,
            request_size=10,
        )

        assert len(ret1) == 20
        replicas = set()
        for r in ret1:
            assert len(r.docs) == 10
            # replicas are identified via their docker id
            for id in r.docs[:, 'tags__uid']:
                replicas.add(id)

        assert len(replicas) == num_replicas * shards

        assert len(ret2) == 20
        replicas = set()
        for r in ret2:
            assert len(r.docs) == 10
            for id in r.docs[:, 'tags__uid']:
                replicas.add(id)

        assert len(replicas) == scale_to * shards
예제 #4
0
def test_scale_success(remote_flow_with_runtime: Flow, pod_params):
    num_replicas, scale_to, shards = pod_params
    with remote_flow_with_runtime as f:
        ret1 = f.index(
            inputs=DocumentArray([Document() for _ in range(200)]),
            return_results=True,
            request_size=10,
        )
        f.scale(pod_name='executor', replicas=scale_to)
        ret2 = f.index(
            inputs=DocumentArray([Document() for _ in range(200)]),
            return_results=True,
            request_size=10,
        )

        assert len(ret1) == 20
        replica_ids = set()
        for r in ret1:
            assert len(r.docs) == 10
            for replica_id in r.docs.get_attributes('tags__replica_id'):
                replica_ids.add(replica_id)

        assert replica_ids == set(range(num_replicas))

        assert len(ret2) == 20
        replica_ids = set()
        for r in ret2:
            assert len(r.docs) == 10
            for replica_id in r.docs.get_attributes('tags__replica_id'):
                replica_ids.add(replica_id)

        assert replica_ids == set(range(scale_to))
예제 #5
0
def test_cache_legacy_field_type(tmp_path, test_metas):
    filename = os.path.join(tmp_path, 'DocCache.bin')
    doc1 = Document(id=1)
    doc1.text = 'blabla'
    doc1.update_content_hash()
    docs1 = DocumentArray([doc1])

    doc2 = Document(id=1)
    doc2.text = 'blabla2'
    doc2.update_content_hash()
    docs2 = DocumentArray([doc2])

    doc3 = Document(id=12312)
    doc3.text = 'blabla'
    doc3.update_content_hash()
    docs3 = DocumentArray([doc3])

    driver = MockBaseCacheDriver()

    with DocCache(filename, metas=test_metas,
                  field=CONTENT_HASH_KEY) as executor:
        driver.attach(executor=executor, runtime=None)
        assert executor.fields == [CONTENT_HASH_KEY]
        driver._apply_all(docs1)
        driver._apply_all(docs2)
        assert executor.size == 2

    with BaseExecutor.load(executor.save_abspath) as executor:
        driver.attach(executor=executor, runtime=None)
        assert executor.fields == [CONTENT_HASH_KEY]
        with pytest.raises(NotImplementedError):
            driver._apply_all(docs3)
예제 #6
0
    def rank(self, docs_matrix: List['DocumentArray'], parameters: Dict,
             **kwargs) -> 'DocumentArray':
        """
        :param docs: the doc which gets bubbled up matches
        :param kwargs: not used (kept to maintain interface)
        """

        result_da = DocumentArray(
        )  # length: 1 as every time there is only one query
        for d_mod1, d_mod2 in zip(*docs_matrix):

            final_matches = {}  # type: Dict[str, Document]

            for m in d_mod1.matches:
                m.score.value *= d_mod1.weight
                final_matches[m.parent_id] = Document(m, copy=True)

            for m in d_mod2.matches:
                if m.parent_id in final_matches:
                    final_matches[m.parent_id].score.value += (m.score.value *
                                                               d_mod2.weight)
                else:
                    m.score.value *= d_mod2.weight
                    final_matches[m.parent_id] = Document(m, copy=True)

            da = DocumentArray(list(final_matches.values()))
            da.sort(key=lambda ma: ma.score.value, reverse=True)
            d = Document(matches=da[:int(parameters['top_k'])])
            result_da.append(d)
        return result_da
예제 #7
0
def test_override_requests_uses_after():
    class FooExecutor(Executor):
        @requests(on='/bar')
        def foo(self, docs, **kwargs):
            for doc in docs:
                doc.text = 'foo called'

    class OtherExecutor(Executor):
        @requests(on='/bar')
        def bar(self, docs, **kwargs):
            for doc in docs:
                doc.text = 'bar called'

    with Flow(port_expose=exposed_port).add(
            uses=FooExecutor,
            uses_requests={'/foo': 'foo'},
            uses_after=OtherExecutor,
            uses_before=OtherExecutor,
    ) as f:
        c = Client(port=exposed_port)
        resp1 = c.post(on='/foo',
                       inputs=DocumentArray([Document(text='')]),
                       return_results=True)
        resp2 = c.post(
            on='/non_foo',
            inputs=DocumentArray([Document(text='')]),
            return_results=True,
        )
        resp3 = c.post(on='/bar',
                       inputs=DocumentArray([Document(text='')]),
                       return_results=True)

    assert resp1[0].docs[0].text == 'foo called'
    assert resp2[0].docs[0].text == ''
    assert resp3[0].docs[0].text == 'bar called'
예제 #8
0
def test_docuset_traverse_over_iterator_CAVEAT():
    # HACKY USAGE's CAVEAT: but it can not iterate over an iterator twice
    ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['r', 'c'])
    # note that random_docs is a generator and can be only used once,
    # therefore whoever comes first wil get iterated, and then it becomes empty
    assert len(list(ds)) == 1 + num_docs

    ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['c', 'r'])
    assert len(list(ds)) == num_docs + 1
예제 #9
0
def test_docuset_traverse_over_iterator_HACKY():
    # HACKY USAGE DO NOT RECOMMEND: can also traverse over "runtime"-DocumentArray
    ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['r'])
    assert len(list(list(ds)[0])) == num_docs

    ds = DocumentArray(random_docs(num_docs, num_chunks_per_doc)).traverse(['c'])
    ds = list(ds)
    assert len(ds) == num_docs
    assert len(ds[0]) == num_chunks_per_doc
예제 #10
0
def test_extract_docs():
    d = Document()

    contents, docs_pts = DocumentArray([d]).all_embeddings
    assert contents is None

    vec = np.random.random([2, 2])
    d.embedding = vec
    contents, docs_pts = DocumentArray([d]).all_embeddings
    np.testing.assert_equal(contents[0], vec)
예제 #11
0
def test_documentarray_filter():
    da = DocumentArray([Document() for _ in range(6)])

    for j in range(6):
        da[j].scores['score'].value = j

    da = [d for d in da if d.scores['score'].value > 2]
    assert len(DocumentArray(da)) == 3

    for d in da:
        assert d.scores['score'].value > 2
예제 #12
0
def test_doc_iter_method(filter_fn):
    ds = list(random_docs(10))

    for d in DocumentArray(ds):
        assert d.text == 'hello world'

    for d in DocumentArray(ds).traverse_flat('c,r', filter_fn=filter_fn):
        d.text = 'modified'

    for d in DocumentArray(ds):
        assert d.text == 'modified'
예제 #13
0
def test_doc_iter_method():
    ds = list(random_docs(10))

    for d in DocumentArray(ds):
        assert d.text == 'hello world'

    for d in DocumentArray(ds).traverse_flat(['c', 'r']):
        d.text = 'modified'

    for d in DocumentArray(ds):
        assert d.text == 'modified'
예제 #14
0
def get_pair_document_array():
    da1 = DocumentArray([
        Document(id='1', embedding=np.array([1, 2])),
        Document(id='2', embedding=np.array([3, 4])),
    ])
    da2 = DocumentArray([
        Document(id='1', embedding=np.array([1, 2])),
        Document(id='2', embedding=np.array([3, 4])),
        Document(id='3', embedding=np.array([4, 5])),
    ])
    yield da1, da2
예제 #15
0
def test_documentarray_filter():
    da = DocumentArray([Document() for _ in range(6)])

    for j in range(6):
        da[j].score.value = j

    da = filter(lambda d: d.score.value > 2, da)

    assert len(DocumentArray(list(da))) == 3

    for d in da:
        assert d.score.value > 2
예제 #16
0
def test_match_exclude_self(exclude_self, num_matches, only_id):
    da1 = DocumentArray([
        Document(id='1', embedding=np.array([1, 2])),
        Document(id='2', embedding=np.array([3, 4])),
    ])
    da2 = DocumentArray([
        Document(id='1', embedding=np.array([1, 2])),
        Document(id='2', embedding=np.array([3, 4])),
    ])
    da1.match(da2, exclude_self=exclude_self, only_id=only_id)
    for d in da1:
        assert len(d.matches) == num_matches
예제 #17
0
def doc_lists_to_doc_arrays(doc_lists, tmpdir, first_memmap, second_memmap,
                            buffer_pool_size):
    doc_list1, doc_list2 = doc_lists

    tmpdir1, tmpdir2 = tmpdir / '1', tmpdir / '2'

    D1 = (DocumentArray() if not first_memmap else DocumentArrayMemmap(
        tmpdir1, buffer_pool_size=buffer_pool_size))
    D1.extend(doc_list1)
    D2 = (DocumentArray() if not second_memmap else DocumentArrayMemmap(
        tmpdir2, buffer_pool_size=buffer_pool_size))
    D2.extend(doc_list2)
    return D1, D2
예제 #18
0
async def test_scale_remote_flow_async(docker_image_built, async_jinad_client,
                                       deployment_params):
    replicas, scale_to, shards = deployment_params
    workspace_id = await async_jinad_client.workspaces.create(
        paths=[os.path.join(cur_dir, cur_dir)])
    assert workspace_id
    flow_id = await async_jinad_client.flows.create(
        workspace_id=workspace_id,
        filename='flow-scalable.yml',
        envs={
            'num_shards': str(shards),
            'num_replicas': str(replicas)
        },
    )
    assert flow_id

    ret1 = Client(host=HOST, port=FLOW_PORT, protocol='http',
                  asyncio=True).index(
                      inputs=DocumentArray([Document() for _ in range(1000)]),
                      return_results=True,
                      request_size=10,
                  )

    process_ids = set()
    async for r in ret1:
        for p_id in r.docs[:, 'tags__process_id']:
            process_ids.add(
                p_id)  # identify replicas by the process they run in

    assert len(process_ids) == replicas * shards

    await async_jinad_client.flows.scale(id=flow_id,
                                         deployment_name=SCALE_EXECUTOR,
                                         replicas=scale_to)

    ret2 = Client(host=HOST, port=FLOW_PORT, protocol='http',
                  asyncio=True).index(
                      inputs=DocumentArray([Document() for _ in range(1000)]),
                      return_results=True,
                      request_size=10,
                  )

    process_ids = set()
    async for r in ret2:
        for p_id in r.docs[:, 'tags__process_id']:
            process_ids.add(p_id)

    assert len(process_ids) == scale_to * shards
    assert await async_jinad_client.flows.delete(flow_id)
    assert await async_jinad_client.workspaces.delete(workspace_id)
예제 #19
0
def docarrays_for_embedding_distance_computation_sparse():
    d1 = Document(embedding=sp.csr_matrix([0, 0, 0]))
    d2 = Document(embedding=sp.csr_matrix([3, 0, 0]))
    d3 = Document(embedding=sp.csr_matrix([1, 0, 0]))
    d4 = Document(embedding=sp.csr_matrix([2, 0, 0]))

    d1_m = Document(embedding=sp.csr_matrix([1, 0, 0]))
    d2_m = Document(embedding=sp.csr_matrix([2, 0, 0]))
    d3_m = Document(embedding=sp.csr_matrix([0, 0, 1]))
    d4_m = Document(embedding=sp.csr_matrix([0, 0, 2]))
    d5_m = Document(embedding=sp.csr_matrix([0, 0, 3]))

    D1 = DocumentArray([d1, d2, d3, d4])
    D2 = DocumentArray([d1_m, d2_m, d3_m, d4_m, d5_m])
    return D1, D2
예제 #20
0
 def __init__(self, index_file_name: str, **kwargs):
     super().__init__(**kwargs)
     self.index_file_name = index_file_name
     if os.path.exists(self.save_path):
         self._docs = DocumentArray.load(self.save_path)
     else:
         self._docs = DocumentArray()
예제 #21
0
 async def _test():
     responses = []
     req = request_generator(
         '/', DocumentArray([Document(text='client0-Request')]))
     async for resp in runtime.streamer.Call(request_iterator=req):
         responses.append(resp)
     return responses
예제 #22
0
    def send_requests_once(
        self,
        requests: List[Request],
        deployment: str,
        head: bool,
        endpoint: str = None,
        timeout: float = 1.0,
    ) -> asyncio.Task:
        assert head
        self.deployments_called.append(deployment)
        response_msg = copy.deepcopy(requests[0])
        new_docs = DocumentArray()
        for doc in requests[0].docs:
            clientid = doc.text[0:7]
            self.sent_msg[clientid][deployment] = doc.text
            new_doc = Document(
                text=doc.text + f'-{clientid}-{deployment}', tags=doc.tags
            )
            new_docs.append(new_doc)
            self.responded_messages[clientid][deployment] = new_doc.text

        response_msg.data.docs = new_docs

        async def task_wrapper():
            import random

            await asyncio.sleep(1 / (random.randint(1, 3) * 10))
            return response_msg, {}

        return asyncio.create_task(task_wrapper())
예제 #23
0
def _create_test_data_message():
    req = list(
        request_generator(
            '/',
            DocumentArray([Document(text='input document')
                           for _ in range(10)])))[0]
    return req
예제 #24
0
def test_document_save_load(method, tmp_path):
    da1 = DocumentArray(random_docs(1000))
    da2 = DocumentArray()
    for doc in random_docs(10):
        da2.append(doc)
    for da in [da1, da2]:
        tmp_file = os.path.join(tmp_path, 'test')
        with TimeContext(f'w/{method}'):
            da.save(tmp_file, file_format=method)
        with TimeContext(f'r/{method}'):
            da_r = DocumentArray.load(tmp_file, file_format=method)
        assert len(da) == len(da_r)
        for d, d_r in zip(da, da_r):
            assert d.id == d_r.id
            np.testing.assert_equal(d.embedding, d_r.embedding)
            assert d.content == d_r.content
예제 #25
0
def empty_documents():
    docs = []
    for idx in range(100, 120):
        with Document() as d:
            d.id = f'{idx:0>16}'
            docs.append(d)
    return DocumentArray(docs)
예제 #26
0
def deleted_documents():
    docs = []
    for idx in range(3):
        with Document() as d:
            d.id = f'{idx:0>16}'
            docs.append(d)
    return DocumentArray(docs)
예제 #27
0
def index(num_docs: int):
    num_docs = min(
        num_docs,
        len(glob(os.path.join(os.getcwd(), IMAGE_SRC), recursive=True)))

    flow = Flow(workspace="workspace")\
        .add(uses={"jtype": "ImageCrafter",
                   "with": {"target_size": 96,
                            "img_mean": [0.485, 0.456, 0.406],
                            "img_std": [0.229, 0.224, 0.225]}}) \
        .add(uses=BigTransferEncoder) \
        .add(uses={"jtype": "EmbeddingIndexer",
                   "with": {"index_file_name": "image.json"},
                   "metas": {"name": "vec_idx"}},
             name="vec_idx") \
        .add(uses={"jtype": "KeyValueIndexer",
                   "metas": {"name": "kv_idx"}},
             name="kv_idx",
             needs="gateway") \
        .add(name="join_all",
             needs=["kv_idx", "vec_idx"],
             read_only="true")

    with flow:
        document_generator = from_files(IMAGE_SRC, size=num_docs)
        flow.post(on='/index',
                  inputs=DocumentArray(document_generator),
                  request_size=64,
                  read_mode='rb')
예제 #28
0
def test_override_params(mocker):
    f = (Flow(port_expose=exposed_port).add(uses={
        'jtype': 'DummyOverrideParams',
        'metas': {
            'name': 'exec_name'
        }
    }, ).add(uses=DummyAssertNotOverrideBetweenPodsParams).add(
        uses=DummyAssertIfParamsCanBeChangedInsidePods))

    error_mock = mocker.Mock()

    with f:
        resp = Client(port=exposed_port).index(
            inputs=DocumentArray([Document()]),
            parameters={
                'param1': 50,
                'param2': 60,
                'exec_name': {
                    'param1': 'changed'
                }
            },
            on_error=error_mock,
            return_results=True,
        )
    error_mock.assert_not_called()

    assert len(resp) == 1
    assert len(resp[0].docs) == 1
    for doc in resp[0].docs:
        assert doc.tags == OVERRIDEN_EXECUTOR1_PARAMS
        assert doc.tags['param1'] == 'changed'
        assert doc.tags['param2'] == 60
        assert doc.tags['exec_name']['param1'] == 'changed'
예제 #29
0
 def documents(embedding_cls_type, text_prefix='', num_docs=5):
     docs = []
     for idx in range(num_docs):
         with Document(text=f'{text_prefix}{idx}') as d:
             d.id = f'{idx:0>16}'
             dense_embedding = np.random.random([10])
             if embedding_cls_type == 'dense':
                 d.embedding = dense_embedding
             elif embedding_cls_type == 'scipy_csr':
                 d.embedding = scipy.sparse.csr_matrix(dense_embedding)
             elif embedding_cls_type == 'scipy_coo':
                 d.embedding = scipy.sparse.coo_matrix(dense_embedding)
             elif embedding_cls_type == 'torch':
                 sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
                 values = sparse_embedding.data
                 indices = np.vstack(
                     (sparse_embedding.row, sparse_embedding.col))
                 d.embedding = torch.sparse_coo_tensor(
                     indices,
                     values,
                     sparse_embedding.shape,
                 )
             elif embedding_cls_type == 'tf':
                 sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
                 values = sparse_embedding.data
                 indices = [(x, y) for x, y in zip(sparse_embedding.row,
                                                   sparse_embedding.col)]
                 d.embedding = tf.SparseTensor(
                     indices=indices,
                     values=values,
                     dense_shape=[1, 10],
                 )
         docs.append(d)
     return DocumentArray(docs)
예제 #30
0
def test_da_with_different_inputs():
    docs = [Document() for _ in range(10)]
    da = DocumentArray(
        [docs[i] if (i % 2 == 0) else docs[i].proto for i in range(len(docs))])
    assert len(da) == 10
    for d in da:
        assert isinstance(d, Document)