Exemplo n.º 1
0
class KeyValueIndexer(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if os.path.exists(self.save_path):
            self._docs = DocumentArray.load(self.save_path)
        else:
            self._docs = DocumentArray()

    @property
    def save_path(self):
        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
        return os.path.join(self.workspace, 'kv.json')

    def close(self):
        self._docs.save(self.save_path)

    @requests(on='/index')
    def index(self, docs: DocumentArray, **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def query(self, docs: DocumentArray, **kwargs):
        for doc in docs:
            for match in doc.matches:
                extracted_doc = self._docs[match.parent_id]
                match.MergeFrom(extracted_doc)
Exemplo n.º 2
0
class EmbeddingIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self.index_file_name = index_file_name
        if os.path.exists(self.save_path):
            self._docs = DocumentArray.load(self.save_path)
        else:
            self._docs = DocumentArray()

    @property
    def save_path(self):
        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
        return os.path.join(self.workspace, self.index_file_name)

    def close(self):
        self._docs.save(self.save_path)

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray:
        embedding_docs = DocumentArray()
        for doc in docs:
            embedding_docs.append(Document(id=doc.id, embedding=doc.embedding))
        self._docs.extend(embedding_docs)
        return docs

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \
            -> DocumentArray:
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        top_k = int(parameters.get('top_k', 5))
        assert top_k > 0
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                doc = Document(self._docs[int(_id)], copy=True)
                doc.score.value = 1 - _dist
                doc.parent_id = int(_id)
                _q.matches.append(doc)
        return docs

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 3
0
def da_and_dam():
    rand_docs = random_docs(100)
    da = DocumentArray()
    da.extend(rand_docs)
    rand_docs = random_docs(100)
    dam = DocumentArrayMemmap()
    dam.extend(rand_docs)
    return da, dam
Exemplo n.º 4
0
def on_done(response, final_da: DocumentArray):
    docs = response.docs
    for doc in docs:
        doc.tags['on_done'] = time.time()
        print(
            f'in on_done {doc.id}, time: {readable_time_from(doc.tags["on_done"])}',
            flush=True,
        )
    final_da.extend(docs)
Exemplo n.º 5
0
def on_done(response, final_da: DocumentArray):
    print(f' receiving response {response._pb_body.request_id}')
    for doc in response.docs:
        doc.tags['on_done'] = time.time()
        print(
            f'in on_done {doc.id}, time: {readable_time_from(doc.tags["on_done"])}, {doc.tags["on_done"]}',
            flush=True,
        )
    final_da.extend(response.docs)
Exemplo n.º 6
0
class DummyMockConnectionPool:
    def send_requests_once(
        self,
        requests,
        deployment: str,
        head: bool,
        endpoint: str = None,
        timeout: float = 1.0,
        retries: int = -1,
    ) -> asyncio.Task:
        assert head
        request = requests[0]

        if not hasattr(self, '_docs'):
            self._docs = DocumentArray()

        async def _compute_response():
            response_msg = copy.deepcopy(request)
            exec_endpoint = request.header.exec_endpoint
            new_docs = DocumentArray()
            await asyncio.sleep(0.1)
            if deployment == 'indexer-executor':
                if exec_endpoint == '/index':
                    time.sleep(0.1)
                    self._docs.extend(request.docs)
                else:
                    docs = response_msg.docs
                    docs.clear()
                    docs.extend(
                        DocumentArray(
                            Document(tags={'ids': self._docs[:, 'id']})))
                    response_msg.data.docs = docs
                return response_msg
            else:
                if deployment == 'slow-executor':
                    await asyncio.sleep(SLOW_EXECUTOR_SLEEP_TIME)
                for doc in request.docs:
                    new_doc = Document(doc, copy=True)
                    new_doc.tags['executor'] = time.time()
                    print(
                        f'in {deployment}, {new_doc.id} => time: {readable_time_from(new_doc.tags["executor"])}, {new_doc.tags["executor"]}',
                        flush=True,
                    )
                    new_docs.append(new_doc)

                docs = response_msg.docs
                docs.clear()
                docs.extend(new_docs)
                response_msg.data.docs = docs
                return response_msg

        async def task_wrapper():
            response_msg = await _compute_response()
            return response_msg, {}

        return asyncio.create_task(task_wrapper())
Exemplo n.º 7
0
class DBMSExecutor(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._docs = DocumentArray()
        self.logger = JinaLogger('IndexExecutor')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', *args, **kwargs):
        self._docs.extend(docs)

    @requests(on='/dump')
    def dump(self, parameters, *args, **kwargs):
        dump_path = parameters['dump_path']
        self._docs.save(dump_path)
Exemplo n.º 8
0
class DummyCSRSparseIndexEncoder(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.docs = DocumentArray()

    @requests(on='/index')
    def encode(self, docs: 'DocumentArray', *args, **kwargs) -> Any:
        for i, doc in enumerate(docs):
            doc.embedding = sparse.coo_matrix(doc.content)
        self.docs.extend(docs)

    @requests(on='/search')
    def query(self, docs: 'DocumentArray', parameters, *args, **kwargs):
        top_k = int(parameters['top_k'])
        for doc in docs:
            doc.matches = self.docs[:top_k]
Exemplo n.º 9
0
class KeyValueDBMSIndexer(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._docs = DocumentArray()
        self.logger = JinaLogger('KeyValueDBMSIndexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', *args, **kwargs):
        self._docs.extend(docs)

    # TODO endpoint in tests.distributed.test_remote_flow_dump_rolling_update.test_dump_dbms_remote.test_dump_dbms_remote
    # ends up being http://0.0.0.0:9000/post/dump
    @requests(on='/dump')
    def dump(self, parameters, *args, **kwargs):
        dump_path = parameters['dump_path']
        # TODO: maybe put some logic for shards here
        self._docs.save(dump_path)
Exemplo n.º 10
0
class DummyCSRSparseIndexEncoder(Executor):
    embedding_cls_type = 'scipy_csr'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.docs = DocumentArray()
        self.vectors = {}

    @requests(on='index')
    def encode(self, docs: 'DocumentArray', *args, **kwargs) -> Any:
        self.docs.extend(docs)
        for i, doc in enumerate(self.docs):
            doc.embedding = sparse.csr_matrix(doc.content)
            self.vectors[doc.id] = doc.embedding.getrow(i)

    @requests(on='search')
    def query(self, parameters, *args, **kwargs):
        top_k = parameters['top_k']
        doc = parameters['doc']
        distances = [item for item in range(0, min(top_k, len(self.docs)))]
        return [self.docs[:top_k]], np.array([distances])
Exemplo n.º 11
0
class MyIndexer(Executor):
    """Simple indexer class """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, 1)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.score.value = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 12
0
class CrudIndexer(Executor):
    """Simple indexer class"""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.logger = JinaLogger('CrudIndexer')
        self._docs = DocumentArray()
        self._dump_location = os.path.join(self.metas.workspace, 'docs')
        if os.path.exists(self._dump_location):
            self._docs = DocumentArray.load(self._dump_location)
            self.logger.info(
                f'Loaded {len(self._docs)} from {self._dump_location}')
        else:
            self.logger.info(f'No data found at {self._dump_location}')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/update')
    def update(self, docs: 'DocumentArray', **kwargs):
        self.delete(docs)
        self.index(docs)

    def close(self) -> None:
        self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}')
        self._docs.save(self._dump_location)

    @requests(on='/delete')
    def delete(self, docs: 'DocumentArray', **kwargs):
        # TODO we can do del _docs[d.id] once
        # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed
        ids_to_delete = [d.id for d in docs]
        idx_to_delete = []
        for i, doc in enumerate(self._docs):
            if doc.id in ids_to_delete:
                idx_to_delete.append(i)
        for i in sorted(idx_to_delete, reverse=True):
            del self._docs[i]

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        top_k = int(parameters.get('top_k', 1))
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.scores['cosine'] = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 13
0
 def merge(self, docs_matrix: DocumentArray, **kwargs):
     merged_docs = DocumentArray()
     for docs in docs_matrix:
         merged_docs.extend(docs)
     return merged_docs
Exemplo n.º 14
0
 def search(self, docs: DocumentArray, **kwargs):
     docs.clear()
     docs.extend(self._docs)
Exemplo n.º 15
0
def docarray_for_cache():
    da = DocumentArray()
    d1 = Document(id=1)
    d2 = Document(id='2')
    da.extend([d1, d2])
    return da
Exemplo n.º 16
0
def test_none_extend():
    da = DocumentArray([Document() for _ in range(100)])
    da.extend(None)
    assert len(da) == 100