Exemplos de DocumentArray.get_attributes em Python, exemplos de jina.DocumentArray.get_attributes em Python

Exemplo n.º 1

0

Exibir arquivo

 def encode(self, docs: DocumentArray, **kwargs):
     content = np.stack(docs.get_attributes('blob'))
     _input = torch.from_numpy(content.astype('float32'))
     _features = self._get_features(_input).detach()
     _features = _features.numpy()
     _features = self._get_pooling(_features)
     docs.embeddings = _features

Exemplo n.º 2

0

Exibir arquivo

class EmbeddingIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self.index_file_name = index_file_name
        if os.path.exists(self.save_path):
            self._docs = DocumentArray.load(self.save_path)
        else:
            self._docs = DocumentArray()

    @property
    def save_path(self):
        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
        return os.path.join(self.workspace, self.index_file_name)

    def close(self):
        self._docs.save(self.save_path)

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray:
        embedding_docs = DocumentArray()
        for doc in docs:
            embedding_docs.append(Document(id=doc.id, embedding=doc.embedding))
        self._docs.extend(embedding_docs)
        return docs

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \
            -> DocumentArray:
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        top_k = int(parameters.get('top_k', 5))
        assert top_k > 0
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                doc = Document(self._docs[int(_id)], copy=True)
                doc.score.value = 1 - _dist
                doc.parent_id = int(_id)
                _q.matches.append(doc)
        return docs

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist

Exemplo n.º 3

0

Exibir arquivo

    def encode(self, docs: DocumentArray, **kwargs):
        if docs is None:
            return

        images = np.stack(docs.get_attributes('blob'))
        images = self._maybe_move_channel_axis(images)

        _input = torch.from_numpy(images)
        features = self._get_features(_input).detach()
        features = self._get_pooling(features.numpy())

        for doc, embed in zip(docs, features):
            doc.embedding = embed

        return docs

Exemplo n.º 4

0

Exibir arquivo

    def encode(self, docs: 'DocumentArray', *args, **kwargs):

        chunks = DocumentArray(
            list(
                filter(lambda d: d.mime_type == 'text/plain',
                       docs.traverse_flat(['c']))))

        texts = chunks.get_attributes('text')

        with torch.no_grad():

            if not self.tokenizer.pad_token:
                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                self.model.resize_token_embeddings(len(self.tokenizer.vocab))

            input_tokens = self.tokenizer(
                texts,
                max_length=self.max_length,
                padding='longest',
                truncation=True,
                return_tensors='pt',
            )
            input_tokens = {
                k: v.to(torch.device('cpu'))
                for k, v in input_tokens.items()
            }

            outputs = getattr(self.model,
                              self.embedding_fn_name)(**input_tokens)
            if isinstance(outputs, torch.Tensor):
                return outputs.cpu().numpy()
            hidden_states = outputs.hidden_states

            embeds = self._compute_embedding(hidden_states, input_tokens)
            for doc, embed in zip(chunks, embeds):
                doc.embedding = embed

        return chunks

Exemplo n.º 5

0

Exibir arquivo

Arquivo: executors.py Projeto: vishalbelsare/jina

class CompoundQueryExecutor(Executor):
    def __init__(self, dump_path: Optional[str] = None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = JinaLogger('CompoundQueryExecutor')
        self._dump_path = dump_path
        if self._dump_path is not None and os.path.exists(self._dump_path):
            self._docs = DocumentArray.load(self._dump_path)
        else:
            self._docs = DocumentArray()

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array', top_k: int):
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters, **kwargs):
        if len(self._docs) > 0:
            a = np.stack(docs.get_attributes('embedding'))
            b = np.stack(self._docs.get_attributes('embedding'))
            q_emb = _ext_A(_norm(a))
            d_emb = _ext_B(_norm(b))
            dists = _cosine(q_emb, d_emb)
            idx, dist = self._get_sorted_top_k(dists, int(parameters['top_k']))
            for _q, _ids, _dists in zip(docs, idx, dist):
                for _id, _dist in zip(_ids, _dists):
                    d = Document(self._docs[int(_id)], copy=True)
                    d.scores['cosine'] = 1 - _dist
                    _q.matches.append(d)

Exemplo n.º 6

0

Exibir arquivo

class MyIndexer(Executor):
    """Simple indexer class """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, 1)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.score.value = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist

Exemplo n.º 7

0

Exibir arquivo

Arquivo: __init__.py Projeto: paddlelaw/jina

class CrudIndexer(Executor):
    """Simple indexer class"""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.logger = JinaLogger('CrudIndexer')
        self._docs = DocumentArray()
        self._dump_location = os.path.join(self.metas.workspace, 'docs')
        if os.path.exists(self._dump_location):
            self._docs = DocumentArray.load(self._dump_location)
            self.logger.info(
                f'Loaded {len(self._docs)} from {self._dump_location}')
        else:
            self.logger.info(f'No data found at {self._dump_location}')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/update')
    def update(self, docs: 'DocumentArray', **kwargs):
        self.delete(docs)
        self.index(docs)

    def close(self) -> None:
        self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}')
        self._docs.save(self._dump_location)

    @requests(on='/delete')
    def delete(self, docs: 'DocumentArray', **kwargs):
        # TODO we can do del _docs[d.id] once
        # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed
        ids_to_delete = [d.id for d in docs]
        idx_to_delete = []
        for i, doc in enumerate(self._docs):
            if doc.id in ids_to_delete:
                idx_to_delete.append(i)
        for i in sorted(idx_to_delete, reverse=True):
            del self._docs[i]

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        top_k = int(parameters.get('top_k', 1))
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.scores['cosine'] = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist

Exemplo n.º 8

0

Exibir arquivo