Exemplo n.º 1
0
 def encode(self, docs: DocumentArray, **kwargs):
     content = np.stack(docs.get_attributes('blob'))
     _input = torch.from_numpy(content.astype('float32'))
     _features = self._get_features(_input).detach()
     _features = _features.numpy()
     _features = self._get_pooling(_features)
     docs.embeddings = _features
Exemplo n.º 2
0
class EmbeddingIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self.index_file_name = index_file_name
        if os.path.exists(self.save_path):
            self._docs = DocumentArray.load(self.save_path)
        else:
            self._docs = DocumentArray()

    @property
    def save_path(self):
        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
        return os.path.join(self.workspace, self.index_file_name)

    def close(self):
        self._docs.save(self.save_path)

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray:
        embedding_docs = DocumentArray()
        for doc in docs:
            embedding_docs.append(Document(id=doc.id, embedding=doc.embedding))
        self._docs.extend(embedding_docs)
        return docs

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \
            -> DocumentArray:
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        top_k = int(parameters.get('top_k', 5))
        assert top_k > 0
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                doc = Document(self._docs[int(_id)], copy=True)
                doc.score.value = 1 - _dist
                doc.parent_id = int(_id)
                _q.matches.append(doc)
        return docs

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 3
0
    def encode(self, docs: DocumentArray, **kwargs):
        if docs is None:
            return

        images = np.stack(docs.get_attributes('blob'))
        images = self._maybe_move_channel_axis(images)

        _input = torch.from_numpy(images)
        features = self._get_features(_input).detach()
        features = self._get_pooling(features.numpy())

        for doc, embed in zip(docs, features):
            doc.embedding = embed

        return docs
Exemplo n.º 4
0
    def encode(self, docs: 'DocumentArray', *args, **kwargs):

        chunks = DocumentArray(
            list(
                filter(lambda d: d.mime_type == 'text/plain',
                       docs.traverse_flat(['c']))))

        texts = chunks.get_attributes('text')

        with torch.no_grad():

            if not self.tokenizer.pad_token:
                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                self.model.resize_token_embeddings(len(self.tokenizer.vocab))

            input_tokens = self.tokenizer(
                texts,
                max_length=self.max_length,
                padding='longest',
                truncation=True,
                return_tensors='pt',
            )
            input_tokens = {
                k: v.to(torch.device('cpu'))
                for k, v in input_tokens.items()
            }

            outputs = getattr(self.model,
                              self.embedding_fn_name)(**input_tokens)
            if isinstance(outputs, torch.Tensor):
                return outputs.cpu().numpy()
            hidden_states = outputs.hidden_states

            embeds = self._compute_embedding(hidden_states, input_tokens)
            for doc, embed in zip(chunks, embeds):
                doc.embedding = embed

        return chunks
Exemplo n.º 5
0
class CompoundQueryExecutor(Executor):
    def __init__(self, dump_path: Optional[str] = None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = JinaLogger('CompoundQueryExecutor')
        self._dump_path = dump_path
        if self._dump_path is not None and os.path.exists(self._dump_path):
            self._docs = DocumentArray.load(self._dump_path)
        else:
            self._docs = DocumentArray()

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array', top_k: int):
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters, **kwargs):
        if len(self._docs) > 0:
            a = np.stack(docs.get_attributes('embedding'))
            b = np.stack(self._docs.get_attributes('embedding'))
            q_emb = _ext_A(_norm(a))
            d_emb = _ext_B(_norm(b))
            dists = _cosine(q_emb, d_emb)
            idx, dist = self._get_sorted_top_k(dists, int(parameters['top_k']))
            for _q, _ids, _dists in zip(docs, idx, dist):
                for _id, _dist in zip(_ids, _dists):
                    d = Document(self._docs[int(_id)], copy=True)
                    d.scores['cosine'] = 1 - _dist
                    _q.matches.append(d)
Exemplo n.º 6
0
class MyIndexer(Executor):
    """Simple indexer class """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, 1)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.score.value = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 7
0
class CrudIndexer(Executor):
    """Simple indexer class"""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.logger = JinaLogger('CrudIndexer')
        self._docs = DocumentArray()
        self._dump_location = os.path.join(self.metas.workspace, 'docs')
        if os.path.exists(self._dump_location):
            self._docs = DocumentArray.load(self._dump_location)
            self.logger.info(
                f'Loaded {len(self._docs)} from {self._dump_location}')
        else:
            self.logger.info(f'No data found at {self._dump_location}')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/update')
    def update(self, docs: 'DocumentArray', **kwargs):
        self.delete(docs)
        self.index(docs)

    def close(self) -> None:
        self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}')
        self._docs.save(self._dump_location)

    @requests(on='/delete')
    def delete(self, docs: 'DocumentArray', **kwargs):
        # TODO we can do del _docs[d.id] once
        # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed
        ids_to_delete = [d.id for d in docs]
        idx_to_delete = []
        for i, doc in enumerate(self._docs):
            if doc.id in ids_to_delete:
                idx_to_delete.append(i)
        for i in sorted(idx_to_delete, reverse=True):
            del self._docs[i]

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        top_k = int(parameters.get('top_k', 1))
        a = np.stack(docs.get_attributes('embedding'))
        b = np.stack(self._docs.get_attributes('embedding'))
        q_emb = _ext_A(_norm(a))
        d_emb = _ext_B(_norm(b))
        dists = _cosine(q_emb, d_emb)
        idx, dist = self._get_sorted_top_k(dists, top_k)
        for _q, _ids, _dists in zip(docs, idx, dist):
            for _id, _dist in zip(_ids, _dists):
                d = Document(self._docs[int(_id)], copy=True)
                d.scores['cosine'] = 1 - _dist
                _q.matches.append(d)

    @staticmethod
    def _get_sorted_top_k(dist: 'np.array',
                          top_k: int) -> Tuple['np.ndarray', 'np.ndarray']:
        if top_k >= dist.shape[1]:
            idx = dist.argsort(axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx, axis=1)
        else:
            idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k]
            dist = np.take_along_axis(dist, idx_ps, axis=1)
            idx_fs = dist.argsort(axis=1)
            idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
            dist = np.take_along_axis(dist, idx_fs, axis=1)

        return idx, dist
Exemplo n.º 8
0
def test_texts_getter_da():
    da = DocumentArray([Document(text='hello') for _ in range(100)])
    assert len(da.texts) == 100
    assert da.texts == da.get_attributes('text')
Exemplo n.º 9
0
def test_tags_getter_da():
    da = DocumentArray([Document(tags={'a': 2, 'c': 'd'}) for _ in range(100)])
    assert len(da.tags) == 100
    assert da.tags == da.get_attributes('tags')
Exemplo n.º 10
0
def test_blobs_getter_da():
    blobs = np.random.random((100, 10, 10))
    da = DocumentArray([Document(blob=blob) for blob in blobs])
    assert len(da) == 100
    np.testing.assert_almost_equal(da.get_attributes('blob'), da.blobs)
Exemplo n.º 11
0
def test_embeddings_getter_da():
    embeddings = np.random.random((100, 10))
    da = DocumentArray([Document(embedding=emb) for emb in embeddings])
    assert len(da) == 100
    np.testing.assert_almost_equal(da.get_attributes('embedding'),
                                   da.embeddings)
Exemplo n.º 12
0
def test_da_get_embeddings_slice():
    da = DocumentArray(random_docs(100))
    np.testing.assert_almost_equal(
        da.get_attributes('embedding')[10:20],
        da._get_embeddings(slice(10, 20)))
Exemplo n.º 13
0
def test_da_get_embeddings():
    da = DocumentArray(random_docs(100))
    np.testing.assert_almost_equal(da.get_attributes('embedding'),
                                   da.embeddings)
Exemplo n.º 14
0
 def index(self, docs: DocumentArray, **kwargs):
     self.db.insert_multiple(docs.get_attributes('tags'))