示例#1
0
文件: sql.py 项目: linhdb-2149/topdup
 def _convert_sql_row_to_document(self, row) -> Document:
     document = Document(
         id=row.id, text=row.text, meta={meta.name: meta.value for meta in row.meta}
     )
     if row.vector_id:
         document.vector_id = row.vector_id
     return document
示例#2
0
    def train_index(
        self,
        documents: Optional[Union[List[dict], List[Document]]],
        embeddings: Optional[np.array] = None,
    ):
        """
        Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
        The train vectors should come from the same distribution as your final ones.
        You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on.

        :param documents: Documents (incl. the embeddings)
        :param embeddings: Plain embeddings
        :return: None
        """

        if embeddings and documents:
            raise ValueError(
                "Either pass `documents` or `embeddings`. You passed both.")
        if documents:
            document_objects = [
                Document.from_dict(d) if isinstance(d, dict) else d
                for d in documents
            ]
            embeddings = [doc.embedding for doc in document_objects]
            embeddings = np.array(embeddings, dtype="float32")
        self.faiss_index.train(embeddings)
示例#3
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
          Indexes documents for later queries.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                            For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                            Optionally: Include meta data via {"text": "<the-actual-text>",
                            "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                            It can be used for filtering and is accessible in the responses of the Finder.
          :param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
                        documents for evaluation can be indexed in a separate index than the documents for search.

          :return: None
        """

        index = index or self.index
        if len(documents) == 0:
            return
        # Make sure we comply to Document class format
        if isinstance(documents[0], dict):
            document_objects = [
                Document.from_dict(d) if isinstance(d, dict) else d
                for d in documents
            ]
        else:
            document_objects = documents

        for i in range(0, len(document_objects), self.batch_size):
            for doc in document_objects[i:i + self.batch_size]:
                meta_fields = doc.meta or {}
                vector_id = meta_fields.get("vector_id")
                meta_orms = [
                    MetaORM(name=key, value=value)
                    for key, value in meta_fields.items()
                ]
                doc_orm = DocumentORM(
                    id=doc.id,
                    text=doc.text,
                    vector_id=vector_id,
                    meta=meta_orms,
                    index=index,
                )
                if self.update_existing_documents:
                    # First old meta data cleaning is required
                    self.session.query(MetaORM).filter_by(
                        document_id=doc.id).delete()
                    self.session.merge(doc_orm)
                else:
                    self.session.add(doc_orm)
            try:
                self.session.commit()
            except Exception as ex:
                logger.error(f"Transaction rollback: {ex.__cause__}")
                # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail
                self.session.rollback()
                raise ex
示例#4
0
    def get_all_documents(
            self,
            index: Optional[str] = None,
            filters: Optional[Dict[str, List[str]]] = None,
            return_embedding: Optional[bool] = None
    ) -> List[Document]:
        """
        Get documents from the document store.

        :param index: Name of the index to get the documents from. If None, the
                      DocumentStore's default index (self.index) will be used.
        :param filters: Optional filters to narrow down the documents to return.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param return_embedding: Whether to return the document embeddings.
        """

        index = index or self.index
        # Generally ORM objects kept in memory cause performance issue
        # Hence using directly column name improve memory and performance.
        # Refer https://stackoverflow.com/questions/23185319/why-is-loading-sqlalchemy-objects-via-the-orm-5-8x-slower-than-rows-via-a-raw-my
        documents_query = self.session.query(
            DocumentORM.id,
            DocumentORM.text,
            DocumentORM.vector_id
        ).filter_by(index=index)

        if filters:
            documents_query = documents_query.join(MetaORM)
            for key, values in filters.items():
                documents_query = documents_query.filter(
                    MetaORM.name == key,
                    MetaORM.value.in_(values),
                    DocumentORM.id == MetaORM.document_id
                )

        documents_map = {}
        for row in documents_query.all():
            documents_map[row.id] = Document(
                id=row.id,
                text=row.text,
                meta=None if row.vector_id is None else {"vector_id": row.vector_id} # type: ignore
            )

        for doc_ids in self.chunked_iterable(documents_map.keys(), size=self.batch_size):
            meta_query = self.session.query(
                MetaORM.document_id,
                MetaORM.name,
                MetaORM.value
            ).filter(MetaORM.document_id.in_(doc_ids))

            for row in meta_query.all():
                if documents_map[row.document_id].meta is None:
                    documents_map[row.document_id].meta = {}
                documents_map[row.document_id].meta[row.name] = row.value # type: ignore

        return list(documents_map.values())
示例#5
0
    def write_documents(
        self, documents: Union[List[dict], List[Document]], index: Optional[str] = None
    ):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                          them right away in FAISS. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :return:
        """
        # vector index
        if not self.faiss_index:
            raise ValueError(
                "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..."
            )

        # doc + metadata index
        index = index or self.index
        field_map = self._create_document_field_map()
        document_objects = [
            Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d
            for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        if self.update_existing_documents and add_vectors:
            logger.warning(
                "You have enabled `update_existing_documents` feature and "
                "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n"
                "Please call `update_embeddings` method to repopulate `faiss_index`"
            )

        for i in range(0, len(document_objects), self.index_buffer_size):
            vector_id = self.faiss_index.ntotal
            if add_vectors:
                embeddings = [
                    doc.embedding
                    for doc in document_objects[i : i + self.index_buffer_size]
                ]
                embeddings = np.array(embeddings, dtype="float32")
                self.faiss_index.add(embeddings)

            docs_to_write_in_sql = []
            for doc in document_objects[i : i + self.index_buffer_size]:
                # meta = doc.meta
                if add_vectors:
                    # meta["vector_id"] = vector_id
                    doc.vector_id = vector_id
                    vector_id += 1
                docs_to_write_in_sql.append(doc)

            super(FAISSDocumentStore, self).write_documents(
                docs_to_write_in_sql, index=index
            )
示例#6
0
文件: sql.py 项目: linhdb-2149/topdup
    def get_all_documents(
        self,
        index: Optional[str] = None,
        filters: Optional[Dict[str, List[str]]] = None,
        return_embedding: Optional[bool] = None,
    ) -> List[Document]:
        """Gets all documents from the DocumentStore.

        Args:
            index (str, optional): Name of the index to get the documents from. If None,
                DocumentStore's default index (self.index) will be used.
                Defaults to None.
            filters (Dict[str, List[str]], optional): Optional filters to narrow down
                the documents to return.
                Example: {"name": ["some", "more"], "category": ["only_one"]}.
                Defaults to None.
            return_embedding (bool, optional): Whether to return the document embeddings.
                Defaults to None.

        Returns:
            List[Document]
        """

        index = index or self.index
        # Generally ORM objects kept in memory cause performance issue
        # Hence using directly column name improve memory and performance.
        documents_query = self.session.query(
            DocumentORM.id, DocumentORM.text, DocumentORM.vector_id
        ).filter_by(index=index)

        if filters:
            documents_query = documents_query.join(MetaORM)
            for key, values in filters.items():
                documents_query = documents_query.filter(
                    MetaORM.name == key,
                    MetaORM.value.in_(values),
                    DocumentORM.id == MetaORM.document_id,
                )

        documents_map = {}
        for row in documents_query.all():
            documents_map[row.id] = Document(
                id=row.id,
                text=row.text,
                meta=None
                if row.vector_id is None
                else {"vector_id": row.vector_id},  # type: ignore
            )

        for doc_ids in self.chunked_iterable(
            documents_map.keys(), size=self.batch_size
        ):
            meta_query = self.session.query(
                MetaORM.document_id, MetaORM.name, MetaORM.value
            ).filter(MetaORM.document_id.in_(doc_ids))

            for row in meta_query.all():
                if documents_map[row.document_id].meta is None:
                    documents_map[row.document_id].meta = {}
                documents_map[row.document_id].meta[
                    row.name
                ] = row.value  # type: ignore

        return list(documents_map.values())