def _convert_sql_row_to_document(self, row) -> Document: document = Document( id=row.id, text=row.text, meta={meta.name: meta.value for meta in row.meta} ) if row.vector_id: document.vector_id = row.vector_id return document
def train_index( self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None, ): """ Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. The train vectors should come from the same distribution as your final ones. You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. :param documents: Documents (incl. the embeddings) :param embeddings: Plain embeddings :return: None """ if embeddings and documents: raise ValueError( "Either pass `documents` or `embeddings`. You passed both.") if documents: document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] embeddings = [doc.embedding for doc in document_objects] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.train(embeddings)
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None): """ Indexes documents for later queries. :param documents: a list of Python dictionaries or a list of Haystack Document objects. For documents as dictionaries, the format is {"text": "<the-actual-text>"}. Optionally: Include meta data via {"text": "<the-actual-text>", "meta":{"name": "<some-document-name>, "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. :param index: add an optional index attribute to documents. It can be later used for filtering. For instance, documents for evaluation can be indexed in a separate index than the documents for search. :return: None """ index = index or self.index if len(documents) == 0: return # Make sure we comply to Document class format if isinstance(documents[0], dict): document_objects = [ Document.from_dict(d) if isinstance(d, dict) else d for d in documents ] else: document_objects = documents for i in range(0, len(document_objects), self.batch_size): for doc in document_objects[i:i + self.batch_size]: meta_fields = doc.meta or {} vector_id = meta_fields.get("vector_id") meta_orms = [ MetaORM(name=key, value=value) for key, value in meta_fields.items() ] doc_orm = DocumentORM( id=doc.id, text=doc.text, vector_id=vector_id, meta=meta_orms, index=index, ) if self.update_existing_documents: # First old meta data cleaning is required self.session.query(MetaORM).filter_by( document_id=doc.id).delete() self.session.merge(doc_orm) else: self.session.add(doc_orm) try: self.session.commit() except Exception as ex: logger.error(f"Transaction rollback: {ex.__cause__}") # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail self.session.rollback() raise ex
def get_all_documents( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None ) -> List[Document]: """ Get documents from the document store. :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. :param filters: Optional filters to narrow down the documents to return. Example: {"name": ["some", "more"], "category": ["only_one"]} :param return_embedding: Whether to return the document embeddings. """ index = index or self.index # Generally ORM objects kept in memory cause performance issue # Hence using directly column name improve memory and performance. # Refer https://stackoverflow.com/questions/23185319/why-is-loading-sqlalchemy-objects-via-the-orm-5-8x-slower-than-rows-via-a-raw-my documents_query = self.session.query( DocumentORM.id, DocumentORM.text, DocumentORM.vector_id ).filter_by(index=index) if filters: documents_query = documents_query.join(MetaORM) for key, values in filters.items(): documents_query = documents_query.filter( MetaORM.name == key, MetaORM.value.in_(values), DocumentORM.id == MetaORM.document_id ) documents_map = {} for row in documents_query.all(): documents_map[row.id] = Document( id=row.id, text=row.text, meta=None if row.vector_id is None else {"vector_id": row.vector_id} # type: ignore ) for doc_ids in self.chunked_iterable(documents_map.keys(), size=self.batch_size): meta_query = self.session.query( MetaORM.document_id, MetaORM.name, MetaORM.value ).filter(MetaORM.document_id.in_(doc_ids)) for row in meta_query.all(): if documents_map[row.document_id].meta is None: documents_map[row.document_id].meta = {} documents_map[row.document_id].meta[row.name] = row.value # type: ignore return list(documents_map.values())
def write_documents( self, documents: Union[List[dict], List[Document]], index: Optional[str] = None ): """ Add new documents to the DocumentStore. :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index them right away in FAISS. If not, you can later call update_embeddings() to create & index them. :param index: (SQL) index name for storing the docs and metadata :return: """ # vector index if not self.faiss_index: raise ValueError( "Couldn't find a FAISS index. Try to init the FAISSDocumentStore() again ..." ) # doc + metadata index index = index or self.index field_map = self._create_document_field_map() document_objects = [ Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents ] add_vectors = False if document_objects[0].embedding is None else True if self.update_existing_documents and add_vectors: logger.warning( "You have enabled `update_existing_documents` feature and " "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n" "Please call `update_embeddings` method to repopulate `faiss_index`" ) for i in range(0, len(document_objects), self.index_buffer_size): vector_id = self.faiss_index.ntotal if add_vectors: embeddings = [ doc.embedding for doc in document_objects[i : i + self.index_buffer_size] ] embeddings = np.array(embeddings, dtype="float32") self.faiss_index.add(embeddings) docs_to_write_in_sql = [] for doc in document_objects[i : i + self.index_buffer_size]: # meta = doc.meta if add_vectors: # meta["vector_id"] = vector_id doc.vector_id = vector_id vector_id += 1 docs_to_write_in_sql.append(doc) super(FAISSDocumentStore, self).write_documents( docs_to_write_in_sql, index=index )
def get_all_documents( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, ) -> List[Document]: """Gets all documents from the DocumentStore. Args: index (str, optional): Name of the index to get the documents from. If None, DocumentStore's default index (self.index) will be used. Defaults to None. filters (Dict[str, List[str]], optional): Optional filters to narrow down the documents to return. Example: {"name": ["some", "more"], "category": ["only_one"]}. Defaults to None. return_embedding (bool, optional): Whether to return the document embeddings. Defaults to None. Returns: List[Document] """ index = index or self.index # Generally ORM objects kept in memory cause performance issue # Hence using directly column name improve memory and performance. documents_query = self.session.query( DocumentORM.id, DocumentORM.text, DocumentORM.vector_id ).filter_by(index=index) if filters: documents_query = documents_query.join(MetaORM) for key, values in filters.items(): documents_query = documents_query.filter( MetaORM.name == key, MetaORM.value.in_(values), DocumentORM.id == MetaORM.document_id, ) documents_map = {} for row in documents_query.all(): documents_map[row.id] = Document( id=row.id, text=row.text, meta=None if row.vector_id is None else {"vector_id": row.vector_id}, # type: ignore ) for doc_ids in self.chunked_iterable( documents_map.keys(), size=self.batch_size ): meta_query = self.session.query( MetaORM.document_id, MetaORM.name, MetaORM.value ).filter(MetaORM.document_id.in_(doc_ids)) for row in meta_query.all(): if documents_map[row.document_id].meta is None: documents_map[row.document_id].meta = {} documents_map[row.document_id].meta[ row.name ] = row.value # type: ignore return list(documents_map.values())