示例#1
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries.

      :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
                      documents for evaluation can be indexed in a separate index than the documents for search.

        :return: None
        """

        # Make sure we comply to Document class format
        document_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]
        index = index or self.index
        for doc in document_objects:
            meta_fields = doc.meta or {}
            meta_orms = [
                MetaORM(name=key, value=value)
                for key, value in meta_fields.items()
            ]
            doc_orm = DocumentORM(id=doc.id,
                                  text=doc.text,
                                  meta=meta_orms,
                                  index=index)
            self.session.add(doc_orm)
        self.session.commit()
示例#2
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries.


       :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta": {"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
        :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                      separate index than the documents for search.
        :return: None
        """
        index = index or self.index

        documents_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]

        for document in documents_objects:
            self.indexes[index][document.id] = document
示例#3
0
    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        if self.faiss_index is not None:
            raise Exception("Addition of more data in an existing index is not supported.")

        faiss_index = self._create_new_index(vector_size=self.vector_size)
        index = index or self.index
        document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]

        add_vectors = False if document_objects[0].embedding is None else True

        if add_vectors:
            phi = self._get_phi(document_objects)

        for i in range(0, len(document_objects), self.index_buffer_size):
            if add_vectors:
                embeddings = [doc.embedding for doc in document_objects[i: i + self.index_buffer_size]]
                hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi)
                faiss_index.add(hnsw_vectors)

            docs_to_write_in_sql = []
            for vector_id, doc in enumerate(document_objects[i : i + self.index_buffer_size]):
                meta = doc.meta
                if add_vectors:
                    meta["vector_id"] = vector_id
                docs_to_write_in_sql.append(doc)

            super(FAISSDocumentStore, self).write_documents(docs_to_write_in_sql, index=index)
        self.faiss_index = faiss_index
示例#4
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
        :param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
                      documents for evaluation can be indexed in a separate index than the documents for search.

        :return: None
        """

        # Make sure we comply to Document class format
        documents = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]
        index = index or self.index
        for doc in documents:
            row = DocumentORM(id=doc.id,
                              text=doc.text,
                              meta_data=doc.meta,
                              index=index)  # type: ignore
            self.session.add(row)
        self.session.commit()
示例#5
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries in Elasticsearch.

        When using explicit document IDs, any existing document with the same ID gets updated.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :return: None
        """

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index

        # Make sure we comply to Document class format
        documents_objects = [
            Document.from_dict(d) if isinstance(d, dict) else d
            for d in documents
        ]

        documents_to_index = []
        for doc in documents_objects:

            _doc = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **doc.to_dict()
            }  # type: Dict[str, Any]

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("query_score", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)
        bulk(self.client,
             documents_to_index,
             request_timeout=300,
             refresh="wait_for")
示例#6
0
def no_answer_prediction(no_answer_reader, test_docs_xs):
    docs = [
        Document.from_dict(d) if isinstance(d, dict) else d
        for d in test_docs_xs
    ]
    prediction = no_answer_reader.predict(
        question="What is the meaning of life?", documents=docs, top_k=5)
    return prediction
示例#7
0
def prediction(reader, test_docs_xs):
    docs = [
        Document.from_dict(d) if isinstance(d, dict) else d
        for d in test_docs_xs
    ]
    prediction = reader.predict(question="Who lives in Berlin?",
                                documents=docs,
                                top_k=5)
    return prediction
示例#8
0
def test_top_k(test_docs_xs):
    # TODO parametrize top_k and farm/transformers reader using pytest
    # TODO transformers reader was crashing when tested on this

    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
                             use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4)
    for top_k in [2, 5, 10]:
        prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
        assert len(prediction["answers"]) == top_k
示例#9
0
def test_context_window_size(test_docs_xs):
    # TODO parametrize window_size and farm/transformers reader using pytest
    docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
    for window_size in [10, 15, 20]:
        farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
                              use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size)
        prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
        for answer in prediction["answers"]:
            # If the extracted answer is larger than the context window, the context window is expanded.
            # If the extracted answer is odd in length, the resulting context window is one less than context_window_size
            # due to rounding (See FARM's QACandidate)
            # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
            if len(answer["answer"]) <= window_size:
                assert len(answer["context"]) in [window_size, window_size-1]
            else:
                assert len(answer["answer"]) == len(answer["context"])
示例#10
0
    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
        """
        Indexes documents for later queries.


        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
        :param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
                      separate index than the documents for search.
        :return: None
        """
        index = index or self.index

        documents_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]

        for document in documents_objects:
            self.indexes[index][document.id] = document

            #TODO fix tags after id refactoring
            tags = document.tags
            self._map_tags_to_ids(document.id, tags)