Exemplo n.º 1
0
def get_retriever(retriever_type, document_store):

    if retriever_type == "dpr":
        retriever = DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=False,
            embed_title=True)
    elif retriever_type == "tfidf":
        retriever = TfidfRetriever(document_store=document_store)
        retriever.fit()
    elif retriever_type == "embedding":
        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)
    elif retriever_type == "retribert":
        retriever = EmbeddingRetriever(
            document_store=document_store,
            embedding_model="yjernite/retribert-base-uncased",
            model_format="retribert",
            use_gpu=False)
    elif retriever_type == "elasticsearch":
        retriever = ElasticsearchRetriever(document_store=document_store)
    elif retriever_type == "es_filter_only":
        retriever = ElasticsearchFilterOnlyRetriever(
            document_store=document_store)
    else:
        raise Exception(f"No retriever fixture for '{retriever_type}'")

    return retriever
Exemplo n.º 2
0
def test_faq_retriever_in_memory_store():

    from haystack.database.memory import InMemoryDocumentStore
    from haystack.retriever.dense import EmbeddingRetriever

    document_store = InMemoryDocumentStore(embedding_field="embedding")

    documents = [
        {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
    ]

    retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)

    embedded = []
    for doc in documents:
        doc['embedding'] = retriever.embed([doc['meta']['question']])[0]
        embedded.append(doc)

    document_store.write_documents(embedded)

    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1)

    assert len(prediction.get('answers', [])) == 1
Exemplo n.º 3
0
def main():
    POPULATE_DOCUMENT_STORE = True

    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                                index="document",
                                                text_field="text",
                                                embedding_field="question_emb",
                                                embedding_dim="768",
                                                excluded_meta_data=["question_emb"])

    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model=os.getcwd() +
        "\\kbQA\\bert-german-model",
        gpu=True, model_format="transformers")

    if POPULATE_DOCUMENT_STORE:
        doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt"
        dicts = convert_files_to_dicts(
            dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True)

        with open("Output.txt", "w") as text_file:
            text = ""
            for doc in dicts:
                text = text + "\n" + doc["text"]
            text_file.write(text)
        df = pd.DataFrame.from_dict(dicts)

        # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für
        # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt.
        # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions
        # bei der Suche geschmissen werden.
        # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort
        # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur
        # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für
        # jeden einzelnen Text.
        # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen,
        #       denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind
        #       deutlich exakter.
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)
        dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)]
        df["question"] = dummy_questions
        print(df.head())

        docs_to_index = df.to_dict(orient="records")
        document_store.write_documents(docs_to_index)

    # question = "Wie viele haben Angst um ihren Job?"
    question = "welche leistungen sind ausgeschlossen?"
    # auch hier wieder: Kleinschreibung zwingend notwendig!
    question = question.lower()

    # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen
    # Der Retriever holt anhand der embeddings die besten Treffer ran.
    # get_answers() ohne reader nicht verwendbar
    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(
        question, top_k_retriever=5)
    print_answers(prediction, details="all")
Exemplo n.º 4
0
def test_faiss_retrieving(document_store):
    document_store.write_documents(DOCUMENTS)

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)
    result = retriever.retrieve(query="How to test this?")
    assert len(result) == len(DOCUMENTS)
    assert type(result[0]) == Document
Exemplo n.º 5
0
def test_faiss_retrieving(index_factory):
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db",
        faiss_index_factory_str=index_factory)
    document_store.delete_all_documents(index="document")
    if "ivf" in index_factory.lower():
        document_store.train_index(DOCUMENTS)
    document_store.write_documents(DOCUMENTS)
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)
    result = retriever.retrieve(query="How to test this?")
    assert len(result) == len(DOCUMENTS)
    assert type(result[0]) == Document
Exemplo n.º 6
0
def get_retriever(retriever_type, document_store):

    if retriever_type == "dpr":
        retriever = DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=False,
            embed_title=True,
            remove_sep_tok_from_untitled_passages=True)
    elif retriever_type == "tfidf":
        return TfidfRetriever(document_store=document_store)
    elif retriever_type == "embedding":
        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)
    elif retriever_type == "elsticsearch":
        retriever = ElasticsearchRetriever(document_store=document_store)
    elif retriever_type == "es_filter_only":
        retriever = ElasticsearchFilterOnlyRetriever(
            document_store=document_store)
    else:
        raise Exception(f"No retriever fixture for '{retriever_type}'")

    return retriever
Exemplo n.º 7
0
    def load(self):
        if(self.finder and self.finder2):
            return
        if(not self.document_store2):
            self.document_store2 = FAISSDocumentStore.load(
                sql_url=sqlUrlFAQ, faiss_file_path='faiss2')  # save before load in preprocess
            self.initSql(url=sqlUrlFAQ, document_store=self.document_store2)
        # else:  # reset session
        #     # self.document_store2.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ)
        if(not self.retriever2):
            self.retriever2 = EmbeddingRetriever(document_store=self.document_store2,
                                                 embedding_model="sentence_bert-saved", use_gpu=False)
        if(not self.finder2):
            self.finder2 = Finder(reader=None, retriever=self.retriever2)

        if(not self.document_store):
            self.document_store = SQLDocumentStore(url=sqlUrl)  
            #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl)
                                                          
            self.initSql(url=sqlUrl, document_store=self.document_store)
        # else:  # reset session
        #     # self.document_store.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store).__init__(url=sqlUrl)
        # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly??
        #     document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever
        if(not self.retriever):
            self.retriever = TfidfRetriever(document_store=self.document_store)
        self.reader = FARMReader(model_name_or_path=modelDir,
                                 use_gpu=False, no_ans_boost=0) if not self.reader else self.reader
        # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
        self.finder = Finder(
            self.reader, self.retriever) if not self.finder else self.finder
Exemplo n.º 8
0
    def _setup_retriever(self, use_gpu, quantize_model):
        retriever = EmbeddingRetriever(
            document_store=self.document_store,
            embedding_model=self.model_name,
            use_gpu=use_gpu)
        if not use_gpu and quantize_model:
            self.set_quantized_model(retriever)

        return retriever
Exemplo n.º 9
0
    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")
Exemplo n.º 10
0
def test_faiss_finding(document_store):
    document_store.write_documents(DOCUMENTS)

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)
    finder = Finder(reader=None, retriever=retriever)

    prediction = finder.get_answers_via_similar_questions(
        question="How to test this?", top_k_retriever=1)

    assert len(prediction.get('answers', [])) == 1
Exemplo n.º 11
0
def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(
            document_store=doc_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=True,
            use_fast_tokenizers=False)
    if retriever_name == "sentence_transformers":
        return EmbeddingRetriever(document_store=doc_store,
                                  embedding_model="nq-distilbert-base-v1",
                                  use_gpu=True,
                                  model_format="sentence_transformers")
Exemplo n.º 12
0
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(
            host=DB_HOST,
            port=DB_PORT,
            index=self.id,
            embedding_field="question_emb",
            embedding_dim=768,
            excluded_meta_data=["question_emb"])
        retriever = EmbeddingRetriever(document_store=doc_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)

        self.finder = Finder(reader=None, retriever=retriever)

        if add_sample_data:
            add_sample_data_faq_qa(self)
Exemplo n.º 13
0
    while True:
        try:
            # 512 dimensions because that is what the sentnce transformer returns
            document_store = ElasticsearchDocumentStore(
                host="elasticsearch",
                username="",
                password="",
                index="document",
                embedding_dim=512,
                embedding_field="embedding")
            break
        except:
            time.sleep(15)

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)
    if document_store.get_document_count() < 1:
        dicts = convert_files_to_dicts(dir_path=data_path,
                                       clean_func=clean_text,
                                       split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    finder = Finder(retriever=retriever, reader=None)
 document_store = ElasticsearchDocumentStore(
     host="localhost",
     username="",
     password="",
     index="document",
     text_field="text",
     embedding_field="question_emb",
     embedding_dim="768",
     excluded_meta_data=["question_emb"])
 # ## Initalize Retriever, Reader,  & Finder
 # ### Retriever
 # gpu= True to speed up processing
 # BERT-Model is trained to use the second extraction layer
 retriever = EmbeddingRetriever(document_store=document_store,
                                embedding_model=os.getcwd() +
                                "/kbQA/bert-german-model",
                                gpu=True,
                                model_format="transformers",
                                emb_extraction_layer=-2)
 if POPULATE_DOCUMENT_STORE:
     # set path to directory containing the embeddings
     doc_dir = os.getcwd() + "/kbQA/data/tesla_embs.json"
     # initialize dataframe with column names
     df = pd.DataFrame(columns=['name', 'text', 'question_emb',
                                'question'], )
     # open the file
     with open(doc_dir, encoding="utf-8") as file:
         # initialize question indexing
         i = 0
         # each line has multiple paragraphs and embeddings, read file line
         # by line
         for cnt, line in enumerate(file):
Exemplo n.º 15
0
def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)
Exemplo n.º 16
0
#                                   max_seq_len_query=64,
#                                   max_seq_len_passage=256,
#                                   batch_size=16,
#                                   use_gpu=True,
#                                   embed_title=True,
#                                   use_fast_tokenizers=True)


# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv("faq.csv")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())

# Get embeddings for our questions from the FAQs
# questions = list(df["question"].values)
# df["question_emb"] = retriever2.embed_queries(texts=questions)
# text is the field to be converted to embeddings
df = df.rename(columns={"question": "text"})

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.delete_all_documents()
document_store.write_documents(docs_to_index)


retriever2 = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence_bert-saved", use_gpu=False)
document_store.update_embeddings(retriever2)
document_store.save('faiss2')
# Alternative retriever - double BERT neural networks for question and doc embedding
from haystack.retriever.dense import DensePassageRetriever
dpr_retriever = DensePassageRetriever(
    document_store=document_store_FAISS,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

document_store_FAISS.update_embeddings(
    retriever=dpr_retriever)  # possible training of dpr model

# Alternative retriever - single BERT to embed both question and doc, may be better for similar documents (our case)
from haystack.retriever.dense import EmbeddingRetriever
embedding_retriever = EmbeddingRetriever(document_store=document_store_FAISS,
                                         embedding_model="deepset/sentence_bert")

# Reader to further scan with Hugging Face models
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
# reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=True)

# Decide whether the answer should be retrieved from the tables or the general texts


class QueryClassifier:
    outgoing_edges = 2

    def run(self, **kwargs):
        #print("Running Query Classifier")
        # print(len(kwargs["documents"]))
        # print(kwargs["documents"][0].meta)
def tutorial4_faq_style_qa():
    ## "FAQ-Style QA": Utilizing existing FAQs for Question Answering

    # While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.
    #
    # Pros:
    # - Very fast at inference time
    # - Utilize existing FAQ data
    # - Quite good control over answers
    #
    # Cons:
    # - Generalizability: We can only answer questions that are similar to existing ones in FAQ
    #
    # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option.
    LAUNCH_ELASTICSEARCH = False

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(30)

    ### Init the DocumentStore
    # In contrast to Tutorial 1 (extractive QA), we:
    #
    # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer
    # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question
    # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results

    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="document",
        embedding_field="question_emb",
        embedding_dim=768,
        excluded_meta_data=["question_emb"],
        similarity="cosine")

    ### Create a Retriever using embeddings
    # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
    # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
    #
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=True)

    # Download a csv containing some FAQ data
    # Here: Some question-answer pairs related to COVID-19
    temp = requests.get(
        "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv"
    )
    open('small_faq_covid.csv', 'wb').write(temp.content)

    # Get dataframe with columns "question", "answer" and some custom metadata
    df = pd.read_csv("small_faq_covid.csv")
    # Minimal cleaning
    df.fillna(value="", inplace=True)
    df["question"] = df["question"].apply(lambda x: x.strip())
    print(df.head())

    # Get embeddings for our questions from the FAQs
    questions = list(df["question"].values)
    df["question_emb"] = retriever.embed_queries(texts=questions)
    df = df.rename(columns={"question": "text"})

    # Convert Dataframe to list of dicts and index them in our DocumentStore
    docs_to_index = df.to_dict(orient="records")
    document_store.write_documents(docs_to_index)

    #    Initialize a Pipeline (this time without a reader) and ask questions

    from haystack.pipeline import FAQPipeline
    pipe = FAQPipeline(retriever=retriever)

    prediction = pipe.run(query="How is the virus spreading?",
                          top_k_retriever=10)
    print_answers(prediction, details="all")
Exemplo n.º 19
0
document_store = ElasticsearchDocumentStore(
    host="localhost",
    username="",
    password="",
    index="document",
    embedding_field="question_emb",
    embedding_dim=768,
    excluded_meta_data=["question_emb"])

### Create a Retriever using embeddings
# Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
# We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
#
retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="deepset/sentence_bert",
                               use_gpu=True)

# Download a csv containing some FAQ data
# Here: Some question-answer pairs related to COVID-19
temp = requests.get(
    "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv"
)
open('small_faq_covid.csv', 'wb').write(temp.content)

# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv("small_faq_covid.csv")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())
Exemplo n.º 20
0
    password=DB_PW,
    index=DB_INDEX,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
    faq_question_field=FAQ_QUESTION_FIELD_NAME,
)

if RETRIEVER_TYPE == "EmbeddingRetriever":
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=EMBEDDING_MODEL_PATH,
                                   model_format=EMBEDDING_MODEL_FORMAT,
                                   use_gpu=USE_GPU)  # type: BaseRetriever
elif RETRIEVER_TYPE == "ElasticsearchRetriever":
    retriever = ElasticsearchRetriever(document_store=document_store)
elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever":
    retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
else:
    raise ValueError(
        f"Could not load Retriever of type '{RETRIEVER_TYPE}'. "
        f"Please adjust RETRIEVER_TYPE to one of: "
        f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None"
        f"OR modify rest_api/search.py to support your retriever")

if READER_MODEL_PATH:  # for extractive doc-qa
    if READER_TYPE == "TransformersReader":
        use_gpu = -1 if not USE_GPU else GPU_NUMBER
Exemplo n.º 21
0
def embedding_retriever(faiss_document_store):
    return EmbeddingRetriever(document_store=faiss_document_store,
                              embedding_model="deepset/sentence_bert",
                              use_gpu=False)
def test_embedding_retriever(document_store):

    documents = [
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'How to test this library?',
                'question': 'How to test this library?'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
        {
            'text': 'By running tox in the command line!',
            'meta': {
                'name': 'blah blah blah',
                'question': 'blah blah blah'
            }
        },
    ]

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)

    embedded = []
    for doc in documents:
        doc['embedding'] = retriever.embed([doc['meta']['question']])[0]
        embedded.append(doc)

    document_store.write_documents(embedded)

    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(
        question="How to test this?", top_k_retriever=1)

    assert len(prediction.get('answers', [])) == 1
Exemplo n.º 23
0
def main():
    # fetch model files if not present. not hosted in git repo
    # model_exists = os.path.isfile(
    #     './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    # if not model_exists:
    #     logging.info("Starting model download (about 700MB) ...")
    #     urllib.request.urlretrieve(
    #         "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
    #         "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
    #     logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2',
            shell=True
        )
        if status.returncode:
            raise Exception("Failed to launch Elasticsearch. If you want to "
                            "connect to an existing Elasticsearch instance"
                            "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost", username="",
                                                password="", index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(
            dir_path=data_path, clean_func=clean_text, split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model=retriever_model_name_full,
                                       model_format=retriever_model_type,
                                       gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    # reader = TransformersReader(model="./kbQA/" + reader_model_name,
    #                             tokenizer="./kbQA/" + reader_model_name,
    #                             use_gpu=-1)
    finder = Finder(retriever=retriever, reader=None)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(
                        query=q, filters=None, top_k=5)
                    end_time = time.process_time()
                    times.append(end_time-start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append(
                            {"q": q, "correct": data["answer"], "a": answers})
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times), len(results)/len(times))
                logging.info("Failed questions:")
                for fail in failed:
                    logging.info("Question: %s", fail["q"])
                    logging.info("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.info(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(
                    query=q, filters=None, top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")