示例#1
0
    def __init__(self):
        self.s3_url_train = 'https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz'
        self.s3_url_dev = 'https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz'
        fetch_archive_from_http(self.s3_url_dev, output_dir='corpus/dev')
        fetch_archive_from_http(self.s3_url_train, output_dir='corpus/train')

        self.data_dir = 'corpus'
        self.train_filename = 'train/biencoder-nq-train.json'
        self.dev_filename = 'dev/biencoder-nq-dev.json'
def test_graph_retrieval(graphdb_fixture):
    # TODO rename doc_dir
    graph_dir = "../data/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
    fetch_archive_from_http(url=s3_url, output_dir=graph_dir)

    # Fetch a pre-trained BART model that translates natural language questions to SPARQL queries
    model_dir = "../saved_models/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
    fetch_archive_from_http(url=s3_url, output_dir=model_dir)

    kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
    kg.delete_index()
    kg.create_index(config_path=Path(graph_dir+"repo-config.ttl"))
    kg.import_from_ttl_file(index="tutorial_10_index",
                            path=Path(graph_dir+"triples.ttl"))
    triple = {'p': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/_paternalgrandfather'}, 's': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Melody_fawley'}, 'o': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Marshall_fawley'}}
    triples = kg.get_all_triples()
    assert len(triples) > 0
    assert triple in triples

    # Define prefixes for names of resources so that we can use shorter resource names in queries
    prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX hp: <https://deepset.ai/harry_potter/>
    """
    kg.prefixes = prefixes

    kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg,
                                          model_name_or_path=model_dir+"hp_v3.4")

    result = kgqa_retriever.retrieve(query="In which house is Harry Potter?")
    assert result[0] == {'answer': ['https://deepset.ai/harry_potter/Gryffindor'], 'prediction_meta': {'model': 'Text2SparqlRetriever', 'sparql_query': 'select ?a { hp:Harry_potter hp:house ?a . }'}}

    result = kgqa_retriever._query_kg(sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }")
    assert result[0][0] == "https://deepset.ai/harry_potter/Rubeus_hagrid"

    result = kgqa_retriever._query_kg(
        sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }")
    assert result[0][0] == "https://deepset.ai/harry_potter/Otter"
def tutorial9_dpr_training():
    # Training Your Own "Dense Passage Retrieval" Model

    # Here are some imports that we'll need

    from haystack.retriever.dense import DensePassageRetriever
    from haystack.preprocessor.utils import fetch_archive_from_http
    from haystack.document_store.memory import InMemoryDocumentStore

    # Download original DPR data
    # WARNING: the train set is 7.4GB and the dev set is 800MB

    doc_dir = "data/dpr_training/"

    s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"
    s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"

    fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "train/")
    fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "dev/")

    ## Option 1: Training DPR from Scratch

    # Here are the variables to specify our training data, the models that we use to initialize DPR
    # and the directory where we'll be saving the model

    doc_dir = "data/dpr_training/"

    train_filename = "train/biencoder-nq-train.json"
    dev_filename = "dev/biencoder-nq-dev.json"

    query_model = "bert-base-uncased"
    passage_model = "bert-base-uncased"

    save_dir = "../saved_models/dpr"

    # ## Option 2: Finetuning DPR
    #
    # # Here are the variables you might want to use instead of the set above
    # # in order to perform pretraining
    #
    # doc_dir = "PATH_TO_YOUR_DATA_DIR"
    # train_filename = "TRAIN_FILENAME"
    # dev_filename = "DEV_FILENAME"
    #
    # query_model = "facebook/dpr-question_encoder-single-nq-base"
    # passage_model = "facebook/dpr-ctx_encoder-single-nq-base"
    #
    # save_dir = "..saved_models/dpr"

    ## Initialize DPR model

    retriever = DensePassageRetriever(
        document_store=InMemoryDocumentStore(),
        query_embedding_model=query_model,
        passage_embedding_model=passage_model,
        max_seq_len_query=64,
        max_seq_len_passage=256
    )

    # Start training our model and save it when it is finished

    retriever.train(
        data_dir=doc_dir,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=dev_filename,
        n_epochs=1,
        batch_size=4,
        grad_acc_steps=4,
        save_dir=save_dir,
        evaluate_every=3000,
        embed_title=True,
        num_positives=1,
        num_hard_negatives=1
    )

    ## Loading

    reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None)
# ## Preprocessing of documents
#
# Haystack provides a customizable pipeline for:
# - converting files into texts
# - cleaning texts
# - splitting texts
# - writing them to a Document Store

# In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
# them in Elasticsearch.

# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.

# Now, let's write the docs to our DB.
if LAUNCH_ELASTICSEARCH:
    document_store.write_documents(dicts)
else:
    logger.warning(
        "Since we already have a running ES instance we should not index the same documents again. \n"
        "If you still want to do this call: document_store.write_documents(dicts) manually "
示例#5
0
def tutorial3_basic_qa_pipeline_without_elasticsearch():
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # or, alternatively, SQLite Document Store
    # document_store = SQLDocumentStore(url="sqlite:///qa.db")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
    # them in Elasticsearch.
    # Let's first get some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where
    # a given question could be answered.
    #
    # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more
    # retrievers, please refer to the tutorial-1.

    # An in-memory TfidfRetriever based on Pandas dataframes
    retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

    # **Here:**                   a medium sized RoBERTa QA model using a Reader based on
    #                             FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):**  TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):**  e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                             "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:**                   You can adjust the model to return "no answer possible" with the no_ans_boost.
    #                             Higher values mean the model prefers "no answer possible".

    # #### FARMReader
    #
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader
    # Alternative:
    # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
示例#6
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
def tutorial6_better_retrieval_via_dpr():
    # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS.
    # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management.
    # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files.
    # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md
    # launch_milvus()
    # document_store = MilvusDocumentStore()

    # ## Preprocessing of documents
    # Let's first get some documents that we want to query
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    ### Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=2,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)

    ### Reader
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    ### Pipeline
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
def tutorial8_preprocessing():
    # This fetches some sample files to work with

    doc_dir = "data/preprocessing_tutorial"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    """
    ## Converters
    
    Haystack's converter classes are designed to help you turn files on your computer into the documents
    that can be processed by the Haystack pipeline.
    There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
    """

    # Here are some examples of how you would use file converters

    converter = TextConverter(remove_numeric_tables=True,
                              valid_languages=["en"])
    doc_txt = converter.convert(
        file_path="data/preprocessing_tutorial/classics.txt", meta=None)

    converter = PDFToTextConverter(remove_numeric_tables=True,
                                   valid_languages=["en"])
    doc_pdf = converter.convert(
        file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

    converter = DocxToTextConverter(remove_numeric_tables=True,
                                    valid_languages=["en"])
    doc_docx = converter.convert(
        file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

    # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.

    all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
    """
    
    ## PreProcessor
    
    The PreProcessor class is designed to help you clean text and split text into sensible units.
    File splitting can have a very significant impact on the system's performance.
    Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
    and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
    """

    # This is a default usage of the PreProcessor.
    # Here, it performs cleaning of consecutive whitespaces
    # and splits a single large document into smaller documents.
    # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
    # Note how the single document passed into the document gets split into 5 smaller documents

    preprocessor = PreProcessor(clean_empty_lines=True,
                                clean_whitespace=True,
                                clean_header_footer=False,
                                split_by="word",
                                split_length=1000,
                                split_respect_sentence_boundary=True)
    docs_default = preprocessor.process(doc_txt)
    print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
    """
    ## Cleaning
    
    - `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
    - `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
    - `clean_header_footer` will remove any long header or footer texts that are repeated on each page
    
    ## Splitting
    By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
    midway through a sentence.
    This will help reduce the possibility of answer phrases being split between two documents.
    This feature can be turned off by setting `split_respect_sentence_boundary=False`.
    """

    # Not respecting sentence boundary vs respecting sentence boundary

    preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
    docs_nrsb = preprocessor_nrsb.process(doc_txt)

    print("RESPECTING SENTENCE BOUNDARY")
    end_text = docs_default[0]["text"][-50:]
    print("End of document: \"..." + end_text + "\"")
    print()
    print("NOT RESPECTING SENTENCE BOUNDARY")
    end_text_nrsb = docs_nrsb[0]["text"][-50:]
    print("End of document: \"..." + end_text_nrsb + "\"")
    """
    A commonly used strategy to split long documents, especially in the field of Question Answering,
    is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:
    
    - doc1 = words[0:10]
    - doc2 = words[7:17]
    - doc3 = words[14:24]
    - ...
    
    You can use this strategy by following the code below.
    """

    # Sliding window approach

    preprocessor_sliding_window = PreProcessor(
        split_overlap=3,
        split_length=10,
        split_respect_sentence_boundary=False)
    docs_sliding_window = preprocessor_sliding_window.process(doc_txt)

    doc1 = docs_sliding_window[0]["text"][:200]
    doc2 = docs_sliding_window[1]["text"][:100]
    doc3 = docs_sliding_window[2]["text"][:100]

    print("Document 1: \"" + doc1 + "...\"")
    print("Document 2: \"" + doc2 + "...\"")
    print("Document 3: \"" + doc3 + "...\"")
def main():

    launch_es()

    document_store = ElasticsearchDocumentStore()
    es_retriever = ElasticsearchRetriever(document_store=document_store)
    eval_retriever = EvalRetriever(open_domain=open_domain)
    reader = FARMReader("deepset/roberta-base-squad2",
                        top_k_per_candidate=4,
                        num_processes=1,
                        return_no_answer=True)
    eval_reader = EvalReader(debug=True, open_domain=open_domain)

    # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
    doc_dir = "../data/nq"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Add evaluation data to Elasticsearch document store
    # We first delete the custom tutorial indices to not have duplicate elements
    preprocessor = PreProcessor(split_length=500,
                                split_overlap=0,
                                split_respect_sentence_boundary=False,
                                clean_empty_lines=False,
                                clean_whitespace=False)
    document_store.delete_all_documents(index=doc_index)
    document_store.delete_all_documents(index=label_index)
    document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json",
                                 doc_index=doc_index,
                                 label_index=label_index,
                                 preprocessor=preprocessor)
    labels = document_store.get_all_labels_aggregated(index=label_index)
    q_to_l_dict = {l.question: {"retriever": l, "reader": l} for l in labels}

    # Here is the pipeline definition
    p = Pipeline()
    p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
    p.add_node(component=eval_retriever,
               name="EvalRetriever",
               inputs=["ESRetriever"])
    p.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"])
    p.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"])

    results = []
    for i, (q, l) in enumerate(q_to_l_dict.items()):
        res = p.run(
            query=q,
            top_k_retriever=top_k_retriever,
            labels=l,
            top_k_reader=10,
            index=doc_index,
            # skip_incorrect_retrieval=True
        )
        results.append(res)

    eval_retriever.print()
    print()
    es_retriever.print_time()
    print()
    eval_reader.print(mode="reader")
    print()
    reader.print_time()
    print()
    eval_reader.print(mode="pipeline")
示例#10
0
def tutorial5_evaluation():

    ##############################################
    # Settings
    ##############################################
    # Choose from Evaluation style from ['retriever_closed', 'reader_closed', 'retriever_reader_open']
    # 'retriever_closed' - evaluates only the retriever, based on whether the gold_label document is retrieved.
    # 'reader_closed' - evaluates only the reader in a closed domain fashion i.e. the reader is given one query
    #     and one document and metrics are calculated on whether the right position in this text is selected by
    #     the model as the answer span (i.e. SQuAD style)
    # 'retriever_reader_open' - evaluates retriever and reader in open domain fashion i.e. a document is considered
    #     correctly retrieved if it contains the answer string within it. The reader is evaluated based purely on the
    #     predicted string, regardless of which document this came from and the position of the extracted span.
    style = "retriever_reader_open"

    # make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
    doc_index = "tutorial5_docs"
    label_index = "tutorial5_labels"

    ##############################################
    # Code
    ##############################################
    launch_es()
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
    doc_dir = "../data/nq"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document",
                                                create_index=False,
                                                embedding_field="emb",
                                                embedding_dim=768,
                                                excluded_meta_data=["emb"])

    # Add evaluation data to Elasticsearch document store
    # We first delete the custom tutorial indices to not have duplicate elements
    # and also split our documents into shorter passages using the PreProcessor
    preprocessor = PreProcessor(split_by="word",
                                split_length=500,
                                split_overlap=0,
                                split_respect_sentence_boundary=False,
                                clean_empty_lines=False,
                                clean_whitespace=False)
    document_store.delete_all_documents(index=doc_index)
    document_store.delete_all_documents(index=label_index)
    document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json",
                                 doc_index=doc_index,
                                 label_index=label_index,
                                 preprocessor=preprocessor)

    # Let's prepare the labels that we need for the retriever and the reader
    labels = document_store.get_all_labels_aggregated(index=label_index)

    # Initialize Retriever
    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: Evaluate DensePassageRetriever
    # Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
    # Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
    # DPR still outperforms Elastic's BM25 by a small margin here.
    # retriever = DensePassageRetriever(document_store=document_store,
    #                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    #                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    #                                   use_gpu=True,
    #                                   embed_title=True,
    #                                   remove_sep_tok_from_untitled_passages=True)
    # document_store.update_embeddings(retriever, index=doc_index)

    # Initialize Reader
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        top_k=4,
                        return_no_answer=True)

    # Here we initialize the nodes that perform evaluation
    eval_retriever = EvalDocuments()
    eval_reader = EvalAnswers(
        sas_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )

    ## Evaluate Retriever on its own in closed domain fashion
    if style == "retriever_closed":
        retriever_eval_results = retriever.eval(top_k=10,
                                                label_index=label_index,
                                                doc_index=doc_index)
        ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
        ## among the correct documents
        print("Retriever Recall:", retriever_eval_results["recall"])
        ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
        print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

    # Evaluate Reader on its own in closed domain fashion (i.e. SQuAD style)
    elif style == "reader_closed":
        reader_eval_results = reader.eval(document_store=document_store,
                                          device=device,
                                          label_index=label_index,
                                          doc_index=doc_index)
        # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
        #reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

        ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
        print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
        ## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
        print("Reader Exact Match:", reader_eval_results["EM"])
        ## Reader F1-Score is the average overlap between the predicted answers and the correct answers
        print("Reader F1-Score:", reader_eval_results["f1"])

    # Evaluate combination of Reader and Retriever in open domain fashion
    elif style == "retriever_reader_open":

        # Here is the pipeline definition
        p = Pipeline()
        p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
        p.add_node(component=eval_retriever,
                   name="EvalDocuments",
                   inputs=["ESRetriever"])
        p.add_node(component=reader, name="QAReader", inputs=["EvalDocuments"])
        p.add_node(component=eval_reader,
                   name="EvalAnswers",
                   inputs=["QAReader"])
        results = []

        for l in labels:
            res = p.run(
                query=l.question,
                top_k_retriever=10,
                labels=l,
                top_k_reader=10,
                index=doc_index,
            )
            results.append(res)

        eval_retriever.print()
        print()
        retriever.print_time()
        print()
        eval_reader.print(mode="reader")
        print()
        reader.print_time()
        print()
        eval_reader.print(mode="pipeline")
    else:
        raise ValueError(
            f'style={style} is not a valid option. Choose from retriever_closed, reader_closed, retriever_reader_open'
        )
def tutorial10_knowledge_graph():
    # Let's first fetch some triples that we want to store in our knowledge graph
    # Here: exemplary triples from the wizarding world
    graph_dir = "../data/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
    fetch_archive_from_http(url=s3_url, output_dir=graph_dir)

    # Fetch a pre-trained BART model that translates text queries to SPARQL queries
    model_dir = "../saved_models/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
    fetch_archive_from_http(url=s3_url, output_dir=model_dir)

    LAUNCH_GRAPHDB = True

    # Start a GraphDB server
    if LAUNCH_GRAPHDB:
        logging.info("Starting GraphDB ...")
        status = subprocess.run([
            'docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11'
        ],
                                shell=True)
        if status.returncode:
            status = subprocess.run(['docker start graphdb-instance-tutorial'],
                                    shell=True)
            if status.returncode:
                raise Exception(
                    "Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
                    "then set LAUNCH_GRAPHDB in the script to False.")
        time.sleep(5)

    # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
    kg = GraphDBKnowledgeGraph(index="tutorial_10_index")

    # Delete the index as it might have been already created in previous runs
    kg.delete_index()

    # Create the index based on a configuration file
    kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))

    # Import triples of subject, predicate, and object statements from a ttl file
    kg.import_from_ttl_file(index="tutorial_10_index",
                            path=Path(graph_dir + "triples.ttl"))
    logging.info(
        f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}"
    )
    logging.info(
        f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph."
    )

    # Define prefixes for names of resources so that we can use shorter resource names in queries
    prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX hp: <https://deepset.ai/harry_potter/>
    """
    kg.prefixes = prefixes

    # Load a pre-trained model that translates text queries to SPARQL queries
    kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg,
                                          model_name_or_path=model_dir +
                                          "hp_v3.4")

    # We can now ask questions that will be answered by our knowledge graph!
    # One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
    # Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
    # E.g. "Harry" -> "hp:Harry_potter"

    query = "In which house is Harry Potter?"
    logging.info(
        f"Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph..."
    )
    result = kgqa_retriever.retrieve(query=query)
    logging.info(result)
    # Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
    # Correct answer: Gryffindor

    logging.info(
        "Executing a SPARQL query with prefixed names of resources...")
    result = kgqa_retriever._query_kg(
        sparql_query=
        "select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
    )
    logging.info(result)
    # Paraphrased question: Who is the keeper of keys and grounds?
    # Correct answer: Rubeus Hagrid

    logging.info("Executing a SPARQL query with full names of resources...")
    result = kgqa_retriever._query_kg(
        sparql_query=
        "select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
    )
    logging.info(result)
示例#12
0
 def __init__(self):
     s3_url_dev = 'https://dpr-nlp.s3.amazonaws.com/startqa_corpus_formatted_for_documentstore.zip'
     fetch_archive_from_http(s3_url_dev, output_dir='corpus/stratCorpus')
def tutorial14_query_classifier():

    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(
        dir_path=doc_dir,
        clean_func=clean_wiki_text,
        split_paragraphs=True
    )

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")


    # Here we build the pipeline
    sklearn_keyword_classifier = Pipeline()
    sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    sklearn_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = sklearn_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = sklearn_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = sklearn_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = sklearn_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = sklearn_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = sklearn_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_keyword_classifier = Pipeline()
    transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    transformer_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = transformer_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = transformer_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = transformer_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = transformer_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = transformer_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = transformer_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_question_classifier = Pipeline()
    transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
    transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"])
    transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
    transformer_question_classifier.draw("question_classifier.png")

    # Run only the QA reader on the question query
    res_1 = transformer_question_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Show only DPR results
    res_2 = transformer_question_classifier.run(
        query="Arya Stark was the daughter of a Lord.",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    res_2

    # Here we create the keyword vs question/statement query classifier

    queries = ["arya stark father","jon snow country",
               "who is the father of arya stark","which country was jon snow filmed?"]

    keyword_classifier = TransformersQueryClassifier()

    for query in queries:
        result = keyword_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question/statement"
        else:
            category = "keyword"

        print(f"Query: {query}, raw_output: {result}, class: {category}")

    # Here we create the question vs statement query classifier

    queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.",
               "who is the father of arya stark?","Which country was jon snow filmed in?"]

    question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")

    for query in queries:
        result = question_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question"
        else:
            category = "statement"

        print(f"Query: {query}, raw_output: {result}, class: {category}")
示例#14
0
def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)
示例#15
0
def tutorial11_pipelines():
    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever,
                                     update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

    ######################
    # Prebuilt Pipelines #
    ######################

    # Extractive QA Pipeline
    ########################

    p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                                retriever=es_retriever)
    res = p_extractive_premade.run(query="Who is the father of Arya Stark?",
                                   top_k_retriever=10,
                                   top_k_reader=5)
    print_answers(res, details="minimal")

    # Document Search Pipeline
    ##########################

    p_retrieval = DocumentSearchPipeline(es_retriever)
    res = p_retrieval.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_documents(res, max_text_len=200)

    # Generator Pipeline
    ##########################

    # We set this to True so that the document store returns document embeddings
    # with each document, this is needed by the Generator
    document_store.return_embedding = True

    # Initialize generator
    rag_generator = RAGenerator()

    # Generative QA
    p_generator = GenerativeQAPipeline(generator=rag_generator,
                                       retriever=dpr_retriever)
    res = p_generator.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_answers(res, details="minimal")

    # We are setting this to False so that in later pipelines,
    # we get a cleaner printout
    document_store.return_embedding = False

    ##############################
    # Creating Pipeline Diagrams #
    ##############################

    p_extractive_premade.draw("pipeline_extractive_premade.png")
    p_retrieval.draw("pipeline_retrieval.png")
    p_generator.draw("pipeline_generator.png")

    ####################
    # Custom Pipelines #
    ####################

    # Extractive QA Pipeline
    ########################

    # Custom built extractive QA pipeline
    p_extractive = Pipeline()
    p_extractive.add_node(component=es_retriever,
                          name="Retriever",
                          inputs=["Query"])
    p_extractive.add_node(component=reader,
                          name="Reader",
                          inputs=["Retriever"])

    # Now we can run it
    res = p_extractive.run(query="Who is the father of Arya Stark?",
                           top_k_retriever=10,
                           top_k_reader=5)
    print_answers(res, details="minimal")
    p_extractive.draw("pipeline_extractive.png")

    # Ensembled Retriever Pipeline
    ##############################

    # Create ensembled pipeline
    p_ensemble = Pipeline()
    p_ensemble.add_node(component=es_retriever,
                        name="ESRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=dpr_retriever,
                        name="DPRRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"),
                        name="JoinResults",
                        inputs=["ESRetriever", "DPRRetriever"])
    p_ensemble.add_node(component=reader,
                        name="Reader",
                        inputs=["JoinResults"])
    p_ensemble.draw("pipeline_ensemble.png")

    # Run pipeline
    res = p_ensemble.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=5  #This is top_k per retriever
    )
    print_answers(res, details="minimal")

    # Query Classification Pipeline
    ###############################

    # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
    # Though this looks very similar to the ensembled pipeline shown above,
    # the key difference is that only one of the retrievers is run for each request.
    # By contrast both retrievers are always run in the ensembled approach.

    class QueryClassifier():
        outgoing_edges = 2

        def run(self, **kwargs):
            if "?" in kwargs["query"]:
                return (kwargs, "output_2")
            else:
                return (kwargs, "output_1")

    # Here we build the pipeline
    p_classifier = Pipeline()
    p_classifier.add_node(component=QueryClassifier(),
                          name="QueryClassifier",
                          inputs=["Query"])
    p_classifier.add_node(component=es_retriever,
                          name="ESRetriever",
                          inputs=["QueryClassifier.output_1"])
    p_classifier.add_node(component=dpr_retriever,
                          name="DPRRetriever",
                          inputs=["QueryClassifier.output_2"])
    p_classifier.add_node(component=reader,
                          name="QAReader",
                          inputs=["ESRetriever", "DPRRetriever"])
    p_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = p_classifier.run(query="Who is the father of Arya Stark?",
                             top_k_retriever=10)
    print("DPR Results" + "\n" + "=" * 15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10)
    print("ES Results" + "\n" + "=" * 15)
    print_answers(res_2)