def transformer_search(): if request.method=='GET': res ={ 'hits': {'total': 0, 'hits': []} } return render_template("index_dev.html",res=res) elif request.method =='POST': if request.method == 'POST': print("-----------------Calling search Result----------") search_term = request.form["input_transformer"] print("Search Term:", search_term) res_transformer = finder.get_answers(question=search_term, top_k_retriever=10, top_k_reader=5) print(res_transformer) dict_res = [] #TODO: dict_res musss nich erstellt, werden da im Jinja template ebenfalls extrahiert/iteriert werden kann res = [] for answer in res_transformer['answers']: dict_res = {'answer': answer['answer'], 'context': answer['context'], 'score': answer['score'], 'probability': answer['probability'] } res.append(dict_res) print_answers(res_transformer, details="medium") count_hits = -1 #TODO: implement count of results return render_template('index_dev.html', res=res, search_term= search_term, count_hits=count_hits)
def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
# from haystack.retriever.dense import EmbeddingRetriever # retriever = EmbeddingRetriever(document_store=document_store, # embedding_model="deepset/sentence_bert", # model_format="farm") # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="roberta-large", use_gpu=True) # reader = FARMReader(model_name_or_path="csarron/mobilebert-uncased-squad-v2", use_gpu=True) # reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True) reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="ktrapeznikov/albert-xlarge-v2-squad-v2", use_gpu=True) finder = Finder(reader, retriever) prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=40, top_k_reader=5) print_answers(prediction, details="minimal") # print("\n\n") # print(prediction)
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) df = pd.DataFrame.from_dict(dicts) # Get embeddings for our questions from the FAQs questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(docs_to_index) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # Init reader & and use Finder to get answer (same as in Tutorial 1) finder = Finder(reader=reader, retriever=retriever) prediction = finder.get_answers(question="Who is the father of Arya?", top_k_reader=3, top_k_retriever=5) print_answers(prediction, details="all")
def tutorial3_basic_qa_pipeline_without_elasticsearch(): # In-Memory Document Store document_store = InMemoryDocumentStore() # or, alternatively, SQLite Document Store # document_store = SQLDocumentStore(url="sqlite:///qa.db") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index # them in Elasticsearch. # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where # a given question could be answered. # # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more # retrievers, please refer to the tutorial-1. # An in-memory TfidfRetriever based on Pandas dataframes retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. # Higher values mean the model prefers "no answer possible". # #### FARMReader # # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
def tutorial1_basic_qa_pipeline(): logger = logging.getLogger(__name__) LAUNCH_ELASTICSEARCH = True # ## Document Store # # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. # # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, # and vector storage for text embeddings. # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 # for using SQL/InMemory document stores. # **Hint**: # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can # configure Haystack to work with your existing document stores. # # Start an Elasticsearch server # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add # them in Elasticsearch. # Let's first fetch some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. if LAUNCH_ELASTICSEARCH: document_store.write_documents(dicts) else: logger.warning( "Since we already have a running ES instance we should not index the same documents again. \n" "If you still want to do this call: document_store.write_documents(dicts) manually " ) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question # could be answered. # # They use some simple but fast algorithm. # **Here:** We use Elasticsearch's default BM25 algorithm # **Alternatives:** # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of # embeddings (e.g. created via Sentence-BERT) # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes # with SQLite document store. # # from haystack.retriever.tfidf import TfidfRetriever # retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean # the model prefers "no answer possible" # # #### FARMReader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader( # model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
def tutorial6_better_retrieval_via_dpr(): # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS. # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management. # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files. # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md # launch_milvus() # document_store = MilvusDocumentStore() # ## Preprocessing of documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts) ### Retriever retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=2, use_gpu=True, embed_title=True, use_fast_tokenizers=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) ### Pipeline from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
# Connect to a locally running instance of Elasticsearch from haystack.document_store.elasticsearch import ElasticsearchDocumentStore # document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq", search_fields='body') document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq") from haystack.retriever.sparse import ElasticsearchRetriever retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store. # from haystack.retriever.sparse import TfidfRetriever # retriever = TfidfRetriever(document_store=document_store) # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", num_processes=0, use_gpu=False) finder = Finder(reader, retriever) question = "What department is AHRQ a part of?" prediction = finder.get_answers(question, top_k_retriever=10, top_k_reader=5) print_answers(prediction, details="medium")
def tutorial4_faq_style_qa(): ## "FAQ-Style QA": Utilizing existing FAQs for Question Answering # While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data. # # Pros: # - Very fast at inference time # - Utilize existing FAQ data # - Quite good control over answers # # Cons: # - Generalizability: We can only answer questions that are similar to existing ones in FAQ # # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. LAUNCH_ELASTICSEARCH = False if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(30) ### Init the DocumentStore # In contrast to Tutorial 1 (extractive QA), we: # # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"], similarity="cosine") ### Create a Retriever using embeddings # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. # retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) # Download a csv containing some FAQ data # Here: Some question-answer pairs related to COVID-19 temp = requests.get( "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv" ) open('small_faq_covid.csv', 'wb').write(temp.content) # Get dataframe with columns "question", "answer" and some custom metadata df = pd.read_csv("small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) print(df.head()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # Initialize a Pipeline (this time without a reader) and ask questions from haystack.pipeline import FAQPipeline pipe = FAQPipeline(retriever=retriever) prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10) print_answers(prediction, details="all")
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ###################### # Prebuilt Pipelines # ###################### # Extractive QA Pipeline ######################## p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") # Document Search Pipeline ########################## p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_documents(res, max_text_len=200) # Generator Pipeline ########################## # We set this to True so that the document store returns document embeddings # with each document, this is needed by the Generator document_store.return_embedding = True
def tutorial14_query_classifier(): #Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore got_dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True ) # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() document_store.delete_all_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever dpr_retriever = DensePassageRetriever(document_store) document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") # Here we build the pipeline sklearn_keyword_classifier = Pipeline() sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) sklearn_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = sklearn_keyword_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = sklearn_keyword_classifier.run( query="arya stark father", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = sklearn_keyword_classifier.run( query="which country was jon snow filmed ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = sklearn_keyword_classifier.run( query="jon snow country", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_4) # Run only the dense retriever on the full sentence query res_5 = sklearn_keyword_classifier.run( query="who are the younger brothers of arya stark ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = sklearn_keyword_classifier.run( query="arya stark younger brothers", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_6) # Here we build the pipeline transformer_keyword_classifier = Pipeline() transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) transformer_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = transformer_keyword_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = transformer_keyword_classifier.run( query="arya stark father", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = transformer_keyword_classifier.run( query="which country was jon snow filmed ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = transformer_keyword_classifier.run( query="jon snow country", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_4) # Run only the dense retriever on the full sentence query res_5 = transformer_keyword_classifier.run( query="who are the younger brothers of arya stark ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = transformer_keyword_classifier.run( query="arya stark younger brothers", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_6) # Here we build the pipeline transformer_question_classifier = Pipeline() transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"]) transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"]) transformer_question_classifier.draw("question_classifier.png") # Run only the QA reader on the question query res_1 = transformer_question_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Show only DPR results res_2 = transformer_question_classifier.run( query="Arya Stark was the daughter of a Lord.", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) res_2 # Here we create the keyword vs question/statement query classifier queries = ["arya stark father","jon snow country", "who is the father of arya stark","which country was jon snow filmed?"] keyword_classifier = TransformersQueryClassifier() for query in queries: result = keyword_classifier.run(query=query) if result[1] == "output_1": category = "question/statement" else: category = "keyword" print(f"Query: {query}, raw_output: {result}, class: {category}") # Here we create the question vs statement query classifier queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.", "who is the father of arya stark?","Which country was jon snow filmed in?"] question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") for query in queries: result = question_classifier.run(query=query) if result[1] == "output_1": category = "question" else: category = "statement" print(f"Query: {query}, raw_output: {result}, class: {category}")
def tutorial11_pipelines(): #Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() document_store.delete_all_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever dpr_retriever = DensePassageRetriever(document_store) document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ###################### # Prebuilt Pipelines # ###################### # Extractive QA Pipeline ######################## p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") # Document Search Pipeline ########################## p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_documents(res, max_text_len=200) # Generator Pipeline ########################## # We set this to True so that the document store returns document embeddings # with each document, this is needed by the Generator document_store.return_embedding = True # Initialize generator rag_generator = RAGenerator() # Generative QA p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) res = p_generator.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_answers(res, details="minimal") # We are setting this to False so that in later pipelines, # we get a cleaner printout document_store.return_embedding = False ############################## # Creating Pipeline Diagrams # ############################## p_extractive_premade.draw("pipeline_extractive_premade.png") p_retrieval.draw("pipeline_retrieval.png") p_generator.draw("pipeline_generator.png") #################### # Custom Pipelines # #################### # Extractive QA Pipeline ######################## # Custom built extractive QA pipeline p_extractive = Pipeline() p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) # Now we can run it res = p_extractive.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") p_extractive.draw("pipeline_extractive.png") # Ensembled Retriever Pipeline ############################## # Create ensembled pipeline p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.draw("pipeline_ensemble.png") # Run pipeline res = p_ensemble.run( query="Who is the father of Arya Stark?", top_k_retriever=5 #This is top_k per retriever ) print_answers(res, details="minimal") # Query Classification Pipeline ############################### # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. # Though this looks very similar to the ensembled pipeline shown above, # the key difference is that only one of the retrievers is run for each request. # By contrast both retrievers are always run in the ensembled approach. class QueryClassifier(): outgoing_edges = 2 def run(self, **kwargs): if "?" in kwargs["query"]: return (kwargs, "output_2") else: return (kwargs, "output_1") # Here we build the pipeline p_classifier = Pipeline() p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = p_classifier.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print("DPR Results" + "\n" + "=" * 15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10) print("ES Results" + "\n" + "=" * 15) print_answers(res_2)
import os # data = pd.read_csv('test.txt', sep='\t') document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") retriever = ElasticsearchRetriever(document_store=document_store) reader = TransformersReader(model_name_or_path='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2', context_window_size=500, use_gpu=-1) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=500) finder = Finder(reader, retriever) if __name__ == '__main__': # questions = ["What do we know about Bourin and Uchiyama?"] ''' prediction = finder.get_answers(question="What do we know about symbiotic stars?", top_k_retriever=10, top_k_reader=3) print_answers(prediction, details='minimal') ''' while True: qes = input('Question: ') # print(qes) prediction = finder.get_answers(question=qes, top_k_retriever=5, top_k_reader=5) print_answers(prediction, details='minimal')