def get_retriever(retriever_type, document_store): if retriever_type == "dpr": retriever = DensePassageRetriever( document_store=document_store, query_embedding_model= "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=False, embed_title=True) elif retriever_type == "tfidf": retriever = TfidfRetriever(document_store=document_store) retriever.fit() elif retriever_type == "embedding": retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) elif retriever_type == "retribert": retriever = EmbeddingRetriever( document_store=document_store, embedding_model="yjernite/retribert-base-uncased", model_format="retribert", use_gpu=False) elif retriever_type == "elasticsearch": retriever = ElasticsearchRetriever(document_store=document_store) elif retriever_type == "es_filter_only": retriever = ElasticsearchFilterOnlyRetriever( document_store=document_store) else: raise Exception(f"No retriever fixture for '{retriever_type}'") return retriever
def test_faq_retriever_in_memory_store(): from haystack.database.memory import InMemoryDocumentStore from haystack.retriever.dense import EmbeddingRetriever document_store = InMemoryDocumentStore(embedding_field="embedding") documents = [ {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, ] retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False) embedded = [] for doc in documents: doc['embedding'] = retriever.embed([doc['meta']['question']])[0] embedded.append(doc) document_store.write_documents(embedded) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def test_faiss_retrieving(document_store): document_store.write_documents(DOCUMENTS) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) result = retriever.retrieve(query="How to test this?") assert len(result) == len(DOCUMENTS) assert type(result[0]) == Document
def test_faiss_retrieving(index_factory): document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db", faiss_index_factory_str=index_factory) document_store.delete_all_documents(index="document") if "ivf" in index_factory.lower(): document_store.train_index(DOCUMENTS) document_store.write_documents(DOCUMENTS) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) result = retriever.retrieve(query="How to test this?") assert len(result) == len(DOCUMENTS) assert type(result[0]) == Document
def get_retriever(retriever_type, document_store): if retriever_type == "dpr": retriever = DensePassageRetriever( document_store=document_store, query_embedding_model= "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=False, embed_title=True, remove_sep_tok_from_untitled_passages=True) elif retriever_type == "tfidf": return TfidfRetriever(document_store=document_store) elif retriever_type == "embedding": retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) elif retriever_type == "elsticsearch": retriever = ElasticsearchRetriever(document_store=document_store) elif retriever_type == "es_filter_only": retriever = ElasticsearchFilterOnlyRetriever( document_store=document_store) else: raise Exception(f"No retriever fixture for '{retriever_type}'") return retriever
def load(self): if(self.finder and self.finder2): return if(not self.document_store2): self.document_store2 = FAISSDocumentStore.load( sql_url=sqlUrlFAQ, faiss_file_path='faiss2') # save before load in preprocess self.initSql(url=sqlUrlFAQ, document_store=self.document_store2) # else: # reset session # # self.document_store2.session.close() # super( # FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ) if(not self.retriever2): self.retriever2 = EmbeddingRetriever(document_store=self.document_store2, embedding_model="sentence_bert-saved", use_gpu=False) if(not self.finder2): self.finder2 = Finder(reader=None, retriever=self.retriever2) if(not self.document_store): self.document_store = SQLDocumentStore(url=sqlUrl) #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl) self.initSql(url=sqlUrl, document_store=self.document_store) # else: # reset session # # self.document_store.session.close() # super( # FAISSDocumentStore, self.document_store).__init__(url=sqlUrl) # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly?? # document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever if(not self.retriever): self.retriever = TfidfRetriever(document_store=self.document_store) self.reader = FARMReader(model_name_or_path=modelDir, use_gpu=False, no_ans_boost=0) if not self.reader else self.reader # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) self.finder = Finder( self.reader, self.retriever) if not self.finder else self.finder
def _setup_retriever(self, use_gpu, quantize_model): retriever = EmbeddingRetriever( document_store=self.document_store, embedding_model=self.model_name, use_gpu=use_gpu) if not use_gpu and quantize_model: self.set_quantized_model(retriever) return retriever
def setup(self): print("SETTING UP PIPELINE") self.document_store = ElasticsearchDocumentStore( similarity="dot_product", host="elasticsearch", username="", password="", index="document") self.document_store_faiss = FAISSDocumentStore( index="document", faiss_index_factory_str="Flat", return_embedding=True, sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss" ) processor, converter = self.write_as4_docs() table_data = self.write_table_docs(converter, processor) es_retriever = ElasticsearchRetriever( document_store=self.document_store) print("SETTING UP DPR") dpr_retriever = DPRTrainingManager.get_current_retriever( self.document_store_faiss) print("SETTING UP EMBEDDINGS") embedding_retriever = EmbeddingRetriever( document_store=self.document_store_faiss, embedding_model="deepset/sentence_bert" ) query_classifier = QueryClassifier() print("SETTING UP TABLE") table_retriever = TableRetriever(table_data) print("SETUP RETRIEVERS") self.question_generator = FurtherQuestionGenerator() print("UPDATING EMBEDDINGS") self.document_store_faiss.update_embeddings(dpr_retriever) print("UPDATED EMBEDDINGS") self.dpr_node = ContinualDPRNode( dpr_retriever, self.document_store_faiss) result = Result() self.trainer = DPRTrainingManager( self.document_store_faiss, self.dpr_node) print("SETUP COMPONENTS") pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=self.dpr_node, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=self.question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"]) self.pipeline = pipeline print("SETUP PIPELINE")
def test_faiss_finding(document_store): document_store.write_documents(DOCUMENTS) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def get_retriever(retriever_name, doc_store): if retriever_name == "elastic": return ElasticsearchRetriever(doc_store) if retriever_name == "tfidf": return TfidfRetriever(doc_store) if retriever_name == "dpr": return DensePassageRetriever( document_store=doc_store, query_embedding_model= "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=True, use_fast_tokenizers=False) if retriever_name == "sentence_transformers": return EmbeddingRetriever(document_store=doc_store, embedding_model="nq-distilbert-base-v1", use_gpu=True, model_format="sentence_transformers")
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore( host=DB_HOST, port=DB_PORT, index=self.id, embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever(document_store=doc_store, embedding_model="deepset/sentence_bert", use_gpu=False) self.finder = Finder(reader=None, retriever=retriever) if add_sample_data: add_sample_data_faq_qa(self)
while True: try: # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore( host="elasticsearch", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") break except: time.sleep(15) retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) if document_store.get_document_count() < 1: dicts = convert_files_to_dicts(dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") finder = Finder(retriever=retriever, reader=None)
document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) # ## Initalize Retriever, Reader, & Finder # ### Retriever # gpu= True to speed up processing # BERT-Model is trained to use the second extraction layer retriever = EmbeddingRetriever(document_store=document_store, embedding_model=os.getcwd() + "/kbQA/bert-german-model", gpu=True, model_format="transformers", emb_extraction_layer=-2) if POPULATE_DOCUMENT_STORE: # set path to directory containing the embeddings doc_dir = os.getcwd() + "/kbQA/data/tesla_embs.json" # initialize dataframe with column names df = pd.DataFrame(columns=['name', 'text', 'question_emb', 'question'], ) # open the file with open(doc_dir, encoding="utf-8") as file: # initialize question indexing i = 0 # each line has multiple paragraphs and embeddings, read file line # by line for cnt, line in enumerate(file):
def tutorial12_lfqa(): """ Document Store: FAISS is a library for efficient similarity search on a cluster of dense vectors. The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers. The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index """ from haystack.document_store.faiss import FAISSDocumentStore document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat") """ Cleaning & indexing documents: Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore """ # Let's first get some files that we want to use doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Convert files to dicts dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the dicts containing documents to our DB. document_store.write_documents(dicts) """ Initalize Retriever and Reader/Generator: We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore` """ from haystack.retriever.dense import EmbeddingRetriever retriever = EmbeddingRetriever( document_store=document_store, embedding_model="yjernite/retribert-base-uncased", model_format="retribert") document_store.update_embeddings(retriever) """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents.""" from haystack.utils import print_answers, print_documents from haystack.pipeline import DocumentSearchPipeline p_retrieval = DocumentSearchPipeline(retriever) res = p_retrieval.run(query="Tell me something about Arya Stark?", top_k_retriever=5) print_documents(res, max_text_len=512) """ Similar to previous Tutorials we now initalize our reader/generator. Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5) """ generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5") """ Pipeline: With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions. You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). """ from haystack.pipeline import GenerativeQAPipeline pipe = GenerativeQAPipeline(generator, retriever) """Voilà! Ask a question!""" query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?" result_1 = pipe.run(query=query_1, top_k_retriever=1) print(f"Query: {query_1}") print(f"Answer: {result_1['answers'][0]}") print() query_2 = "What kind of character does Arya Stark play?" result_2 = pipe.run(query=query_2, top_k_retriever=1) print(f"Query: {query_2}") print(f"Answer: {result_2['answers'][0]}") print() pipe.run(query=query_2, top_k_retriever=1)
# max_seq_len_query=64, # max_seq_len_passage=256, # batch_size=16, # use_gpu=True, # embed_title=True, # use_fast_tokenizers=True) # Get dataframe with columns "question", "answer" and some custom metadata df = pd.read_csv("faq.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) print(df.head()) # Get embeddings for our questions from the FAQs # questions = list(df["question"].values) # df["question_emb"] = retriever2.embed_queries(texts=questions) # text is the field to be converted to embeddings df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.delete_all_documents() document_store.write_documents(docs_to_index) retriever2 = EmbeddingRetriever( document_store=document_store, embedding_model="sentence_bert-saved", use_gpu=False) document_store.update_embeddings(retriever2) document_store.save('faiss2')
# Alternative retriever - double BERT neural networks for question and doc embedding from haystack.retriever.dense import DensePassageRetriever dpr_retriever = DensePassageRetriever( document_store=document_store_FAISS, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=True, embed_title=True, ) document_store_FAISS.update_embeddings( retriever=dpr_retriever) # possible training of dpr model # Alternative retriever - single BERT to embed both question and doc, may be better for similar documents (our case) from haystack.retriever.dense import EmbeddingRetriever embedding_retriever = EmbeddingRetriever(document_store=document_store_FAISS, embedding_model="deepset/sentence_bert") # Reader to further scan with Hugging Face models # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=True) # Decide whether the answer should be retrieved from the tables or the general texts class QueryClassifier: outgoing_edges = 2 def run(self, **kwargs): #print("Running Query Classifier") # print(len(kwargs["documents"])) # print(kwargs["documents"][0].meta)
def tutorial4_faq_style_qa(): ## "FAQ-Style QA": Utilizing existing FAQs for Question Answering # While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data. # # Pros: # - Very fast at inference time # - Utilize existing FAQ data # - Quite good control over answers # # Cons: # - Generalizability: We can only answer questions that are similar to existing ones in FAQ # # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. LAUNCH_ELASTICSEARCH = False if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(30) ### Init the DocumentStore # In contrast to Tutorial 1 (extractive QA), we: # # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"], similarity="cosine") ### Create a Retriever using embeddings # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. # retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) # Download a csv containing some FAQ data # Here: Some question-answer pairs related to COVID-19 temp = requests.get( "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv" ) open('small_faq_covid.csv', 'wb').write(temp.content) # Get dataframe with columns "question", "answer" and some custom metadata df = pd.read_csv("small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) print(df.head()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # Initialize a Pipeline (this time without a reader) and ask questions from haystack.pipeline import FAQPipeline pipe = FAQPipeline(retriever=retriever) prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10) print_answers(prediction, details="all")
document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"]) ### Create a Retriever using embeddings # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. # retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) # Download a csv containing some FAQ data # Here: Some question-answer pairs related to COVID-19 temp = requests.get( "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv" ) open('small_faq_covid.csv', 'wb').write(temp.content) # Get dataframe with columns "question", "answer" and some custom metadata df = pd.read_csv("small_faq_covid.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) print(df.head())
password=DB_PW, index=DB_INDEX, scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False, text_field=TEXT_FIELD_NAME, search_fields=SEARCH_FIELD_NAME, embedding_dim=EMBEDDING_DIM, embedding_field=EMBEDDING_FIELD_NAME, excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore faq_question_field=FAQ_QUESTION_FIELD_NAME, ) if RETRIEVER_TYPE == "EmbeddingRetriever": retriever = EmbeddingRetriever(document_store=document_store, embedding_model=EMBEDDING_MODEL_PATH, model_format=EMBEDDING_MODEL_FORMAT, use_gpu=USE_GPU) # type: BaseRetriever elif RETRIEVER_TYPE == "ElasticsearchRetriever": retriever = ElasticsearchRetriever(document_store=document_store) elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever": retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store) else: raise ValueError( f"Could not load Retriever of type '{RETRIEVER_TYPE}'. " f"Please adjust RETRIEVER_TYPE to one of: " f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None" f"OR modify rest_api/search.py to support your retriever") if READER_MODEL_PATH: # for extractive doc-qa if READER_TYPE == "TransformersReader": use_gpu = -1 if not USE_GPU else GPU_NUMBER
def embedding_retriever(faiss_document_store): return EmbeddingRetriever(document_store=faiss_document_store, embedding_model="deepset/sentence_bert", use_gpu=False)
def test_embedding_retriever(document_store): documents = [ { 'text': 'By running tox in the command line!', 'meta': { 'name': 'How to test this library?', 'question': 'How to test this library?' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, { 'text': 'By running tox in the command line!', 'meta': { 'name': 'blah blah blah', 'question': 'blah blah blah' } }, ] retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) embedded = [] for doc in documents: doc['embedding'] = retriever.embed([doc['meta']['question']])[0] embedded.append(doc) document_store.write_documents(embedded) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def main(): # fetch model files if not present. not hosted in git repo # model_exists = os.path.isfile( # './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin') # if not model_exists: # logging.info("Starting model download (about 700MB) ...") # urllib.request.urlretrieve( # "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin", # "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin") # logging.info("model successfully downloaded") # start Elasticsearch if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.call( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2', shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to " "connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") # load docs in database if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE: dicts = convert_files_to_dicts( dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # reader wont be used in the retrieval because results take longer and the quality is worse # still has to be initialized # reader = TransformersReader(model="./kbQA/" + reader_model_name, # tokenizer="./kbQA/" + reader_model_name, # use_gpu=-1) finder = Finder(retriever=retriever, reader=None) if TEST: try: with open("./kbQA/Test.json", encoding="utf-8") as file: times = [] results = [] failed = [] # each line has multiple paragraphs and embeddings, read file line # by line for line in enumerate(file): # load the json string of the current line as a a python object data = json.loads(line[1]) q = data["question"] # fetch results from db start_time = time.process_time() candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) end_time = time.process_time() times.append(end_time-start_time) answered = False for doc in candidate_docs: if data["answer"] in doc.text: answered = True results.append(True) break if not answered: answers = [] for doc in candidate_docs: answers.append(doc.text) failed.append( {"q": q, "correct": data["answer"], "a": answers}) total = 0 for zeit in times: total = total + zeit logging.info("Average time per request: %f", total / len(times)) logging.info("Questions answered correctly: %d/%d (%f)", len(results), len(times), len(results)/len(times)) logging.info("Failed questions:") for fail in failed: logging.info("Question: %s", fail["q"]) logging.info("Correct Answer: %s", fail["correct"]) for answer in fail["a"]: logging.info(answer) except Exception as e: traceback.print_exc() logging.error(f"exception: {e}") else: # loop until Keyboard-Interrupt event ctrl+c or "!q" input while True: try: # Eread input from console input q = input("Enter:").strip() # input "!q" to stop execution if q == "!q": exit(0) # fetch results from db candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) for doc in candidate_docs: logging.info("doc id: %s", doc.id) logging.info("doc meta name: %s", doc.meta["name"]) logging.info("doc text: %s", doc.text) logging.info("doc query score: %s", doc.query_score) logging.info("") # not used # prediction = finder.get_answers( # question=q, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="medium") except Exception as e: traceback.print_exc() logging.error(f"exception: {e}")