def test_finder_get_answers(): test_docs = [{ "name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", "meta": { "test": "test" } }, { "name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", "meta": { "test": "test" } }, { "name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", "meta": { "test": "test" } }] document_store = SQLDocumentStore(url="sqlite:///qa_test.db") document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def test_faq_retriever_in_memory_store(): from haystack.database.memory import InMemoryDocumentStore from haystack.retriever.dense import EmbeddingRetriever document_store = InMemoryDocumentStore(embedding_field="embedding") documents = [ {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, ] retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False) embedded = [] for doc in documents: doc['embedding'] = retriever.embed([doc['meta']['question']])[0] embedded.append(doc) document_store.write_documents(embedded) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def load(self): if(self.finder and self.finder2): return if(not self.document_store2): self.document_store2 = FAISSDocumentStore.load( sql_url=sqlUrlFAQ, faiss_file_path='faiss2') # save before load in preprocess self.initSql(url=sqlUrlFAQ, document_store=self.document_store2) # else: # reset session # # self.document_store2.session.close() # super( # FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ) if(not self.retriever2): self.retriever2 = EmbeddingRetriever(document_store=self.document_store2, embedding_model="sentence_bert-saved", use_gpu=False) if(not self.finder2): self.finder2 = Finder(reader=None, retriever=self.retriever2) if(not self.document_store): self.document_store = SQLDocumentStore(url=sqlUrl) #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl) self.initSql(url=sqlUrl, document_store=self.document_store) # else: # reset session # # self.document_store.session.close() # super( # FAISSDocumentStore, self.document_store).__init__(url=sqlUrl) # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly?? # document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever if(not self.retriever): self.retriever = TfidfRetriever(document_store=self.document_store) self.reader = FARMReader(model_name_or_path=modelDir, use_gpu=False, no_ans_boost=0) if not self.reader else self.reader # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) self.finder = Finder( self.reader, self.retriever) if not self.finder else self.finder
def test_finder_get_answers_single_result(reader, retriever_with_docs, document_store_with_docs): finder = Finder(reader, retriever_with_docs) query = "testing finder" prediction = finder.get_answers(question=query, top_k_retriever=1, top_k_reader=1) assert prediction is not None assert len(prediction["answers"]) == 1
def test_finder_get_answers_with_in_memory_store(): test_docs = [{ "name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': { 'url': 'url' } }, { "name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': { 'url': 'url' } }, { "name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': { 'url': 'url' } }] from haystack.database.memory import InMemoryDocumentStore document_store = InMemoryDocumentStore() document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def __init__(self): self.finder = Finder(reader=Reader(model_name_or_path=MODEL_PATH, tokenizer=MODEL_PATH, use_gpu=0), retriever=ElasticsearchRetriever( document_store=ElasticsearchDocumentStore( refresh_type='false')))
def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def test_finder_get_answers_single_result(reader, document_store_with_docs): retriever = TfidfRetriever(document_store=document_store_with_docs) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=1, top_k_reader=1) assert prediction is not None assert len(prediction["answers"]) == 1
def test_finding(document_store, retriever): document_store.write_documents(DOCUMENTS) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def test_finder_offsets(reader, retriever_with_docs, document_store_with_docs): finder = Finder(reader, retriever_with_docs) prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5) assert prediction["answers"][0]["offset_start"] == 11 assert prediction["answers"][0]["offset_end"] == 16 start = prediction["answers"][0]["offset_start"] end = prediction["answers"][0]["offset_end"] assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"]
def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready')
def test_finder_offsets(reader, document_store_with_docs): retriever = TfidfRetriever(document_store=document_store_with_docs) finder = Finder(reader, retriever) prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5) assert prediction["answers"][0]["offset_start"] == 11 #TODO enable again when FARM is upgraded incl. the new offset calc # assert prediction["answers"][0]["offset_end"] == 16 start = prediction["answers"][0]["offset_start"] end = prediction["answers"][0]["offset_end"]
def test_faiss_finding(document_store): document_store.write_documents(DOCUMENTS) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question="How to test this?", top_k_retriever=1) assert len(prediction.get('answers', [])) == 1
def test_finder_get_answers(reader, retriever_with_docs, document_store_with_docs): finder = Finder(reader, retriever_with_docs) prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) assert prediction is not None assert prediction["question"] == "Who lives in Berlin?" assert prediction["answers"][0]["answer"] == "Carla" assert prediction["answers"][0]["probability"] <= 1 assert prediction["answers"][0]["probability"] >= 0 assert prediction["answers"][0]["meta"]["meta_field"] == "test1" assert prediction["answers"][0]["context"] == "My name is Carla and I live in Berlin" assert len(prediction["answers"]) == 3
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
def init(): ### Model values for Reader and Document Store global document_store, retriever, reader, finder document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") retriever = ElasticsearchRetriever(document_store=document_store) reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-covid', use_gpu=False) finder = Finder(reader, retriever)
def setPrediction(self, reader, retriever, job): finder = Finder(reader, retriever) if 'top_k_retriever' not in job.task_params: job.task_params['top_k_retriever'] = 10 if 'top_k_reader' not in job.task_params: job.task_params['top_k_reader'] = 5 results = [] es_query_body = self.buildFilter(job) del es_query_body['from'] del es_query_body['size'] es_query_body = { 'external_source_id': ['ea13ebc0-18bf-4dfe-8750-61641fdbb00b'] } for question in job.task_params['questions']: prediction = finder.get_answers(question=question, top_k_retriever=job.task_params['top_k_retriever'], top_k_reader=job.task_params['top_k_reader'], filters=None) #es_query_body['query']['bool'] results.append(prediction) print('INFO:', results, flush=True) return results
class QAPipeline: def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready') def add_to_datastore_from_remote(self, data_url): return {'status': 'Not Implemented'} def add_to_datastore_local(self, data_path): json_data = read_json_data(data_path) es_data = create_data_dicts(json_data) self.document_store.write_documents(es_data) return {'status': 'Added To Datastore'} def answer(self, question, top_k_options=10, top_k_answers=3): prediction = self.finder.get_answers(question=question, top_k_retriever=top_k_options, top_k_reader=top_k_answers) results = extract_info_from_predictions(prediction) return results
def initFinder(): """ Function to initiate retriever, reader and finder Parameters ---------- Returns ------- finder (object): Haystack finder """ retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=False, embed_title=True, max_seq_len=256, batch_size=16, remove_sep_tok_from_untitled_passages=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) return Finder(reader, retriever)
def set_finder(user_id_key): if user_settings[user_id_key]["model"] == "roberta": model_path = ( "deepset/roberta-base-squad2" # Path of the models hosted in Hugging Face ) elif user_settings[user_id_key]["model"] == "bert": model_path = "deepset/bert-large-uncased-whole-word-masking-squad2" elif user_settings[user_id_key]["model"] == "distilbert": model_path = "distilbert-base-uncased-distilled-squad" else: model_path = "illuin/camembert-base-fquad" retriever = ElasticsearchRetriever(document_store=user_doc_store[user_id_key]) if user_settings[user_id_key]["gpu"] == "on": try: reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=0 ) except Exception as e: print(e) print("GPU not available. Inferencing on CPU") reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1 ) else: reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1 ) finder = Finder(reader, retriever) return finder
def feed_documents_to_model(model_name="deepset/roberta-base-squad2-covid"): """Feeds documents to model and returns a model ready to make predictions Parameters ---------- model_name : str The path of the model to be selected from HuggingFace By default uses the pretrained version of roBERTa in squad2 and covid articles Returns ------- finder the model to use for predictions """ # Initialize in memory Document Store document_store = InMemoryDocumentStore() # Load articles and format it as dictionary articles = ret.get_data(MANIFEST, ARTICLES_FOLDER, []) dicts_textContent = process_documents(articles) # Store the dictionary with articles content in the Document Store document_store.write_documents(dicts_textContent) # Retriever chooses what is the subset of documents that are relevant # many techniques are possible: for dev purposes TfidfRetriever is faster retriever = TfidfRetriever(document_store=document_store) # Reader provides interface to use the pre trained transformers # by default we're using the roberta reader = FARMReader(model_name_or_path=model_name, use_gpu=False) # The finder retrieves predictions finder = Finder(reader, retriever) return finder
def qna(): """Return the n answers.""" question = request.form['question'] # index is the target document where queries need to sent. index = request.form['index'] # to select train or untrained model mode = request.form['mode'] #initialization of the Haystack Elasticsearch document storage document_store = ElasticsearchDocumentStore( host=app.config["host"], username=app.config["username"], password=app.config["password"], index=index) if mode == 'trained': # base on the search mode train_model reader = FARMReader(model_name_or_path=app.config["train_model"], use_gpu=False) else: # base on the search mode pre_train reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) #initialization of ElasticRetriever retriever = ElasticsearchRetriever(document_store=document_store) # Finder sticks together reader and retriever # in a pipeline to answer our actual questions. finder = Finder(reader, retriever) # predict n answers n = int(request.form['n']) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n) answer = [] for res in prediction['answers']: answer.append(res['answer']) return json.dumps({ 'status': 'success', 'message': 'Process succesfully', 'result': answer })
def qa_with_dense_retrieval(question: str) -> List[dict]: """ """ finder = Finder(reader, dense_retriever) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5) paras = {para.id: para.text for para in dense_retriever.retrieve(question)} results = [] for result in prediction['answers']: if result['score'] > 0: results.append({ 'answer': result['answer'], 'context': result['context'], 'para': paras[result['document_id']] }) return results
def qa(self, question, text_field): document_store = ElasticsearchDocumentStore(host=ES_HOST, username=ES_USERNAME, password=ES_PASSWORD, index=self.ELASTIC_INDEX, text_field=text_field) retriever = TfidfRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=1, top_k_reader=5) return prediction
def get_test_client_and_override_dependencies(reader, document_store_with_docs): from rest_api.application import app from rest_api.controller import search search.document_store = document_store_with_docs search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)} return TestClient(app)
class QAPipeLine: def __init__(self): self.finder = Finder(reader=Reader(model_name_or_path=MODEL_PATH, tokenizer=MODEL_PATH, use_gpu=0), retriever=ElasticsearchRetriever( document_store=ElasticsearchDocumentStore( refresh_type='false'))) def __call__(self, paper_id: str, question: str): results = self.finder.get_answers(question=question, top_k_reader=5, filters={'name': [paper_id]}) return results
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore( host=DB_HOST, port=DB_PORT, index=self.id, embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever(document_store=doc_store, embedding_model="deepset/sentence_bert", use_gpu=False) self.finder = Finder(reader=None, retriever=retriever) if add_sample_data: add_sample_data_faq_qa(self)
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore(host=DB_HOST, port=DB_PORT, index=self.id) retriever = ElasticsearchRetriever(document_store=doc_store) reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=False, num_processes=MAX_PROCESSES, ) self.finder = Finder(reader, retriever) if add_sample_data: add_sample_data_doc_qa(self) reader.save(directory=READER_MODEL_PATH) print("saved")
def create_app(config_name): app = Flask(__name__) app.config.from_object(config[config_name]) host = config[config_name].ELASTIC_URL port = config[config_name].ELASTIC_PORT index = config[config_name].ELASTIC_INDEX doc_store = ElasticsearchDocumentStore(host=host, username='', password='', index=index) retriever = ElasticsearchRetriever(document_store=doc_store) model_name = "deepset/roberta-base-squad2" reader = FARMReader(model_name_or_path=model_name, num_processes=0, use_gpu=False) app.finder = Finder(reader, retriever) from app.main import main as main_blueprint app.register_blueprint(main_blueprint) return app
model_name_or_path=str(READER_MODEL_PATH), batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, max_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) else: # don't need one for pure FAQ matching reader = None FINDERS = { 1: Finder(reader=reader, retriever=retriever), 2: Finder(reader=reader, retriever=english_retriever) } ############################################# # Basic data schema for request & response ############################################# class Query(BaseModel): questions: List[str] filters: Dict[str, Optional[str]] = None top_k_reader: int = DEFAULT_TOP_K_READER top_k_retriever: int = DEFAULT_TOP_K_RETRIEVER class Answer(BaseModel):