def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def test_sql_write_read(): sql_document_store = SQLDocumentStore() documents = convert_files_to_dicts(dir_path="samples/docs") sql_document_store.write_documents(documents) documents = sql_document_store.get_all_documents() assert len(documents) == 2 doc = sql_document_store.get_document_by_id("1") assert doc.id assert doc.text
def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() documents = convert_files_to_dicts(dir_path="samples/docs") document_store.write_documents(documents) sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() assert len(documents) == 2 assert documents[0].id assert documents[0].text
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
def fetch_data_from_repo( doc_dir="resources/data/website_data/", s3_url="https://github.com/Thabo-5/Chatbot-scraper/raw/master/txt_files.zip", doc_store=FAISSDocumentStore()): """ Function to download data from s3 bucket/ github Parameters ---------- doc_dir (str): path to destination folder s3_url (str): path to download zipped data doc_store (class): Haystack document store Returns ------- document_store (object): Haystack document store object """ document_store = doc_store fetch_archive_from_http(url=s3_url, output_dir=doc_dir) import os for filename in os.listdir(path=doc_dir): with open(os.path.join(doc_dir, filename), 'r', encoding='utf-8', errors='replace') as file: text = file.read() file.close() with open(os.path.join(doc_dir, filename), 'w', encoding='utf-8', errors='replace') as file: file.write(text) file.close() # Convert files to dicts dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the dicts containing documents to our DB. document_store.write_documents(dicts) return document_store
excluded_meta_data=["question_emb"]) # Create a Retriever using embeddings # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. # retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=True) if POPULATE_DOCUMENT_STORE: # set path to directory conating the text files doc_dir = os.getcwd() + "\\kbQA\\data\\article_txt_got" # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) df = pd.DataFrame.from_dict(dicts) # Get embeddings for our questions from the FAQs questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(docs_to_index)
host="elasticsearch", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") break except: time.sleep(15) retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) if document_store.get_document_count() < 1: dicts = convert_files_to_dicts(dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") finder = Finder(retriever=retriever, reader=None) api.run(host='0.0.0.0', port=8000, debug=True)
def main(): # fetch model files if not present. not hosted in git repo # model_exists = os.path.isfile( # './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin') # if not model_exists: # logging.info("Starting model download (about 700MB) ...") # urllib.request.urlretrieve( # "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin", # "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin") # logging.info("model successfully downloaded") # start Elasticsearch if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.call( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2', shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to " "connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") # load docs in database if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE: dicts = convert_files_to_dicts( dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # reader wont be used in the retrieval because results take longer and the quality is worse # still has to be initialized # reader = TransformersReader(model="./kbQA/" + reader_model_name, # tokenizer="./kbQA/" + reader_model_name, # use_gpu=-1) finder = Finder(retriever=retriever, reader=None) if TEST: try: with open("./kbQA/Test.json", encoding="utf-8") as file: times = [] results = [] failed = [] # each line has multiple paragraphs and embeddings, read file line # by line for line in enumerate(file): # load the json string of the current line as a a python object data = json.loads(line[1]) q = data["question"] # fetch results from db start_time = time.process_time() candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) end_time = time.process_time() times.append(end_time-start_time) answered = False for doc in candidate_docs: if data["answer"] in doc.text: answered = True results.append(True) break if not answered: answers = [] for doc in candidate_docs: answers.append(doc.text) failed.append( {"q": q, "correct": data["answer"], "a": answers}) total = 0 for zeit in times: total = total + zeit logging.info("Average time per request: %f", total / len(times)) logging.info("Questions answered correctly: %d/%d (%f)", len(results), len(times), len(results)/len(times)) logging.info("Failed questions:") for fail in failed: logging.info("Question: %s", fail["q"]) logging.info("Correct Answer: %s", fail["correct"]) for answer in fail["a"]: logging.info(answer) except Exception as e: traceback.print_exc() logging.error(f"exception: {e}") else: # loop until Keyboard-Interrupt event ctrl+c or "!q" input while True: try: # Eread input from console input q = input("Enter:").strip() # input "!q" to stop execution if q == "!q": exit(0) # fetch results from db candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) for doc in candidate_docs: logging.info("doc id: %s", doc.id) logging.info("doc meta name: %s", doc.meta["name"]) logging.info("doc text: %s", doc.text) logging.info("doc query score: %s", doc.query_score) logging.info("") # not used # prediction = finder.get_answers( # question=q, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="medium") except Exception as e: traceback.print_exc() logging.error(f"exception: {e}")
def write_storage(): dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True) document_store.write_documents(dicts)