class QAPipeline: def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready') def add_to_datastore_from_remote(self, data_url): return {'status': 'Not Implemented'} def add_to_datastore_local(self, data_path): json_data = read_json_data(data_path) es_data = create_data_dicts(json_data) self.document_store.write_documents(es_data) return {'status': 'Added To Datastore'} def answer(self, question, top_k_options=10, top_k_answers=3): prediction = self.finder.get_answers(question=question, top_k_retriever=top_k_options, top_k_reader=top_k_answers) results = extract_info_from_predictions(prediction) return results
def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() documents = convert_files_to_dicts(dir_path="samples/docs") document_store.write_documents(documents) sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() assert len(documents) == 2 assert documents[0].id assert documents[0].text
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
def test_elasticsearch_custom_fields(elasticsearch_fixture): client = Elasticsearch() client.indices.delete(index='haystack_test_custom', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test_custom", text_field="custom_text_field", embedding_field="custom_embedding_field") doc_to_write = { "custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32) } document_store.write_documents([doc_to_write]) documents = document_store.get_all_documents() assert len(documents) == 1 assert documents[0].text == "test" np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) df = pd.DataFrame.from_dict(dicts) # Get embeddings for our questions from the FAQs questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(docs_to_index) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # Init reader & and use Finder to get answer (same as in Tutorial 1) finder = Finder(reader=reader, retriever=retriever) prediction = finder.get_answers(question="Who is the father of Arya?", top_k_reader=3, top_k_retriever=5) print_answers(prediction, details="all")
embedding_dim=768, embedding_field="embedding") # ## Cleaning & indexing documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts[:16]) ### Retriever retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq", do_lower_case=True, use_gpu=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader # Load a local model or any of the QA models on
index="document") # ## Cleaning & indexing documents # Initialize Elasticsearch with docs if POPULATE_DOCUMENT_STORE: # set path to directory containing the text files doc_dir = os.getcwd() + "\\kbQA\\data\\tesla" # convert files to dicts containing documents that can be indexed to our # datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # write the docs to the elasticsearch database document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder # ### Retriever # Retrievers help narrowing down the scope for the Reader to smaller units # of text where a given question # could be answered. # We use Elasticsearch's default BM25 algorithm retriever = ElasticsearchRetriever(document_store=document_store) # ### Reader # A Reader scans the texts returned by retrievers in detail and extracts # the k best answers. It is based on a powerful, but slower deep learning model. reader = TransformersReader(model="dbmdz/bert-base-german-uncased", tokenizer="dbmdz/bert-base-german-uncased", use_gpu=-1) # ### Finder
def main(): HOST = 'localhost' PORT = 9200 INDEX_NAME = 'wikipedia_en' from haystack import Finder from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers from haystack.database.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME) # clear existing index (optional) # if document_store.client.indices.exists(index=document_store.index): # print('clear existing inddex') # document_store.client.indices.delete(index=document_store.index) # Get all dirs in wikipedia folder from os import listdir from os.path import isfile, join import json from tqdm import tqdm wikidata_path = "wikipedia" onlydirs = [ f for f in listdir(wikidata_path) if not isfile(join(wikidata_path, f)) ] dicts = [] bulk_size = 5000 pbar = tqdm(onlydirs) for directory in pbar: subdirs = [ f for f in listdir(join(wikidata_path, directory)) if not isfile(join(wikidata_path, directory)) ] pbar.set_description(f"Processing wikipedia folder {directory}") for file in subdirs: f = open(join(wikidata_path, directory, file), "r") # Each text file contains json structures separated by EOL articles = f.read().split("\n") for article in articles: if len(article) == 0: continue # Article in json format json_formatted_article = json.loads(article) # Rename keys document = { "id": json_formatted_article["id"], "name": json_formatted_article["title"], "url": json_formatted_article["url"], "text": json_formatted_article["text"] } # Add document to bulk dicts.append(document) if len(dicts) >= bulk_size: # Index bulk try: document_store.write_documents(dicts) dicts.clear() except: print("Bulk not indexed") if len(dicts) > 0: print('final round') document_store.write_documents(dicts) print('finished')
import re enable_elastic_search() document_store_dense = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_field="embedding", embedding_dim=768) documet_store_sparse = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") document_store_dense.write_documents(base_corpus()) dense_retriever = DensePassageRetriever(document_store=document_store_dense, embedding_model="dpr-bert-base-nq", do_lower_case=True, use_gpu=True) document_store_dense.update_embeddings(dense_retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) alphabets = "([A-Za-z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)"
def main(): # fetch model files if not present. not hosted in git repo # model_exists = os.path.isfile( # './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin') # if not model_exists: # logging.info("Starting model download (about 700MB) ...") # urllib.request.urlretrieve( # "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin", # "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin") # logging.info("model successfully downloaded") # start Elasticsearch if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.call( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2', shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to " "connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") # load docs in database if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE: dicts = convert_files_to_dicts( dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # reader wont be used in the retrieval because results take longer and the quality is worse # still has to be initialized # reader = TransformersReader(model="./kbQA/" + reader_model_name, # tokenizer="./kbQA/" + reader_model_name, # use_gpu=-1) finder = Finder(retriever=retriever, reader=None) if TEST: try: with open("./kbQA/Test.json", encoding="utf-8") as file: times = [] results = [] failed = [] # each line has multiple paragraphs and embeddings, read file line # by line for line in enumerate(file): # load the json string of the current line as a a python object data = json.loads(line[1]) q = data["question"] # fetch results from db start_time = time.process_time() candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) end_time = time.process_time() times.append(end_time-start_time) answered = False for doc in candidate_docs: if data["answer"] in doc.text: answered = True results.append(True) break if not answered: answers = [] for doc in candidate_docs: answers.append(doc.text) failed.append( {"q": q, "correct": data["answer"], "a": answers}) total = 0 for zeit in times: total = total + zeit logging.info("Average time per request: %f", total / len(times)) logging.info("Questions answered correctly: %d/%d (%f)", len(results), len(times), len(results)/len(times)) logging.info("Failed questions:") for fail in failed: logging.info("Question: %s", fail["q"]) logging.info("Correct Answer: %s", fail["correct"]) for answer in fail["a"]: logging.info(answer) except Exception as e: traceback.print_exc() logging.error(f"exception: {e}") else: # loop until Keyboard-Interrupt event ctrl+c or "!q" input while True: try: # Eread input from console input q = input("Enter:").strip() # input "!q" to stop execution if q == "!q": exit(0) # fetch results from db candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) for doc in candidate_docs: logging.info("doc id: %s", doc.id) logging.info("doc meta name: %s", doc.meta["name"]) logging.info("doc text: %s", doc.text) logging.info("doc query score: %s", doc.query_score) logging.info("") # not used # prediction = finder.get_answers( # question=q, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="medium") except Exception as e: traceback.print_exc() logging.error(f"exception: {e}")
def main(data_dir, bulk_size, paragraph=False): document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="wikipedia" + ("_paragraph" if paragraph else "")) if document_store.client.indices.exists(index=document_store.index): logger.info(f'{"wikipedia" + ("_paragraph" if paragraph else "")}') logger.warning( f"Index {document_store.index} already exists, deleting the index." ) document_store.client.indices.delete(index=document_store.index) # Get all dirs in wikipedia folder only_dirs = [f for f in listdir(data_dir) if not isfile(join(data_dir, f))] dicts = [] counts = dict(documents=0, paragraphs=0) progress_bar = tqdm(only_dirs) for directory in progress_bar: sub_dirs = [ f for f in listdir(join(data_dir, directory)) if not isfile(join(data_dir, directory)) ] progress_bar.set_description( f"Processing wikipedia folder {directory}") for file in sub_dirs: f = open(join(data_dir, directory, file), "r") # Each text file contains json structures separated by EOL articles = f.read().split("\n") for article in articles: if len(article) == 0: continue # Article in json format json_formatted_article = json.loads(article) base_document = { "id": json_formatted_article["id"], "name": json_formatted_article["title"], "url": json_formatted_article["url"], } counts["documents"] += 1 if paragraph: """ - Paragraphs are separated by two new-line characters. - The first paragraph is always the title --> remove! - Some paragraphs only contain whitespace --> ignore """ paragraphs = [ p.strip() for pid, p in enumerate( json_formatted_article["text"].split("\n\n")) if pid > 0 and p.strip() ] counts["paragraphs"] += len(paragraphs) for pid, p in enumerate(paragraphs): document = { **base_document, "paragraph_id": pid, "text": p } # Add document to bulk dicts.append(document) else: # Rename keys document = { **base_document, "text": json_formatted_article["text"] } # Add document to bulk dicts.append(document) if len(dicts) >= bulk_size: # Index bulk try: document_store.write_documents(dicts) except: logger.warning("Bulk not indexed") # Empty bulk dicts = [] # index the last partial batch if dicts: try: document_store.write_documents(dicts) except: logger.warning("Bulk not indexed") logger.info("==" * 100) logger.info("Indexing done.") logger.info(f"# documents: {counts['documents']}") if paragraph and counts['documents']: logger.info( f"# paragraphs: {counts['paragraphs']}, " f"{counts['paragraphs'] / counts['documents']:.2f} per document")