class QAPipeline: def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready') def add_to_datastore_from_remote(self, data_url): return {'status': 'Not Implemented'} def add_to_datastore_local(self, data_path): json_data = read_json_data(data_path) es_data = create_data_dicts(json_data) self.document_store.write_documents(es_data) return {'status': 'Added To Datastore'} def answer(self, question, top_k_options=10, top_k_answers=3): prediction = self.finder.get_answers(question=question, top_k_retriever=top_k_options, top_k_reader=top_k_answers) results = extract_info_from_predictions(prediction) return results
def main(): POPULATE_DOCUMENT_STORE = True document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim="768", excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever( document_store=document_store, embedding_model=os.getcwd() + "\\kbQA\\bert-german-model", gpu=True, model_format="transformers") if POPULATE_DOCUMENT_STORE: doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt" dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True) with open("Output.txt", "w") as text_file: text = "" for doc in dicts: text = text + "\n" + doc["text"] text_file.write(text) df = pd.DataFrame.from_dict(dicts) # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt. # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions # bei der Suche geschmissen werden. # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für # jeden einzelnen Text. # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen, # denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind # deutlich exakter. questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)] df["question"] = dummy_questions print(df.head()) docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) # question = "Wie viele haben Angst um ihren Job?" question = "welche leistungen sind ausgeschlossen?" # auch hier wieder: Kleinschreibung zwingend notwendig! question = question.lower() # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen # Der Retriever holt anhand der embeddings die besten Treffer ran. # get_answers() ohne reader nicht verwendbar finder = Finder(reader=None, retriever=retriever) prediction = finder.get_answers_via_similar_questions( question, top_k_retriever=5) print_answers(prediction, details="all")
def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() documents = convert_files_to_dicts(dir_path="samples/docs") document_store.write_documents(documents) sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() assert len(documents) == 2 assert documents[0].id assert documents[0].text
def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() write_documents_to_db(document_store=document_store, document_dir="samples/docs") sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() print(documents) assert len(documents) == 2 assert documents[0].id assert documents[0].text
def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready')
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
def init(): ### Model values for Reader and Document Store global document_store, retriever, reader, finder document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") retriever = ElasticsearchRetriever(document_store=document_store) reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-covid', use_gpu=False) finder = Finder(reader, retriever)
def test_elasticsearch_custom_fields(elasticsearch_fixture): client = Elasticsearch() client.indices.delete(index='haystack_test_custom', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test_custom", text_field="custom_text_field", embedding_field="custom_embedding_field") doc_to_write = { "custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32) } document_store.write_documents([doc_to_write]) documents = document_store.get_all_documents() assert len(documents) == 1 assert documents[0].text == "test" np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
def get_elastic_search_document_store(es_host='localhost', es_port=9200, es_index_name='wikipedia', search_fields=['text']): return ElasticsearchDocumentStore(host=es_host, port=es_port, username="", password="", index=es_index_name, search_fields=search_fields)
def document_store(request, test_docs_xs, elasticsearch_fixture): if request.param == "sql": if os.path.exists("qa_test.db"): os.remove("qa_test.db") document_store = SQLDocumentStore(url="sqlite:///qa_test.db") if request.param == "memory": document_store = InMemoryDocumentStore() if request.param == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test") return document_store
def qa(self, question, text_field): document_store = ElasticsearchDocumentStore(host=ES_HOST, username=ES_USERNAME, password=ES_PASSWORD, index=self.ELASTIC_INDEX, text_field=text_field) retriever = TfidfRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=1, top_k_reader=5) return prediction
def get_document_store(document_store_type): if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test") elif document_store_type == "faiss": if os.path.exists("haystack_test_faiss.db"): os.remove("haystack_test_faiss.db") document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db") else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
def eval(self, document_store: ElasticsearchDocumentStore, device: str, label_index: str = "feedback", doc_index: str = "eval_document", label_origin: str = "gold_label"): """ Performs evaluation on evaluation documents in Elasticsearch DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_recall": Proportion of predicted answers that overlap with correct answer :param document_store: The ElasticsearchDocumentStore containing the evaluation documents :type document_store: ElasticsearchDocumentStore :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :type device: str :param label_index: Elasticsearch index where labeled questions are stored :type label_index: str :param doc_index: Elasticsearch index where documents that are used for evaluation are stored :type doc_index: str """ # extract all questions for evaluation filter = {"origin": label_origin} questions = document_store.get_all_documents_in_index( index=label_index, filters=filter) # mapping from doc_id to questions doc_questions_dict = {} id = 0 for question in questions: doc_id = question["_source"]["doc_id"] if doc_id not in doc_questions_dict: doc_questions_dict[doc_id] = [{ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }] else: doc_questions_dict[doc_id].append({ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }) id += 1 # extract eval documents and convert data back to SQuAD-like format documents = document_store.get_all_documents_in_index(index=doc_index) dicts = [] for document in documents: doc_id = document["_source"]["doc_id"] text = document["_source"]["text"] questions = doc_questions_dict[doc_id] dicts.append({"qas": questions, "context": text}) # Create DataLoader that can be passed to the Evaluator indices = range(len(dicts)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( dicts, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_recall": eval_results[0]["top_n_recall"] } return results
def main(): HOST = 'localhost' PORT = 9200 INDEX_NAME = 'wikipedia_en' from haystack import Finder from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers from haystack.database.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME) # clear existing index (optional) # if document_store.client.indices.exists(index=document_store.index): # print('clear existing inddex') # document_store.client.indices.delete(index=document_store.index) # Get all dirs in wikipedia folder from os import listdir from os.path import isfile, join import json from tqdm import tqdm wikidata_path = "wikipedia" onlydirs = [ f for f in listdir(wikidata_path) if not isfile(join(wikidata_path, f)) ] dicts = [] bulk_size = 5000 pbar = tqdm(onlydirs) for directory in pbar: subdirs = [ f for f in listdir(join(wikidata_path, directory)) if not isfile(join(wikidata_path, directory)) ] pbar.set_description(f"Processing wikipedia folder {directory}") for file in subdirs: f = open(join(wikidata_path, directory, file), "r") # Each text file contains json structures separated by EOL articles = f.read().split("\n") for article in articles: if len(article) == 0: continue # Article in json format json_formatted_article = json.loads(article) # Rename keys document = { "id": json_formatted_article["id"], "name": json_formatted_article["title"], "url": json_formatted_article["url"], "text": json_formatted_article["text"] } # Add document to bulk dicts.append(document) if len(dicts) >= bulk_size: # Index bulk try: document_store.write_documents(dicts) dicts.clear() except: print("Bulk not indexed") if len(dicts) > 0: print('final round') document_store.write_documents(dicts) print('finished')
def main(data_dir, bulk_size, paragraph=False): document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="wikipedia" + ("_paragraph" if paragraph else "")) if document_store.client.indices.exists(index=document_store.index): logger.info(f'{"wikipedia" + ("_paragraph" if paragraph else "")}') logger.warning( f"Index {document_store.index} already exists, deleting the index." ) document_store.client.indices.delete(index=document_store.index) # Get all dirs in wikipedia folder only_dirs = [f for f in listdir(data_dir) if not isfile(join(data_dir, f))] dicts = [] counts = dict(documents=0, paragraphs=0) progress_bar = tqdm(only_dirs) for directory in progress_bar: sub_dirs = [ f for f in listdir(join(data_dir, directory)) if not isfile(join(data_dir, directory)) ] progress_bar.set_description( f"Processing wikipedia folder {directory}") for file in sub_dirs: f = open(join(data_dir, directory, file), "r") # Each text file contains json structures separated by EOL articles = f.read().split("\n") for article in articles: if len(article) == 0: continue # Article in json format json_formatted_article = json.loads(article) base_document = { "id": json_formatted_article["id"], "name": json_formatted_article["title"], "url": json_formatted_article["url"], } counts["documents"] += 1 if paragraph: """ - Paragraphs are separated by two new-line characters. - The first paragraph is always the title --> remove! - Some paragraphs only contain whitespace --> ignore """ paragraphs = [ p.strip() for pid, p in enumerate( json_formatted_article["text"].split("\n\n")) if pid > 0 and p.strip() ] counts["paragraphs"] += len(paragraphs) for pid, p in enumerate(paragraphs): document = { **base_document, "paragraph_id": pid, "text": p } # Add document to bulk dicts.append(document) else: # Rename keys document = { **base_document, "text": json_formatted_article["text"] } # Add document to bulk dicts.append(document) if len(dicts) >= bulk_size: # Index bulk try: document_store.write_documents(dicts) except: logger.warning("Bulk not indexed") # Empty bulk dicts = [] # index the last partial batch if dicts: try: document_store.write_documents(dicts) except: logger.warning("Bulk not indexed") logger.info("==" * 100) logger.info("Indexing done.") logger.info(f"# documents: {counts['documents']}") if paragraph and counts['documents']: logger.info( f"# paragraphs: {counts['paragraphs']}, " f"{counts['paragraphs'] / counts['documents']:.2f} per document")
from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers # doc_dir = "data/article_txt_got" # # # s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" # # # fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # # # print(dicts[:1]) # # # print(len(dicts)) from haystack.database.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", # embedding_field="embedding", embedding_dim=768) # document_store.write_documents(dicts) from haystack.retriever.sparse import ElasticsearchRetriever retriever = ElasticsearchRetriever(document_store=document_store) # from haystack.retriever.dense import DensePassageRetriever # retriever = DensePassageRetriever(document_store=document_store, # embedding_model="dpr-bert-base-nq", # do_lower_case=True, use_gpu=True) # # document_store.update_embeddings(retriever)
raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Init the DocumentStore # # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", text_field="text", embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"]) # Create a Retriever using embeddings # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. # retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=True) if POPULATE_DOCUMENT_STORE: # set path to directory conating the text files
from config import DB_HOST, DB_USER, DB_PW, DB_INDEX from config import READER_MODEL_PATH from fastapi import FastAPI from pydantic import BaseModel import logging logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(threadName)s : %(message)s') app = FastAPI() ### Model values for Reader and Document Store global document_store, retriever, reader, finder document_store = ElasticsearchDocumentStore(host=DB_HOST, username=DB_USER, password=DB_PW, index=DB_INDEX) retriever = ElasticsearchRetriever(document_store=document_store) reader = FARMReader(model_name_or_path=READER_MODEL_PATH, use_gpu=False) finder = Finder(reader, retriever) ## API class Item(BaseModel): query: str @app.get("/greet") async def greet(): return {"message": "Hi there!!! I am working"}
def main(): # fetch model files if not present. not hosted in git repo # model_exists = os.path.isfile( # './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin') # if not model_exists: # logging.info("Starting model download (about 700MB) ...") # urllib.request.urlretrieve( # "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin", # "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin") # logging.info("model successfully downloaded") # start Elasticsearch if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.call( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2', shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to " "connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") # load docs in database if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE: dicts = convert_files_to_dicts( dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # reader wont be used in the retrieval because results take longer and the quality is worse # still has to be initialized # reader = TransformersReader(model="./kbQA/" + reader_model_name, # tokenizer="./kbQA/" + reader_model_name, # use_gpu=-1) finder = Finder(retriever=retriever, reader=None) if TEST: try: with open("./kbQA/Test.json", encoding="utf-8") as file: times = [] results = [] failed = [] # each line has multiple paragraphs and embeddings, read file line # by line for line in enumerate(file): # load the json string of the current line as a a python object data = json.loads(line[1]) q = data["question"] # fetch results from db start_time = time.process_time() candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) end_time = time.process_time() times.append(end_time-start_time) answered = False for doc in candidate_docs: if data["answer"] in doc.text: answered = True results.append(True) break if not answered: answers = [] for doc in candidate_docs: answers.append(doc.text) failed.append( {"q": q, "correct": data["answer"], "a": answers}) total = 0 for zeit in times: total = total + zeit logging.info("Average time per request: %f", total / len(times)) logging.info("Questions answered correctly: %d/%d (%f)", len(results), len(times), len(results)/len(times)) logging.info("Failed questions:") for fail in failed: logging.info("Question: %s", fail["q"]) logging.info("Correct Answer: %s", fail["correct"]) for answer in fail["a"]: logging.info(answer) except Exception as e: traceback.print_exc() logging.error(f"exception: {e}") else: # loop until Keyboard-Interrupt event ctrl+c or "!q" input while True: try: # Eread input from console input q = input("Enter:").strip() # input "!q" to stop execution if q == "!q": exit(0) # fetch results from db candidate_docs = finder.retriever.retrieve( query=q, filters=None, top_k=5) for doc in candidate_docs: logging.info("doc id: %s", doc.id) logging.info("doc meta name: %s", doc.meta["name"]) logging.info("doc text: %s", doc.text) logging.info("doc query score: %s", doc.query_score) logging.info("") # not used # prediction = finder.get_answers( # question=q, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="medium") except Exception as e: traceback.print_exc() logging.error(f"exception: {e}")
status = subprocess.run( ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(30) # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents doc_dir = "../data/nq" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False, embedding_field="emb", embedding_dim=768, excluded_meta_data=["emb"]) # Add evaluation data to Elasticsearch database # We first delete the custom tutorial indices to not have duplicate elements document_store.delete_all_documents(index=doc_index) document_store.delete_all_documents(index=label_index) document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index) # Initialize Retriever retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: Evaluate DensePassageRetriever # Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
from pprint import pprint from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.retriever.sparse import ElasticsearchRetriever if __name__ == '__main__': document_store = ElasticsearchDocumentStore( host="192.168.8.106", username="", password="", index="drqa_wiki", ) retriever = ElasticsearchRetriever(document_store=document_store) while True: q = input("utter question: ") documents = retriever.retrieve(q, top_k=3) pprint([d.text for d in documents])
if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=768, embedding_field="embedding") # ## Cleaning & indexing documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB.
from haystack import Finder from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.reader.farm import FARMReader from haystack.retriever.sparse import ElasticsearchRetriever from haystack.utils import print_answers document_store = ElasticsearchDocumentStore( host="192.168.8.106", username="", password="", index="drqa_wiki", # embedding_dim=768, # embedding_field="embedding", ) retriever = ElasticsearchRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) finder = Finder(reader, retriever) prediction = finder.get_answers(question="What is the capital of Germany?", top_k_retriever=10, top_k_reader=5) # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
logging.info("Starting Elasticsearch ...") status = subprocess.run( ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(30) # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents doc_dir = "../data/nq" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset.json.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False) # Add evaluation data to Elasticsearch database if LAUNCH_ELASTICSEARCH: document_store.add_eval_data("../data/nq/nq_dev_subset.json") else: logger.warning("Since we already have a running ES instance we should not index the same documents again." "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ") # Initialize Retriever retriever = ElasticsearchRetriever(document_store=document_store) # Initialize Reader reader = FARMReader("deepset/roberta-base-squad2") # Initialize Finder which sticks together Reader and Retriever finder = Finder(reader, retriever)
from setup import enable_elastic_search, base_corpus from haystack.reader.farm import FARMReader from haystack.retriever.dense import DensePassageRetriever from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack import Finder from typing import Dict, List import re enable_elastic_search() document_store_dense = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_field="embedding", embedding_dim=768) documet_store_sparse = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") document_store_dense.write_documents(base_corpus()) dense_retriever = DensePassageRetriever(document_store=document_store_dense, embedding_model="dpr-bert-base-nq", do_lower_case=True, use_gpu=True) document_store_dense.update_embeddings(dense_retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
DEFAULT_TOP_K_READER, DEFAULT_TOP_K_RETRIEVER, ) from backend.controller.autocomplete import addQuestionToAutocomplete logger = logging.getLogger(__name__) router = APIRouter() document_store = ElasticsearchDocumentStore( host=DB_HOST, username=DB_USER, password=DB_PW, index=DB_INDEX, scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False, text_field=TEXT_FIELD_NAME, search_fields=SEARCH_FIELD_NAME, embedding_dim=EMBEDDING_DIM, embedding_field=EMBEDDING_FIELD_NAME, excluded_meta_data=EXCLUDE_META_DATA_FIELDS, ) # multilingual baseline retriever (=BM25) retriever = ElasticsearchRetriever(document_store=document_store, embedding_model=None, gpu=USE_GPU) # english_retriever english_retriever = ElasticsearchRetriever( document_store=document_store,
# of the sentence and bridge anaphora resolution sent_segs = [f"{topic}: {sent}" for sent in sent_segs] out.extend(sent_segs) text = "\n".join(out) return text if __name__ == '__main__': while True: try: # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore( host="elasticsearch", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") break except: time.sleep(15) retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) if document_store.get_document_count() < 1: dicts = convert_files_to_dicts(dir_path=data_path, clean_func=clean_text, split_paragraphs=True)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from flask import Flask, jsonify, request from haystack import Finder from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.retriever.sparse import ElasticsearchRetriever document_store = ElasticsearchDocumentStore(host="elasticsearch", username="", password="", index="arxiv-qa") def filter_answers(results: dict, details: str = "all"): answers = results["answers"] if details != "all": if details == "minimal": keys_to_keep = set(["answer", "context"]) elif details == "medium": keys_to_keep = set(["answer", "context", "score"]) else: keys_to_keep = answers.keys() # filter the results filtered_answers = []
question_embedding = retriever.create_embedding( r["_source"]["question"]) body = {"doc": {"question_emb": question_embedding}} document_store.client.update(index=document_store.index, id=r["_id"], body=body) if __name__ == "__main__": document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", index="document", text_field="answer", embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"], ) MODEL = "deepset/sentence_bert" GPU = False retriever = ElasticsearchRetriever(document_store=document_store, embedding_model=MODEL, gpu=GPU, emb_extraction_layer=-2, pooling_strategy="reduce_mean") # index new docs