Exemplo n.º 1
0
class QAPipeline:
    def __init__(self):
        self.document_store = ElasticsearchDocumentStore(host="localhost",
                                                         username="",
                                                         password="",
                                                         index="document")
        self.retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        self.reader = FARMReader(
            model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
        self.finder = Finder(self.reader, self.retriever)
        print('Ready')

    def add_to_datastore_from_remote(self, data_url):
        return {'status': 'Not Implemented'}

    def add_to_datastore_local(self, data_path):
        json_data = read_json_data(data_path)
        es_data = create_data_dicts(json_data)
        self.document_store.write_documents(es_data)
        return {'status': 'Added To Datastore'}

    def answer(self, question, top_k_options=10, top_k_answers=3):
        prediction = self.finder.get_answers(question=question,
                                             top_k_retriever=top_k_options,
                                             top_k_reader=top_k_answers)
        results = extract_info_from_predictions(prediction)
        return results
Exemplo n.º 2
0
def main():
    POPULATE_DOCUMENT_STORE = True

    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                                index="document",
                                                text_field="text",
                                                embedding_field="question_emb",
                                                embedding_dim="768",
                                                excluded_meta_data=["question_emb"])

    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model=os.getcwd() +
        "\\kbQA\\bert-german-model",
        gpu=True, model_format="transformers")

    if POPULATE_DOCUMENT_STORE:
        doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt"
        dicts = convert_files_to_dicts(
            dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True)

        with open("Output.txt", "w") as text_file:
            text = ""
            for doc in dicts:
                text = text + "\n" + doc["text"]
            text_file.write(text)
        df = pd.DataFrame.from_dict(dicts)

        # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für
        # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt.
        # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions
        # bei der Suche geschmissen werden.
        # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort
        # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur
        # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für
        # jeden einzelnen Text.
        # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen,
        #       denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind
        #       deutlich exakter.
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)
        dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)]
        df["question"] = dummy_questions
        print(df.head())

        docs_to_index = df.to_dict(orient="records")
        document_store.write_documents(docs_to_index)

    # question = "Wie viele haben Angst um ihren Job?"
    question = "welche leistungen sind ausgeschlossen?"
    # auch hier wieder: Kleinschreibung zwingend notwendig!
    question = question.lower()

    # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen
    # Der Retriever holt anhand der embeddings die besten Treffer ran.
    # get_answers() ohne reader nicht verwendbar
    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(
        question, top_k_retriever=5)
    print_answers(prediction, details="all")
Exemplo n.º 3
0
def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
    documents = convert_files_to_dicts(dir_path="samples/docs")
    document_store.write_documents(documents)
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text
Exemplo n.º 4
0
def get_results(txt_files_location, use_gpu, questions_list, results_location):

    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    for dirpath, dirnames, files in os.walk(txt_files_location):
        for dirname in dirnames:
            for dirpath, dirname, files in os.walk(
                    os.path.join(txt_files_location, dirname)):
                for file_name in files:
                    document_store.client.indices.delete(index='document',
                                                         ignore=[400, 404])

                    doc_dir = dirpath

                    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                                   clean_func=clean_wiki_text,
                                                   split_paragraphs=True)

                    document_store.write_documents(dicts)

                    retriever = ElasticsearchRetriever(
                        document_store=document_store)

                    reader = FARMReader(
                        model_name_or_path=
                        "elgeish/cs224n-squad2.0-albert-xxlarge-v1",
                        use_gpu=use_gpu)

                    finder = Finder(reader, retriever)

                    sys.stdout = open(
                        os.path.join(results_location,
                                     file_name[:-4] + "_results.txt"), "a+")

                    for i, question in enumerate(questions_list):

                        prediction = finder.get_answers(question=question,
                                                        top_k_retriever=10,
                                                        top_k_reader=1)

                        print("\n\n\nQuestion " + str(i + 1) + ":\n")
                        print(question + "\n")
                        print_answers(prediction, details="minimal")

                    sys.stdout.close()

    document_store.client.transport.close()
Exemplo n.º 5
0
def test_elasticsearch_custom_fields(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(
        index="haystack_test_custom",
        text_field="custom_text_field",
        embedding_field="custom_embedding_field")

    doc_to_write = {
        "custom_text_field": "test",
        "custom_embedding_field": np.random.rand(768).astype(np.float32)
    }
    document_store.write_documents([doc_to_write])
    documents = document_store.get_all_documents()
    assert len(documents) == 1
    assert documents[0].text == "test"
    np.testing.assert_array_equal(doc_to_write["custom_embedding_field"],
                                  documents[0].embedding)
Exemplo n.º 6
0
        dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

        df = pd.DataFrame.from_dict(dicts)
        # Get embeddings for our questions from the FAQs
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)

        # Convert Dataframe to list of dicts and index them in our DocumentStore
        docs_to_index = df.to_dict(orient="records")
        # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
        # It must take a str as input, and return a str.

        # Now, let's write the docs to our DB.
        document_store.write_documents(docs_to_index)

    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)

    # Init reader & and use Finder to get answer (same as in Tutorial 1)
    finder = Finder(reader=reader, retriever=retriever)

    prediction = finder.get_answers(question="Who is the father of Arya?",
                                    top_k_reader=3,
                                    top_k_retriever=5)

    print_answers(prediction, details="all")
                                            embedding_dim=768,
                                            embedding_field="embedding")

# ## Cleaning & indexing documents
# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts[:16])

### Retriever
retriever = DensePassageRetriever(document_store=document_store,
                                  embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True,
                                  use_gpu=True)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
Exemplo n.º 8
0
                                                index="document")

    # ## Cleaning & indexing documents

    # Initialize Elasticsearch with docs
    if POPULATE_DOCUMENT_STORE:
        # set path to directory containing the text files
        doc_dir = os.getcwd() + "\\kbQA\\data\\tesla"
        # convert files to dicts containing documents that can be indexed to our
        # datastore
        dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

        # write the docs to the elasticsearch database
        document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    # ### Retriever
    # Retrievers help narrowing down the scope for the Reader to smaller units
    # of text where a given question
    # could be answered.
    # We use Elasticsearch's default BM25 algorithm
    retriever = ElasticsearchRetriever(document_store=document_store)
    # ### Reader
    # A Reader scans the texts returned by retrievers in detail and extracts
    # the k best answers. It is based on a powerful, but slower deep learning model.
    reader = TransformersReader(model="dbmdz/bert-base-german-uncased",
                                tokenizer="dbmdz/bert-base-german-uncased",
                                use_gpu=-1)
    # ### Finder
def main():
    HOST = 'localhost'
    PORT = 9200
    INDEX_NAME = 'wikipedia_en'

    from haystack import Finder
    from haystack.indexing.cleaning import clean_wiki_text
    from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
    from haystack.reader.farm import FARMReader
    from haystack.reader.transformers import TransformersReader
    from haystack.utils import print_answers
    from haystack.database.elasticsearch import ElasticsearchDocumentStore
    document_store = ElasticsearchDocumentStore(host=HOST,
                                                port=PORT,
                                                username="",
                                                password="",
                                                index=INDEX_NAME)

    # clear existing index (optional)
    # if document_store.client.indices.exists(index=document_store.index):
    #     print('clear existing inddex')
    #     document_store.client.indices.delete(index=document_store.index)

    # Get all dirs in wikipedia folder
    from os import listdir
    from os.path import isfile, join
    import json
    from tqdm import tqdm

    wikidata_path = "wikipedia"
    onlydirs = [
        f for f in listdir(wikidata_path) if not isfile(join(wikidata_path, f))
    ]

    dicts = []
    bulk_size = 5000

    pbar = tqdm(onlydirs)
    for directory in pbar:
        subdirs = [
            f for f in listdir(join(wikidata_path, directory))
            if not isfile(join(wikidata_path, directory))
        ]
        pbar.set_description(f"Processing wikipedia folder {directory}")

        for file in subdirs:
            f = open(join(wikidata_path, directory, file), "r")

            # Each text file contains json structures separated by EOL
            articles = f.read().split("\n")

            for article in articles:
                if len(article) == 0: continue

                # Article in json format
                json_formatted_article = json.loads(article)

                # Rename keys
                document = {
                    "id": json_formatted_article["id"],
                    "name": json_formatted_article["title"],
                    "url": json_formatted_article["url"],
                    "text": json_formatted_article["text"]
                }

                # Add document to bulk
                dicts.append(document)

                if len(dicts) >= bulk_size:
                    # Index bulk
                    try:
                        document_store.write_documents(dicts)
                        dicts.clear()
                    except:
                        print("Bulk not indexed")

    if len(dicts) > 0:
        print('final round')
        document_store.write_documents(dicts)

    print('finished')
Exemplo n.º 10
0
import re

enable_elastic_search()

document_store_dense = ElasticsearchDocumentStore(host="localhost",
                                                  username="",
                                                  password="",
                                                  index="document",
                                                  embedding_field="embedding",
                                                  embedding_dim=768)
documet_store_sparse = ElasticsearchDocumentStore(host="localhost",
                                                  username="",
                                                  password="",
                                                  index="document")

document_store_dense.write_documents(base_corpus())
dense_retriever = DensePassageRetriever(document_store=document_store_dense,
                                        embedding_model="dpr-bert-base-nq",
                                        do_lower_case=True,
                                        use_gpu=True)
document_store_dense.update_embeddings(dense_retriever)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True)

alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
Exemplo n.º 11
0
def main():
    # fetch model files if not present. not hosted in git repo
    # model_exists = os.path.isfile(
    #     './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    # if not model_exists:
    #     logging.info("Starting model download (about 700MB) ...")
    #     urllib.request.urlretrieve(
    #         "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
    #         "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
    #     logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2',
            shell=True
        )
        if status.returncode:
            raise Exception("Failed to launch Elasticsearch. If you want to "
                            "connect to an existing Elasticsearch instance"
                            "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost", username="",
                                                password="", index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(
            dir_path=data_path, clean_func=clean_text, split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model=retriever_model_name_full,
                                       model_format=retriever_model_type,
                                       gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    # reader = TransformersReader(model="./kbQA/" + reader_model_name,
    #                             tokenizer="./kbQA/" + reader_model_name,
    #                             use_gpu=-1)
    finder = Finder(retriever=retriever, reader=None)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(
                        query=q, filters=None, top_k=5)
                    end_time = time.process_time()
                    times.append(end_time-start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append(
                            {"q": q, "correct": data["answer"], "a": answers})
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times), len(results)/len(times))
                logging.info("Failed questions:")
                for fail in failed:
                    logging.info("Question: %s", fail["q"])
                    logging.info("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.info(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(
                    query=q, filters=None, top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")
Exemplo n.º 12
0
def main(data_dir, bulk_size, paragraph=False):
    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="wikipedia" + ("_paragraph" if paragraph else ""))
    if document_store.client.indices.exists(index=document_store.index):
        logger.info(f'{"wikipedia" + ("_paragraph" if paragraph else "")}')
        logger.warning(
            f"Index {document_store.index} already exists, deleting the index."
        )
        document_store.client.indices.delete(index=document_store.index)

    # Get all dirs in wikipedia folder
    only_dirs = [f for f in listdir(data_dir) if not isfile(join(data_dir, f))]

    dicts = []
    counts = dict(documents=0, paragraphs=0)
    progress_bar = tqdm(only_dirs)
    for directory in progress_bar:
        sub_dirs = [
            f for f in listdir(join(data_dir, directory))
            if not isfile(join(data_dir, directory))
        ]
        progress_bar.set_description(
            f"Processing wikipedia folder {directory}")

        for file in sub_dirs:
            f = open(join(data_dir, directory, file), "r")

            # Each text file contains json structures separated by EOL
            articles = f.read().split("\n")

            for article in articles:
                if len(article) == 0:
                    continue

                # Article in json format
                json_formatted_article = json.loads(article)

                base_document = {
                    "id": json_formatted_article["id"],
                    "name": json_formatted_article["title"],
                    "url": json_formatted_article["url"],
                }
                counts["documents"] += 1
                if paragraph:
                    """
                    - Paragraphs are separated by two new-line characters.
                    - The first paragraph is always the title --> remove!
                    - Some paragraphs only contain whitespace --> ignore
                    """
                    paragraphs = [
                        p.strip() for pid, p in enumerate(
                            json_formatted_article["text"].split("\n\n"))
                        if pid > 0 and p.strip()
                    ]
                    counts["paragraphs"] += len(paragraphs)
                    for pid, p in enumerate(paragraphs):
                        document = {
                            **base_document, "paragraph_id": pid,
                            "text": p
                        }

                        # Add document to bulk
                        dicts.append(document)

                else:
                    # Rename keys
                    document = {
                        **base_document, "text": json_formatted_article["text"]
                    }

                    # Add document to bulk
                    dicts.append(document)

                if len(dicts) >= bulk_size:
                    # Index bulk
                    try:
                        document_store.write_documents(dicts)
                    except:
                        logger.warning("Bulk not indexed")

                    # Empty bulk
                    dicts = []

    # index the last partial batch
    if dicts:
        try:
            document_store.write_documents(dicts)
        except:
            logger.warning("Bulk not indexed")

    logger.info("==" * 100)
    logger.info("Indexing done.")
    logger.info(f"# documents: {counts['documents']}")
    if paragraph and counts['documents']:
        logger.info(
            f"# paragraphs: {counts['paragraphs']}, "
            f"{counts['paragraphs'] / counts['documents']:.2f} per document")