Python convert_files_to_dicts 예제들, haystack.indexing.utils.convert_files_to_dicts Python 예제들

예제 #1

0

파일 보기

def main():
    POPULATE_DOCUMENT_STORE = True

    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                                index="document",
                                                text_field="text",
                                                embedding_field="question_emb",
                                                embedding_dim="768",
                                                excluded_meta_data=["question_emb"])

    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model=os.getcwd() +
        "\\kbQA\\bert-german-model",
        gpu=True, model_format="transformers")

    if POPULATE_DOCUMENT_STORE:
        doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt"
        dicts = convert_files_to_dicts(
            dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True)

        with open("Output.txt", "w") as text_file:
            text = ""
            for doc in dicts:
                text = text + "\n" + doc["text"]
            text_file.write(text)
        df = pd.DataFrame.from_dict(dicts)

        # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für
        # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt.
        # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions
        # bei der Suche geschmissen werden.
        # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort
        # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur
        # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für
        # jeden einzelnen Text.
        # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen,
        #       denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind
        #       deutlich exakter.
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)
        dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)]
        df["question"] = dummy_questions
        print(df.head())

        docs_to_index = df.to_dict(orient="records")
        document_store.write_documents(docs_to_index)

    # question = "Wie viele haben Angst um ihren Job?"
    question = "welche leistungen sind ausgeschlossen?"
    # auch hier wieder: Kleinschreibung zwingend notwendig!
    question = question.lower()

    # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen
    # Der Retriever holt anhand der embeddings die besten Treffer ran.
    # get_answers() ohne reader nicht verwendbar
    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(
        question, top_k_retriever=5)
    print_answers(prediction, details="all")

예제 #2

0

파일 보기

def test_sql_write_read():
    sql_document_store = SQLDocumentStore()
    documents = convert_files_to_dicts(dir_path="samples/docs")
    sql_document_store.write_documents(documents)
    documents = sql_document_store.get_all_documents()
    assert len(documents) == 2
    doc = sql_document_store.get_document_by_id("1")
    assert doc.id
    assert doc.text

예제 #3

0

파일 보기

def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
    documents = convert_files_to_dicts(dir_path="samples/docs")
    document_store.write_documents(documents)
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text

예제 #4

0

파일 보기

파일: run.py 프로젝트: ZapAutomation/DocumentAnalysis

def get_results(txt_files_location, use_gpu, questions_list, results_location):

    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    for dirpath, dirnames, files in os.walk(txt_files_location):
        for dirname in dirnames:
            for dirpath, dirname, files in os.walk(
                    os.path.join(txt_files_location, dirname)):
                for file_name in files:
                    document_store.client.indices.delete(index='document',
                                                         ignore=[400, 404])

                    doc_dir = dirpath

                    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                                   clean_func=clean_wiki_text,
                                                   split_paragraphs=True)

                    document_store.write_documents(dicts)

                    retriever = ElasticsearchRetriever(
                        document_store=document_store)

                    reader = FARMReader(
                        model_name_or_path=
                        "elgeish/cs224n-squad2.0-albert-xxlarge-v1",
                        use_gpu=use_gpu)

                    finder = Finder(reader, retriever)

                    sys.stdout = open(
                        os.path.join(results_location,
                                     file_name[:-4] + "_results.txt"), "a+")

                    for i, question in enumerate(questions_list):

                        prediction = finder.get_answers(question=question,
                                                        top_k_retriever=10,
                                                        top_k_reader=1)

                        print("\n\n\nQuestion " + str(i + 1) + ":\n")
                        print(question + "\n")
                        print_answers(prediction, details="minimal")

                    sys.stdout.close()

    document_store.client.transport.close()

예제 #5

0

파일 보기

def fetch_data_from_repo(
    doc_dir="resources/data/website_data/",
    s3_url="https://github.com/Thabo-5/Chatbot-scraper/raw/master/txt_files.zip",
    doc_store=FAISSDocumentStore()):
    """
    Function to download data from s3 bucket/ github
    Parameters
    ----------
        doc_dir (str): path to destination folder
        s3_url (str): path to download zipped data
        doc_store (class): Haystack document store
    Returns
    -------
        document_store (object): Haystack document store object
    """
    document_store = doc_store
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    import os
    for filename in os.listdir(path=doc_dir):
        with open(os.path.join(doc_dir, filename),
                  'r',
                  encoding='utf-8',
                  errors='replace') as file:
            text = file.read()
            file.close()
        with open(os.path.join(doc_dir, filename),
                  'w',
                  encoding='utf-8',
                  errors='replace') as file:
            file.write(text)
            file.close()
    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    return document_store

예제 #6

0

파일 보기

파일: kbQA_embeddings.py 프로젝트: F4r1n/haystack

        excluded_meta_data=["question_emb"])

    # Create a Retriever using embeddings
    # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
    # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
    #
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   gpu=True)

    if POPULATE_DOCUMENT_STORE:
        # set path to directory conating the text files
        doc_dir = os.getcwd() + "\\kbQA\\data\\article_txt_got"
        # convert files to dicts containing documents that can be indexed to our datastore
        dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

        df = pd.DataFrame.from_dict(dicts)
        # Get embeddings for our questions from the FAQs
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)

        # Convert Dataframe to list of dicts and index them in our DocumentStore
        docs_to_index = df.to_dict(orient="records")
        # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
        # It must take a str as input, and return a str.

        # Now, let's write the docs to our DB.
        document_store.write_documents(docs_to_index)

예제 #7

0

파일 보기

파일: MLQA_api.py 프로젝트: F4r1n/haystack

                host="elasticsearch",
                username="",
                password="",
                index="document",
                embedding_dim=512,
                embedding_field="embedding")
            break
        except:
            time.sleep(15)

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)
    if document_store.get_document_count() < 1:
        dicts = convert_files_to_dicts(dir_path=data_path,
                                       clean_func=clean_text,
                                       split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    finder = Finder(retriever=retriever, reader=None)

    api.run(host='0.0.0.0', port=8000, debug=True)

예제 #8

0

파일 보기

def main():
    # fetch model files if not present. not hosted in git repo
    # model_exists = os.path.isfile(
    #     './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    # if not model_exists:
    #     logging.info("Starting model download (about 700MB) ...")
    #     urllib.request.urlretrieve(
    #         "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
    #         "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
    #     logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2',
            shell=True
        )
        if status.returncode:
            raise Exception("Failed to launch Elasticsearch. If you want to "
                            "connect to an existing Elasticsearch instance"
                            "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost", username="",
                                                password="", index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(
            dir_path=data_path, clean_func=clean_text, split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model=retriever_model_name_full,
                                       model_format=retriever_model_type,
                                       gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    # reader = TransformersReader(model="./kbQA/" + reader_model_name,
    #                             tokenizer="./kbQA/" + reader_model_name,
    #                             use_gpu=-1)
    finder = Finder(retriever=retriever, reader=None)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(
                        query=q, filters=None, top_k=5)
                    end_time = time.process_time()
                    times.append(end_time-start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append(
                            {"q": q, "correct": data["answer"], "a": answers})
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times), len(results)/len(times))
                logging.info("Failed questions:")
                for fail in failed:
                    logging.info("Question: %s", fail["q"])
                    logging.info("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.info(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(
                    query=q, filters=None, top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")

예제 #9

0

파일 보기

def write_storage():
    dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)
    document_store.write_documents(dicts)