Exemplo n.º 1
0
def evaluate_testset(input_file, model_file):
    sentences = merge_input_files(input_file)
    eric = eric_nlp.Eric_nlp()
    eric.load_model(model_file)
    sp = depparse.init_stanza("en")

    results = analyze_sentences(sentences, eric, sp)

    list_to_file(
        results, "D:\\Bachelor\\eric\\output\\testrestults_survey_8_to_11.txt")
Exemplo n.º 2
0
    def __init__(self, mlObject):
        self.env = clips.Environment()
        self.mlObject = mlObject
        self.fact_type = "input ui"
        self.clips_type = "symbol"
        self.set_of_functions = dictionary

        #NLP components
        self.nlp = eric_nlp.Eric_nlp()
        self.nlp.load_model(self.nlp.nlp_model_file)
        self.nlp.init_depparsing("en")
Exemplo n.º 3
0
def compare_similarity_models(models, model_path, sentences):
    disable_print_function()
    sent_count = len(sentences)
    eric = eric_nlp.Eric_nlp()
    out = []
    for model in models:
        wrong_out = [
            "{sim_result[0]}|{result}|{gold}: '{sent}' => {origin} ({sim_result[1][0]})"
        ]
        wrong_out_result_none = ["", "", ""]
        wrong_out_gold_none = ["", "", ""]
        eric.load_model(f"{model_path}{model}")
        correct_count = 0
        for gold, sent, origin in sentences:
            result, sim_result, dep_result, time_elapsed = eric.map_to_eric_function(
                sent, analytics=True)
            if sim_result[0] == gold:
                correct_count += 1
            else:
                print(f"'{sim_result[0]}' != '{gold}'")
                if result == gold:
                    print(f"     but {result} == {gold}")
                elif gold == "none":
                    wrong_out_gold_none.append(
                        f"{sim_result[0]}|{result}|{gold}: '{sent}' => {origin} ({sim_result[1][0]})"
                    )
                elif result == "none":
                    wrong_out_result_none.append(
                        f"{sim_result[0]}|{result}|{gold}: '{sent}' => {origin} ({sim_result[1][0]})"
                    )
                else:
                    wrong_out.append(
                        f"{sim_result[0]}|{result}|{gold}: '{sent}' => {origin} ({sim_result[1][0]})"
                    )

        correct_percent = correct_count * 100.0 / sent_count
        out.append(
            f"Model '{model}': {correct_count}/{sent_count} correct ({correct_percent}%)"
        )
        print(
            f"Model '{model}': {correct_count}/{sent_count} correct ({correct_percent}%)"
        )
        wrong_out.insert(
            0,
            f"Model '{model}': {correct_count}/{sent_count} correct ({correct_percent}%)"
        )
        wrong_out.extend(wrong_out_result_none)
        wrong_out.extend(wrong_out_gold_none)
        list_to_file(wrong_out,
                     f"D:\\Bachelor\\eric\\output\\dev_results_{model}.txt")

    for o in out:
        print(o)
Exemplo n.º 4
0
def test_preprocessing():
    eric = eric_nlp.Eric_nlp()
    console_loop = True
    while console_loop:
        print(f"Write a sentence to test preprocessing")
        usr_in = input()
        if usr_in.lower() in [
                "exit", "exit()", "quit", "quit()", "end", "end()"
        ]:
            console_loop = False
        else:
            preprocessed = eric.preprocessing(usr_in, "usr_input")
            print(f"preprocessed: '{preprocessed}'")
            print(f"placeholders: {eric.placeholders}")

    print("Goodbye.")
Exemplo n.º 5
0
def debug_depparsed_sentences_to_console():
    pipeline = init_stanza("de")
    
    eric = eric_nlp.Eric_nlp()
    sentence_list = ["Used sentences:"]
    print("Please provide input:")
    while True:
    # for usr_in in whiletrue:
        usr_in = input()
        
        
        if not usr_in:
            print("no input given")
            continue
        elif usr_in.lower() in ["exit", "exit()", "quit", "quit()", "end", "end()"]:
            break

        
        sentence_list.append(usr_in)
        preprocessed = eric.preprocessing(usr_in, "usr_input")
        print(f"preprocessed: {preprocessed}")

        out, _ = depparse([preprocessed], pipeline)

        root = ""
        for o in out:
            if "id: 0" in o:
                finder = "word: "
                ender = "lemma: "
                index = o.find(finder) + len(finder)
                index_end = o.find(ender)

                root = o[index:index_end].strip()


        if not root:
            root = "root not found"

        print(f"Root: {root}")
        for o in out[3:]:
            print(o)
    print("Goodbye")
    for sent in sentence_list:
        print(sent)
Exemplo n.º 6
0
def main():
    debug_depparsed_sentences_to_console
    quit()
    input_language = "en"
    stanza_pipeline = init_stanza(input_language)
    eric = eric_nlp.Eric_nlp()
    input_path = "data\\"
    input_files = [f"{input_path}umfrage_input_{x}_cleaned.txt" for x in range(1,5)]
    input_files.append(f"{input_path}manually_added.txt")
    output_path = "output\\depparse\\data_analysis\\"
    roots_out_file = f"{output_path}roots.csv"

    input_accumulated = test_stuff.merge_input_files(input_files)#{x["id"]: x["key_sentences"] for x in nlp_dictionary}
    input_accumulated = list(set(input_accumulated))
    input_accumulated_as_dict = {}
    for x in input_accumulated:
        if x[0] in input_accumulated_as_dict.keys():
            input_accumulated_as_dict[x[0]].append(x[1])
        else:
            input_accumulated_as_dict[x[0]] = [x[1]]
    all_roots = dict() #keys are root words and the values are dicts where the keys are the function_id
    for fct_id, unpreprocessed_sentences in input_accumulated_as_dict.items():
        preprocessed_sentences = [eric.preprocessing(x, "usr_input") for x in unpreprocessed_sentences]
        
        dep_output, roots = depparse(preprocessed_sentences, stanza_pipeline)
        
        preface = [f"{v}: {k}" for k, v in roots.items()]
        
        #extend all_roots
        all_roots = extend_roots(all_roots, roots, fct_id)


        all_output = ["Used Input:"] + input_files + ["\n"] + preface + dep_output
        for o in all_output:
            print(o)

    create_roots_matrix(all_roots, roots_out_file, empty_cell="")
    print(all_roots)
Exemplo n.º 7
0
def evaluate_fasttext_models(model_files, model_path, output_path, sentences):
    eric = eric_nlp.Eric_nlp()
    sp = depparse.init_stanza("en")
    comparison_dict = {"model": "accuracy"}

    for model in model_files:
        print("=" * 20)
        print(f"loading model: {model_path}{model}")

        time_start = time.time()
        eric.load_model(f"{model_path}{model}")
        time_end = time.time()

        model_load_string = f" Model Load Time: {time_end - time_start}s"

        disable_print_function()
        output, sent_count, correct_count, correct_ratio, confusion = analyze_sentences(
            sentences, eric, sp, sendback_accuracy=True)
        enable_print_function()

        output.insert(8, model_load_string)

        confusion_output = []
        tab = "\t"

        #v contains the counts how often k got predicted when v.key should have been predicted. so e.g. if result = predict, then v["predict"] has the true positives
        column_names = "\t\t"
        for k, v in confusion.items():
            out_str = f"{k}{tab}"
            column_names += f"{k}{tab}"
            for k2, v2 in v.items():
                out_str += f"{v2}{tab}"
            confusion_output.append(out_str)
        confusion_output.insert(0, column_names)
        confusion_output.append("")
        confusion_output.append("PRECISION AND RECALL PER FUNCTION")

        for k, v in confusion.items():
            confusion_output.append(f"[{k}]")

            tp = confusion[k][k]  #true positive
            #tp_fp is sum of tp and the times k was predicted, when a different function was gold
            tp_fp = 0
            for k_predicted in v.values():
                tp_fp += k_predicted
            #tp_fn is the number of all the times k should have been predicte, i.e. number of correct predictions + all the times something else was predicted when it should have been k
            tp_fn = 0
            for row in confusion.values():
                tp_fn += row[k]

            #CALCULATE
            precision = tp / tp_fp if tp_fp > 0 else "n/A"
            recall = tp / tp_fn if tp_fn > 0 else "n/A"
            f_measure = 2 * (precision * recall) / (
                precision +
                recall) if precision != "n/A" and recall != "n/A" else "n/A"

            confusion_output.append(f"Precision: {precision}")
            confusion_output.append(f"   Recall: {recall}")
            confusion_output.append(f"F-Measure: {f_measure}")
            confusion_output.append("")

        comparison_dict[model] = correct_ratio
        list_to_file(output, f"{output_path}{model}.txt")
        list_to_file(confusion_output, f"{output_path}{model}_confusion.txt")
        print(f"{correct_count}/{sent_count} correct. {correct_ratio}%")
        print()

    print("COMPARISON RESULTS:")
    for key, val in comparison_dict.items():
        print(f"{key}\t\t{val}")
Exemplo n.º 8
0
def similarity_depparse_combination(eric,
                                    model,
                                    method,
                                    output_path,
                                    output_name,
                                    in_files,
                                    merge_in_files,
                                    stanza_pipeline="",
                                    lang="en"):
    sp = depparse.init_stanza(lang) if not stanza_pipeline else stanza_pipeline
    eric = eric_nlp.Eric_nlp()
    eric.method = method
    print(f"eric model '{model}' loading")
    eric.load_model(model)
    print(f"eric model '{model}' loaded")

    if merge_in_files:
        test_inputs = {f"{output_name}_merged": merge_input_files(in_files)}
    else:
        test_inputs = {
            f"{output_name}_{x+1}": read_input_from_file(f)
            for x, f in enumerate(in_files)
        }
    print("done reading files")

    print("\n")
    all_out = []
    correct_count = 0
    correct_count_no_depparse = 0
    wrong_but_denied_count = 0
    for test_name, test_tuples in test_inputs.items():
        print(f"{test_name}:")
        for gold, sentence in test_tuples:
            out = []
            depparse_necessary = False
            preprocessed_sentence = eric.preprocessing(sentence, "usr_input")
            similarity_ranking = eric.get_similarity_result(
                preprocessed_sentence, limit=5)
            #tuple (fct_id, matching similarity)
            similarity_result = (similarity_ranking[0][0],
                                 similarity_ranking[0][1])
            result = similarity_result[0]
            if similarity_result[1] < eric.depparse_threshold:
                depparse_necessary = True
                preprocessed_depparse_sentence = eric.preprocessing(
                    sentence, "usr_input")
                tree = sp(preprocessed_depparse_sentence).sentences[0]
                depparse_ranking = depparse.get_matching_dictionary_trees(
                    tree, eric)
                try:
                    if len(depparse_ranking) == 0:
                        depparse_result = ("none", [])
                    else:
                        #tuple (fct_id, matching tree template)
                        depparse_result = (depparse_ranking[0][0],
                                           depparse_ranking[0][1])
                        result = depparse_result[0]
                except:
                    print("could not work with depparse ranking:")
                    print(depparse_ranking)
                    quit()
            else:
                depparse_result = ("none", [
                    f"No depparsing necessary; similarity >= depparse_threshold ({similarity_result[1]} >= {eric.depparse_threshold})"
                ])

            if depparse_result[0] == "none" and similarity_result[
                    1] < eric.deny_threshold:
                result = "none"

            if result == gold:
                correctness = True
                correctness_phrase = "CORRECT"
                correct_count += 1
            else:
                correctness_phrase = "WRONG"
                correctness = False
                if result == "none":
                    wrong_but_denied_count += 1

            out.extend([
                "", f"{'/'*40}", f"       sentence: {sentence}",
                f"         result: {result}", f"           gold: {gold}",
                f"    correctness: {correctness_phrase}",
                f" did depparsing: {depparse_necessary}",
                f"   simil-result: {similarity_result}",
                f"depparse-result: {depparse_result}"
            ])
            if depparse_necessary:
                results_were_equal = "True" if depparse_result[
                    0] == similarity_result[0] else "False"
                out.extend([f"  results equal: {results_were_equal}"])
            else:
                if correctness:
                    correct_count_no_depparse += 1

            all_out.extend(out)
            for o in out:
                print(o)

        sentence_count = len(test_tuples)
        correct_percent = correct_count * 100.0 / sentence_count
        wrong_count = sentence_count - correct_count
        wrong_percent = wrong_count * 100.0 / sentence_count

        preface = [
            f"                   CORRECT: {correct_count} / {sentence_count} ({correct_percent}%)",
            f"                     WRONG: {wrong_count} / {sentence_count} ({wrong_percent}%)",
            f"Correct without depparsing: {correct_count_no_depparse}",
            f"          Wrong but denied: {wrong_but_denied_count}", "\n"
        ]
        preface.extend(all_out)
        list_to_file(preface, f"{output_path}{test_name}.txt")
Exemplo n.º 9
0
    test_input_file_name = "D:\\Bachelor\\bachelor_thesis_eric_nlp\\survey\\results_formatted.txt"
    dev_input_path = "D:\\Bachelor\\eric\\data\\survey\\"
    dev_input_file_names = [
        "umfrage_input_1_cleaned.txt", "umfrage_input_2_cleaned.txt",
        "umfrage_input_3_cleaned.txt", "umfrage_input_4_cleaned.txt",
        "umfrage_input_5_cleaned.txt", "umfrage_input_6_cleaned.txt",
        "umfrage_input_7_cleaned.txt"
    ]
    manually_added_file_name = "manually_added.txt"
    dev_input_file_names_pathed = [
        f"{dev_input_path}{x}" for x in dev_input_file_names
    ]
    output_file_name = "D:\\Bachelor\\eric\\output\\dev_results.txt"

    eric = eric_nlp.Eric_nlp()
    # eric.load_model(f"{model_path}{model_file}")
    # sp = depparse.init_stanza("en")
    sentences = merge_input_files(dev_input_file_names_pathed)
    dev_sentence_count = len(sentences)
    manually_added_sentences = merge_input_files(
        [f"{dev_input_path}{manually_added_file_name}"])
    sentences.extend(manually_added_sentences)
    manually_added_count = len(sentences) - dev_sentence_count
    # evaluate_threshold(sentences, eric, sp)
    # quit()

    model_list = [
        # "ag_news.ftz",
        # "ag_news.bin",
        # "amazon_review_full.bin"
Exemplo n.º 10
0
def update_depparse_output(input_files, output_file_overwrite, passed_fct_id, output_file_new_sentences="data\\manually_added.txt", sp=""):
    #input_accumulated.extend([("why", "Why did you predict this outcome?"), ("why", "Why did you predict the outcome?")])

    #1 get all three as dictionaries {passed_fct_id: list of sentences}
    #1.1 originally used input
    lines = test_stuff.merge_input_files(input_files)
    lines = list(set(lines))
    input_accumulated = convert_input_tuples_to_dict(lines)
    #1.2 modified output
    lines = read_sentences_from_output(output_file_overwrite)
    output_accumulated = {passed_fct_id: lines}
    #1.3 existing manually added sentences
    lines = test_stuff.merge_input_files([output_file_new_sentences])
    lines = list(set(lines))
    manual_accumulated = convert_input_tuples_to_dict(lines)

    #2 look for sentences in output_accumulated, that do not exist in input_accumulated and append these to manual_accumulated if they not already exist there
    eric = eric_nlp.Eric_nlp()
    for fct_id, sentences in output_accumulated.items():
        if fct_id in input_accumulated.keys():
            preprocessed_inputs = [eric.preprocessing(x, "usr_input") for x in input_accumulated[fct_id]]
            for sent in sentences:
                sentence = eric.preprocessing(sent, "usr_input")
                if sentence not in preprocessed_inputs:
                    if fct_id in manual_accumulated.keys():
                        if sentence not in manual_accumulated[fct_id]:
                            manual_accumulated[fct_id].append(sentence)
                    else:
                        manual_accumulated[fct_id] = [sentence]
        else:
            #all are new sentences
            if fct_id in manual_accumulated.keys():
                if sentence not in manual_accumulated[fct_id]:
                    manual_accumulated[fct_id].append(sentence)
            else:
                manual_accumulated[fct_id] = [sentence]

    #4 write manual_accumulated to data\\manually_added.txt (or sth else, if argument was given)
    out= []
    for fct_id, sentences in manual_accumulated.items():
        out.append(f"[{fct_id}]")
        out.extend(sentences)
        out.append("")
    test_stuff.list_to_file(out, output_file_new_sentences)

    #5 update the output file
    #5.1 get all sentences for fct_id from manually_added.txt and the input files
    if not sp:
        sp = init_stanza("en")
    all_sentences = []
    if passed_fct_id in manual_accumulated.keys():
        all_sentences.extend(manual_accumulated[passed_fct_id])
    if passed_fct_id in input_accumulated.keys():
        all_sentences.extend(input_accumulated[passed_fct_id])

    all_sentences = [eric.preprocessing(x, "usr_input") for x in all_sentences]
    out, roots = depparse(all_sentences, sp)
    preface = [f"{v}: {k}" for k, v in roots.items()]

    all_out = preface + out
    test_stuff.list_to_file(all_out, output_file_overwrite)