def get_results_from_file(filename, type_split="none"):
    data = test_train_split
    predict_folds = get_combined_predict_from_file(filename, data, type_split)
    results_med = []
    results_bow = []
    print(" ".join(predict_folds[8][4]))
    print(" ".join(data[8][0][4][0]))
    print(" ".join(data[8][0][4][1]))
    results_norm_med = []
    for fold in range(len(predict_folds)):
        test, _ = data[fold]
        predicts = predict_folds[fold]
        med, bow, norm_med = 0, 0, 0
        assert (len(predicts) == len(test))
        print()
        print("FOLD NUM: ", fold)
        for pred, test_pair in zip(predicts, test):
            truth = test_pair[1]
            print(minimum_edit_distance(pred, truth))
            med += minimum_edit_distance(pred, truth)
            bow += bag_of_words_test(pred, truth)
            norm_med += minimum_edit_distance_per_token(pred, truth)
        results_med.append(med)
        results_bow.append(bow)
        results_norm_med.append(norm_med / len(test))
    return results_med, results_bow, results_norm_med
def get_med_bow_norm(translator,test_set):
    meds = 0
    bows = 0
    meds_per_tok = 0
    for ts, truth in test_set:
        predict = translator(ts)
        pred_str = " ".join(predict)
        write_to_log("predict: "+pred_str+"\n")
        # print(pred_str)
        med = minimum_edit_distance(predict, truth)
        # write_to_log(str(med)+"\n")
        meds += med
        bows += bag_of_words_test(predict, truth)
        meds_per_tok += minimum_edit_distance_per_token(predict, truth)
    return meds,bows,meds_per_tok/len(test_set)
Пример #3
0
def baseline(stem_flag):
    pseudocode_tokens = get_pseudocode_tokens()
    transcripts_simplified = get_data.get_data_from_directory(
        "/transcripts_var_replaced/")
    truth = get_data.get_data_from_directory("/pseudocode_simplified/")

    best_score = math.inf
    scores = []
    for n in range(1, 7):
        cur_scores = []
        for t in range(0, 15):
            distances = []
            errors = []
            for i in range(len(transcripts_simplified)):
                # print()
                # print(i)
                transcript = transcripts_simplified[i]
                transcript_tokens = [
                    x for x in transcript.strip("\n").split(" ") if not x == ""
                ]
                only_posible_tokens = transcript_to_code_tokens(
                    transcript_tokens, pseudocode_tokens, stem_flag)
                pseudocode_attempt = get_n_gram_reordering.get_most_likely_ordering_old_DO_NOT_USE(
                    only_posible_tokens, n, t)
                reordered = [
                    x for x in pseudocode_attempt.split(" ") if not x == ""
                ]
                actual = [x for x in truth[i].split(" ") if not x == ""]
                edit_distance = minimum_edit_distance(reordered, actual)
                distances.append(edit_distance)
                errors.append(edit_distance / len(actual))
            total_distance = sum(distances)
            print(errors)
            print("n = " + str(n) + " t = " + str(t) + " score ==     " +
                  str(total_distance))
            print(distances)
            best_score = min(best_score, total_distance)
            cur_scores.append(total_distance)
        scores.append(cur_scores)
    print(best_score)

    show_heatmap(scores)
def get_results_from_file_with_split(filename, type_split="none"):
    predicts = get_translations_results(filename)
    results = []
    for fold, pair in enumerate(test_train_split):
        test, train = pair
        preds = predicts[fold]
        tot_med = 0
        num_split_per_file = [
            len(get_splits([pair], type_split)) for pair in test
        ]
        test = get_splits(test, type_split)
        # print(num_split_per_file)
        print()
        print("FOLD NUM: ", fold)
        file_index = 0
        file_med = 0
        split_index = 0
        if not len(test) == len(preds):
            print("test != preds")
            print(len(test))
            print(len(preds))
        for pred, pair in zip(preds, test):
            trans, truth = pair
            med = minimum_edit_distance(pred.split(" "), truth)
            file_med += med
            tot_med += med
            split_index += 1
            if fold == 8 and file_index == 4:
                print(pred)
                print(" ".join(truth))
                print(" ".join(trans))
                print()
            if split_index == num_split_per_file[file_index]:
                split_index = 0
                file_index += 1
                print(file_med)
                file_med = 0
        results.append(tot_med)
    return results
    preds = tr_train + tr_test1 + tr_test2
    preds = correct_order_to_shuffle_order(preds)

    folds = form_folds(preds, truth, 10, True)
    folds = folds[:-1]
    print(len(flatten(folds)))
    eds = []
    bows = []
    norm_eds = []
    for fold in folds:
        tot_ed = 0
        tot_bow = 0
        tot_norm = 0
        print(len(fold))
        for pred, truth in fold:
            ed = minimum_edit_distance(pred, truth)
            bow = bag_of_words_test(pred, truth)
            norm_ed = minimum_edit_distance_per_token(pred, truth)
            tot_ed += ed
            tot_bow += bow
            tot_norm += norm_ed
        eds.append(tot_ed)
        bows.append(tot_bow)
        norm_eds.append(tot_norm / len(fold))
    print(eds)
    print(sum(eds))
    print(bows)
    print(sum(bows))
    print(norm_eds)

    print(get_timings())
Пример #6
0
    # Traditional dataset results
    i = 0
    med_results = []
    bow_results = []
    meds_list = []
    for test,train in test_train_split:
        translator = get_translator(True,train)
        med = 0
        bow = 0
        count = 0
        for ts,ps in test:
            if RANDOM_SHUFFLE_ORDER[i] >= 49:
                count += 1
                predict = translator(ts)
                med += minimum_edit_distance(predict,ps)
                meds_list.append(minimum_edit_distance(predict, ps))
                bow += bag_of_words_test(predict,ps)
            i += 1
        med_results.append(med)
        bow_results.append(bow)
        print(count)
    print(i, med_results)
    print(bow_results)
    print(sum(bow_results))
    print(meds_list)
    # stem_flag = False
    # med_results = []
    # bow_results = []
    # norm_med_results = []
    # for train,test in train_test_split: