def get_results_from_file(filename, type_split="none"): data = test_train_split predict_folds = get_combined_predict_from_file(filename, data, type_split) results_med = [] results_bow = [] print(" ".join(predict_folds[8][4])) print(" ".join(data[8][0][4][0])) print(" ".join(data[8][0][4][1])) results_norm_med = [] for fold in range(len(predict_folds)): test, _ = data[fold] predicts = predict_folds[fold] med, bow, norm_med = 0, 0, 0 assert (len(predicts) == len(test)) print() print("FOLD NUM: ", fold) for pred, test_pair in zip(predicts, test): truth = test_pair[1] print(minimum_edit_distance(pred, truth)) med += minimum_edit_distance(pred, truth) bow += bag_of_words_test(pred, truth) norm_med += minimum_edit_distance_per_token(pred, truth) results_med.append(med) results_bow.append(bow) results_norm_med.append(norm_med / len(test)) return results_med, results_bow, results_norm_med
def get_med_bow_norm(translator,test_set): meds = 0 bows = 0 meds_per_tok = 0 for ts, truth in test_set: predict = translator(ts) pred_str = " ".join(predict) write_to_log("predict: "+pred_str+"\n") # print(pred_str) med = minimum_edit_distance(predict, truth) # write_to_log(str(med)+"\n") meds += med bows += bag_of_words_test(predict, truth) meds_per_tok += minimum_edit_distance_per_token(predict, truth) return meds,bows,meds_per_tok/len(test_set)
def baseline(stem_flag): pseudocode_tokens = get_pseudocode_tokens() transcripts_simplified = get_data.get_data_from_directory( "/transcripts_var_replaced/") truth = get_data.get_data_from_directory("/pseudocode_simplified/") best_score = math.inf scores = [] for n in range(1, 7): cur_scores = [] for t in range(0, 15): distances = [] errors = [] for i in range(len(transcripts_simplified)): # print() # print(i) transcript = transcripts_simplified[i] transcript_tokens = [ x for x in transcript.strip("\n").split(" ") if not x == "" ] only_posible_tokens = transcript_to_code_tokens( transcript_tokens, pseudocode_tokens, stem_flag) pseudocode_attempt = get_n_gram_reordering.get_most_likely_ordering_old_DO_NOT_USE( only_posible_tokens, n, t) reordered = [ x for x in pseudocode_attempt.split(" ") if not x == "" ] actual = [x for x in truth[i].split(" ") if not x == ""] edit_distance = minimum_edit_distance(reordered, actual) distances.append(edit_distance) errors.append(edit_distance / len(actual)) total_distance = sum(distances) print(errors) print("n = " + str(n) + " t = " + str(t) + " score == " + str(total_distance)) print(distances) best_score = min(best_score, total_distance) cur_scores.append(total_distance) scores.append(cur_scores) print(best_score) show_heatmap(scores)
def get_results_from_file_with_split(filename, type_split="none"): predicts = get_translations_results(filename) results = [] for fold, pair in enumerate(test_train_split): test, train = pair preds = predicts[fold] tot_med = 0 num_split_per_file = [ len(get_splits([pair], type_split)) for pair in test ] test = get_splits(test, type_split) # print(num_split_per_file) print() print("FOLD NUM: ", fold) file_index = 0 file_med = 0 split_index = 0 if not len(test) == len(preds): print("test != preds") print(len(test)) print(len(preds)) for pred, pair in zip(preds, test): trans, truth = pair med = minimum_edit_distance(pred.split(" "), truth) file_med += med tot_med += med split_index += 1 if fold == 8 and file_index == 4: print(pred) print(" ".join(truth)) print(" ".join(trans)) print() if split_index == num_split_per_file[file_index]: split_index = 0 file_index += 1 print(file_med) file_med = 0 results.append(tot_med) return results
preds = tr_train + tr_test1 + tr_test2 preds = correct_order_to_shuffle_order(preds) folds = form_folds(preds, truth, 10, True) folds = folds[:-1] print(len(flatten(folds))) eds = [] bows = [] norm_eds = [] for fold in folds: tot_ed = 0 tot_bow = 0 tot_norm = 0 print(len(fold)) for pred, truth in fold: ed = minimum_edit_distance(pred, truth) bow = bag_of_words_test(pred, truth) norm_ed = minimum_edit_distance_per_token(pred, truth) tot_ed += ed tot_bow += bow tot_norm += norm_ed eds.append(tot_ed) bows.append(tot_bow) norm_eds.append(tot_norm / len(fold)) print(eds) print(sum(eds)) print(bows) print(sum(bows)) print(norm_eds) print(get_timings())
# Traditional dataset results i = 0 med_results = [] bow_results = [] meds_list = [] for test,train in test_train_split: translator = get_translator(True,train) med = 0 bow = 0 count = 0 for ts,ps in test: if RANDOM_SHUFFLE_ORDER[i] >= 49: count += 1 predict = translator(ts) med += minimum_edit_distance(predict,ps) meds_list.append(minimum_edit_distance(predict, ps)) bow += bag_of_words_test(predict,ps) i += 1 med_results.append(med) bow_results.append(bow) print(count) print(i, med_results) print(bow_results) print(sum(bow_results)) print(meds_list) # stem_flag = False # med_results = [] # bow_results = [] # norm_med_results = [] # for train,test in train_test_split: