def get_results(omega, model_num=2, split_type="none", fm_flag=False): med_results = [] norm_med_results = [] bow_results = [] for pair in test_train_split: train = get_splits(pair[1], split_type) test = get_splits(pair[0], split_type) translator = get_translator(train, omega, fm_flag, model_num) med, bow, norm_med = useful_functions.get_med_bow_norm( translator, test) write_to_log("Edit distance = {}\n".format(med)) med_results.append(med) bow_results.append(bow) norm_med_results.append(norm_med) return med_results, bow_results, norm_med_results
def get_alignment_accuracies(split_type="none", fm_flag=False, model_num=2): test, train = test_train_split[0] split_train = get_splits(train, split_type) split_test = get_splits(test, split_type) if model_num == 2: pred_aligns = get_pred_align_test_2(split_train, split_test, fm_flag) elif model_num == 1: pred_aligns = get_pred_align_test_1(split_train, split_test, fm_flag) else: raise ("Model number not known") alignments = combine_split_aligns(pred_aligns, test, split_test) # print_i = 2 # print_alignment_compare(alignments[print_i],print_i,(test[print_i])) return score_alignment_recall(alignments), score_alignment_precision( alignments)
def get_time_for_single(translator, pair,split_type): start = time.time() data = get_splits([pair],split_type) for trans,_ in data: translator(trans) end = time.time() return end - start
def validate_ibmmodel(omega_range, model_num=2, split_type="none", fm_flag=False): print("Validate ibmmodel{}".format(model_num)) preprocessed_train = get_splits(train_test_data, split_type) preprocessed_valid = get_splits(validation_set, split_type) results = [] for omega in omega_range: message = "omega {}\n".format(omega) write_to_log(message) print("omega", omega) translator = get_translator(preprocessed_train, omega, fm_flag, model_num) med, _, _ = useful_functions.get_med_bow_norm(translator, preprocessed_valid) print(med) write_to_log("Edit distance = {}\n".format(med)) results.append((omega, med)) return results
def get_results_from_file_with_split(filename, type_split="none"): predicts = get_translations_results(filename) results = [] for fold, pair in enumerate(test_train_split): test, train = pair preds = predicts[fold] tot_med = 0 num_split_per_file = [ len(get_splits([pair], type_split)) for pair in test ] test = get_splits(test, type_split) # print(num_split_per_file) print() print("FOLD NUM: ", fold) file_index = 0 file_med = 0 split_index = 0 if not len(test) == len(preds): print("test != preds") print(len(test)) print(len(preds)) for pred, pair in zip(preds, test): trans, truth = pair med = minimum_edit_distance(pred.split(" "), truth) file_med += med tot_med += med split_index += 1 if fold == 8 and file_index == 4: print(pred) print(" ".join(truth)) print(" ".join(trans)) print() if split_index == num_split_per_file[file_index]: split_index = 0 file_index += 1 print(file_med) file_med = 0 results.append(tot_med) return results
def get_combined_predict_from_file(filename, train_test_folds, type_split): predicts = get_translations_results(filename) data = train_test_folds combined_predicts = [] for fold in range(len(predicts)): fold_predicts = [] preds = [x.split(" ") for x in predicts[fold]] preds_i = 0 test, train = data[fold] for pair in test: split_pair = get_splits([pair], type_split=type_split) number_splits = len(split_pair) prediction = [] for i in range(number_splits): if preds_i < len(preds): prediction.extend(preds[preds_i]) else: print("not enough preds") preds_i += 1 fold_predicts.append(prediction) combined_predicts.append(fold_predicts) return combined_predicts
def get_timings(omega, split_type="none", fm_flag=False, model_num=2): test, train = test_train_split[0] train = get_splits(train, split_type) # test = get_splits(test, split_type) translator = get_translator(train, omega, fm_flag, model_num) return get_time_for_files(translator, test, split_type)
# number_aligns = len(v2_alignment[0]) # print(sum(v2_alignment[0])/number_aligns, sum(v2_alignment[1])/number_aligns) # print(sum(split_v1_alignment[0])/number_aligns, sum(split_v1_alignment[1])/number_aligns) # print(sum(split_v2_alignment[0])/number_aligns, sum(split_v2_alignment[1])/number_aligns) # NOT WORKING # print("enhanced") # print(get_alignment_accuracies("enhanced",False,2)) elif test_num == 7: print(get_alignment_accuracies("split", False, 2)) elif test_num == 8: pass # print(get_results_for_traditional_files("logs/results_split_v2.txt","split")) elif test_num == 9: alignments = [] test, train = test_train_split[0] split_train = get_splits(train, "enhanced") split_test = get_splits(test, "enhanced") aligns = get_pred_align_test_2(split_train, split_train, True) alignments.extend(aligns) for i in range(10, 20): print(" ".join(split_train[i][0])) print(" ".join(split_train[i][1])) print_alignment(alignments[i], split_train[i]) elif test_num == 10: test, train = test_train_split[0] splits_test_norm = get_splits(test, "enhanced") splits_train_norm = get_splits(test, "enhanced") indexed = [] for trans, pseud in test: trans = [(x, i) for i, x in enumerate(trans)]