def __init__(self, path_train, path_dev, path_test, aug_num, distance_train, distance_dev, distance_test): self.path_train = path_train self.path_dev = path_dev self.path_test = path_test self.aug_num = aug_num self.old_train = load_dataset_input(self.path_train) self.old_dev = load_dataset_input(self.path_dev) self.old_test = load_dataset_input(self.path_test) self.train_train = load_pickle(distance_train) self.dev_train = load_pickle(distance_dev) self.test_train = load_pickle(distance_test) tprint('load is done, begin transfer data to new format') self._load_new() tprint('dataset load done')
def pruning_sem_f1_score(target, predict, predicate_correct, predicate_sum, out_of_pruning, argument2idx): predict_args = 0 golden_args = 0 correct_args = 0 num_correct = 0 total = 0 for i in range(len(target)): pred_i = predict[i] golden_i = target[i] if golden_i == argument2idx[_PAD_]: continue total += 1 if pred_i == argument2idx[_UNK_]: pred_i = argument2idx['_'] if golden_i == argument2idx[_UNK_]: golden_i = argument2idx['_'] if pred_i != argument2idx['_']: predict_args += 1 if golden_i != argument2idx['_']: golden_args += 1 if golden_i != argument2idx['_'] and pred_i == golden_i: correct_args += 1 if pred_i == golden_i: num_correct += 1 P = (correct_args + predicate_correct) / (predict_args + predicate_sum + 1e-13) R = (correct_args + predicate_correct) / (golden_args + out_of_pruning + predicate_sum + 1e-13) NP = correct_args / (predict_args + 1e-13) NR = correct_args / (golden_args + 1e-13) F1 = 2 * P * R / (P + R + 1e-13) NF1 = 2 * NP * NR / (NP + NR + 1e-13) tprint( '\teval accurate:{:.2f} predict:{} golden:{} correct:{} P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}' .format(num_correct / total * 100, predict_args, golden_args + out_of_pruning, correct_args, P * 100, R * 100, F1 * 100, NP * 100, NR * 100, NF1 * 100)) return (P, R, F1, NP, NR, NF1)
def sem_f1_score(target, predict, predicate_correct, predicate_sum, argument2idx, output_to_file=None): predict_args = 0 golden_args = 0 correct_args = 0 num_correct = 0 total = 0 for i in range(len(target)): pred_i = predict[i] golden_i = target[i] if golden_i == argument2idx[_PAD_]: continue total += 1 if pred_i == argument2idx[_UNK_]: pred_i = argument2idx['_'] if golden_i == argument2idx[_UNK_]: golden_i = argument2idx['_'] if pred_i != argument2idx['_']: predict_args += 1 if golden_i != argument2idx['_']: golden_args += 1 if golden_i != argument2idx['_'] and pred_i == golden_i: correct_args += 1 if pred_i == golden_i: num_correct += 1 P = (correct_args + predicate_correct) / (predict_args + predicate_sum + 1e-13) R = (correct_args + predicate_correct) / (golden_args + predicate_sum + 1e-13) NP = correct_args / (predict_args + 1e-13) NR = correct_args / (golden_args + 1e-13) F1 = 2 * P * R / (P + R + 1e-13) NF1 = 2 * NP * NR / (NP + NR + 1e-13) outs = tprint( 'eval accurate:{:.2f} predict:{} golden:{} correct:{} P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}' .format(num_correct / total * 100, predict_args, golden_args, correct_args, P * 100, R * 100, F1 * 100, NP * 100, NR * 100, NF1 * 100)) if output_to_file is not None: with open(output_to_file, 'a') as files: files.write(outs) return (P, R, F1, NP, NR, NF1)
def eval_train_batch(epoch, batch_i, loss, golden_batch, predict_batch, argument2idx, output_to_file=None): predict_args = 0 golden_args = 0 correct_args = 0 num_correct = 0 batch_total = 0 for i in range(len(golden_batch)): pred_i = predict_batch[i] golden_i = golden_batch[i] if golden_i == argument2idx[_PAD_]: continue batch_total += 1 if pred_i == argument2idx[_UNK_]: pred_i = argument2idx['_'] if golden_i == argument2idx[_UNK_]: golden_i = argument2idx['_'] if pred_i != argument2idx['_']: predict_args += 1 if golden_i != argument2idx['_']: golden_args += 1 if golden_i != argument2idx['_'] and pred_i == golden_i: correct_args += 1 if pred_i == golden_i: num_correct += 1 recall = correct_args / (golden_args + 1e-13) precision = correct_args / (predict_args + 1e-13) F = 2 * recall * precision / (recall + precision + 1e-13) outs = tprint( 'epoch {} batch {} loss:{:4f} accurate:{:.2f} precision:{} recall:{} F:{}' .format(epoch, batch_i, loss, num_correct / batch_total * 100, precision, recall, F)) if output_to_file is not None: with open(output_to_file, 'a') as out_file: out_file.write(outs) return correct_args, golden_args, predict_args
res_tmp = [] for i,e in enumerate(ele): res_tmp.append([i,e]) res_tmp = sorted(res_tmp,key = lambda x:x[1])[:300] train_tmp.append([ele[0] for ele in res_tmp]) pickle_save(train_tmp,result_data_path ) if __name__ == '__main__': target_name = sys.argv[1] if target_name == 'train': train = get_sentences('./data/train.txt') # train file is so large, we need to split them to 18 file for process target_2 = get_sentences('./data/train.txt') cur_idx = 0 while cur_idx < 179014: tprint('begin processing %d/18' % (cur_idx // 10000 + 1)) target = target_2[cur_idx:cur_idx+10000] result = get_max_similar(target, train) pickle_save(result, './temp/train-train-%d-result.bin' % (cur_idx)) analyse_train_file('./temp/train-train-%d-result.bin' % (cur_idx), './temp/train-train-%d-part.bin' % (cur_idx)) cur_idx += 10000 total = [] for i in range(18): total.extend(pickle_load('./temp/train-train-%d-part.bin' % (i))) pickle_save(total, './temp/train_train.bin') elif target_name == 'dev': target = get_sentences('./data/dev.txt') train = get_sentences('./data/train.txt') result = get_max_similar(target, train) pickle_save(result, './temp/dev-train-result.bin')