def confirm_threshold(raw_data_file_path, pred_dir, pred_file_prefix): ref_ans = read_mrc_dataset(raw_data_file_path, tag=None) candidate_thresholds = np.linspace(0, 1, 100) ret_metrics = [] for ind, candi_thresh in enumerate(candidate_thresholds): pred_ans = _pred_ans_by_thresh(pred_dir, pred_file_prefix, candi_thresh) F1, EM, _, _, _ = evaluate(ref_ans, pred_ans) ret_metrics.append((candi_thresh, F1, EM)) if (ind + 1) % 20 == 0: print( f"now {ind + 1}/{len(candidate_thresholds)}, F1 is {F1}, EM is {EM}" ) ret_metrics = sorted(ret_metrics, key=lambda x: (x[1], x[2])) print("the best metrics&threshold is ", ret_metrics[-1]) return ret_metrics[-1]
def _evaluate(raw_data_path, pred_data_path, tag=None): ref_ans = read_mrc_dataset(raw_data_path, tag=tag) assert len(ref_ans) > 0, 'Find no sample with tag - {}'.format(tag) pred_ans = read_model_prediction(pred_data_path) F1, EM, ans_score, TOTAL, SKIP = evaluate(ref_ans, pred_ans, verbose=False) print_metrics(F1, EM, ans_score, TOTAL, SKIP, tag)