def main(_): nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict, score_thres_long=FLAGS.score_thres_long, score_thres_short=FLAGS.score_thres_short) # reporting results print('*' * 20) scores = compute_final_f1(long_answer_stats, short_answer_stats) print('*' * 20) print('SCORES (n={}):'.format(scores['long-answer-n'])) print(' F1 / P / R') print('Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['long-answer-f1'], scores['long-answer-precision'], scores['long-answer-recall'])) print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['short-answer-f1'], scores['short-answer-precision'], scores['short-answer-recall'])) print('All answers {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['all-answer-f1'], scores['all-answer-precision'], scores['all-answer-recall']))
def get_metrics_as_dict(gold_path, prediction_path, num_threads=10): """Library version of the end-to-end evaluation. Arguments: gold_path: Path to the gzip JSON data. For multiple files, should be a glob pattern (e.g. "/path/to/files-*") prediction_path: Path to the JSON prediction data. num_threads (10): Number of threads to use when parsing multiple files. Returns: metrics: A dictionary mapping string names to metric scores. """ nq_gold_dict = util.read_annotation(gold_path, n_threads=num_threads) nq_pred_dict = util.read_prediction_json(prediction_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), "cache") if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info("Reading from cache: %s", format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, "r")) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info("Caching gold data for next time to: %s", format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, "w")) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) if FLAGS.pretty_print: print("*" * 20) print("LONG ANSWER R@P TABLE:") print_r_at_p_table(long_answer_stats) print("*" * 20) print("SHORT ANSWER R@P TABLE:") print_r_at_p_table(short_answer_stats) scores = compute_final_f1(long_answer_stats, short_answer_stats) print("*" * 20) print("METRICS IGNORING SCORES (n={}):".format( scores["long-answer-n"])) print(" F1 / P / R") print("Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format( scores["long-answer-f1"], scores["long-answer-precision"], scores["long-answer-recall"], )) print("Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format( scores["short-answer-f1"], scores["short-answer-precision"], scores["short-answer-recall"], )) else: metrics = get_metrics_with_answer_stats(long_answer_stats, short_answer_stats) print(json.dumps(metrics))
def get_metrics_as_dict(gold_path, prediction_path): """Library version of the end-to-end evaluation. Arguments: gold_path: Path to a single JSONL data. Could be gzipped or not. prediction_path: Path to the JSONL file of prediction data. Returns: metrics: A dictionary mapping string names to metric scores. """ tydi_gold_dict = eval_utils.read_annotation(gold_path) tydi_pred_dict = eval_utils.read_prediction_jsonl(prediction_path) passage_answer_stats, minimal_answer_stats = score_answers( tydi_gold_dict, tydi_pred_dict) return get_metrics_with_answer_stats(passage_answer_stats, minimal_answer_stats)
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, 'r')) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info('Caching gold data for next time to: %s', format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, 'w')) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) # print("nq_gold_dict", nq_gold_dict) # print("nq_pred_dict", nq_pred_dict) long_answer_stats, short_answer_stats = score_answers( nq_gold_dict, nq_pred_dict) if FLAGS.pretty_print: print('*' * 20) print('LONG ANSWER R@P TABLE:') print_r_at_p_table(long_answer_stats) print('*' * 20) print('SHORT ANSWER R@P TABLE:') print_r_at_p_table(short_answer_stats) scores = compute_final_f1(long_answer_stats, short_answer_stats) print('*' * 20) print('METRICS IGNORING SCORES (n={}):'.format( scores['long-answer-n'])) print(' F1 / P / R') print('Long answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['long-answer-f1'], scores['long-answer-precision'], scores['long-answer-recall'])) print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format( scores['short-answer-f1'], scores['short-answer-precision'], scores['short-answer-recall'])) else: metrics = get_metrics_with_answer_stats(long_answer_stats, short_answer_stats) print(json.dumps(metrics))
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) nq_gold_dict = pickle.load(open(cache_path, 'r')) else: nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) if FLAGS.cache_gold_data: logging.info('Caching gold data for next time to: %s', format(cache_path)) pickle.dump(nq_gold_dict, open(cache_path, 'w')) nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path) ## input: nq_gold_dict, nq_pred_dict ## output: long, short score (with optional optimal threshold) print('final f1, final_p, final_r', get_f1(nq_gold_dict, nq_pred_dict))
def main(_): cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache') if FLAGS.cache_gold_data and os.path.exists(cache_path): logging.info('Reading from cache: %s', format(cache_path)) tydi_gold_dict = pickle.load(open(cache_path, 'r')) # pytype: disable=wrong-arg-types else: tydi_gold_dict = eval_utils.read_annotation(FLAGS.gold_path) if FLAGS.cache_gold_data: logging.info('Caching gold data for future to: %s', format(cache_path)) pickle.dump(tydi_gold_dict, open(cache_path, 'w')) # pytype: disable=wrong-arg-types total_ans_count = 0 count = 0 for ans in tydi_gold_dict.values(): count += 1 gold_has_answer = eval_utils.gold_has_minimal_answer( ans, FLAGS.minimal_non_null_threshold) total_ans_count += gold_has_answer logging.info('%d examples have minimal answers', total_ans_count) logging.info('*' * 40) tydi_pred_dict = eval_utils.read_prediction_jsonl(FLAGS.predictions_path) per_lang_gold = {} per_lang_pred = {} for ex_id, ex in tydi_gold_dict.items(): if ex[0].language in per_lang_gold: per_lang_gold[ex[0].language][ex_id] = ex else: per_lang_gold[ex[0].language] = {ex_id: ex} for ex_id, ex in tydi_pred_dict.items(): if ex.language in per_lang_pred: per_lang_pred[ex.language][ex_id] = ex else: per_lang_pred[ex.language] = {ex_id: ex} macro_avg_passage_scores = ([], [], []) macro_avg_minimal_scores = ([], [], []) language_list = [ 'english', 'arabic', 'bengali', 'finnish', 'indonesian', 'japanese', 'swahili', 'korean', 'russian', 'telugu', 'thai' ] for lang in language_list: if lang in per_lang_pred: passage_answer_stats, minimal_answer_stats = score_answers( per_lang_gold.get(lang, {}), per_lang_pred[lang]) # Passage selection task opt_result, _ = compute_pr_curves(passage_answer_stats, targets=[0.5]) f1, precision, recall, _ = opt_result if lang != 'english': macro_avg_passage_scores[0].append(f1) macro_avg_passage_scores[1].append(precision) macro_avg_passage_scores[2].append(recall) print('Passage & ' + lang + ' & ' + get_latex_str(f1, precision, recall)) # Minimal answer span task opt_result, _ = compute_pr_curves(minimal_answer_stats, targets=[0.5]) f1, precision, recall, _ = opt_result if lang != 'english': macro_avg_minimal_scores[0].append(f1) macro_avg_minimal_scores[1].append(precision) macro_avg_minimal_scores[2].append(recall) print('Minimal Answer & ' + lang + ' & ' + get_latex_str(f1, precision, recall)) if FLAGS.pretty_print: print('*' * 20) print(lang) print('Language: %s (%d)' % (lang, len(per_lang_gold.get(lang, {})))) print('*' * 20) print('PASSAGE ANSWER R@P TABLE:') print_r_at_p_table(passage_answer_stats) print('*' * 20) print('MINIMAL ANSWER R@P TABLE:') print_r_at_p_table(minimal_answer_stats) else: metrics = get_metrics_with_answer_stats( passage_answer_stats, minimal_answer_stats) print(json.dumps(metrics)) print( 'Total # examples in gold: %d, # ex. in pred: %d (including english)' % (len(tydi_gold_dict), len(tydi_pred_dict))) f1_list, precision_list, recall_list = macro_avg_passage_scores print('*** Macro Over %d Languages, excluding English **' % len(f1_list)) avg_passage_f1 = eval_utils.safe_average(f1_list) avg_passage_recall = eval_utils.safe_average(recall_list) avg_passage_precision = eval_utils.safe_average(precision_list) print('Passage F1:%.3f P:%.3f R:%3f' % (avg_passage_f1, avg_passage_precision, avg_passage_recall)) print( get_latex_str(avg_passage_f1, avg_passage_precision, avg_passage_recall)) f1_list, precision_list, recall_list = macro_avg_minimal_scores avg_minimal_f1 = eval_utils.safe_average(f1_list) avg_minimal_recall = eval_utils.safe_average(recall_list) avg_minimal_precision = eval_utils.safe_average(precision_list) print('Minimal F1:%.3f P:%.3f R:%3f' % (avg_minimal_f1, avg_minimal_precision, avg_minimal_recall)) print( get_latex_str(avg_minimal_f1, avg_minimal_precision, avg_minimal_recall)) print('*** / Aggregate Scores ****') aggregate_metrics = { 'avg_passage_f1': avg_passage_f1, 'avg_passage_recall': avg_passage_recall, 'avg_passage_precision': avg_passage_precision, 'avg_minimal_f1': avg_minimal_f1, 'avg_minimal_recall': avg_minimal_recall, 'avg_minimal_precision': avg_minimal_precision } print(json.dumps(aggregate_metrics))
def main(_): nq_gold_dict = util.read_annotation(FLAGS.gold_path, n_threads=FLAGS.num_threads) def label_to_pred(labels): """Convert a list of gold human annotations to a perfect prediction.""" gold_has_short_answer = util.gold_has_short_answer(labels) gold_has_long_answer = util.gold_has_long_answer(labels) # We did not put `long_answer` and `yes_no_answer`, and they should be # considered as null when loading from input. pred = { 'example_id': labels[0].example_id, 'short_answers': [], 'short_answers_score': random.random(), 'long_answer_score': random.random() } keep_answer = random.random() <= FLAGS.desired_recall for label in labels: if gold_has_short_answer and keep_answer: pred['short_answers_score'] *= 2 if not util.is_null_span_list(label.short_answer_span_list): pred['short_answers'] = ([{ 'start_token': span.start_token_idx, 'end_token': span.end_token_idx, 'start_byte': span.start_byte, 'end_byte': span.end_byte } for span in label.short_answer_span_list]) pred['yes_no_answer'] = 'none' elif label.yes_no_answer != 'none': pred['short_answers'] = [] pred['yes_no_answer'] = label.yes_no_answer if (gold_has_long_answer and not label.long_answer_span.is_null_span() and keep_answer): pred['long_answer'] = { 'start_token': label.long_answer_span.start_token_idx, 'end_token': label.long_answer_span.end_token_idx, 'start_byte': label.long_answer_span.start_byte, 'end_byte': label.long_answer_span.end_byte } pred['long_answer_score'] *= 2 if FLAGS.generate_false_positives: if not gold_has_short_answer: pred['short_answers'] = [{ 'start_token': 0, 'end_token': 1, 'start_byte': -1, 'end_byte': -1 }] if not gold_has_long_answer: pred['long_answer_start_token'] = 0 pred['long_answer_end_token'] = 1 return pred predictions = [] for _, labels in nq_gold_dict.iteritems(): predictions.append(label_to_pred(labels)) with open(FLAGS.output_path, 'w') as f: json.dump({'predictions': predictions}, f)