def main(_):
    nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                        n_threads=FLAGS.num_threads)

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict,
        nq_pred_dict,
        score_thres_long=FLAGS.score_thres_long,
        score_thres_short=FLAGS.score_thres_short)

    # reporting results
    print('*' * 20)

    scores = compute_final_f1(long_answer_stats, short_answer_stats)
    print('*' * 20)
    print('SCORES (n={}):'.format(scores['long-answer-n']))
    print('              F1     /  P      /  R')
    print('Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['long-answer-f1'], scores['long-answer-precision'],
        scores['long-answer-recall']))
    print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['short-answer-f1'], scores['short-answer-precision'],
        scores['short-answer-recall']))
    print('All answers  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
        scores['all-answer-f1'], scores['all-answer-precision'],
        scores['all-answer-recall']))
예제 #2
0
def get_metrics_as_dict(gold_path, prediction_path, num_threads=10):
    """Library version of the end-to-end evaluation.
  Arguments:
    gold_path: Path to the gzip JSON data. For multiple files, should be a glob
      pattern (e.g. "/path/to/files-*")
    prediction_path: Path to the JSON prediction data.
    num_threads (10): Number of threads to use when parsing multiple files.
  Returns:
    metrics: A dictionary mapping string names to metric scores.
  """

    nq_gold_dict = util.read_annotation(gold_path, n_threads=num_threads)
    nq_pred_dict = util.read_prediction_json(prediction_path)
    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    return get_metrics_with_answer_stats(long_answer_stats, short_answer_stats)
예제 #3
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), "cache")
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info("Reading from cache: %s", format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, "r"))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info("Caching gold data for next time to: %s",
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, "w"))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    if FLAGS.pretty_print:
        print("*" * 20)
        print("LONG ANSWER R@P TABLE:")
        print_r_at_p_table(long_answer_stats)
        print("*" * 20)
        print("SHORT ANSWER R@P TABLE:")
        print_r_at_p_table(short_answer_stats)

        scores = compute_final_f1(long_answer_stats, short_answer_stats)
        print("*" * 20)
        print("METRICS IGNORING SCORES (n={}):".format(
            scores["long-answer-n"]))
        print("              F1     /  P      /  R")
        print("Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}".format(
            scores["long-answer-f1"],
            scores["long-answer-precision"],
            scores["long-answer-recall"],
        ))
        print("Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}".format(
            scores["short-answer-f1"],
            scores["short-answer-precision"],
            scores["short-answer-recall"],
        ))
    else:
        metrics = get_metrics_with_answer_stats(long_answer_stats,
                                                short_answer_stats)
        print(json.dumps(metrics))
예제 #4
0
파일: tydi_eval.py 프로젝트: wgc20/tydiqa
def get_metrics_as_dict(gold_path, prediction_path):
    """Library version of the end-to-end evaluation.

  Arguments:
    gold_path: Path to a single JSONL data. Could be gzipped or not.
    prediction_path: Path to the JSONL file of prediction data.

  Returns:
    metrics: A dictionary mapping string names to metric scores.
  """

    tydi_gold_dict = eval_utils.read_annotation(gold_path)
    tydi_pred_dict = eval_utils.read_prediction_jsonl(prediction_path)

    passage_answer_stats, minimal_answer_stats = score_answers(
        tydi_gold_dict, tydi_pred_dict)

    return get_metrics_with_answer_stats(passage_answer_stats,
                                         minimal_answer_stats)
예제 #5
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, 'r'))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for next time to: %s',
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, 'w'))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    # print("nq_gold_dict", nq_gold_dict)
    # print("nq_pred_dict", nq_pred_dict)
    long_answer_stats, short_answer_stats = score_answers(
        nq_gold_dict, nq_pred_dict)

    if FLAGS.pretty_print:
        print('*' * 20)
        print('LONG ANSWER R@P TABLE:')
        print_r_at_p_table(long_answer_stats)
        print('*' * 20)
        print('SHORT ANSWER R@P TABLE:')
        print_r_at_p_table(short_answer_stats)

        scores = compute_final_f1(long_answer_stats, short_answer_stats)
        print('*' * 20)
        print('METRICS IGNORING SCORES (n={}):'.format(
            scores['long-answer-n']))
        print('              F1     /  P      /  R')
        print('Long answer  {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
            scores['long-answer-f1'], scores['long-answer-precision'],
            scores['long-answer-recall']))
        print('Short answer {: >7.2%} / {: >7.2%} / {: >7.2%}'.format(
            scores['short-answer-f1'], scores['short-answer-precision'],
            scores['short-answer-recall']))
    else:
        metrics = get_metrics_with_answer_stats(long_answer_stats,
                                                short_answer_stats)
        print(json.dumps(metrics))
예제 #6
0
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        nq_gold_dict = pickle.load(open(cache_path, 'r'))
    else:
        nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                            n_threads=FLAGS.num_threads)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for next time to: %s',
                         format(cache_path))
            pickle.dump(nq_gold_dict, open(cache_path, 'w'))

    nq_pred_dict = util.read_prediction_json(FLAGS.predictions_path)

    ## input: nq_gold_dict, nq_pred_dict
    ## output: long, short score (with optional optimal threshold)

    print('final f1, final_p, final_r', get_f1(nq_gold_dict, nq_pred_dict))
예제 #7
0
파일: tydi_eval.py 프로젝트: wgc20/tydiqa
def main(_):
    cache_path = os.path.join(os.path.dirname(FLAGS.gold_path), 'cache')
    if FLAGS.cache_gold_data and os.path.exists(cache_path):
        logging.info('Reading from cache: %s', format(cache_path))
        tydi_gold_dict = pickle.load(open(cache_path, 'r'))  # pytype: disable=wrong-arg-types
    else:
        tydi_gold_dict = eval_utils.read_annotation(FLAGS.gold_path)
        if FLAGS.cache_gold_data:
            logging.info('Caching gold data for future to: %s',
                         format(cache_path))
            pickle.dump(tydi_gold_dict, open(cache_path, 'w'))  # pytype: disable=wrong-arg-types
    total_ans_count = 0
    count = 0

    for ans in tydi_gold_dict.values():
        count += 1
        gold_has_answer = eval_utils.gold_has_minimal_answer(
            ans, FLAGS.minimal_non_null_threshold)
        total_ans_count += gold_has_answer

    logging.info('%d examples have minimal answers', total_ans_count)
    logging.info('*' * 40)
    tydi_pred_dict = eval_utils.read_prediction_jsonl(FLAGS.predictions_path)

    per_lang_gold = {}
    per_lang_pred = {}

    for ex_id, ex in tydi_gold_dict.items():
        if ex[0].language in per_lang_gold:
            per_lang_gold[ex[0].language][ex_id] = ex
        else:
            per_lang_gold[ex[0].language] = {ex_id: ex}
    for ex_id, ex in tydi_pred_dict.items():
        if ex.language in per_lang_pred:
            per_lang_pred[ex.language][ex_id] = ex
        else:
            per_lang_pred[ex.language] = {ex_id: ex}

    macro_avg_passage_scores = ([], [], [])
    macro_avg_minimal_scores = ([], [], [])

    language_list = [
        'english', 'arabic', 'bengali', 'finnish', 'indonesian', 'japanese',
        'swahili', 'korean', 'russian', 'telugu', 'thai'
    ]
    for lang in language_list:
        if lang in per_lang_pred:
            passage_answer_stats, minimal_answer_stats = score_answers(
                per_lang_gold.get(lang, {}), per_lang_pred[lang])

            # Passage selection task
            opt_result, _ = compute_pr_curves(passage_answer_stats,
                                              targets=[0.5])
            f1, precision, recall, _ = opt_result
            if lang != 'english':
                macro_avg_passage_scores[0].append(f1)
                macro_avg_passage_scores[1].append(precision)
                macro_avg_passage_scores[2].append(recall)
            print('Passage & ' + lang + ' & ' +
                  get_latex_str(f1, precision, recall))

            # Minimal answer span task
            opt_result, _ = compute_pr_curves(minimal_answer_stats,
                                              targets=[0.5])
            f1, precision, recall, _ = opt_result
            if lang != 'english':
                macro_avg_minimal_scores[0].append(f1)
                macro_avg_minimal_scores[1].append(precision)
                macro_avg_minimal_scores[2].append(recall)
            print('Minimal Answer & ' + lang + ' & ' +
                  get_latex_str(f1, precision, recall))

            if FLAGS.pretty_print:
                print('*' * 20)
                print(lang)
                print('Language: %s (%d)' %
                      (lang, len(per_lang_gold.get(lang, {}))))
                print('*' * 20)
                print('PASSAGE ANSWER R@P TABLE:')
                print_r_at_p_table(passage_answer_stats)
                print('*' * 20)
                print('MINIMAL ANSWER R@P TABLE:')
                print_r_at_p_table(minimal_answer_stats)
            else:
                metrics = get_metrics_with_answer_stats(
                    passage_answer_stats, minimal_answer_stats)
                print(json.dumps(metrics))

    print(
        'Total # examples in gold: %d, # ex. in pred: %d (including english)' %
        (len(tydi_gold_dict), len(tydi_pred_dict)))

    f1_list, precision_list, recall_list = macro_avg_passage_scores
    print('*** Macro Over %d Languages, excluding English **' % len(f1_list))
    avg_passage_f1 = eval_utils.safe_average(f1_list)
    avg_passage_recall = eval_utils.safe_average(recall_list)
    avg_passage_precision = eval_utils.safe_average(precision_list)
    print('Passage F1:%.3f P:%.3f R:%3f' %
          (avg_passage_f1, avg_passage_precision, avg_passage_recall))
    print(
        get_latex_str(avg_passage_f1, avg_passage_precision,
                      avg_passage_recall))

    f1_list, precision_list, recall_list = macro_avg_minimal_scores

    avg_minimal_f1 = eval_utils.safe_average(f1_list)
    avg_minimal_recall = eval_utils.safe_average(recall_list)
    avg_minimal_precision = eval_utils.safe_average(precision_list)
    print('Minimal F1:%.3f P:%.3f R:%3f' %
          (avg_minimal_f1, avg_minimal_precision, avg_minimal_recall))
    print(
        get_latex_str(avg_minimal_f1, avg_minimal_precision,
                      avg_minimal_recall))
    print('*** / Aggregate Scores ****')

    aggregate_metrics = {
        'avg_passage_f1': avg_passage_f1,
        'avg_passage_recall': avg_passage_recall,
        'avg_passage_precision': avg_passage_precision,
        'avg_minimal_f1': avg_minimal_f1,
        'avg_minimal_recall': avg_minimal_recall,
        'avg_minimal_precision': avg_minimal_precision
    }
    print(json.dumps(aggregate_metrics))
def main(_):
    nq_gold_dict = util.read_annotation(FLAGS.gold_path,
                                        n_threads=FLAGS.num_threads)

    def label_to_pred(labels):
        """Convert a list of gold human annotations to a perfect prediction."""
        gold_has_short_answer = util.gold_has_short_answer(labels)

        gold_has_long_answer = util.gold_has_long_answer(labels)

        # We did not put `long_answer` and `yes_no_answer`, and they should be
        # considered as null when loading from input.

        pred = {
            'example_id': labels[0].example_id,
            'short_answers': [],
            'short_answers_score': random.random(),
            'long_answer_score': random.random()
        }

        keep_answer = random.random() <= FLAGS.desired_recall
        for label in labels:
            if gold_has_short_answer and keep_answer:
                pred['short_answers_score'] *= 2
                if not util.is_null_span_list(label.short_answer_span_list):
                    pred['short_answers'] = ([{
                        'start_token': span.start_token_idx,
                        'end_token': span.end_token_idx,
                        'start_byte': span.start_byte,
                        'end_byte': span.end_byte
                    } for span in label.short_answer_span_list])
                    pred['yes_no_answer'] = 'none'
                elif label.yes_no_answer != 'none':
                    pred['short_answers'] = []
                    pred['yes_no_answer'] = label.yes_no_answer

            if (gold_has_long_answer
                    and not label.long_answer_span.is_null_span()
                    and keep_answer):
                pred['long_answer'] = {
                    'start_token': label.long_answer_span.start_token_idx,
                    'end_token': label.long_answer_span.end_token_idx,
                    'start_byte': label.long_answer_span.start_byte,
                    'end_byte': label.long_answer_span.end_byte
                }
                pred['long_answer_score'] *= 2

        if FLAGS.generate_false_positives:
            if not gold_has_short_answer:
                pred['short_answers'] = [{
                    'start_token': 0,
                    'end_token': 1,
                    'start_byte': -1,
                    'end_byte': -1
                }]

            if not gold_has_long_answer:
                pred['long_answer_start_token'] = 0
                pred['long_answer_end_token'] = 1

        return pred

    predictions = []
    for _, labels in nq_gold_dict.iteritems():
        predictions.append(label_to_pred(labels))

    with open(FLAGS.output_path, 'w') as f:
        json.dump({'predictions': predictions}, f)