def test_glue_average(self):
     mets = eval_utils.METRIC_NAMES.items()
     glue_metric_names = [
         v.name for k, v in mets
         if k.startswith("glue") and "average" not in k
     ]
     super_glue_metric_names = [
         v.name for k, v in mets
         if k.startswith("super") and "average" not in k
     ]
     extra_metric_names = ["Fake metric", "Average GLUE Score"]
     columns = glue_metric_names + super_glue_metric_names + extra_metric_names
     n_total_metrics = len(columns)
     df = pd.DataFrame(
         [np.arange(n_total_metrics), 2 * np.arange(n_total_metrics)],
         columns=columns,
     )
     df = eval_utils.compute_avg_glue(df)
     expected_glue = (0 + 1 + (2 + 3) / 2. + (4 + 5) / 2. + (6 + 7) / 2. +
                      (8 + 9) / 2. + 10 + 11) / 8.
     self.assertSequenceAlmostEqual(df["Average GLUE Score"],
                                    [expected_glue, 2 * expected_glue])
     expected_super = (12 + (13 + 14) / 2. + 15 + (16 + 17) / 2. +
                       (18 + 19) / 2. + 20 + 21 + 22) / 8.
     self.assertSequenceAlmostEqual(df["Average SuperGLUE Score"],
                                    [expected_super, 2 * expected_super])
     del df["CoLA"]
     del df["Average GLUE Score"]
     df = eval_utils.compute_avg_glue(df)
     self.assertNoCommonElements(df.columns, ["Average GLUE Score"])
示例#2
0
def main(_):
    if FLAGS.seqio_summaries:
        subdirs = tf.io.gfile.listdir(FLAGS.summary_dir)
        summary_dirs = [os.path.join(FLAGS.summary_dir, d) for d in subdirs]
    else:
        summary_dirs = [FLAGS.summary_dir]

    scores = None
    for d in summary_dirs:
        events = eval_utils.parse_events_files(d, FLAGS.seqio_summaries)
        if FLAGS.perplexity_eval:
            task_metrics = events
        else:
            task_metrics = eval_utils.get_eval_metric_values(
                events,
                task_name=os.path.basename(d)
                if FLAGS.seqio_summaries else None)
        if scores:
            scores.update(task_metrics)
        else:
            scores = task_metrics

    if not scores:
        logging.info("No evaluation events found in %s", FLAGS.summary_dir)
        return
    df = eval_utils.scores_to_df(scores)
    df = eval_utils.compute_avg_glue(df)
    df = eval_utils.sort_columns(df)
    eval_utils.log_csv(df, output_file=FLAGS.out_file)
示例#3
0
 def test_glue_average(self):
   score_names = [
       "glue_cola_v002/matthews_corrcoef",
       "glue_sst2_v002/accuracy",
       "glue_mrpc_v002/f1",
       "glue_mrpc_v002/accuracy",
       "glue_stsb_v002/pearson_corrcoef",
       "glue_stsb_v002/spearman_corrcoef",
       "glue_qqp_v002/f1",
       "glue_qqp_v002/accuracy",
       "glue_mnli_matched_v002/accuracy",
       "glue_mnli_mismatched_v002/accuracy",
       "glue_qnli_v002/accuracy",
       "glue_rte_v002/accuracy",
       "super_glue_boolq_v102/accuracy",
       "super_glue_cb_v102/mean_3class_f1",
       "super_glue_cb_v102/accuracy",
       "super_glue_copa_v102/accuracy",
       "super_glue_multirc_v102/f1",
       "super_glue_multirc_v102/exact_match",
       "super_glue_record_v102/f1",
       "super_glue_record_v102/em",
       "super_glue_rte_v102/accuracy",
       "super_glue_wic_v102/accuracy",
       "super_glue_wsc_v102_simple_eval/accuracy",
       "super_glue_average",
       "random/accuracy",
       "glue_average",
   ]
   scores = {k: [(20, n), (30, n*2)] for n, k in enumerate(score_names)}
   scores = eval_utils.compute_avg_glue(scores)
   expected_glue = (
       0 + 1 + (2 + 3)/2. + (4 + 5)/2. + (6 + 7)/2. + (8 + 9)/2. + 10 + 11
   )/8.
   expected_glue_average = [(20, expected_glue), (30, expected_glue * 2)]
   self.assertEqual(scores["glue_average"], expected_glue_average)
   expected_super = (
       12 + (13 + 14)/2. + 15 + (16 + 17)/2. + (18 + 19)/2. + 20 + 21 + 22
   )/8.
   expected_super_average = [(20, expected_super), (30, expected_super * 2)]
   self.assertEqual(scores["super_glue_average"], expected_super_average)
   # Test that keys don't get added when GLUE scores are not computed
   scores = {k: [(20, n), (30, n*2)] for n, k in enumerate(score_names)}
   del scores["glue_cola_v002/matthews_corrcoef"]
   del scores["glue_average"]
   scores = eval_utils.compute_avg_glue(scores)
   self.assertNoCommonElements(scores.keys(), ["glue_average"])
def main(_):
    events = eval_utils.parse_events_files(FLAGS.summary_dir)
    scores = eval_utils.get_eval_metric_values(events)
    if not scores:
        logging.info("No evaluation events found in %s", FLAGS.summary_dir)
        return
    scores = eval_utils.compute_avg_glue(scores)
    eval_utils.log_csv(scores, output_file=FLAGS.out_file)
示例#5
0
def main(_):
    events = eval_utils.parse_events_files(FLAGS.summary_dir)
    if FLAGS.perplexity_eval:
        scores = events
    else:
        scores = eval_utils.get_eval_metric_values(events)
    if not scores:
        logging.info("No evaluation events found in %s", FLAGS.summary_dir)
        return
    df = eval_utils.scores_to_df(scores)
    df = eval_utils.compute_avg_glue(df)
    df = eval_utils.sort_columns(df)
    eval_utils.log_csv(df, output_file=FLAGS.out_file)