def test_glue_average(self): mets = eval_utils.METRIC_NAMES.items() glue_metric_names = [ v.name for k, v in mets if k.startswith("glue") and "average" not in k ] super_glue_metric_names = [ v.name for k, v in mets if k.startswith("super") and "average" not in k ] extra_metric_names = ["Fake metric", "Average GLUE Score"] columns = glue_metric_names + super_glue_metric_names + extra_metric_names n_total_metrics = len(columns) df = pd.DataFrame( [np.arange(n_total_metrics), 2 * np.arange(n_total_metrics)], columns=columns, ) df = eval_utils.compute_avg_glue(df) expected_glue = (0 + 1 + (2 + 3) / 2. + (4 + 5) / 2. + (6 + 7) / 2. + (8 + 9) / 2. + 10 + 11) / 8. self.assertSequenceAlmostEqual(df["Average GLUE Score"], [expected_glue, 2 * expected_glue]) expected_super = (12 + (13 + 14) / 2. + 15 + (16 + 17) / 2. + (18 + 19) / 2. + 20 + 21 + 22) / 8. self.assertSequenceAlmostEqual(df["Average SuperGLUE Score"], [expected_super, 2 * expected_super]) del df["CoLA"] del df["Average GLUE Score"] df = eval_utils.compute_avg_glue(df) self.assertNoCommonElements(df.columns, ["Average GLUE Score"])
def main(_): if FLAGS.seqio_summaries: subdirs = tf.io.gfile.listdir(FLAGS.summary_dir) summary_dirs = [os.path.join(FLAGS.summary_dir, d) for d in subdirs] else: summary_dirs = [FLAGS.summary_dir] scores = None for d in summary_dirs: events = eval_utils.parse_events_files(d, FLAGS.seqio_summaries) if FLAGS.perplexity_eval: task_metrics = events else: task_metrics = eval_utils.get_eval_metric_values( events, task_name=os.path.basename(d) if FLAGS.seqio_summaries else None) if scores: scores.update(task_metrics) else: scores = task_metrics if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)
def test_glue_average(self): score_names = [ "glue_cola_v002/matthews_corrcoef", "glue_sst2_v002/accuracy", "glue_mrpc_v002/f1", "glue_mrpc_v002/accuracy", "glue_stsb_v002/pearson_corrcoef", "glue_stsb_v002/spearman_corrcoef", "glue_qqp_v002/f1", "glue_qqp_v002/accuracy", "glue_mnli_matched_v002/accuracy", "glue_mnli_mismatched_v002/accuracy", "glue_qnli_v002/accuracy", "glue_rte_v002/accuracy", "super_glue_boolq_v102/accuracy", "super_glue_cb_v102/mean_3class_f1", "super_glue_cb_v102/accuracy", "super_glue_copa_v102/accuracy", "super_glue_multirc_v102/f1", "super_glue_multirc_v102/exact_match", "super_glue_record_v102/f1", "super_glue_record_v102/em", "super_glue_rte_v102/accuracy", "super_glue_wic_v102/accuracy", "super_glue_wsc_v102_simple_eval/accuracy", "super_glue_average", "random/accuracy", "glue_average", ] scores = {k: [(20, n), (30, n*2)] for n, k in enumerate(score_names)} scores = eval_utils.compute_avg_glue(scores) expected_glue = ( 0 + 1 + (2 + 3)/2. + (4 + 5)/2. + (6 + 7)/2. + (8 + 9)/2. + 10 + 11 )/8. expected_glue_average = [(20, expected_glue), (30, expected_glue * 2)] self.assertEqual(scores["glue_average"], expected_glue_average) expected_super = ( 12 + (13 + 14)/2. + 15 + (16 + 17)/2. + (18 + 19)/2. + 20 + 21 + 22 )/8. expected_super_average = [(20, expected_super), (30, expected_super * 2)] self.assertEqual(scores["super_glue_average"], expected_super_average) # Test that keys don't get added when GLUE scores are not computed scores = {k: [(20, n), (30, n*2)] for n, k in enumerate(score_names)} del scores["glue_cola_v002/matthews_corrcoef"] del scores["glue_average"] scores = eval_utils.compute_avg_glue(scores) self.assertNoCommonElements(scores.keys(), ["glue_average"])
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return scores = eval_utils.compute_avg_glue(scores) eval_utils.log_csv(scores, output_file=FLAGS.out_file)
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) if FLAGS.perplexity_eval: scores = events else: scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)