def main(_): if FLAGS.seqio_summaries: subdirs = tf.io.gfile.listdir(FLAGS.summary_dir) summary_dirs = [os.path.join(FLAGS.summary_dir, d) for d in subdirs] else: summary_dirs = [FLAGS.summary_dir] scores = None for d in summary_dirs: events = eval_utils.parse_events_files(d, FLAGS.seqio_summaries) if FLAGS.perplexity_eval: task_metrics = events else: task_metrics = eval_utils.get_eval_metric_values( events, task_name=os.path.basename(d) if FLAGS.seqio_summaries else None) if scores: scores.update(task_metrics) else: scores = task_metrics if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return scores = eval_utils.compute_avg_glue(scores) eval_utils.log_csv(scores, output_file=FLAGS.out_file)
def test_get_eval_metric_values(self): events = { "eval/foo_task/accuracy": [(20, 1.), (30, 2.)], "eval/bar_task/sequence_accuracy": [(10, 3.)], "loss": [(40, 3.)], } eval_values = eval_utils.get_eval_metric_values(events) self.assertDictEqual( eval_values, { "foo_task/accuracy": [(20, 1.), (30, 2.)], "bar_task/sequence_accuracy": [(10, 3.)], })
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) if FLAGS.perplexity_eval: scores = events else: scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)