def test_uneven_macro_aggrevation(self): report1 = { 'avg': AverageMetric(1, 1), } report2 = { 'avg': AverageMetric(0, 1), } report3 = { 'avg': AverageMetric(0, 1), } agg1 = aggregate_named_reports({ 'a': report1, 'b': report2 }, micro_average=False) agg2 = aggregate_named_reports({ 'a': {}, 'c': report3 }, micro_average=False) agg = aggregate_unnamed_reports([agg1, agg2]) assert agg1['avg'] == 0.5 assert agg2['avg'] == 0.0 assert agg['a/avg'] == 1.0 assert agg['b/avg'] == 0.0 assert agg['c/avg'] == 0.0 assert agg['avg'] == 1.0 / 3
def test_micro_aggregation(self): report1 = { 'avg': AverageMetric(3, 4), 'sum': SumMetric(3), 'fixed': FixedMetric(4), 'global_avg': GlobalAverageMetric(3, 4), } report2 = { 'avg': AverageMetric(1, 3), 'sum': SumMetric(4), 'fixed': FixedMetric(4), 'global_avg': GlobalAverageMetric(1, 3), } agg = aggregate_named_reports({'a': report1, 'b': report2}, micro_average=True) assert agg['avg'] == 4.0 / 7 assert agg['sum'] == 7 assert agg['fixed'] == 4 assert agg['global_avg'] in (report1['global_avg'], report2['global_avg']) # task level metrics assert agg['a/avg'] == 3.0 / 4 assert agg['a/sum'] == 3 assert agg['a/fixed'] == 4 assert 'a/global_avg' not in agg assert agg['b/avg'] == 1.0 / 3 assert agg['b/sum'] == 4 assert agg['b/fixed'] == 4 assert 'b/global_avg' not in agg
def _run_eval( self, valid_worlds, opt, datatype, max_exs=-1, write_log=False, extra_log_suffix="", ): """ Eval on validation/test data. :param valid_world: list of the pre-created validation worlds. :param opt: the options that specific the task, eval_task, etc :param datatype: the datatype to use, such as "valid" or "test" :param bool write_log: specifies to write metrics to file if the model_file is set :param int max_exs: limits the number of examples if max_exs > 0 """ logging.info(f'running eval: {datatype}') timer = Timer() reports = [] max_exs_per_worker = max_exs / (len(valid_worlds) * num_workers()) for v_world in valid_worlds: task_report = self._run_single_eval(opt, v_world, max_exs_per_worker) reports.append(task_report) tasks = [world.getID() for world in valid_worlds] named_reports = dict(zip(tasks, reports)) report = aggregate_named_reports(named_reports, micro_average=self.opt.get( 'aggregate_micro', False)) # get the results from all workers report = self._sync_metrics(report) metrics = f'{datatype}:\n{nice_report(report)}\n' logging.info(f'eval completed in {timer.time():.2f}s') logging.report(metrics) # write to file if write_log and opt.get('model_file') and is_primary_worker(): # Write out metrics with PathManager.open( opt['model_file'] + extra_log_suffix + '.' + datatype, 'a') as f: f.write(f'{metrics}\n') return report
def report(self): """ Report aggregate metrics across all subworlds. """ metrics = aggregate_named_reports( {w.getID(): w.report() for w in self.worlds}, micro_average=self.opt.get('aggregate_micro', False), ) if 'exs' in metrics: self.total_exs += metrics['exs'].value() return metrics
def _run_eval(self, valid_worlds, opt, datatype, max_exs=-1, write_log=False): """ Eval on validation/test data. :param valid_world: list of the pre-created validation worlds. :param opt: the options that specific the task, eval_task, etc :param datatype: the datatype to use, such as "valid" or "test" :param bool write_log: specifies to write metrics to file if the model_file is set :param int max_exs: limits the number of examples if max_exs > 0 """ print('[ running eval: ' + datatype + ' ]') timer = Timer() reports = [] max_exs_per_worker = max_exs / (len(valid_worlds) * num_workers()) for v_world in valid_worlds: task_report = self._run_single_eval(opt, v_world, max_exs_per_worker) reports.append(task_report) tasks = [world.getID() for world in valid_worlds] named_reports = dict(zip(tasks, reports)) report = aggregate_named_reports(named_reports) # get the results from all workers report = self._sync_metrics(report) metrics = f'{datatype}:{nice_report(report)}' print(f'[ eval completed in {timer.time():.2f}s ]') print(metrics) # write to file if write_log and opt.get('model_file') and is_primary_worker(): # Write out metrics f = open(opt['model_file'] + '.' + datatype, 'a+') f.write(f'{metrics}\n') f.close() return report
def eval_model(opt, print_parser=None): """ Evaluates a model. :param opt: tells the evaluation function how to run :param bool print_parser: if provided, prints the options that are set within the model after loading the model :return: the final result of calling report() """ random.seed(42) if 'train' in opt['datatype'] and 'evalmode' not in opt['datatype']: raise ValueError( 'You should use --datatype train:evalmode if you want to evaluate on ' 'the training set.' ) if opt['save_world_logs'] and not opt['report_filename']: raise RuntimeError( 'In order to save model replies, please specify the save path ' 'with --report-filename' ) # load model and possibly print opt agent = create_agent(opt, requireModelExists=True) if print_parser: # show args after loading model print_parser.opt = agent.opt print_parser.print_args() tasks = opt['task'].split(',') reports = [] for task in tasks: task_report = _eval_single_world(opt, agent, task) reports.append(task_report) report = aggregate_named_reports( dict(zip(tasks, reports)), micro_average=opt.get('aggregate_micro', False) ) # print announcments and report print_announcements(opt) print( '[ Finished evaluating tasks {} using datatype {} ]'.format( tasks, opt.get('datatype', 'N/A') ) ) print(nice_report(report)) _save_eval_stats(opt, report)
def run_eval(valid_worlds, opt, datatype, max_exs=-1, write_log=False): """ Eval on validation/test data. :param valid_world: list of the pre-created validation worlds. :param opt: the options that specific the task, eval_task, etc :param datatype: the datatype to use, such as "valid" or "test" :param bool write_log: specifies to write metrics to file if the model_file is set :param int max_exs: limits the number of examples if max_exs > 0 """ if valid_worlds is None: # This isn't the primary worker, so we can just skip evaluation return sync_object(None) print('[ running eval: ' + datatype + ' ]') timer = Timer() reports = [] for v_world in valid_worlds: task_report = _run_single_eval(opt, v_world, max_exs / len(valid_worlds)) reports.append(task_report) tasks = [world.getID() for world in valid_worlds] named_reports = dict(zip(tasks, reports)) report = aggregate_named_reports(named_reports) metrics = f'{datatype}:{nice_report(report)}' print(f'[ eval completed in {timer.time():.2f}s ]') print(metrics) # write to file if write_log and opt.get('model_file'): # Write out metrics f = open(opt['model_file'] + '.' + datatype, 'a+') f.write(f'{metrics}\n') f.close() return sync_object(report)
def eval_model(opt): """ Evaluates a model. :param opt: tells the evaluation function how to run :return: the final result of calling report() """ random.seed(42) if 'train' in opt['datatype'] and 'evalmode' not in opt['datatype']: raise ValueError( 'You should use --datatype train:evalmode if you want to evaluate on ' 'the training set.') if opt['save_world_logs'] and not opt['report_filename']: raise RuntimeError( 'In order to save model replies, please specify the save path ' 'with --report-filename') # load model and possibly print opt agent = create_agent(opt, requireModelExists=True) agent.opt.log() tasks = opt['task'].split(',') reports = [] for task in tasks: task_report = _eval_single_world(opt, agent, task) reports.append(task_report) report = aggregate_named_reports(dict(zip(tasks, reports)), micro_average=opt.get( 'aggregate_micro', False)) # print announcments and report print_announcements(opt) logging.info( f'Finished evaluating tasks {tasks} using datatype {opt.get("datatype")}' ) print(nice_report(report)) _save_eval_stats(opt, report) return report
def test_classifier_metrics(self): # We assume a batch of 16 samples, binary classification case, from 2 tasks. # task 1 # confusion matrix expected, for class ok, # TP = 2, TN = 2, FP = 2, FN = 2 report1 = {} report2 = {} task1_f1s = {} task2_f1s = {} classes = ['class_ok', 'class_notok'] task1_predictions = [ 'class_ok', 'class_ok', 'class_ok', 'class_ok', 'class_notok', 'class_notok', 'class_notok', 'class_notok', ] task1_gold_labels = [ 'class_ok', 'class_ok', 'class_notok', 'class_notok', 'class_ok', 'class_ok', 'class_notok', 'class_notok', ] for each in classes: precisions, recalls, f1s = ConfusionMatrixMetric.compute_metrics( task1_predictions, task1_gold_labels, each) report1.update({ f'{each}_precision': sum(precisions, None), f'{each}_recall': sum(recalls, None), f'{each}_f1': sum(f1s, None), }) task1_f1s[each] = f1s report1['weighted_f1'] = sum(WeightedF1Metric.compute_many(task1_f1s), None) # task 2, for class ok # TP = 3, TN = 2, FP = 2, FN = 1 # for class not ok # TP = 2, TN = 3, FP = 1, FN = 2 task2_predictions = [ 'class_ok', 'class_ok', 'class_ok', 'class_ok', 'class_ok', 'class_notok', 'class_notok', 'class_notok', ] task2_gold_labels = [ 'class_ok', 'class_ok', 'class_notok', 'class_notok', 'class_ok', 'class_ok', 'class_notok', 'class_notok', ] for each in classes: precisions, recalls, f1s = ConfusionMatrixMetric.compute_metrics( task2_predictions, task2_gold_labels, each) report2.update({ f'{each}_precision': sum(precisions, None), f'{each}_recall': sum(recalls, None), f'{each}_f1': sum(f1s, None), }) task2_f1s[each] = f1s report2['weighted_f1'] = sum(WeightedF1Metric.compute_many(task2_f1s), None) agg = aggregate_named_reports({ 'task1': report1, 'task2': report2 }, micro_average=False) # task1 assert agg['task1/class_ok_precision'] == 0.5 assert agg['task1/class_ok_recall'] == 0.5 assert agg['task1/class_ok_f1'] == 0.5 # task2 assert agg['task2/class_ok_precision'] == 3 / 5 assert agg['task2/class_ok_recall'] == 3 / 4 assert agg['task2/class_ok_f1'] == 2 / 3 # task2 not ok assert agg['task2/class_notok_precision'] == 2 / 3 assert agg['task2/class_notok_recall'] == 0.5 assert agg['task2/class_notok_f1'] == 4 / 7 # weighted f1 assert agg['task1/weighted_f1'] == 0.5 assert agg['task2/weighted_f1'] == (2 / 3) * 0.5 + (4 / 7) * 0.5 # all assert agg['weighted_f1'] == (0.5 + (2 / 3) * 0.5 + (4 / 7) * 0.5) / 2