Exemplo n.º 1
0
    def test_uneven_macro_aggrevation(self):
        report1 = {
            'avg': AverageMetric(1, 1),
        }
        report2 = {
            'avg': AverageMetric(0, 1),
        }
        report3 = {
            'avg': AverageMetric(0, 1),
        }
        agg1 = aggregate_named_reports({
            'a': report1,
            'b': report2
        },
                                       micro_average=False)
        agg2 = aggregate_named_reports({
            'a': {},
            'c': report3
        },
                                       micro_average=False)

        agg = aggregate_unnamed_reports([agg1, agg2])
        assert agg1['avg'] == 0.5
        assert agg2['avg'] == 0.0
        assert agg['a/avg'] == 1.0
        assert agg['b/avg'] == 0.0
        assert agg['c/avg'] == 0.0
        assert agg['avg'] == 1.0 / 3
Exemplo n.º 2
0
 def test_micro_aggregation(self):
     report1 = {
         'avg': AverageMetric(3, 4),
         'sum': SumMetric(3),
         'fixed': FixedMetric(4),
         'global_avg': GlobalAverageMetric(3, 4),
     }
     report2 = {
         'avg': AverageMetric(1, 3),
         'sum': SumMetric(4),
         'fixed': FixedMetric(4),
         'global_avg': GlobalAverageMetric(1, 3),
     }
     agg = aggregate_named_reports({'a': report1, 'b': report2}, micro_average=True)
     assert agg['avg'] == 4.0 / 7
     assert agg['sum'] == 7
     assert agg['fixed'] == 4
     assert agg['global_avg'] in (report1['global_avg'], report2['global_avg'])
     # task level metrics
     assert agg['a/avg'] == 3.0 / 4
     assert agg['a/sum'] == 3
     assert agg['a/fixed'] == 4
     assert 'a/global_avg' not in agg
     assert agg['b/avg'] == 1.0 / 3
     assert agg['b/sum'] == 4
     assert agg['b/fixed'] == 4
     assert 'b/global_avg' not in agg
Exemplo n.º 3
0
    def _run_eval(
        self,
        valid_worlds,
        opt,
        datatype,
        max_exs=-1,
        write_log=False,
        extra_log_suffix="",
    ):
        """
        Eval on validation/test data.

        :param valid_world:
            list of the pre-created validation worlds.
        :param opt:
            the options that specific the task, eval_task, etc
        :param datatype:
            the datatype to use, such as "valid" or "test"
        :param bool write_log:
            specifies to write metrics to file if the model_file is set
        :param int max_exs:
            limits the number of examples if max_exs > 0
        """

        logging.info(f'running eval: {datatype}')
        timer = Timer()
        reports = []

        max_exs_per_worker = max_exs / (len(valid_worlds) * num_workers())
        for v_world in valid_worlds:
            task_report = self._run_single_eval(opt, v_world,
                                                max_exs_per_worker)
            reports.append(task_report)

        tasks = [world.getID() for world in valid_worlds]
        named_reports = dict(zip(tasks, reports))
        report = aggregate_named_reports(named_reports,
                                         micro_average=self.opt.get(
                                             'aggregate_micro', False))
        # get the results from all workers
        report = self._sync_metrics(report)

        metrics = f'{datatype}:\n{nice_report(report)}\n'
        logging.info(f'eval completed in {timer.time():.2f}s')
        logging.report(metrics)

        # write to file
        if write_log and opt.get('model_file') and is_primary_worker():
            # Write out metrics
            with PathManager.open(
                    opt['model_file'] + extra_log_suffix + '.' + datatype,
                    'a') as f:
                f.write(f'{metrics}\n')

        return report
Exemplo n.º 4
0
 def report(self):
     """
     Report aggregate metrics across all subworlds.
     """
     metrics = aggregate_named_reports(
         {w.getID(): w.report() for w in self.worlds},
         micro_average=self.opt.get('aggregate_micro', False),
     )
     if 'exs' in metrics:
         self.total_exs += metrics['exs'].value()
     return metrics
    def _run_eval(self,
                  valid_worlds,
                  opt,
                  datatype,
                  max_exs=-1,
                  write_log=False):
        """
        Eval on validation/test data.

        :param valid_world:
            list of the pre-created validation worlds.
        :param opt:
            the options that specific the task, eval_task, etc
        :param datatype:
            the datatype to use, such as "valid" or "test"
        :param bool write_log:
            specifies to write metrics to file if the model_file is set
        :param int max_exs:
            limits the number of examples if max_exs > 0
        """

        print('[ running eval: ' + datatype + ' ]')
        timer = Timer()
        reports = []

        max_exs_per_worker = max_exs / (len(valid_worlds) * num_workers())
        for v_world in valid_worlds:
            task_report = self._run_single_eval(opt, v_world,
                                                max_exs_per_worker)
            reports.append(task_report)

        tasks = [world.getID() for world in valid_worlds]
        named_reports = dict(zip(tasks, reports))
        report = aggregate_named_reports(named_reports)
        # get the results from all workers
        report = self._sync_metrics(report)

        metrics = f'{datatype}:{nice_report(report)}'
        print(f'[ eval completed in {timer.time():.2f}s ]')
        print(metrics)

        # write to file
        if write_log and opt.get('model_file') and is_primary_worker():
            # Write out metrics
            f = open(opt['model_file'] + '.' + datatype, 'a+')
            f.write(f'{metrics}\n')
            f.close()

        return report
Exemplo n.º 6
0
def eval_model(opt, print_parser=None):
    """
    Evaluates a model.

    :param opt: tells the evaluation function how to run
    :param bool print_parser: if provided, prints the options that are set within the
        model after loading the model
    :return: the final result of calling report()
    """
    random.seed(42)
    if 'train' in opt['datatype'] and 'evalmode' not in opt['datatype']:
        raise ValueError(
            'You should use --datatype train:evalmode if you want to evaluate on '
            'the training set.'
        )

    if opt['save_world_logs'] and not opt['report_filename']:
        raise RuntimeError(
            'In order to save model replies, please specify the save path '
            'with --report-filename'
        )

    # load model and possibly print opt
    agent = create_agent(opt, requireModelExists=True)
    if print_parser:
        # show args after loading model
        print_parser.opt = agent.opt
        print_parser.print_args()

    tasks = opt['task'].split(',')
    reports = []
    for task in tasks:
        task_report = _eval_single_world(opt, agent, task)
        reports.append(task_report)

    report = aggregate_named_reports(
        dict(zip(tasks, reports)), micro_average=opt.get('aggregate_micro', False)
    )

    # print announcments and report
    print_announcements(opt)
    print(
        '[ Finished evaluating tasks {} using datatype {} ]'.format(
            tasks, opt.get('datatype', 'N/A')
        )
    )
    print(nice_report(report))
    _save_eval_stats(opt, report)
Exemplo n.º 7
0
def run_eval(valid_worlds, opt, datatype, max_exs=-1, write_log=False):
    """
    Eval on validation/test data.

    :param valid_world:
        list of the pre-created validation worlds.
    :param opt:
        the options that specific the task, eval_task, etc
    :param datatype:
        the datatype to use, such as "valid" or "test"
    :param bool write_log:
        specifies to write metrics to file if the model_file is set
    :param int max_exs:
        limits the number of examples if max_exs > 0
    """
    if valid_worlds is None:
        # This isn't the primary worker, so we can just skip evaluation
        return sync_object(None)

    print('[ running eval: ' + datatype + ' ]')
    timer = Timer()
    reports = []
    for v_world in valid_worlds:
        task_report = _run_single_eval(opt, v_world,
                                       max_exs / len(valid_worlds))
        reports.append(task_report)

    tasks = [world.getID() for world in valid_worlds]
    named_reports = dict(zip(tasks, reports))
    report = aggregate_named_reports(named_reports)

    metrics = f'{datatype}:{nice_report(report)}'
    print(f'[ eval completed in {timer.time():.2f}s ]')
    print(metrics)

    # write to file
    if write_log and opt.get('model_file'):
        # Write out metrics
        f = open(opt['model_file'] + '.' + datatype, 'a+')
        f.write(f'{metrics}\n')
        f.close()

    return sync_object(report)
Exemplo n.º 8
0
def eval_model(opt):
    """
    Evaluates a model.

    :param opt: tells the evaluation function how to run
    :return: the final result of calling report()
    """
    random.seed(42)
    if 'train' in opt['datatype'] and 'evalmode' not in opt['datatype']:
        raise ValueError(
            'You should use --datatype train:evalmode if you want to evaluate on '
            'the training set.')

    if opt['save_world_logs'] and not opt['report_filename']:
        raise RuntimeError(
            'In order to save model replies, please specify the save path '
            'with --report-filename')

    # load model and possibly print opt
    agent = create_agent(opt, requireModelExists=True)
    agent.opt.log()

    tasks = opt['task'].split(',')
    reports = []
    for task in tasks:
        task_report = _eval_single_world(opt, agent, task)
        reports.append(task_report)

    report = aggregate_named_reports(dict(zip(tasks, reports)),
                                     micro_average=opt.get(
                                         'aggregate_micro', False))

    # print announcments and report
    print_announcements(opt)
    logging.info(
        f'Finished evaluating tasks {tasks} using datatype {opt.get("datatype")}'
    )

    print(nice_report(report))
    _save_eval_stats(opt, report)
    return report
Exemplo n.º 9
0
    def test_classifier_metrics(self):
        # We assume a batch of 16 samples, binary classification case, from 2 tasks.
        # task 1
        # confusion matrix expected, for class ok,
        # TP = 2, TN = 2, FP = 2, FN = 2
        report1 = {}
        report2 = {}
        task1_f1s = {}
        task2_f1s = {}
        classes = ['class_ok', 'class_notok']
        task1_predictions = [
            'class_ok',
            'class_ok',
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
            'class_notok',
            'class_notok',
        ]
        task1_gold_labels = [
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
        ]
        for each in classes:
            precisions, recalls, f1s = ConfusionMatrixMetric.compute_metrics(
                task1_predictions, task1_gold_labels, each)
            report1.update({
                f'{each}_precision': sum(precisions, None),
                f'{each}_recall': sum(recalls, None),
                f'{each}_f1': sum(f1s, None),
            })
            task1_f1s[each] = f1s
        report1['weighted_f1'] = sum(WeightedF1Metric.compute_many(task1_f1s),
                                     None)
        # task 2, for class ok
        # TP = 3, TN = 2, FP = 2, FN = 1
        # for class not ok
        # TP = 2, TN = 3, FP = 1, FN = 2
        task2_predictions = [
            'class_ok',
            'class_ok',
            'class_ok',
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
            'class_notok',
        ]
        task2_gold_labels = [
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
            'class_ok',
            'class_ok',
            'class_notok',
            'class_notok',
        ]
        for each in classes:
            precisions, recalls, f1s = ConfusionMatrixMetric.compute_metrics(
                task2_predictions, task2_gold_labels, each)
            report2.update({
                f'{each}_precision': sum(precisions, None),
                f'{each}_recall': sum(recalls, None),
                f'{each}_f1': sum(f1s, None),
            })
            task2_f1s[each] = f1s
        report2['weighted_f1'] = sum(WeightedF1Metric.compute_many(task2_f1s),
                                     None)

        agg = aggregate_named_reports({
            'task1': report1,
            'task2': report2
        },
                                      micro_average=False)
        # task1
        assert agg['task1/class_ok_precision'] == 0.5
        assert agg['task1/class_ok_recall'] == 0.5
        assert agg['task1/class_ok_f1'] == 0.5
        # task2
        assert agg['task2/class_ok_precision'] == 3 / 5
        assert agg['task2/class_ok_recall'] == 3 / 4
        assert agg['task2/class_ok_f1'] == 2 / 3
        # task2 not ok
        assert agg['task2/class_notok_precision'] == 2 / 3
        assert agg['task2/class_notok_recall'] == 0.5
        assert agg['task2/class_notok_f1'] == 4 / 7
        # weighted f1
        assert agg['task1/weighted_f1'] == 0.5
        assert agg['task2/weighted_f1'] == (2 / 3) * 0.5 + (4 / 7) * 0.5
        # all
        assert agg['weighted_f1'] == (0.5 + (2 / 3) * 0.5 + (4 / 7) * 0.5) / 2