def evaluate_instances( instances: List[EvalInstance], metrics: List[Metric]) -> Tuple[MetricsDict, List[Metrics]]: macro = MetricsDict() micro_list = get_initial_micro_list(instances) for metric in metrics: # Prepare the input arguments summary_args = [] for field in metric.required_summary_fields: summary_args.append( [instance.fields[field].to_input() for instance in instances]) context_args = [] for field in metric.required_context_fields: context_args.append( [instance.fields[field].to_input() for instance in instances]) # Score all the summaries this_macro, this_micro_list = metric.evaluate(*summary_args, *context_args) # Update the global metrics dictionaries macro.update(this_macro) for micro, this_micro in zip(micro_list, this_micro_list): micro.metrics.update(this_micro) return macro, micro_list
def _combine_metrics(self, recall_metrics: MetricsDict, precision_metrics: MetricsDict) -> MetricsDict: combined = MetricsDict() combined.update(recall_metrics) combined.update(precision_metrics) for key in combined.keys(): if 'precision' and 'recall' in combined[key]: precision = combined[key]['precision'] recall = combined[key]['recall'] f1 = 0.0 if precision + recall != 0.0: f1 = 2 * (precision * recall) / (precision + recall) combined[key]['f1'] = f1 return combined
def run(self, args): params = Params.from_file(args.config, args.overrides) dataset_reader = DatasetReader.from_params( params.pop('dataset_reader')) metrics = load_metrics(params) instances = dataset_reader.read() summaries = [instance.summary for instance in instances] macro = MetricsDict() micro_list = get_initial_micro_list(instances) for metric in metrics: # Prepare the extra input arguments eval_args = [] for field in metric.required_fields: eval_args.append( [instance.fields[field] for instance in instances]) # Score all the summaries this_macro, this_micro_list = metric.evaluate( summaries, *eval_args) # Update the global metrics dictionaries macro.update(this_macro) for micro, this_micro in zip(micro_list, this_micro_list): micro.metrics.update(this_micro) dirname = os.path.dirname(args.macro_output_json) if dirname: os.makedirs(dirname, exist_ok=True) serialized_macro = jsons.dumps({'metrics': macro}, jdkwargs={'indent': 2}) with open(args.macro_output_json, 'w') as out: out.write(serialized_macro) if not args.silent: print(serialized_macro) with JsonlWriter(args.micro_output_jsonl) as out: for metrics_dict in micro_list: out.write(metrics_dict)
def _run_metric(self, summary_tokens: List[Token], reference_tokens_list: List[List[Token]], matches_list: List[List[Tuple[int, int, float]]], token_weights_list: List[List[float]], metric: str): total_weight = 0 total_normalization_weight = 0 total_matches = 0 content_type_to_total_matches = defaultdict(float) matcher_metrics = [[] for _ in self.matchers] for reference_tokens, matches, weights in zip(reference_tokens_list, matches_list, token_weights_list): total_weight += self.backend.get_total_weight(matches) total_normalization_weight += sum(weights) all_matches = [] content_type_to_matches = defaultdict(list) for i, matcher in enumerate(self.matchers): category_matches, metrics = matcher.select_matches( summary_tokens, reference_tokens, matches, weights, metric, self.backend) content_type_to_matches[matcher.content_type].extend( category_matches) all_matches.extend(category_matches) matcher_metrics[i].append(metrics) total_matches += self.backend.get_total_weight(all_matches) for content_type, content_matches in content_type_to_matches.items( ): content_type_to_total_matches[ content_type] += self.backend.get_total_weight( content_matches) # Compute the aggregated metrics for each matcher metrics = MetricsDict() for matcher, metrics_list in zip(self.matchers, matcher_metrics): metrics.update(matcher.finalize(metrics_list, total_weight, metric)) # Add the standard rouge score measure = 0.0 if total_normalization_weight != 0.0: measure = total_weight / total_normalization_weight * 100 metrics[self.name] = { f'{metric}_total_weight': total_weight, f'{metric}_total_norm_weight': total_normalization_weight, metric: measure } # Compute the metric for just the edges that the categories selected measure = 0.0 if total_normalization_weight != 0.0: measure = total_matches / total_normalization_weight * 100 coverage = 0.0 if total_weight != 0.0: coverage = total_matches / total_weight * 100 # Calculate each content type coverage for content_type, content_total_matches in content_type_to_total_matches.items( ): content_coverage = 0.0 if total_weight != 0.0: content_coverage = content_total_matches / total_weight * 100 metrics[f'{metric}_content-coverages'][ content_type] = content_coverage metrics[f'interpretable-{self.name}'] = { f'{metric}_total_weight': total_matches, f'{metric}_total_norm_weight': total_normalization_weight, metric: measure, f'{metric}_coverage': coverage } return metrics
def test_update(self): m1 = MetricsDict({'k1': 1, 'k2': {'k3': [1, 2, 3]}}) m2 = MetricsDict({'k4': 4, 'k2': {'k3': 5, 'k5': 8}}) m1.update(m2) assert m1 == {'k1': 1, 'k2': {'k3': 5, 'k5': 8}, 'k4': 4}