def test_convert_to_matrices(self): metrics_list = [ Metrics('1', 'A', 'peer', { 'm1': 1, 'm2': 2, 'm3': 3 }), Metrics('2', 'A', 'peer', { 'm1': 4, 'm2': 5 }), Metrics('1', 'B', 'peer', { 'm1': 6, 'm2': 7, 'm3': 8 }), Metrics('2', 'B', 'peer', { 'm1': 9, 'm2': 10, 'm3': 11 }), ] m1 = convert_to_matrices(metrics_list, 'm1') np.testing.assert_array_equal(m1, [[1, 4], [6, 9]]) m1, m2 = convert_to_matrices(metrics_list, 'm1', 'm2') np.testing.assert_array_equal(m1, [[1, 4], [6, 9]]) np.testing.assert_array_equal(m2, [[2, 5], [7, 10]]) m3 = convert_to_matrices(metrics_list, 'm3') np.testing.assert_array_equal(m3, [[3, np.nan], [8, 11]]) metrics_list = [ Metrics('1', 'A', 'peer', { 'm1': 1, 'm2': 2 }), Metrics('2', 'A', 'peer', { 'm1': 4, 'm2': 5 }), Metrics('1', 'B', 'peer', { 'm1': 6, 'm2': 7 }), Metrics('3', 'B', 'peer', { 'm1': 2, 'm2': 9 }), ] m1 = convert_to_matrices(metrics_list, 'm1') np.testing.assert_array_equal(m1, [[1, 4, np.nan], [6, np.nan, 2]])
def _run_simulation(summaries_file: str, metrics_file: str, corr_func, proportion: float, method: str, ground_truth: str, rouge_variant: str, alpha: float, random_seed: int) -> int: random.seed(random_seed) instances = json.load(open(summaries_file, 'r')) metrics_list = JsonlReader(metrics_file, Metrics).read() metrics_list.extend( score_instances_with_ablated_rouge(instances, proportion, rouge_variant)) metrics_list = merge_metrics(metrics_list) X, Y, Z = convert_to_matrices(metrics_list, 'ROUGE-1', 'ablated_rouge', ground_truth) pvalue = corr_diff_test(corr_func, X, Y, Z, method, False) if pvalue <= alpha: return 1 return 0
def main(args): metrics_list = load_metrics(args.metrics_jsonls) metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics) for metrics in metrics_list: metrics.select_metrics(args.metrics) metrics.average_values() X, Y = convert_to_matrices(metrics_list, *args.metrics) num_iterations = 1000 alpha = 0.05 seed = 10 results_dict = defaultdict(lambda: defaultdict(dict)) for coef_name, coef_func in zip(['pearson', 'spearman', 'kendall'], [pearsonr, spearmanr, kendalltau]): for level_name, level in zip(['system_level', 'summary_level'], [system_level_corr, summary_level_corr]): corr_func = functools.partial(level, coef_func) for method in [ 'bootstrap-system', 'bootstrap-input', 'bootstrap-both', 'fisher' ]: results = Parallel(n_jobs=args.num_processes)(delayed( _run_simulation)(X, Y, corr_func, method, alpha, seed + i) for i in range(num_iterations)) counts = Counter(results) proportions = { key: value / len(results) for key, value in counts.items() } results_dict[level_name][coef_name][method] = proportions print(level_name, coef_name, method, proportions) os.makedirs(os.path.dirname(args.output_json), exist_ok=True) with open(args.output_json, 'w') as out: out.write(json.dumps(results_dict, indent=2))
def run_hypothesis_tests(metrics_jsonl_files_or_metrics_list: Union[ str, List[str], List[Metrics]], dependent_metric: str, metric_A: str, metric_B: str, summarizer_type: str, test_method: str = 'permutation-both', alpha: float = 0.05, two_tailed: bool = True, skip_summary_level: bool = False, skip_system_level: bool = False, skip_global: bool = False) -> Dict: if isinstance(metrics_jsonl_files_or_metrics_list, str): # A single file metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list]) elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, str) for item in metrics_jsonl_files_or_metrics_list): # A list of files metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list) else: # A list of metrics assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, Metrics) for item in metrics_jsonl_files_or_metrics_list) metrics_list = metrics_jsonl_files_or_metrics_list # Merge duplicate metrics objects into one metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, summarizer_type, dependent_metric, metric_A, metric_B) for metrics in metrics_list: metrics.select_metrics([dependent_metric, metric_A, metric_B]) metrics.average_values() # Follow the math in the paper: the dependent metric is Z X, Y, Z = convert_to_matrices(metrics_list, metric_A, metric_B, dependent_metric) H0, H1 = _get_hypotheses(two_tailed, dependent_metric, metric_A, metric_B) results = { 'dependent_metric': dependent_metric, 'metric_A': metric_A, 'metric_B': metric_B, 'summarizer_type': summarizer_type, 'test_method': test_method, 'alpha': alpha, 'two_tailed': two_tailed, 'H0': H0, 'H1': H1 } if not skip_summary_level: results['summary_level'] = _run_test(summary_level_corr, X, Y, Z, test_method, alpha, two_tailed) if not skip_system_level: results['system_level'] = _run_test(system_level_corr, X, Y, Z, test_method, alpha, two_tailed) if not skip_global: results['global'] = _run_test(global_corr, X, Y, Z, test_method, alpha, two_tailed) return results
def compute_correlation( metrics_jsonl_files_or_metrics_list: Union[str, List[str], List[Metrics]], metric1: str, metric2: str, summarizer_type: str, skip_summary_level: bool = False, skip_system_level: bool = False, skip_global: bool = False, system_level_output_plot: str = None, global_output_plot: str = None, ci_method: str = None, alpha: float = 0.05, two_tailed: bool = True, ci_kwargs: Dict = None): if system_level_output_plot is not None: assert not skip_system_level, 'If `system_level_output_plot` is not `None`, system-level correlations must be calculated' if global_output_plot is not None: assert not skip_global, 'If `global_output_plot` is not `None`, global correlations must be calculated' ci_kwargs = ci_kwargs or {} summary_kwargs, system_kwargs, global_kwargs = _split_level_kwargs( ci_kwargs) if isinstance(metrics_jsonl_files_or_metrics_list, str): # A single file metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list]) elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, str) for item in metrics_jsonl_files_or_metrics_list): # A list of files metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list) else: # A list of metrics assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, Metrics) for item in metrics_jsonl_files_or_metrics_list) metrics_list = metrics_jsonl_files_or_metrics_list # Merge duplicate metrics objects into one metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, summarizer_type, metric1, metric2) for metrics in metrics_list: metrics.select_metrics([metric1, metric2]) metrics.average_values() X, Y = convert_to_matrices(metrics_list, metric1, metric2) results = { 'metric1': metric1, 'metric2': metric2, 'summarizer_type': summarizer_type } if not skip_summary_level: results['summary_level'] = compute_summary_level_correlations( X, Y, ci_method=ci_method, alpha=alpha, two_tailed=two_tailed, ci_kwargs=summary_kwargs) if not skip_system_level: results['system_level'] = compute_system_level_correlations( X, Y, ci_method=ci_method, alpha=alpha, two_tailed=two_tailed, ci_kwargs=system_kwargs) if system_level_output_plot is not None: _plot_system_level_metrics(metrics_list, metric1, metric2, system_level_output_plot) if not skip_global: results['global'] = compute_global_correlations( X, Y, ci_method=ci_method, alpha=alpha, two_tailed=two_tailed, ci_kwargs=global_kwargs) if global_output_plot is not None: _plot_global_metrics(metrics_list, metric1, metric2, global_output_plot) return results