示例#1
0
    def test_convert_to_matrices(self):
        metrics_list = [
            Metrics('1', 'A', 'peer', {
                'm1': 1,
                'm2': 2,
                'm3': 3
            }),
            Metrics('2', 'A', 'peer', {
                'm1': 4,
                'm2': 5
            }),
            Metrics('1', 'B', 'peer', {
                'm1': 6,
                'm2': 7,
                'm3': 8
            }),
            Metrics('2', 'B', 'peer', {
                'm1': 9,
                'm2': 10,
                'm3': 11
            }),
        ]
        m1 = convert_to_matrices(metrics_list, 'm1')
        np.testing.assert_array_equal(m1, [[1, 4], [6, 9]])

        m1, m2 = convert_to_matrices(metrics_list, 'm1', 'm2')
        np.testing.assert_array_equal(m1, [[1, 4], [6, 9]])
        np.testing.assert_array_equal(m2, [[2, 5], [7, 10]])

        m3 = convert_to_matrices(metrics_list, 'm3')
        np.testing.assert_array_equal(m3, [[3, np.nan], [8, 11]])

        metrics_list = [
            Metrics('1', 'A', 'peer', {
                'm1': 1,
                'm2': 2
            }),
            Metrics('2', 'A', 'peer', {
                'm1': 4,
                'm2': 5
            }),
            Metrics('1', 'B', 'peer', {
                'm1': 6,
                'm2': 7
            }),
            Metrics('3', 'B', 'peer', {
                'm1': 2,
                'm2': 9
            }),
        ]
        m1 = convert_to_matrices(metrics_list, 'm1')
        np.testing.assert_array_equal(m1, [[1, 4, np.nan], [6, np.nan, 2]])
示例#2
0
def _run_simulation(summaries_file: str, metrics_file: str, corr_func,
                    proportion: float, method: str, ground_truth: str,
                    rouge_variant: str, alpha: float, random_seed: int) -> int:
    random.seed(random_seed)
    instances = json.load(open(summaries_file, 'r'))
    metrics_list = JsonlReader(metrics_file, Metrics).read()

    metrics_list.extend(
        score_instances_with_ablated_rouge(instances, proportion,
                                           rouge_variant))
    metrics_list = merge_metrics(metrics_list)

    X, Y, Z = convert_to_matrices(metrics_list, 'ROUGE-1', 'ablated_rouge',
                                  ground_truth)

    pvalue = corr_diff_test(corr_func, X, Y, Z, method, False)
    if pvalue <= alpha:
        return 1
    return 0
示例#3
0
def main(args):
    metrics_list = load_metrics(args.metrics_jsonls)
    metrics_list = merge_metrics(metrics_list)

    for metrics in metrics_list:
        metrics.flatten_keys()

    metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics)
    for metrics in metrics_list:
        metrics.select_metrics(args.metrics)
        metrics.average_values()
    X, Y = convert_to_matrices(metrics_list, *args.metrics)

    num_iterations = 1000
    alpha = 0.05
    seed = 10

    results_dict = defaultdict(lambda: defaultdict(dict))
    for coef_name, coef_func in zip(['pearson', 'spearman', 'kendall'],
                                    [pearsonr, spearmanr, kendalltau]):
        for level_name, level in zip(['system_level', 'summary_level'],
                                     [system_level_corr, summary_level_corr]):
            corr_func = functools.partial(level, coef_func)
            for method in [
                    'bootstrap-system', 'bootstrap-input', 'bootstrap-both',
                    'fisher'
            ]:
                results = Parallel(n_jobs=args.num_processes)(delayed(
                    _run_simulation)(X, Y, corr_func, method, alpha, seed +
                                     i) for i in range(num_iterations))
                counts = Counter(results)
                proportions = {
                    key: value / len(results)
                    for key, value in counts.items()
                }
                results_dict[level_name][coef_name][method] = proportions
                print(level_name, coef_name, method, proportions)

    os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
    with open(args.output_json, 'w') as out:
        out.write(json.dumps(results_dict, indent=2))
示例#4
0
def run_hypothesis_tests(metrics_jsonl_files_or_metrics_list: Union[
    str, List[str], List[Metrics]],
                         dependent_metric: str,
                         metric_A: str,
                         metric_B: str,
                         summarizer_type: str,
                         test_method: str = 'permutation-both',
                         alpha: float = 0.05,
                         two_tailed: bool = True,
                         skip_summary_level: bool = False,
                         skip_system_level: bool = False,
                         skip_global: bool = False) -> Dict:
    if isinstance(metrics_jsonl_files_or_metrics_list, str):
        # A single file
        metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list])
    elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, str)
            for item in metrics_jsonl_files_or_metrics_list):
        # A list of files
        metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list)
    else:
        # A list of metrics
        assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, Metrics)
            for item in metrics_jsonl_files_or_metrics_list)
        metrics_list = metrics_jsonl_files_or_metrics_list

    # Merge duplicate metrics objects into one
    metrics_list = merge_metrics(metrics_list)

    for metrics in metrics_list:
        metrics.flatten_keys()

    metrics_list = filter_metrics(metrics_list, summarizer_type,
                                  dependent_metric, metric_A, metric_B)
    for metrics in metrics_list:
        metrics.select_metrics([dependent_metric, metric_A, metric_B])
        metrics.average_values()

    # Follow the math in the paper: the dependent metric is Z
    X, Y, Z = convert_to_matrices(metrics_list, metric_A, metric_B,
                                  dependent_metric)

    H0, H1 = _get_hypotheses(two_tailed, dependent_metric, metric_A, metric_B)
    results = {
        'dependent_metric': dependent_metric,
        'metric_A': metric_A,
        'metric_B': metric_B,
        'summarizer_type': summarizer_type,
        'test_method': test_method,
        'alpha': alpha,
        'two_tailed': two_tailed,
        'H0': H0,
        'H1': H1
    }
    if not skip_summary_level:
        results['summary_level'] = _run_test(summary_level_corr, X, Y, Z,
                                             test_method, alpha, two_tailed)

    if not skip_system_level:
        results['system_level'] = _run_test(system_level_corr, X, Y, Z,
                                            test_method, alpha, two_tailed)

    if not skip_global:
        results['global'] = _run_test(global_corr, X, Y, Z, test_method, alpha,
                                      two_tailed)

    return results
示例#5
0
def compute_correlation(
        metrics_jsonl_files_or_metrics_list: Union[str, List[str],
                                                   List[Metrics]],
        metric1: str,
        metric2: str,
        summarizer_type: str,
        skip_summary_level: bool = False,
        skip_system_level: bool = False,
        skip_global: bool = False,
        system_level_output_plot: str = None,
        global_output_plot: str = None,
        ci_method: str = None,
        alpha: float = 0.05,
        two_tailed: bool = True,
        ci_kwargs: Dict = None):
    if system_level_output_plot is not None:
        assert not skip_system_level, 'If `system_level_output_plot` is not `None`, system-level correlations must be calculated'
    if global_output_plot is not None:
        assert not skip_global, 'If `global_output_plot` is not `None`, global correlations must be calculated'

    ci_kwargs = ci_kwargs or {}
    summary_kwargs, system_kwargs, global_kwargs = _split_level_kwargs(
        ci_kwargs)

    if isinstance(metrics_jsonl_files_or_metrics_list, str):
        # A single file
        metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list])
    elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, str)
            for item in metrics_jsonl_files_or_metrics_list):
        # A list of files
        metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list)
    else:
        # A list of metrics
        assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, Metrics)
            for item in metrics_jsonl_files_or_metrics_list)
        metrics_list = metrics_jsonl_files_or_metrics_list

    # Merge duplicate metrics objects into one
    metrics_list = merge_metrics(metrics_list)

    for metrics in metrics_list:
        metrics.flatten_keys()

    metrics_list = filter_metrics(metrics_list, summarizer_type, metric1,
                                  metric2)
    for metrics in metrics_list:
        metrics.select_metrics([metric1, metric2])
        metrics.average_values()

    X, Y = convert_to_matrices(metrics_list, metric1, metric2)

    results = {
        'metric1': metric1,
        'metric2': metric2,
        'summarizer_type': summarizer_type
    }
    if not skip_summary_level:
        results['summary_level'] = compute_summary_level_correlations(
            X,
            Y,
            ci_method=ci_method,
            alpha=alpha,
            two_tailed=two_tailed,
            ci_kwargs=summary_kwargs)

    if not skip_system_level:
        results['system_level'] = compute_system_level_correlations(
            X,
            Y,
            ci_method=ci_method,
            alpha=alpha,
            two_tailed=two_tailed,
            ci_kwargs=system_kwargs)
        if system_level_output_plot is not None:
            _plot_system_level_metrics(metrics_list, metric1, metric2,
                                       system_level_output_plot)

    if not skip_global:
        results['global'] = compute_global_correlations(
            X,
            Y,
            ci_method=ci_method,
            alpha=alpha,
            two_tailed=two_tailed,
            ci_kwargs=global_kwargs)
        if global_output_plot is not None:
            _plot_global_metrics(metrics_list, metric1, metric2,
                                 global_output_plot)

    return results