예제 #1
0
    def test_compute_violation_ratio_exact(self):
        """
        Test the value of the violation ratio given some exact CDFs.
        """
        test_dists = [
            (
                np.random.normal,
                norm.ppf,
                {"loc": 0.275, "scale": 1.5},
                {"loc": 0.25, "scale": 1},
            ),
            (
                np.random.laplace,
                laplace.ppf,
                {"loc": 0.275, "scale": 1.5},
                {"loc": 0.25, "scale": 1},
            ),
            (np.random.rayleigh, rayleigh.ppf, {"scale": 1.05}, {"scale": 1}),
        ]

        for sample_func, ppf, params_a, params_b in test_dists:
            quantile_func_a = lambda x: ppf(x, **params_a)
            quantile_func_b = lambda x: ppf(x, **params_b)
            violation_ratio_ab_exact = compute_violation_ratio(
                quantile_func_a=quantile_func_a, quantile_func_b=quantile_func_b
            )
            violation_ratio_ba_exact = compute_violation_ratio(
                quantile_func_a=quantile_func_b, quantile_func_b=quantile_func_a
            )

            samples_a = sample_func(size=self.num_samples, **params_a)
            samples_b = sample_func(size=self.num_samples, **params_b)
            violation_ratio_ab_sampled = compute_violation_ratio(
                scores_a=samples_a, scores_b=samples_b
            )
            violation_ratio_ba_sampled = compute_violation_ratio(
                scores_a=samples_b, scores_b=samples_a
            )

            # Check symmetries
            self.assertAlmostEqual(
                violation_ratio_ab_exact, 1 - violation_ratio_ba_exact, delta=0.05
            )
            self.assertAlmostEqual(
                violation_ratio_ab_sampled, 1 - violation_ratio_ba_sampled, delta=0.05
            )

            # Check closeness to exact value
            self.assertAlmostEqual(
                violation_ratio_ab_exact, violation_ratio_ab_sampled, delta=0.05
            )
            self.assertAlmostEqual(
                violation_ratio_ba_exact, violation_ratio_ba_sampled, delta=0.05
            )
예제 #2
0
    def _bootstrap_iter(seed: Optional[int] = None):
        """
        One bootstrap iteration. Wrapped in a function so it can be handed to joblib.Parallel.
        """
        # When running multiple jobs, these modules have to be re-imported for some reason to avoid an error
        # Use dir() to check whether module is available in local scope:
        # https://stackoverflow.com/questions/30483246/how-to-check-if-a-module-has-been-imported
        if "numpy" not in dir() or "deepsig" not in dir():
            import numpy as np
            from deepsig.aso import compute_violation_ratio

        if seed is not None:
            np.random.seed(seed)

        sampled_scores_a = quantile_func_a(
            np.random.uniform(0, 1, len(scores_a)))
        sampled_scores_b = quantile_func_b(
            np.random.uniform(0, 1, len(scores_b)))
        sample = compute_violation_ratio(
            scores_a=sampled_scores_a,
            scores_b=sampled_scores_b,
            dt=dt,
        )

        return sample
예제 #3
0
    def test_compute_violation_ratio_correlation(self):
        """
        Test whether violation ratio is being computed correctly.
        """
        samples_normal2 = np.random.normal(
            scale=2, size=self.num_samples
        )  # Scores for algorithm B
        violation_ratios = []
        inv_sqw_dists = []

        # Shift the distribution of A away (algorithm A becomes better and better)
        for loc in np.arange(0, 1, 0.05):
            samples_normal1 = np.random.normal(loc=loc, size=self.num_samples)
            violation_ratio = compute_violation_ratio(
                samples_normal1, samples_normal2, dt=0.05
            )
            w_dist = wasserstein_distance(samples_normal1, samples_normal2)
            violation_ratios.append(violation_ratio)
            inv_sqw_dists.append(1 / w_dist ** 2)

        # I didn't find a closed-form solution for the violation ratio for two gaussians - so instead I am checking
        # whether it is positively correlated with the inverse squared 1-Wasserstein distance computed via scipy
        rho, _ = pearsonr(violation_ratios, inv_sqw_dists)
        self.assertGreaterEqual(rho, 0.85)
예제 #4
0
    def test_argument_combos(self):
        """
        Try different combinations of inputs arguments for compute_violation_ratio().
        """
        scores_a = np.random.normal(size=5)
        scores_b = np.random.normal(size=5)
        quantile_func_a = norm.ppf
        quantile_func_b = norm.ppf

        # All of these should work
        for kwarg1, kwarg2 in product(
            [{"scores_a": scores_a}, {"quantile_func_a": quantile_func_a}],
            [{"scores_b": scores_b}, {"quantile_func_b": quantile_func_b}],
        ):
            compute_violation_ratio(**{**kwarg1, **kwarg2})

        # These should create errors
        with self.assertRaises(AssertionError):
            compute_violation_ratio(scores_a=scores_a, quantile_func_a=quantile_func_a)

        with self.assertRaises(AssertionError):
            compute_violation_ratio(scores_b=scores_b, quantile_func_b=quantile_func_b)
예제 #5
0
def aso(
    scores_a: ArrayLike,
    scores_b: ArrayLike,
    confidence_level: float = 0.95,
    num_comparisons: int = 1,
    num_samples: int = 1000,
    num_bootstrap_iterations: int = 1000,
    dt: float = 0.005,
    num_jobs: int = 1,
    show_progress: bool = True,
    seed: Optional[int] = None,
    _progress_bar: Optional[tqdm] = None,
) -> float:
    """
    Performs the Almost Stochastic Order test by Dror et al. (2019). The function takes two list of scores as input
    (they do not have to be of the same length) and returns an upper bound to the violation ratio - the minimum epsilon
    threshold. `scores_a` should contain scores of the algorithm which we suspect to be better (in this setup,
    higher = better).

    The null hypothesis (which we would like to reject), is that the algorithm that generated `scores_a` is
    *not* better than the one `scores_b` originated from. If the violation ratio is below 0.5, the null hypothesis can
    be rejected safely (and the model scores_a belongs to is deemed better than the model of scores_b). Intuitively, the
    violation ratio denotes the degree to which total stochastic order (algorithm A is *always* better than B) is being
    violated. The more scores and the higher num_samples / num_bootstrap_iterations, the more reliable is the result.

    Parameters
    ----------
    scores_a: List[float]
        Scores of algorithm A.
    scores_b: List[float]
        Scores of algorithm B.
    confidence_level: float
        Desired confidence level of test. Set to 0.95 by default.
    num_comparisons: int
        Number of comparisons that the test is being used for. Is used to perform a Bonferroni correction.
    num_samples: int
        DEPRECATED: Number of samples from the score distributions during every bootstrap iteration when estimating
        sigma. Currently ignored, and will be deprecated in next major release.
    num_bootstrap_iterations: int
        Number of bootstrap iterations when estimating sigma.
    dt: float
        Differential for t during integral calculation.
    num_jobs: int
        Number of threads that bootstrap iterations are divided among.
    show_progress: bool
        Show progress bar. Default is True.
    seed: Optional[int]
        Set seed for reproducibility purposes. Default is None (meaning no seed is used).
    _progress_bar: Optional[tqdm]
        Hands over a progress bar object when called by multi_aso(). Only for internal use.

    Returns
    -------
    float
        Return an upper bound to the violation ratio. If it falls below 0.5, the null hypothesis can be rejected.
    """
    assert (len(scores_a) > 0
            and len(scores_b) > 0), "Both lists of scores must be non-empty."
    assert (num_bootstrap_iterations >
            0), "num_samples must be positive, {} found.".format(
                num_bootstrap_iterations)
    assert (
        num_jobs > 0 or num_jobs == -1
    ), "Number of jobs has to be at least 1 or -1, {} found.".format(num_jobs)
    assert (num_comparisons >
            0), "Number of comparisons has to be at least 1, {} found.".format(
                num_comparisons)

    # Determine the maximum number of jobs possible
    if num_jobs == -1:
        num_jobs = psutil.cpu_count(logical=True)

        if num_jobs is None:
            warn(
                "Number of available CPUs could not be determined, setting num_jobs=1."
            )
            num_jobs = 1

    # TODO: Remove in future version
    if num_samples != 1000:
        warn(
            "'num_samples' argument is being ignored in the current version and will be deprecated in version 1.3!",
            DeprecationWarning,
        )

    # TODO: Remove in future version
    if confidence_level < 0.95:
        warn(
            "'confidence_level' was refactored in version 1.2.4 to be more intuitive and usually should be in the .95 -"
            f".99 range, but {confidence_level} was found. If you tried to adjust the confidence level for multiple "
            f"comparisons, try the new num_comparisons argument instead.",
            UserWarning,
        )

    if num_comparisons > 1:
        confidence_level += (1 - confidence_level) / num_comparisons

    violation_ratio = compute_violation_ratio(scores_a=scores_a,
                                              scores_b=scores_b,
                                              dt=dt)
    # Based on the actual number of samples
    quantile_func_a = get_quantile_function(scores_a)
    quantile_func_b = get_quantile_function(scores_b)

    samples = get_bootstrapped_violation_ratios(
        scores_a,
        scores_b,
        quantile_func_a,
        quantile_func_b,
        num_bootstrap_iterations,
        dt,
        num_jobs,
        show_progress,
        seed,
        _progress_bar,
    )
    samples = np.array(samples)

    const = np.sqrt(
        len(scores_a) * len(scores_b) / (len(scores_a) + len(scores_b)))
    sigma_hat = np.std(const * (samples - violation_ratio))

    # Compute eps_min and make sure it stays in [0, 1]
    min_epsilon = np.clip(
        violation_ratio -
        (1 / const) * sigma_hat * normal.ppf(1 - confidence_level),
        0,
        1,
    )

    return min_epsilon
예제 #6
0
def multi_aso(
    scores: ScoreCollection,
    confidence_level: float = 0.95,
    use_bonferroni: bool = True,
    use_symmetry: bool = True,
    num_samples: int = 1000,
    num_bootstrap_iterations: int = 1000,
    dt: float = 0.005,
    num_jobs: int = 1,
    return_df: bool = False,
    show_progress: bool = True,
    seed: Optional[int] = None,
) -> Union[np.array, pd.DataFrame]:
    """
    Provides easy function to compare the scores of multiple models at ones. Scores can be supplied in various forms
    (dictionary, nested list, 2D arrays or tensors). Returns a matrix (or pandas.DataFrame) with results. Applies
    Bonferroni correction to confidence level by default, but can be disabled by use_bonferroni=False.

    Parameters
    ----------
    scores: ScoreCollection
        Collection of model scores. Should be either dictionary of model name to model scores, nested Python list,
        2D numpy or Jax array, or 2D Tensorflow or PyTorch tensor.
    confidence_level: float
        Desired confidence level of test. Set to 0.95 by default.
    use_bonferroni: bool
        Indicate whether Bonferroni correction should be applied to confidence level in order to adjust for the number
        of comparisons. Default is True.
    use_symmetry: bool
        DEPRECATED: Use the fact that ASO(A, B, alpha) = 1 - ASO(B, A, alpha)
        `del Barrio et al. (2018) <https://arxiv.org/pdf/1705.01788.pdf>`_ to save half of the computations. Default is
        True. Currently ignored, and will be deprecated in next major release.
    num_samples: int
        DEPRECATED: Number of samples from the score distributions during every bootstrap iteration when estimating
        sigma. Currently ignored, and will be deprecated in next major release.
    num_bootstrap_iterations: int
        Number of bootstrap iterations when estimating sigma.
    dt: float
        Differential for t during integral calculation.
    num_jobs: int
        Number of threads that bootstrap iterations are divided among.
    return_df: bool
        Indicate whether result should be returned as pandas DataFrame. Only possible if scores is a dictionary of
        model names to model scores. Otherwise, 2D numpy array with eps_min scores is returned. Default is False.
    show_progress: bool
        Show progress bar. Default is True.
    seed: Optional[int]
        Set seed for reproducibility purposes. Default is None (meaning no seed is used).

    Returns
    -------
    Union[np.array, pd.DataFrame]
        2D numpy array or pandas Dataframe (if scores is dictionary and return_df=True) with result of ASO.
    """
    assert (
        num_jobs > 0 or num_jobs == -1
    ), "Number of jobs has to be at least 1 or -1, {} found.".format(num_jobs)

    # Determine the maximum number of jobs possible
    if num_jobs == -1:
        num_jobs = psutil.cpu_count(logical=True)

        if num_jobs is None:
            warn(
                "Number of available CPUs could not be determined, setting num_jobs=1."
            )
            num_jobs = 1

    # TODO: Remove in future version
    if num_samples != 1000:
        warn(
            "'num_samples' argument is being ignored in the current version and will be deprecated in version 1.3!",
            DeprecationWarning,
        )

    # TODO: Remove in future version
    if not use_symmetry:
        warn(
            "'use_symmetry' argument is being ignored in the current version and will be deprecated in version 1.3!",
            DeprecationWarning,
        )

    # TODO: Remove in future version
    if confidence_level < 0.95:
        warn(
            "'confidence_level' was refactored in version 1.2.4 to be more intuitive and usually should be in the .95 -"
            f".99 range, but {confidence_level} was found.",
            UserWarning,
        )

    num_models = _get_num_models(scores)
    num_comparisons = num_models * (num_models - 1) / 2
    eps_min = np.eye(num_models)  # Initialize score matrix

    if use_bonferroni:
        # Increase the confidence level based in oder to mitigate the multiple comparisons problem
        confidence_level += (1 - confidence_level) / num_comparisons

    # Iterate over simple indices or dictionary keys depending on type of scores argument
    indices = list(range(num_models)) if type(scores) != dict else list(
        scores.keys())

    # Add progressbar if applicable
    progress_bar = None
    if show_progress:
        progress_bar = tqdm(
            range(int(num_comparisons *
                      num_bootstrap_iterations)) if use_symmetry else range(
                          int(num_comparisons * num_bootstrap_iterations * 2)),
            desc="Model comparisons",
        )

    for i, key_i in enumerate(indices):
        for j, key_j in enumerate(indices[(i + 1):], start=i + 1):
            scores_a, scores_b = scores[key_i], scores[key_j]
            quantile_func_a = get_quantile_function(scores_a)
            quantile_func_b = get_quantile_function(scores_b)
            const = np.sqrt(
                len(scores_a) * len(scores_b) /
                (len(scores_a) + len(scores_b)))

            violation_ratio_ab = compute_violation_ratio(
                dt=dt,
                quantile_func_a=quantile_func_a,
                quantile_func_b=quantile_func_b,
            )
            violation_ratio_ba = (1 - violation_ratio_ab
                                  )  # Exploit symmetry of violation ratio here
            samples_ab = get_bootstrapped_violation_ratios(
                scores_a,
                scores_b,
                quantile_func_a,
                quantile_func_b,
                num_bootstrap_iterations,
                dt,
                num_jobs,
                show_progress,
                seed,
                progress_bar,
            )
            samples_ab = np.array(samples_ab)

            # This quantity is the same for both, so we only have to compute it once, see
            # (samples_ab - violation_ratio_ab)
            # = (1 - samples_ba - 1 + violation_ratio_ba)
            # = (samples_ba - violation_ratio_ba)
            sigma_hat = np.std(const * (samples_ab - violation_ratio_ab))

            # Compute eps_min and make sure it stays in [0, 1]
            min_epsilon_ab = np.clip(
                violation_ratio_ab -
                (1 / const) * sigma_hat * normal.ppf(1 - confidence_level),
                0,
                1,
            )
            min_epsilon_ba = np.clip(
                violation_ratio_ba -
                (1 / const) * sigma_hat * normal.ppf(1 - confidence_level),
                0,
                1,
            )

            # Set values
            eps_min[i, j] = min_epsilon_ab
            eps_min[j, i] = min_epsilon_ba

    if type(scores) == dict and return_df:
        eps_min = pd.DataFrame(data=eps_min, index=list(scores.keys()))
        eps_min = eps_min.rename(dict(enumerate(scores.keys())), axis=1)

    return eps_min
예제 #7
0
def aso_bootstrap_comparisons(
    scores_a: ArrayLike,
    scores_b: ArrayLike,
    confidence_level: float = 0.05,
    num_samples: int = 1000,
    num_bootstrap_iterations: int = 1000,
    dt: float = 0.005,
    num_jobs: int = 2,
    show_progress: bool = False,
    seed: Optional[int] = None,
    _progress_bar: Optional[tqdm] = None,
) -> Dict[str, float]:
    """
    Like the package ASO function, but compares different choices of bootstrap estimator.

    Parameters
    ----------
    scores_a: List[float]
        Scores of algorithm A.
    scores_b: List[float]
        Scores of algorithm B.
    confidence_level: float
        Desired confidence level of test. Set to 0.05 by default.
    num_samples: int
        Number of samples from the score distributions during every bootstrap iteration when estimating sigma.
    num_bootstrap_iterations: int
        Number of bootstrap iterations when estimating sigma.
    dt: float
        Differential for t during integral calculation.
    num_jobs: int
        Number of threads that bootstrap iterations are divided among.
    show_progress: bool
        Show progress bar. Default is True.
    seed: Optional[int]
        Set seed for reproducibility purposes. Default is None (meaning no seed is used).
    _progress_bar: Optional[tqdm]
        Hands over a progress bar object when called by multi_aso(). Only for internal use.

    Returns
    -------
    float
        Return an upper bound to the violation ratio. If it falls below 0.5, the null hypothesis can be rejected.
    """
    assert (len(scores_a) > 0
            and len(scores_b) > 0), "Both lists of scores must be non-empty."
    assert num_samples > 0, "num_samples must be positive, {} found.".format(
        num_samples)
    assert (num_bootstrap_iterations >
            0), "num_samples must be positive, {} found.".format(
                num_bootstrap_iterations)
    assert num_jobs > 0, "Number of jobs has to be at least 1, {} found.".format(
        num_jobs)

    violation_ratio = compute_violation_ratio(scores_a, scores_b, dt)

    # Based on the actual number of samples
    const1 = np.sqrt(
        len(scores_a) * len(scores_b) / (len(scores_a) + len(scores_b)))
    quantile_func_a = get_quantile_function(scores_a)
    quantile_func_b = get_quantile_function(scores_b)

    def _progress_iter(high: int, progress_bar: tqdm):
        """
        This function is used when a shared progress bar is passed from multi_aso() - every time the iterator yields an
        element, the progress bar is updated by one. It essentially behaves like a simplified range() function.

        Parameters
        ----------
        high: int
            Number of elements in iterator.
        progress_bar: tqdm
            Shared progress bar.
        """
        current = 0

        while current < high:
            yield current
            current += 1
            progress_bar.update(1)

    # Add progress bar if applicable
    if show_progress and _progress_bar is None:
        iters = tqdm(range(num_bootstrap_iterations),
                     desc="Bootstrap iterations")

    # Shared progress bar when called from multi_aso()
    elif _progress_bar is not None:
        iters = _progress_iter(num_bootstrap_iterations, _progress_bar)

    else:
        iters = range(num_bootstrap_iterations)

    # Set seeds for different jobs if applicable
    # "Sub-seeds" for jobs are just seed argument + job index
    seeds = ([None] * num_bootstrap_iterations if seed is None else [
        seed + offset
        for offset in range(1, math.ceil((num_bootstrap_iterations + 1)))
    ])

    def _bootstrap_iter(seed: Optional[int] = None):
        """
        One bootstrap iteration. Wrapped in a function so it can be handed to joblib.Parallel.
        """
        # When running multiple jobs, these modules have to be re-imported for some reason to avoid an error
        # Use dir() to check whether module is available in local scope:
        # https://stackoverflow.com/questions/30483246/how-to-check-if-a-module-has-been-imported
        if "np" not in dir() or "deepsig" not in dir():
            import numpy as np
            from deepsig.aso import compute_violation_ratio

        if seed is not None:
            np.random.seed(seed)

        sampled_scores_a = quantile_func_a(
            np.random.uniform(0, 1, len(scores_a)))
        sampled_scores_b = quantile_func_b(
            np.random.uniform(0, 1, len(scores_b)))
        sample = compute_violation_ratio(
            sampled_scores_a,
            sampled_scores_b,
            dt,
        )

        return sample

    # Initialize worker pool and start iterations
    parallel = Parallel(n_jobs=num_jobs)
    samples = parallel(
        delayed(_bootstrap_iter)(seed) for seed, _ in zip(seeds, iters))

    # Compute the different variants of the bootstrap estimator

    # 1. Classic bootstrap estimator
    sigma_hat1 = np.std(1 / (num_bootstrap_iterations - 1) *
                        (samples - np.mean(samples)))
    min_epsilon1 = np.clip(
        violation_ratio -
        (1 / const1) * sigma_hat1 * normal.ppf(confidence_level),
        0,
        1,
    )

    # 2. ASO as implemented by Dror et al. (2019)
    sigma_hat2 = np.std(const1 * (samples - violation_ratio))
    min_epsilon2 = np.clip(
        violation_ratio -
        (1 / const1) * sigma_hat2 * normal.ppf(confidence_level),
        0,
        1,
    )

    # 3. Like 2., but using the expected violation ratio for sigma
    sigma_hat3 = np.std(const1 * (samples - np.mean(samples)))
    min_epsilon3 = np.clip(
        violation_ratio -
        (1 / const1) * sigma_hat3 * normal.ppf(confidence_level),
        0,
        1,
    )

    # 4. Like 3, but with the classic bootstrap bias correction
    corrected_bootstrap_violation_ratio = np.clip(
        2 * violation_ratio - np.mean(samples), 0, 1)
    min_epsilon4 = np.clip(
        corrected_bootstrap_violation_ratio -
        (1 / const1) * sigma_hat3 * normal.ppf(confidence_level),
        0,
        1,
    )

    # 5. Like 4., but with conditionally corrected bootstrap estimate
    bias = np.mean(samples) - violation_ratio
    sigma_hat_corr = np.std(1 / (len(samples) - 1) *
                            (samples - np.mean(samples)))
    min_epsilon5 = np.clip(
        (corrected_bootstrap_violation_ratio
         if bias >= sigma_hat_corr else violation_ratio) -
        (1 / const1) * sigma_hat3 * normal.ppf(confidence_level),
        0,
        1,
    )

    # 6. Like 5, but conditional correction happens based on the later used sigma hat
    min_epsilon6 = np.clip(
        (corrected_bootstrap_violation_ratio
         if bias >= sigma_hat3 else violation_ratio) -
        (1 / const1) * sigma_hat3 * normal.ppf(confidence_level),
        0,
        1,
    )

    return {
        "Classic Bootstrap": min_epsilon1,
        "Dror et al. (2019)": min_epsilon2,
        r"Bootstrap $\varepsilon_{\mathcal{W}_2}$ mean": min_epsilon3,
        "Bootstrap correction": min_epsilon4,
        "Cond. Bootstrap corr.": min_epsilon5,
        "Cond. Bootstrap corr. 2": min_epsilon6,
    }
예제 #8
0
def test_type2_error(
    sample_size: int,
    colors: Dict[str, str],
    name: str,
    num_simulations: int = 200,
    thresholds: List[float] = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    dist_func: Callable = np.random.normal,
    inv_cdf_func: Callable = scipy.stats.norm.ppf,
    dist_params: Dict[str, Any] = {
        "loc": 0,
        "scale": 0.5
    },
    dist_params2: Dict[str, Any] = {
        "loc": -0.25,
        "scale": 1.5
    },
    save_dir: Optional[str] = None,
):
    """
    Test the rate of type I error (false positive) under different samples sizes.

    Parameters
    ----------
    sample_size: int
        Sample size used in experiments.
    colors: Dict[str, str]
        Colors corresponding to each test for plotting.
    name: str
        Name of the experiment.
    num_simulations: int
        Number of simulations conducted.
    dist_func: Callable
        Distribution function that is used for sampling.
    inv_cdf_funcL Callable
        Inverse cumulative distribution function in order to compute the exact violation ratio.
    dist_params: Dict[str, Any]
        Parameters of the distribution function.
    dist_params2: Dict[str, Any]
        Parameters of the comparison distribution function.
    save_dir: Optional[str]
        Directory that plots should be saved to.
    """
    simulation_results = defaultdict(list)

    with tqdm(total=len(colors) * num_simulations) as progress_bar:
        for _ in range(num_simulations):

            # Sample scores for this round
            scores_a = dist_func(**dist_params, size=sample_size)
            scores_b = dist_func(**dist_params2, size=sample_size)

            results = aso_bootstrap_comparisons(scores_a, scores_b)

            for variant, res in results.items():
                simulation_results[variant].append(res)

            progress_bar.update(len(colors))

    # with open(f"{save_dir}/type1_pg_rates.pkl", "wb") as out_file:
    #    pickle.dump(simulation_results, out_file)

    # Plot Type I error rates as line plot
    plt.figure(figsize=(8, 6))
    plt.rcParams.update({
        "font.size": 18,
        "text.usetex": True,
        "legend.loc": "upper right"
    })

    # Create datastructure for boxplots
    data = [
        simulation_results[test_name]
        for test_name in simulation_results.keys()
    ]

    box_plot = plt.boxplot(
        data,
        widths=0.45,
        patch_artist=True,
    )

    for variant_name, patch, color in zip(simulation_results.keys(),
                                          box_plot["boxes"], colors.values()):
        patch.set_edgecolor(color)
        patch.set_facecolor("white")

        plt.plot([], color=color, label=variant_name)

    real_violation_ratio = compute_violation_ratio(
        [],
        [],
        dt=0.05,
        quantile_func_a=lambda p: inv_cdf_func(p, **dist_params),
        quantile_func_b=lambda p: inv_cdf_func(p, **dist_params2),
    )

    plt.xticks(
        range(1,
              len(colors) + 1),
        [
            f"{(np.array(simulation_results[variant_name]) > thresholds[0]).astype(float).mean():.2f}"
            for variant_name in simulation_results.keys()
        ],
    )

    ax = plt.gca()
    ax.set_ylim(0, 1)
    x = np.arange(ax.get_xlim()[0], ax.get_xlim()[1] + 1)
    plt.plot(
        x,
        np.ones(len(x)) * real_violation_ratio,
        alpha=0.8,
        linestyle="--",
        color="black",
    )
    ax.yaxis.grid()
    plt.xlabel("Bootstrap variants")
    plt.ylabel(r"$\varepsilon_\mathrm{min}$")
    plt.legend()

    if save_dir is not None:
        plt.tight_layout()
        plt.savefig(f"{save_dir}/type2_bootstrap_dists_{name}.png")
    else:
        plt.show()

    plt.close()

    with open(f"{save_dir}/type2_bootstrap_rates_{name}.txt", "w") as out_file:
        rates_df = pd.DataFrame(index=thresholds,
                                columns=simulation_results.keys())

        for threshold in thresholds:
            for variant_name, data in simulation_results.items():
                rates_df.at[threshold,
                            variant_name] = ((np.array(data) >
                                              threshold).astype(float).mean())

        out_file.write(rates_df.to_latex())