예제 #1
0
    def eval_policy(save_dir: [str, None],
                    env: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy,
                    mc_estimator: bool,
                    prefix: str,
                    num_rollouts: int,
                    num_parallel_envs: int = 1) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy, eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt('Set all returns for this policy to zero.', color='c')
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('Target domain performance', bright=True)
            print(tabulate([['mean return', to.mean(rets_real).item()],
                            ['std return', to.std(rets_real)],
                            ['min return', to.min(rets_real)],
                            ['max return', to.max(rets_real)]]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean,
                                              num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])
예제 #2
0
def test_bootsrapping():
    # Why you should operate on the deltas and not directly on the statistic from the resampled data
    sample = np.array([30, 37, 36, 43, 42, 43, 43, 46, 41, 42])
    mean = np.mean(sample)
    print(mean)
    m, ci = bootstrap_ci(sample, np.mean, num_reps=20, alpha=0.1, ci_sides=2, seed=123)
    print(m, ci)

    np.random.seed(123)
    resampled = np.random.choice(sample, (sample.shape[0], 20), replace=True)
    means = np.apply_along_axis(np.mean, 0, resampled)
    print(np.sort(means))
    ci_lo, ci_up = np.percentile(means, [100*0.05, 100*0.95])
    print(ci_lo, ci_up)

    x = np.random.normal(10, 1, 40)
    # x = np.random.uniform(5, 15, 20)
    # x = np.random.poisson(5, 30)
    np.random.seed(1)
    # print(bs.bootstrap(x, stat_func=bs_stats.mean))

    np.random.seed(1)
    m, ci = bootstrap_ci(x, np.mean, num_reps=1000, alpha=0.05, ci_sides=2, studentized=False, bias_correction=False)
    print('[use_t_for_ci=False] mean: ', m)
    print('[use_t_for_ci=False] CI: ', ci)

    np.random.seed(1)
    m, ci = bootstrap_ci(x, np.mean, num_reps=1000, alpha=0.05, ci_sides=2, studentized=False, bias_correction=True)
    print('[bias_correction=True] mean: ', m)

    m, ci = bootstrap_ci(x, np.mean, num_reps=2*384, alpha=0.05, ci_sides=1, studentized=False)
    print('[use_t_for_ci=False] mean: ', m)
    print('[use_t_for_ci=False] CI: ', ci)

    m, ci = bootstrap_ci(x, np.mean, num_reps=2*384, alpha=0.05, ci_sides=1, studentized=True)
    print('[use_t_for_ci=True] mean: ', m)
    print('[use_t_for_ci=True] CI: ', ci)

    print('Matlab example:')
    # https://de.mathworks.com/help/stats/bootci.htmls
    x_matlab = np.random.normal(1, 1, 40)

    m, ci = bootstrap_ci(x_matlab, np.mean, num_reps=2000, alpha=0.05, ci_sides=2, studentized=False)
    print('[use_t_for_ci=False] mean: ', m)
    print('[use_t_for_ci=False] CI: ', ci)

    m, ci = bootstrap_ci(x_matlab, np.mean, num_reps=2000, alpha=0.05, ci_sides=2, studentized=True)
    print('[use_t_for_ci=True] mean: ', m)
    print('[use_t_for_ci=True] CI: ', ci)
예제 #3
0
def test_bootsrapping(data, num_reps, seed):
    # Fully-fledged example
    bootstrap_ci(data,
                 np.mean,
                 num_reps,
                 alpha=0.05,
                 ci_sides=2,
                 studentized=True,
                 bias_correction=True,
                 seed=seed)

    m, ci_lo, ci_up = bootstrap_ci(data,
                                   np.mean,
                                   num_reps,
                                   alpha=0.05,
                                   ci_sides=2,
                                   studentized=False,
                                   bias_correction=False,
                                   seed=seed)
    assert np.all(m >= ci_lo)
    assert np.all(m <= ci_up)

    m_bc, ci_lo, ci_up = bootstrap_ci(data,
                                      np.mean,
                                      num_reps,
                                      alpha=0.05,
                                      ci_sides=2,
                                      studentized=False,
                                      bias_correction=True,
                                      seed=seed)
    assert np.all(m_bc != m)

    m, ci_lo, ci_up = bootstrap_ci(data,
                                   np.mean,
                                   num_reps,
                                   alpha=0.05,
                                   ci_sides=1,
                                   studentized=False,
                                   seed=seed)
    m_t, ci_lo_t, ci_up_t = bootstrap_ci(data,
                                         np.mean,
                                         num_reps,
                                         alpha=0.05,
                                         ci_sides=1,
                                         studentized=True,
                                         seed=seed)
    assert m == pytest.approx(m_t)
    assert np.all(m_t >= ci_lo_t)
    assert np.all(m_t <= ci_up_t)
    # Bounds are different (not generally wider) when assuming a t-distribution
    assert np.all(ci_lo != ci_lo_t)
    assert np.all(ci_up != ci_up_t)
예제 #4
0
def test_boostrap_methods(sample, seed):
    # Emperical bootstrap
    m_bs, ci_bs_lo, ci_bs_up = bootstrap_ci(sample,
                                            np.mean,
                                            num_reps=20,
                                            alpha=0.1,
                                            ci_sides=2,
                                            seed=seed)

    # Percentile bootstrap
    # Add one to the seed because with the MD5 seed calculation and so on, the lower quantiles are actually equal by
    # chance. This seems to be the one-in-a-million case for this.
    pyrado.set_seed(seed + 1)
    resampled = np.random.choice(sample, (sample.shape[0], 20), replace=True)
    means = np.apply_along_axis(np.mean, 0, resampled)
    ci_lo, ci_up = np.percentile(means, [5, 95])

    # You should operate on the deltas (emperical bootsrap) and not directly on the statistic from the resampled data
    # (percentile bootsrap)
    assert ci_lo != ci_bs_lo
    assert ci_up != ci_bs_up
예제 #5
0
    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        if not osp.isdir(ld):
            raise pyrado.ValueErr(msg='Given path is not a directory!')

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env_sim = joblib.load(osp.join(ld, 'env_sim.pkl'))
            self._env_real = joblib.load(osp.join(ld, 'env_real.pkl'))

            # Crawl through the given directory and check how many policies and candidates there are
            found_policies, found_cands = None, None
            for root, dirs, files in os.walk(ld):
                found_policies = [
                    p for p in files if p.endswith('_policy.pt')
                ]  # 'policy.pt' file should not be found
                found_cands = [c for c in files if c.endswith('_candidate.pt')]

            # Copy to the current experiment's directory. Not necessary if we are continuing in that directory.
            if ld != self._save_dir:
                for p in found_policies:
                    copyfile(osp.join(ld, p), osp.join(self._save_dir, p))
                for c in found_cands:
                    copyfile(osp.join(ld, c), osp.join(self._save_dir, c))

            if len(found_policies) > 0:
                # Load all found candidates to save them into a single tensor
                found_cands.sort(
                )  # the order is important since it determines the rows of the tensor
                self.cands = to.stack(
                    [to.load(osp.join(ld, c)) for c in found_cands])
                to.save(self.cands, osp.join(self._save_dir, 'candidates.pt'))

                # Catch the case that the algorithm stopped before evaluating a sampled candidate
                if not len(found_policies) == len(found_cands):
                    print_cbt(
                        f'Found {len(found_policies)} policies, but {len(found_cands)} candidates!',
                        'r')
                    n = len(found_cands) - len(found_policies)
                    delete = input(
                        'Delete the superfluous candidates? [y / any other]'
                    ).lower() == 'y'
                    if n > 0 and delete:
                        # Delete the superfluous candidates
                        print_cbt(f'Candidates before:\n{self.cands.numpy()}',
                                  'w')
                        self.cands = self.cands[:-n, :]
                        found_cands = found_cands[:-n]
                        to.save(self.cands,
                                osp.join(self._save_dir, 'candidates.pt'))
                        print_cbt(f'Candidates after:\n{self.cands.numpy()}',
                                  'c')
                    else:
                        raise pyrado.ShapeErr(
                            msg=f'Found {len(found_policies)} policies,'
                            f'but {len(found_cands)} candidates!')

            else:
                # Assuming not even the training of the initial policies has not been finished. Redo it all.
                print_cbt(
                    'No policies have been found. Basically starting from scratch.',
                    'c')
                self.train_init_policies()
                self.eval_init_policies()
                self.initialized = True

            try:
                # Crawl through the load_dir and copy all done evaluations.
                # Not necessary if we are continuing in that directory.
                if ld != self._save_dir:
                    for root, dirs, files in os.walk(load_dir):
                        [
                            copyfile(osp.join(load_dir, c),
                                     osp.join(self._save_dir, c))
                            for c in files if c.endswith('_returns_real.pt')
                        ]

                # Get all previously done evaluations. If we don't find any, the exception is caught.
                found_evals = None
                for root, dirs, files in os.walk(ld):
                    found_evals = [
                        v for v in files if v.endswith('_returns_real.pt')
                    ]
                found_evals.sort(
                )  # the order is important since it determines the rows of the tensor

                # Reconstruct candidates_values.pt
                self.cands_values = to.empty(self.cands.shape[0])
                for i, fe in enumerate(found_evals):
                    # Get the return estimate from the raw evaluations as in eval_policy()
                    if self.montecarlo_estimator:
                        self.cands_values[i] = to.mean(
                            to.load(osp.join(ld, fe)))
                    else:
                        self.cands_values[i] = to.from_numpy(
                            bootstrap_ci(to.load(osp.join(ld, fe)).numpy(),
                                         np.mean,
                                         num_reps=1000,
                                         alpha=0.05,
                                         ci_sides=1,
                                         studentized=False)[1])

                if len(found_evals) < len(found_cands):
                    print_cbt(
                        f'Found {len(found_evals)} real-world evaluation files but {len(found_cands)} candidates.'
                        f' Now evaluation the remaining ones.',
                        'c',
                        bright=True)
                for i in range(len(found_cands) - len(found_evals)):
                    # Evaluate the current policy on the target domain
                    if len(found_evals) < self.num_init_cand:
                        prefix = f'init_{i + len(found_evals)}'
                    else:
                        prefix = f'iter_{i + len(found_evals) - self.num_init_cand}'
                    policy = to.load(
                        osp.join(self._save_dir, f'{prefix}_policy.pt'))
                    self.cands_values[i + len(found_evals)] = self.eval_policy(
                        self._save_dir, self._env_real, policy,
                        self.montecarlo_estimator, prefix,
                        self.num_eval_rollouts_real)
                to.save(self.cands_values,
                        osp.join(self._save_dir, 'candidates_values.pt'))

                if len(found_cands) < self.num_init_cand:
                    print_cbt(
                        'Found less candidates than the number of initial candidates.',
                        'y')
                else:
                    self.initialized = True

            except (FileNotFoundError, RuntimeError):
                # If there are returns_real.pt files but len(found_policies) > 0 (was checked earlier),
                # then the initial policies have not been evaluated yet
                self.eval_init_policies()
                self.initialized = True

            # Get current iteration count
            found_iter_policies = None
            for root, dirs, files in os.walk(ld):
                found_iter_policies = [
                    p for p in files
                    if p.startswith('iter_') and p.endswith('_policy.pt')
                ]

            if not found_iter_policies:
                self._curr_iter = 0
                # We don't need to init the subroutine since it will be reset for iteration 0 anyway
            else:
                self._curr_iter = len(
                    found_iter_policies)  # continue with next

                # Initialize subroutine with previous iteration
                self._subroutine.load_snapshot(
                    ld, meta_info=dict(prefix=f'iter_{self._curr_iter - 1}'))

                # Evaluate and save the latest candidate on the target system.
                # This is the case if we found iter_i_candidate.pt but not iter_i_returns_real.pt
                if self.cands.shape[0] == self.cands_values.shape[0] + 1:
                    curr_cand_value = self.eval_policy(
                        self._save_dir,
                        self._env_real,
                        self._subroutine.policy,
                        self.montecarlo_estimator,
                        prefix=f'iter_{self._curr_iter - 1}',
                        num_rollouts=self.num_eval_rollouts_real)
                    self.cands_values = to.cat(
                        [self.cands_values,
                         curr_cand_value.view(1)], dim=0)
                    to.save(self.cands_values,
                            osp.join(self._save_dir, 'candidates_values.pt'))

                    if isinstance(self._env_real, RealEnv):
                        input(
                            'Evaluated in the target domain. Hit any key to continue.'
                        )

        else:
            raise pyrado.ValueErr(
                msg=f'{self.name} is not supposed be run as a subroutine!')
예제 #6
0
    def _estimate_ucbog(self, nr: int):
        """
        Collect the returns with synchronized random seeds and estimate the pessimistic and optimistic bound.
        
        :param nr: number of domains used for training the reference solutions
        :return: upper confidence bound on the optimality gap (UCBOG)
        """
        # Init containers
        cand_rets = np.zeros((self.nG, nr))
        refs_rets = np.zeros((self.nG, nr))

        # Loop over all reference solutions
        for k in range(self.nG):
            print(f'Estimating the UCBOG | Reference {k + 1} of {self.nG}')
            # Load the domain parameters corresponding to the k-th reference solution
            env_params_ref = joblib.load(
                osp.join(self._save_dir,
                         f'iter_{self._curr_iter}_env_params_ref_{k}.pkl'))
            self._env_dr.buffer = env_params_ref

            # Load the policies (makes a difference for snapshot_mode = best). They are set to eval mode by rollout()
            self._subrtn_cand.policy.load_state_dict(
                to.load(
                    osp.join(self._save_dir,
                             f'iter_{self._curr_iter}_policy_cand.pt')).
                state_dict())
            self._subrtn_refs.policy.load_state_dict(
                to.load(
                    osp.join(self._save_dir,
                             f'iter_{self._curr_iter}_policy_ref_{k}.pt')).
                state_dict())

            # Loop over all domain realizations of the reference solutions
            for i in tqdm(range(nr),
                          total=nr,
                          desc=f'Reference {k + 1}',
                          unit='domains',
                          file=sys.stdout,
                          leave=False):
                # Evaluate solutions
                cand_rets[k, i], refs_rets[
                    k, i] = self._eval_cand_and_ref_one_domain(i)

                # Process negative optimality samples
                refs_rets = self._handle_neg_samples(cand_rets, refs_rets, k,
                                                     i)

        # --------------
        # Optimality Gap
        # --------------

        # This is similar to the difference of the means that is used to calculate the optimality gap in eq. (9) in [2]
        self.Gn_diffs = np.subtract(
            refs_rets,
            cand_rets)  # optimistic bound - pessimistic bound; dim = nG x nr
        Gn_samples = np.mean(self.Gn_diffs, axis=1)  # dim = 1 x nr
        Gn_est = np.mean(
            Gn_samples
        )  # sample mean of the original (non-bootstrapped) samples

        ratio_neg_diffs = 1 - np.count_nonzero(
            self.Gn_diffs
        ) / self.Gn_diffs.size  # assuming zero come from clipping

        print_cbt(f'diffs (optimistic - pessimistic bound):\n{self.Gn_diffs}',
                  'y')
        print_cbt(
            f'\n{100 * ratio_neg_diffs}% of the diffs would have been negative and were set to 0\n',
            'r',
            bright=True)

        if ratio_neg_diffs == 1:
            # All diffs are negative
            ci_bs = [
                0, float('inf')
            ]  # such that the UCBOG comparison in stopping_criterion_met() does not break
            log_dict = {
                'Gn_est': np.NaN,
                'UCBOG': np.NaN,
                'ratio_neg_diffs': np.NaN
            }
        else:
            # Apply bootstrapping
            m_bs, ci_bs = bootstrap_ci(np.ravel(self.Gn_diffs), np.mean,
                                       self.num_bs_reps, self.alpha, 1,
                                       self.studentized_ci)
            print(f'm_bs: {m_bs}, ci_bs: {ci_bs}')
            print_cbt(f'\nOG (point estimate): {Gn_est} \nUCBOG: {ci_bs[1]}\n',
                      'y',
                      bright=True)
            log_dict = {
                'Gn_est': Gn_est,
                'UCBOG': ci_bs[1],
                'ratio_neg_diffs': ratio_neg_diffs
            }

        # Log the optimality gap data
        mode = 'w' if self.curr_iter == 0 else 'a'
        with open(osp.join(self._save_dir, 'OG_log.csv'), mode,
                  newline='') as csvfile:
            fieldnames = list(log_dict.keys())
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if self.curr_iter == 0:
                writer.writeheader()
            writer.writerow(log_dict)

        # Store the current UCBOG estimated from all samples
        self.ucbog = ci_bs[1]
예제 #7
0
            column_labels[i], metric_arr.shape[0], mean_metric[i],
            min_metric[i], std_metric[i]
        ])

    print("\nAll metrics:")
    print(tabulate(metric_arr, column_labels))
    print("\nStandard Deviation:\n", tabulate(table, headers))

    # compute confidence intervals
    conf_headers = ["metric", "experiments", "mean", "ci low", "ci high"]

    for i in range(len(column_labels)):
        total_mean, total_ci_lo, total_ci_hi = bootstrap_ci(
            np.array([experiment_means[exp][i] for exp in best_experiments]),
            np.mean,
            num_reps=1000,
            alpha=0.05,
            ci_sides=2,
        )

        conf_table.append([
            column_labels[i],
            len(best_experiments), total_mean, total_ci_lo, total_ci_hi
        ])
    print("\nConfidence Interval:\n", tabulate(conf_table, conf_headers))

    info = "Best Experiments:\n"
    for exp in best_experiments:
        info += f"\t\t{exp}:\n"

    # Save the table in a latex file if requested
예제 #8
0
파일: curve.py 프로젝트: fdamken/SimuRLacra
def draw_curve_from_data(
    plot_type: str,
    ax: plt.Axes,
    data: Union[list, np.ndarray, to.Tensor, pd.DataFrame],
    x_grid: Union[list, np.ndarray, to.Tensor],
    ax_calc: int,
    x_label: Optional[Union[str, Sequence[str]]] = None,
    y_label: Optional[str] = None,
    curve_label: Optional[str] = None,
    area_label: Optional[str] = "",
    vline_level: Optional[float] = None,
    vline_label: str = "approx. solved",
    title: Optional[str] = None,
    show_legend: bool = True,
    cmp_kwargs: dict = None,
    plot_kwargs: dict = None,
    legend_kwargs: dict = None,
) -> plt.Figure:
    """
    Create a box or violin plot for a list of data arrays or a pandas DataFrame.
    The plot is neither shown nor saved.

    .. note::
        If you want to have a tight layout, it is best to pass axes of a figure with `tight_layout=True` or
        `constrained_layout=True`.

        If you want to order the 4th element to the 2nd position in terms of colors use
        .. code-block:: python

            palette.insert(1, palette.pop(3))

    :param plot_type: tye of 1-dim plot: `mean_std`, `min_mean_max`, or `ci_on_mean`
    :param ax: axis of the figure to plot on
    :param data: data to plot,me.g. a time series
    :param x_grid: values to plot the data over, e.g. time
    :param ax_calc: axis of the data array to calculate the mean, min and max, or std over
    :param x_label: labels for the categories on the x-axis, if `data` is not given as a `DataFrame`
    :param y_label: label for the y-axis, pass `None` to set no label
    :param curve_label: label of the (1-dim) curve, pass `None` for no label
    :param area_label: label of the (transparent) area, pass `None` for no label and "" for the default label
    :param vline_level: if not `None` (default) add a vertical line at the given level
    :param vline_label: label for the vertical line
    :param show_legend: if `True` the legend is shown, useful when handling multiple subplots
    :param title: title displayed above the figure, set to None to suppress the title
    :param cmp_kwargs: keyword arguments forwarded to functions computing the statistics of interest
    :param plot_kwargs: keyword arguments forwarded to the plotting` functions
    :param legend_kwargs: keyword arguments forwarded to pyplot's `legend()` function, e.g. `loc='best'`
    :return: handle to the resulting figure
    """
    plot_type = plot_type.lower()
    if plot_type not in ["mean_std", "min_mean_max", "ci_on_mean"]:
        raise pyrado.ValueErr(
            given=plot_type,
            eq_constraint="mean_std, min_mean_max, ci_on_mean")
    if not isinstance(data, (list, to.Tensor, np.ndarray, pd.DataFrame)):
        raise pyrado.TypeErr(
            given=data,
            expected_type=[list, to.Tensor, np.ndarray, pd.DataFrame])

    # Set defaults which can be overwritten by passing plot_kwargs
    cmp_kwargs = merge_dicts([
        dict(num_reps=1000,
             confidence_level=0.9,
             bias_correction=False,
             studentized=False), cmp_kwargs
    ])

    if isinstance(data, pd.DataFrame):
        data = data.to_numpy()
    elif isinstance(data, list):
        data = np.array(data)
    elif isinstance(data, to.Tensor):
        data = data.detach().cpu().numpy()

    # Extract features from data
    data_mean = np.mean(data, axis=ax_calc)
    df = pd.DataFrame()
    df = df.assign(mean=data_mean)
    if plot_type == "mean_std":
        data_std = np.std(data, axis=ax_calc)
        df = df.assign(std=data_std)

    elif plot_type == "min_mean_max":
        data_min = np.min(data, axis=ax_calc)
        data_max = np.max(data, axis=ax_calc)
        df = df.assign(min=data_min)
        df = df.assign(max=data_max)

    elif plot_type == "ci_on_mean":
        _, data_lo, data_up = bootstrap_ci(
            data.T if ax_calc == 1 else data,
            stat_fcn=np.mean,
            num_reps=cmp_kwargs["num_reps"],
            alpha=cmp_kwargs["confidence_level"],
            ci_sides=2,
            bias_correction=cmp_kwargs["bias_correction"],
            studentized=cmp_kwargs["studentized"],
            seed=0,
        )
        df = df.assign(ci_lo=data_lo)
        df = df.assign(ci_up=data_up)

    # Forward the actual plotting
    return draw_curve(
        plot_type,
        ax,
        df,
        x_grid,
        x_label,
        y_label,
        curve_label,
        area_label,
        vline_level,
        vline_label,
        title,
        show_legend,
        plot_kwargs,
        legend_kwargs,
    )
예제 #9
0
파일: spota.py 프로젝트: fdamken/SimuRLacra
    def _estimate_ucbog(self, nr: int):
        """
        Collect the returns with synchronized random seeds and estimate the pessimistic and optimistic bound.

        :param nr: number of domains used for training the reference solutions
        :return: upper confidence bound on the optimality gap (UCBOG)
        """
        # Init containers
        cand_rets = np.zeros((self.nG, nr))
        refs_rets = np.zeros((self.nG, nr))

        # Loop over all reference solutions
        for k in range(self.nG):
            print_cbt(
                f"Estimating the UCBOG | Reference {k + 1} of {self.nG} ...",
                "c")
            # Load the domain parameters corresponding to the k-th reference solution
            env_params_ref = joblib.load(
                osp.join(self.save_dir,
                         f"iter_{self._curr_iter}_env_params_ref_{k}.pkl"))
            self.env_dr.buffer = env_params_ref

            # Load the policies (makes a difference for snapshot_mode = best)
            self._subrtn_cand._policy = pyrado.load(
                "policy.pt",
                self.save_dir,
                prefix=f"iter_{self._curr_iter}",
                suffix="cand",
                obj=self._subrtn_cand._policy,
            )
            self._subrtn_refs._policy = pyrado.load(
                "policy.pt",
                self.save_dir,
                prefix=f"iter_{self._curr_iter}",
                suffix=f"ref_{k}",
                obj=self._subrtn_refs._policy,
            )

            # Loop over all domain realizations of the reference solutions
            for i in tqdm(range(nr),
                          total=nr,
                          desc=f"Reference {k + 1}",
                          unit="domains",
                          file=sys.stdout,
                          leave=False):
                # Evaluate solutions
                cand_rets[k, i], refs_rets[
                    k, i] = self._eval_cand_and_ref_one_domain(i)

                # Process negative optimality samples
                refs_rets = self._handle_neg_samples(cand_rets, refs_rets, k,
                                                     i)

        # --------------
        # Optimality Gap
        # --------------

        # This is similar to the difference of the means that is used to calculate the optimality gap in eq. (9) in [2]
        self.Gn_diffs = np.subtract(
            refs_rets,
            cand_rets)  # optimistic bound - pessimistic bound; dim = nG x nr
        Gn_samples = np.mean(self.Gn_diffs, axis=1)  # dim = 1 x nr
        Gn_est = np.mean(
            Gn_samples
        )  # sample mean of the original (non-bootstrapped) samples

        ratio_neg_diffs = 1 - np.count_nonzero(
            self.Gn_diffs
        ) / self.Gn_diffs.size  # assuming zero come from clipping

        print_cbt(f"diffs (optimistic - pessimistic bound):\n{self.Gn_diffs}",
                  "y")
        print_cbt(
            f"\n{100 * ratio_neg_diffs}% of the diffs would have been negative and were set to 0\n",
            "r",
            bright=True)

        if ratio_neg_diffs == 1:
            # All diffs are negative
            ci_bs_lo, ci_bs_up = np.zeros(1), np.array(
                [pyrado.inf]
            )  # such that the UCBOG comparison in stopping_criterion_met() does not break
            log_dict = {
                "Gn_est": np.NaN,
                "UCBOG": np.NaN,
                "ratio_neg_diffs": np.NaN
            }
        else:
            # Apply bootstrapping
            m_bs, ci_bs_lo, ci_bs_up = bootstrap_ci(np.ravel(self.Gn_diffs),
                                                    np.mean, self.num_bs_reps,
                                                    self.alpha, 1,
                                                    self.studentized_ci)
            print(f"m_bs: {m_bs}, ci_bs: {ci_bs_lo, ci_bs_up}")
            print_cbt(f"\nOG (point estimate): {Gn_est} \nUCBOG: {ci_bs_up}\n",
                      "y",
                      bright=True)
            log_dict = {
                "Gn_est": Gn_est,
                "UCBOG": ci_bs_up,
                "ratio_neg_diffs": ratio_neg_diffs
            }

        # Log the optimality gap data
        mode = "w" if self.curr_iter == 0 else "a"
        with open(osp.join(self.save_dir, "OG_log.csv"), mode,
                  newline="") as csvfile:
            fieldnames = list(log_dict.keys())
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if self.curr_iter == 0:
                writer.writeheader()
            writer.writerow(log_dict)

        # Store the current UCBOG estimated from all samples
        self.ucbog = ci_bs_up