Exemplo n.º 1
0
def compute_exact_odds(acore, ):

    clf_probs = train_clf(sample_size=acore.b,
                          clf_model=acore.classifier_or,
                          gen_function=acore.model.generate_sample,
                          d=acore.model.d,
                          clf_name=acore.classifier_or_name)
Exemplo n.º 2
0
def main(b,
         alpha,
         classifier,
         sample_size_obs,
         run,
         n_eval_grid=101,
         debug=False,
         seed=7,
         sample_size_check=1000,
         size_reference=1000):

    # Setup the variables, also to account for debug runs
    np.random.seed(seed)
    b = b if not debug else 100
    sample_size_obs = sample_size_obs if not debug else 5
    classifier_cde_dict = classifier_cde_dict_full if not debug else classifier_cde_dict_small

    # Create the loader object, which drives most
    print('----- Loading Simulations In')
    model_obj = model_dict[run]()

    # Also, calculate the reference distribution
    model_obj.set_reference_g(size_reference=size_reference)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    clf_model = classifier_dict[classifier]
    gen_sample_func = model_obj.generate_sample
    classifier = classifier.replace('\n', '').replace(' ', '-')

    # Then generate first the thetas used for checking coverage
    theta_vec, x_vec = model_obj.sample_sim_check(
        sample_size=sample_size_check, n=sample_size_obs)

    # Compute Odds via classifier
    print('----- Calculating Odds')
    clf = train_clf(sample_size=b,
                    clf_model=clf_model,
                    gen_function=gen_sample_func,
                    d=model_obj.d,
                    clf_name=classifier)
    tau_obs = np.array([
        compute_statistics_single_t0(clf=clf,
                                     obs_sample=x_vec[kk, :, :].reshape(
                                         -1, model_obj.d_obs),
                                     d=model_obj.d,
                                     d_obs=model_obj.d_obs,
                                     t0=theta_0,
                                     grid_param_t1=grid_param)
        for kk, theta_0 in enumerate(theta_vec)
    ])
    print('----- %s Trained' % classifier)

    # Loop over B'
    b_prime_vec = model_obj.b_prime_vec if not debug else [500, 1000]
    out_val = []
    out_cols = [
        'b_prime', 'classifier', 'class_cde', 'run', 'n_eval_grid',
        'sample_check', 'sample_reference', 'percent_correct_coverage',
        'average_coverage', 'percent_correct_coverage_lr',
        'average_coverage_lr', 'percent_correct_coverage_1std',
        'average_coverage_1std', 'percent_correct_coverage_2std',
        'average_coverage_2std'
    ]
    for b_prime in np.array(b_prime_vec).astype(int):
        # First generate the samples to train b_prime algorithm
        np.random.seed(seed)
        theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime,
                                                   sample_size=sample_size_obs)
        stats_mat = np.array([
            compute_statistics_single_t0(clf=clf,
                                         d=model_obj.d,
                                         d_obs=model_obj.d_obs,
                                         grid_param_t1=grid_param,
                                         t0=theta_0,
                                         obs_sample=sample_mat[kk, :, :])
            for kk, theta_0 in enumerate(theta_mat)
        ])

        pbar = tqdm(total=len(classifier_cde_dict.keys()),
                    desc=r'Working on QR classifiers, b=%s' % b_prime)
        for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(),
                                              key=lambda x: x[0]):

            if b_prime > 10000 and 'RF' in clf_name_qr:
                continue

            t0_pred_vec = train_qr_algo(
                model_obj=model_obj,
                theta_mat=theta_mat,
                stats_mat=stats_mat,
                algo_name=clf_params[0],
                learner_kwargs=clf_params[1],
                pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None,
                alpha=alpha,
                prediction_grid=theta_vec)

            in_vec = np.array([
                int(tau_obs[jj] > t0_pred_vec[jj])
                for jj in range(theta_vec.shape[0])
            ])

            # Calculate the mean
            model = XGBClassifier(depth=3, n_estimators=100)
            model.fit(theta_vec.reshape(-1, model_obj.d), in_vec.reshape(-1, ))
            pred_grid = model_obj.pred_grid
            pred_cov_mean = model.predict_proba(pred_grid)[:, 1]
            percent_correct_coverage = np.average((pred_cov_mean >
                                                   (1.0 - alpha)).astype(int))
            average_coverage = np.average(pred_cov_mean)

            # Calculate the upper limit
            x = theta_vec.reshape(-1, 2)
            y = in_vec.reshape(-1, )
            # estimate the model
            X = sm.add_constant(x)

            with Suppressor():
                model = sm.Logit(y, X).fit(full_output=False)
            proba = model.predict(X)

            percent_correct_coverage_lr = np.average(
                (proba > (1.0 - alpha)).astype(int))
            average_coverage_lr = np.average(proba)

            # estimate confidence interval for predicted probabilities
            cov = model.cov_params()
            gradient = (proba * (1 - proba) *
                        X.T).T  # matrix of gradients for each observation
            std_errors = np.array(
                [np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient])
            c = 1  # multiplier for confidence interval
            upper = np.maximum(0, np.minimum(1, proba + std_errors * c))
            percent_correct_coverage_upper = np.average(
                (upper > (1.0 - alpha)).astype(int))
            average_coverage_upper = np.average(upper)

            upper_2std = np.maximum(0, np.minimum(1,
                                                  proba + std_errors * 1.96))
            percent_correct_coverage_upper_2std = np.average(
                (upper_2std > (1.0 - alpha)).astype(int))
            average_coverage_upper_2std = np.average(upper_2std)

            out_val.append([
                b_prime, classifier, clf_name_qr, run, n_eval_grid,
                sample_size_check, size_reference, percent_correct_coverage,
                average_coverage, percent_correct_coverage_lr,
                average_coverage_lr, percent_correct_coverage_upper,
                average_coverage_upper, percent_correct_coverage_upper_2std,
                average_coverage_upper_2std
            ])

            pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/%s' % model_obj.out_directory
    out_filename = 'b_prime_analysis_%s_%s_alpha%s_ngrid%s_sizecheck%s_bprimemax%s_logregint_%s.csv' % (
        classifier, run, str(alpha).replace(
            '.', '-'), n_eval_grid, sample_size_check, np.max(b_prime_vec),
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)
Exemplo n.º 3
0
def main(b,
         b_prime,
         alpha,
         classifier,
         class_cde,
         sample_size_obs,
         run,
         t_star,
         c_star,
         debug=False,
         seed=7,
         n_sampled=500,
         size_reference=1000):

    # Setup the variables, also to account for debug runs
    np.random.seed(seed)
    b = b if not debug else 100
    b_prime = b_prime if not debug else 100
    sample_size_obs = sample_size_obs if not debug else 1
    n_sampled = n_sampled if not debug else 10

    # Create the loader object, which drives most
    print('----- Loading Simulations In')
    model_obj = model_dict[run]() if not debug else model_dict[run](
        num_grid=21)
    t0_val = model_obj.true_t0

    # Also, calculate the reference distribution
    model_obj.set_reference_g(size_reference=size_reference)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    clf_model = classifier_dict[classifier]
    gen_sample_func = model_obj.generate_sample
    t0_grid = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    classifier = classifier.replace('\n', '').replace(' ', '-')

    # Create a sample of observed data that are going to be used later
    # and compute statistics tau value for each t0
    x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val)

    start_time = datetime.now()
    # Calculate Odds
    print('----- Calculating Odds')
    if t_star:
        train_time = datetime.now()
        pbar = tqdm(total=t0_grid.shape[0], desc=r'Calculating True $\tau$')
        tau_obs = []
        for t0 in t0_grid:
            tau_obs.append(
                model_obj.compute_exact_tau(x_obs=x_obs,
                                            t0_val=t0,
                                            meshgrid=grid_param))
            pbar.update(1)
        tau_obs = np.array(tau_obs)
        pred_time = datetime.now()
    else:
        # Compute Odds via classifier
        clf = train_clf(sample_size=b,
                        clf_model=clf_model,
                        gen_function=gen_sample_func,
                        d=model_obj.d,
                        clf_name=classifier)
        train_time = datetime.now()
        print('----- %s Trained' % classifier)

        pbar = tqdm(total=len(t0_grid), desc='Calculate Odds')
        tau_obs = []
        for theta_0 in t0_grid:
            tau_obs.append(
                compute_statistics_single_t0(clf=clf,
                                             obs_sample=x_obs,
                                             t0=theta_0,
                                             d=model_obj.d,
                                             d_obs=model_obj.d_obs,
                                             grid_param_t1=grid_param))
            pbar.update(1)
        tau_obs = np.array(tau_obs)
        pred_time = datetime.now()

    # Train Quantile Regression
    if c_star:
        pbar = tqdm(total=t0_grid.shape[0],
                    desc=r'Calculating Distribution True $\tau$')
        tau_distr = []
        for t0 in t0_grid:
            tau_distr.append(
                model_obj.compute_exact_tau_distr(
                    t0_val=t0,
                    meshgrid=grid_param,
                    n_sampled=n_sampled,
                    sample_size_obs=sample_size_obs))
            pbar.update(1)
        bprime_time = datetime.now()

        tau_distr = np.array(tau_distr)
        np.save(
            file='sims/%stau_distr_t0_%s_%s_%ssampled_%ssamplesizeobs.npy' %
            (model_obj.out_directory, b, b_prime, n_sampled, sample_size_obs),
            arr=tau_distr)
        t0_pred_vec = np.quantile(a=tau_distr, q=alpha, axis=1)
        cutoff_time = datetime.now()

    else:
        print('----- Training Quantile Regression Algorithm')
        theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime,
                                                   sample_size=sample_size_obs)

        # Compute the tau values for QR training
        if t_star:
            stats_mat = np.array([
                model_obj.compute_exact_tau(x_obs=sample_mat[kk, :, :],
                                            t0_val=theta_0,
                                            meshgrid=grid_param)
                for kk, theta_0 in enumerate(theta_mat)
            ])
        else:
            stats_mat = np.array([
                compute_statistics_single_t0(clf=clf,
                                             d=model_obj.d,
                                             d_obs=model_obj.d_obs,
                                             grid_param_t1=grid_param,
                                             t0=theta_0,
                                             obs_sample=sample_mat[kk, :, :])
                for kk, theta_0 in enumerate(theta_mat)
            ])
        bprime_time = datetime.now()
        clf_params = classifier_cde_dict[class_cde]

        t0_pred_vec = train_qr_algo(
            model_obj=model_obj,
            alpha=alpha,
            theta_mat=theta_mat,
            stats_mat=stats_mat,
            algo_name=clf_params[0],
            learner_kwargs=clf_params[1],
            pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None,
            prediction_grid=t0_grid)
        cutoff_time = datetime.now()

    # Confidence Region
    print('----- Creating Confidence Region')
    simultaneous_nh_decision = []
    for jj, t0_pred in enumerate(t0_pred_vec):
        simultaneous_nh_decision.append(
            [t0_pred, tau_obs[jj],
             int(tau_obs[jj] < t0_pred)])

    time_vec = [(train_time - start_time).total_seconds(),
                (pred_time - train_time).total_seconds(),
                (bprime_time - pred_time).total_seconds(),
                (cutoff_time - bprime_time).total_seconds()]
    time_vec.append(sum(time_vec))
    print(time_vec)

    # Saving data
    print('----- Saving Data')
    save_dict = {
        'background': t0_grid[:, 0],
        'signal': t0_grid[:, 1],
        'tau_statistics': tau_obs,
        'simul_nh_cutoff': [el[0] for el in simultaneous_nh_decision],
        'simul_nh_decision': [el[2] for el in simultaneous_nh_decision],
        'b': b,
        'b_prime': b_prime,
        'seed': seed,
        'sample_size_obs': sample_size_obs,
        'classifier': classifier,
        't_star': t_star,
        'time_vec': time_vec
    }
    outfile_name = '2d_confint_%s_data_b_%s_bprime_%s_%s_%s_n%s_%s_%s_%s_%s%s_%s.pkl' % (
        run, b, b_prime, t0_val[0], t0_val[1], sample_size_obs, classifier,
        class_cde, n_sampled, '' if not t_star else '_taustar',
        '' if not c_star else '_cstar',
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    outdir = 'sims/%s' % model_obj.out_directory
    pickle.dump(obj=save_dict, file=open(outdir + outfile_name, 'wb'))

    # Visualization
    plot_df = pd.DataFrame.from_dict({
        'background':
        t0_grid[:, 0],
        'signal':
        t0_grid[:, 1],
        'tau_statistics':
        tau_obs,
        'simul_nh_cutoff': [el[0] for el in simultaneous_nh_decision],
        'simul_nh_decision': [el[2] for el in simultaneous_nh_decision]
    })

    col_vec = ['blue']
    alpha_vec = [0.75, 0.1]
    theta_0_plot = plot_df['background'].values
    theta_1_plot = plot_df['signal'].values

    plt.figure(figsize=(12, 8))
    for ii, col in enumerate(['simul_nh_decision']):
        value_temp = plot_df[col].values
        marker = np.array(["x" if el else "o" for el in value_temp])
        unique_markers = set(marker)

        for j, um in enumerate(unique_markers):
            mask = marker == um
            plt.scatter(x=theta_0_plot[mask],
                        y=theta_1_plot[mask],
                        marker=um,
                        color=col_vec[ii],
                        alpha=alpha_vec[j])

        plt.scatter(x=t0_val[0], y=t0_val[1], color='r', marker='*', s=500)
        plt.xlabel('Background', fontsize=25)
        plt.ylabel('Signal', fontsize=25)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.title("2D Confidence Interval, %s Example, B=%s, B'=%s, n=%s%s%s" %
                  (run.title(), b, b_prime, sample_size_obs, '' if not t_star
                   else '\n tau_star', '' if not c_star else ', c_star'),
                  fontsize=25)

    plt.tight_layout()
    image_name = '2d_confint_%s_b_%s_bprime_%s_%s_%s_%s_n%s%s%s_%s.pdf' % (
        run, b, b_prime, t0_val[0], t0_val[1], sample_size_obs, classifier,
        '' if not t_star else '_taustar', '' if not c_star else '_cstar',
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    plt.savefig('images/%s/' % model_obj.out_directory + image_name)
Exemplo n.º 4
0
def main(run,
         rep,
         b,
         b_prime,
         alpha,
         t0_val,
         sample_size_obs,
         classifier_cde,
         or_loss_samples=1000,
         debug=False,
         seed=7,
         size_check=1000,
         verbose=False,
         marginal=False,
         size_marginal=1000):

    # Changing values if debugging
    b = b if not debug else 100
    b_prime = b_prime if not debug else 100
    size_check = size_check if not debug else 100
    rep = rep if not debug else 2
    model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    gen_sample_func = model_obj.generate_sample
    t0_grid = model_obj.pred_grid
    tp_func = model_obj.compute_exact_prob
    or_loss_sample_func = model_obj.create_samples_for_or_loss

    # Creating sample to check entropy about
    np.random.seed(seed)
    sample_check = gen_sample_func(sample_size=size_check, marginal=marginal)
    theta_vec = sample_check[:, :model_obj.d]
    x_vec = sample_check[:, (model_obj.d + 1):]
    bern_vec = sample_check[:, model_obj.d]

    true_prob_vec = tp_func(theta_vec=theta_vec, x_vec=x_vec)
    entropy_est = -np.average([
        np.log(true_prob_vec[kk]) if el == 1 else np.log(1 - true_prob_vec[kk])
        for kk, el in enumerate(bern_vec)
    ])

    # Creating sample to calculate the OR loss
    first_term_sample, second_term_sample = or_loss_sample_func(
        or_loss_samples=or_loss_samples)

    # Loop over repetitions and classifiers
    # Each time we train the different classifiers, we build the intervals and we record
    # whether the point is in or not.
    out_val = []
    out_cols = [
        'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep',
        'sample_size_obs', 'cross_entropy_loss', 't0_true_val',
        'theta_0_current', 'on_true_t0', 'estimated_tau', 'estimated_cutoff',
        'in_confint', 'out_confint', 'size_CI', 'true_entropy', 'or_loss'
    ]
    pbar = tqdm(total=rep,
                desc='Toy Example for Simulations, n=%s, b=%s' %
                (sample_size_obs, b))
    for jj in range(rep):

        # Generates samples for each t0 values, so to be able to check both coverage and power
        x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val)

        # Train the classifier for the odds
        clf_odds_fitted = {}
        clf_cde_fitted = {}
        for clf_name, clf_model in sorted(classifier_dict.items(),
                                          key=lambda x: x[0]):
            clf_odds = train_clf(sample_size=b,
                                 clf_model=clf_model,
                                 gen_function=gen_sample_func,
                                 clf_name=clf_name,
                                 marginal=marginal,
                                 nn_square_root=True)
            if verbose:
                print('----- %s Trained' % clf_name)
            tau_obs = np.array([
                compute_statistics_single_t0(clf=clf_odds,
                                             obs_sample=x_obs,
                                             t0=theta_0,
                                             grid_param_t1=grid_param,
                                             d=model_obj.d,
                                             d_obs=model_obj.d_obs)
                for theta_0 in t0_grid
            ])

            # Calculating cross-entropy
            est_prob_vec = clf_prob_value(clf=clf_odds,
                                          x_vec=x_vec,
                                          theta_vec=theta_vec,
                                          d=model_obj.d,
                                          d_obs=model_obj.d_obs)
            loss_value = log_loss(y_true=bern_vec, y_pred=est_prob_vec)

            # Calculating or loss
            or_loss_value = or_loss(clf=clf_odds,
                                    first_sample=first_term_sample,
                                    second_sample=second_term_sample)

            clf_odds_fitted[clf_name] = (tau_obs, loss_value, or_loss_value)

            # Train the quantile regression algorithm for confidence levels
            theta_mat, sample_mat = msnh_sampling_func(
                b_prime=b_prime, sample_size=sample_size_obs)
            full_mat = np.hstack((theta_mat, sample_mat))
            stats_mat = np.apply_along_axis(
                arr=full_mat,
                axis=1,
                func1d=lambda row: compute_statistics_single_t0(
                    clf=clf_odds,
                    obs_sample=row[model_obj.d:],
                    t0=row[:model_obj.d],
                    grid_param_t1=grid_param,
                    d=model_obj.d,
                    d_obs=model_obj.d_obs))
            clf_cde_fitted[clf_name] = {}
            # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]):
            clf_name_qr = classifier_cde
            clf_params = classifier_cde_dict[classifier_cde]
            t0_pred_vec = train_qr_algo(
                model_obj=model_obj,
                theta_mat=theta_mat,
                stats_mat=stats_mat,
                algo_name=clf_params[0],
                learner_kwargs=clf_params[1],
                pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None,
                alpha=alpha,
                prediction_grid=t0_grid)
            clf_cde_fitted[clf_name][clf_name_qr] = t0_pred_vec

        # At this point all it's left is to record
        for clf_name, (tau_obs_val, cross_ent_loss,
                       or_loss_value) in clf_odds_fitted.items():
            for clf_name_qr, cutoff_val in clf_cde_fitted[clf_name].items():
                size_temp = np.sum(
                    (tau_obs_val >= cutoff_val).astype(int)) / t0_grid.shape[0]
                for kk, theta_0_current in enumerate(t0_grid):
                    out_val.append([
                        b_prime, b, clf_name, clf_name_qr, run, jj,
                        sample_size_obs, cross_ent_loss, t0_val,
                        theta_0_current,
                        int(t0_val == theta_0_current), tau_obs_val[kk],
                        cutoff_val[kk],
                        int(tau_obs_val[kk] > cutoff_val[kk]),
                        int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp,
                        entropy_est, or_loss_value
                    ])
        pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/classifier_cov_pow_toy/'
    out_filename = 'classifier_reps_cov_pow_toy_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        str(t0_val).replace('.', '-'), classifier_cde,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)

    # Print results
    cov_df = out_df[out_df['on_true_t0'] == 1][[
        'classifier', 'classifier_cde', 'in_confint', 'cross_entropy_loss',
        'size_CI'
    ]]
    print(
        cov_df.groupby(['classifier', 'classifier_cde']).agg({
            'in_confint': [np.average],
            'size_CI': [np.average, np.std],
            'cross_entropy_loss': [np.average, np.std]
        }))

    # Power plots
    out_df['class_combo'] = out_df[['classifier', 'classifier_cde'
                                    ]].apply(lambda x: x[0] + '---' + x[1],
                                             axis=1)
    plot_df = out_df[['class_combo', 'theta_0_current', 'out_confint'
                      ]].groupby(['class_combo',
                                  'theta_0_current']).mean().reset_index()
    fig = plt.figure(figsize=(20, 10))
    sns.lineplot(x='theta_0_current',
                 y='out_confint',
                 hue='class_combo',
                 data=plot_df,
                 palette='cubehelix')
    plt.legend(loc='best', fontsize=25)
    plt.xlabel(r'$\theta$', fontsize=25)
    plt.ylabel('Power', fontsize=25)
    plt.title("Power of Hypothesis Test, B=%s, B'=%s, n=%s, %s" %
              (b, b_prime, sample_size_obs, run.title()),
              fontsize=25)
    out_dir = 'images/classifier_cov_pow_toy/'
    outfile_name = 'power_classifier_reps_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.pdf' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        str(t0_val).replace('.', '-'), classifier_cde,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    plt.tight_layout()
    plt.savefig(out_dir + outfile_name)
    plt.close()
Exemplo n.º 5
0
def main(run, rep, b, b_prime, alpha, sample_size_obs, classifier_cde, sample_type='MC', cutoff='qr',
         debug=False, seed=7, size_check=1000, verbose=False, marginal=False, size_marginal=1000):

    # Changing values if debugging
    b = b if not debug else 100
    b_prime = b_prime if not debug else 100
    size_check = size_check if not debug else 100
    rep = rep if not debug else 2
    model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    gen_sample_func = model_obj.generate_sample
    t0_grid = model_obj.pred_grid
    t0_val = model_obj.true_param
    lik_func = model_obj.compute_exact_likelihood
    np.random.seed(seed)

    # Adding Gaussian Process as an option in the classifier toy example
    anchor_points_vec = [5, 10, 25]
    for anchor_points in anchor_points_vec:
        classifier_dict['gaussian_process_' + str(anchor_points)] = anchor_points

    # Loop over repetitions and classifiers
    # Each time we train the different classifiers, we build the intervals and we record
    # whether the point is in or not.
    out_val = []
    out_cols = ['b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs',
                't0_true_val', 'theta_0_current', 'on_true_t0',
                'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI', 'mse_loss',
                'training_time', 'pred_time', 'bprime_time', 'cutoff_time', 'total_time', 'cutoff_type']
    pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s, b=%s' % (sample_size_obs, b))
    for jj in range(rep):

        # Generates samples for each t0 values, so to be able to check both coverage and power
        x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val)

        # Calculate the true likelihood ratio
        lik_theta0 = np.array([np.sum(np.log(lik_func(x_obs=x_obs, true_param=theta_0))) for theta_0 in t0_grid])
        max_across_grid = np.max(np.array([np.sum(np.log(lik_func(x_obs=x_obs, true_param=t1))) for t1 in grid_param]))
        true_tau_obs = lik_theta0.reshape(-1, ) - max_across_grid.reshape(1)
        # print('TRUE', true_tau_obs)

        # Train the classifier for the odds
        clf_odds_fitted = {}
        clf_cde_fitted = {}
        for clf_name, clf_model in sorted(classifier_dict.items(), key=lambda x: x[0]):
            start_time = datetime.now()

            if 'gaussian_process' in clf_name:

                # Train Gaussian Process
                gp_model = train_gp(sample_size=b, n_anchor_points=clf_model, model_obj=model_obj, t0_grid=t0_grid,
                                    sample_type=sample_type)
                training_time = datetime.now()

                # Calculate LR given a Gaussian Process
                tau_obs = np.array([
                    compute_statistics_single_t0_gp(
                        gp_model=gp_model, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param,
                        d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid])
                clf_odds_fitted[clf_name] = (tau_obs, np.mean((tau_obs - true_tau_obs)**2))
                # print(clf_name, clf_odds_fitted[clf_name])
                pred_time = datetime.now()

                # Calculate the LR statistics given a sample
                if cutoff == 'qr':
                    theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs)
                    full_mat = np.hstack((theta_mat, sample_mat))
                    stats_mat = np.apply_along_axis(arr=full_mat, axis=1,
                                                    func1d=lambda row: compute_statistics_single_t0_gp(
                                                        gp_model=gp_model,
                                                        obs_sample=row[model_obj.d:],
                                                        t0=row[:model_obj.d],
                                                        grid_param_t1=grid_param,
                                                        d=model_obj.d,
                                                        d_obs=model_obj.d_obs
                                                    ))
                    bprime_time = datetime.now()

                    clf_cde_fitted[clf_name] = {}
                    # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]):
                    clf_name_qr = classifier_cde
                    clf_params = classifier_cde_dict[classifier_cde]
                    t0_pred_vec = train_qr_algo(model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat,
                                                algo_name=clf_params[0], learner_kwargs=clf_params[1],
                                                pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None,
                                                alpha=alpha, prediction_grid=t0_grid)
                elif cutoff == 'chisquare':
                    chisquare_cutoff = chi2.ppf(q=1.0-alpha, df=1)
                    t0_pred_vec = np.array([-0.5 * chisquare_cutoff] * tau_obs.shape[0])

                    bprime_time = datetime.now()
                    clf_name_qr = classifier_cde
                    clf_cde_fitted[clf_name] = {}
                else:
                    raise ValueError('Cutoff %s not recognized. Either "qr" or "chisquare" are accepted' % cutoff)

            else:
                clf_odds = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func,
                                     clf_name=clf_name, marginal=marginal, nn_square_root=True)
                training_time = datetime.now()

                if verbose:
                    print('----- %s Trained' % clf_name)
                tau_obs = np.array([
                    compute_statistics_single_t0(
                        clf=clf_odds, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param,
                        d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid])
                clf_odds_fitted[clf_name] = (tau_obs, np.mean((tau_obs - true_tau_obs)**2))
                # print(clf_name, clf_odds_fitted[clf_name])
                pred_time = datetime.now()

                # Train the quantile regression algorithm for confidence levels
                theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs)
                full_mat = np.hstack((theta_mat, sample_mat))
                stats_mat = np.apply_along_axis(arr=full_mat, axis=1,
                                                func1d=lambda row: compute_statistics_single_t0(
                                                    clf=clf_odds,
                                                    obs_sample=row[model_obj.d:],
                                                    t0=row[:model_obj.d],
                                                    grid_param_t1=grid_param,
                                                    d=model_obj.d,
                                                    d_obs=model_obj.d_obs
                                                ))
                bprime_time = datetime.now()

                clf_cde_fitted[clf_name] = {}
                # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]):
                clf_name_qr = classifier_cde
                clf_params = classifier_cde_dict[classifier_cde]
                t0_pred_vec = train_qr_algo(model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat,
                                            algo_name=clf_params[0], learner_kwargs=clf_params[1],
                                            pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None,
                                            alpha=alpha, prediction_grid=t0_grid)

            cutoff_time = datetime.now()
            clf_cde_fitted[clf_name][clf_name_qr] = (
                t0_pred_vec, ((training_time - start_time).total_seconds() * 100,
                              (pred_time - training_time).total_seconds() * 100,
                              (bprime_time - pred_time).total_seconds() * 100,
                              (cutoff_time - bprime_time).total_seconds() * 100))

        # At this point all it's left is to record
        for clf_name, (tau_obs_val, mse_val) in clf_odds_fitted.items():
            for clf_name_qr, (cutoff_val, time_vec) in clf_cde_fitted[clf_name].items():
                size_temp = np.sum((tau_obs_val >= cutoff_val).astype(int))/t0_grid.shape[0]
                for kk, theta_0_current in enumerate(t0_grid):
                    out_val.append([
                        b_prime, b, clf_name, clf_name_qr, run, jj, sample_size_obs,
                        t0_val, theta_0_current, int(t0_val == theta_0_current),
                        tau_obs_val[kk], cutoff_val[kk], int(tau_obs_val[kk] > cutoff_val[kk]),
                        int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp, mse_val,
                        time_vec[0], time_vec[1], time_vec[2], time_vec[3], sum(time_vec), cutoff
                    ])
        pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols)
    out_dir = 'sims/gp_mc_comparison/'
    out_filename = 'classifier_reps_gp_mc_comparison_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        str(t0_val).replace('.', '-'), classifier_cde,
        datetime.strftime(datetime.today(), '%Y-%m-%d')
    )
    out_df.to_csv(out_dir + out_filename)

    # Print results
    cov_df = out_df[out_df['on_true_t0'] == 1][['classifier', 'classifier_cde',
                                                'in_confint', 'mse_loss', 'size_CI',
                                                'training_time', 'pred_time', 'bprime_time', 'cutoff_time',
                                                'total_time']]
    print(cov_df.groupby(['classifier', 'classifier_cde']).agg({'in_confint': [np.average],
                                                                'size_CI': [np.average, np.std],
                                                                'mse_loss': [np.average, np.std],
                                                                'training_time': [np.average, np.std],
                                                                'pred_time': [np.average, np.std],
                                                                'bprime_time': [np.average, np.std],
                                                                'cutoff_time': [np.average, np.std],
                                                                'total_time': [np.average, np.std]}))

    # Power plots
    out_df['class_combo'] = out_df[['classifier', 'classifier_cde']].apply(lambda x: x[0] + '---' + x[1], axis = 1)
    plot_df = out_df[['class_combo', 'theta_0_current', 'out_confint']].groupby(
        ['class_combo', 'theta_0_current']).mean().reset_index()
    fig = plt.figure(figsize=(20, 10))
    sns.lineplot(x='theta_0_current', y='out_confint', hue='class_combo', data=plot_df, palette='cubehelix')
    plt.legend(loc='best', fontsize=25)
    plt.xlabel(r'$\theta$', fontsize=25)
    plt.ylabel('Power', fontsize=25)
    plt.title("Power of Hypothesis Test, B=%s, B'=%s, n=%s, %s" % (
        b, b_prime, sample_size_obs, run.title()), fontsize=25)
    out_dir = 'images/gp_mc_comparison/'
    outfile_name = 'power_gp_mc_comparison_reps_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.pdf' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        str(t0_val).replace('.', '-'), classifier_cde,
        datetime.strftime(datetime.today(), '%Y-%m-%d')
    )
    plt.tight_layout()
    plt.savefig(out_dir + outfile_name)
    plt.close()
Exemplo n.º 6
0
def main(alpha,
         run,
         debug=False,
         seed=7,
         size_check=1000,
         size_reference=10000):

    # Setup the variables, also to account for debug runs
    np.random.seed(seed)
    classifier_dict = classifier_dict_full if not debug else classifier_dict_small

    # Create the loader object, which drives most
    print('----- Loading Simulations In')
    model_obj = model_dict[run]()

    # Also, get the mean and std of the reference distribution
    model_obj.set_reference_g(size_reference=size_reference)
    mean_instrumental = model_obj.mean_instrumental
    cov_instrumental = model_obj.cov_instrumental

    # Get the correct functions
    gen_sample_func = model_obj.generate_sample

    np.random.seed(seed)
    # Loop to check different values of B
    b_vec = model_obj.b_sample_vec if not debug else [100, 1000]
    out_val = []
    out_cols = [
        'b', 'classifier', 'entropy_loss', 'alpha', 'run', 'size_check',
        'size_marginal'
    ]
    for b_val in np.array(b_vec).astype(np.int):

        if b_val > 100000 and model_obj.regen_flag:
            model_obj = model_dict[run]()
            gen_sample_func = model_obj.generate_sample
            model_obj.set_reference_g_no_sample(
                mean_instrumental=mean_instrumental,
                cov_instrumental=cov_instrumental)

        np.random.seed(seed)
        sample_check = gen_sample_func(sample_size=size_check, marginal=False)
        theta_vec = sample_check[:, :model_obj.d]
        x_vec = sample_check[:, (model_obj.d + 1):]
        bern_vec = sample_check[:, model_obj.d]

        pbar = tqdm(total=len(classifier_dict.keys()),
                    desc=r'Working on classifiers, b=%s' % b_val)

        for clf_name in sorted(classifier_dict.keys()):
            if b_val > 5000 and 'Gauss' in clf_name or b_val > 50000 and (
                    'NN' in clf_name or 'Log. Regr.' in clf_name):
                continue
            if b_val == 1e6 and 'MLP' not in clf_name:
                continue

            clf_model = classifier_dict[clf_name]
            clf = train_clf(sample_size=b_val,
                            clf_model=clf_model,
                            gen_function=gen_sample_func,
                            d=model_obj.d,
                            clf_name=clf_name,
                            marginal=False,
                            nn_square_root=True)

            est_prob_vec = clf_prob_value(clf=clf,
                                          x_vec=x_vec,
                                          theta_vec=theta_vec,
                                          d=model_obj.d,
                                          d_obs=model_obj.d_obs)
            loss_value = log_loss(y_true=bern_vec, y_pred=est_prob_vec)
            out_val.append([
                b_val,
                clf_name.replace('\n', '').replace(' ', '-'), loss_value,
                alpha, run, size_check, size_reference
            ])

            if debug:
                print(
                    '---------- %s: %s' %
                    (clf_name.replace('\n', '').replace(' ', '-'), loss_value))

            pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/%s' % model_obj.out_directory
    out_filename = 'b_analysis_%s_alpha%s_sizecheck%s_bmax%s_%s.csv' % (
        run, str(alpha).replace('.', '-'), size_check, np.max(b_vec),
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)
Exemplo n.º 7
0
def main(run,
         rep,
         b,
         b_prime,
         alpha,
         sample_size_obs,
         classifier_cde,
         sample_type='MC',
         cutoff='qr',
         debug=False,
         seed=7,
         size_check=1000,
         verbose=False,
         marginal=False,
         size_marginal=1000):

    # Changing values if debugging
    b = b if not debug else 100
    b_prime = b_prime if not debug else 100
    size_check = size_check if not debug else 100
    rep = rep if not debug else 2
    model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    gen_sample_func = model_obj.generate_sample
    t0_grid = model_obj.pred_grid
    t0_val = model_obj.true_param
    lik_func = model_obj.compute_exact_likelihood
    np.random.seed(seed)

    # Adding Gaussian Process as an option in the classifier toy example
    num_hidden_vec = [(100, ), (20, 20), (50, 20)]
    for num_hidden in num_hidden_vec:
        classifier_dict['carl_' + str(num_hidden)] = num_hidden

    # # MadMiner output
    # logging.basicConfig(
    #     format='%(asctime)-5.5s %(name)-20.20s %(levelname)-7.7s %(message)s',
    #     datefmt='%H:%M',
    #     level=logging.INFO
    # )
    # # Output of all other modules (e.g. matplotlib)
    # for key in logging.Logger.manager.loggerDict:
    #     if "madminer" not in key:
    #         logging.getLogger(key).setLevel(logging.WARNING)

    # Loop over repetitions and classifiers
    # Each time we train the different classifiers, we build the intervals and we record
    # whether the point is in or not.
    out_val = []
    out_cols = [
        'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep',
        'sample_size_obs', 't0_true_val', 'theta_0_current', 'on_true_t0',
        'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint',
        'size_CI', 'mse_loss', 'training_time', 'pred_time', 'bprime_time',
        'cutoff_time', 'total_time', 'cutoff_type'
    ]
    pbar = tqdm(total=rep,
                desc='Toy Example for Simulations, n=%s, b=%s' %
                (sample_size_obs, b))
    for jj in range(rep):

        # Generates samples for each t0 values, so to be able to check both coverage and power
        x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val)

        # Calculate the true likelihood ratio
        lik_theta0 = np.array([
            np.sum(np.log(lik_func(x_obs=x_obs, true_param=theta_0)))
            for theta_0 in t0_grid
        ])
        max_across_grid = np.max(
            np.array([
                np.sum(np.log(lik_func(x_obs=x_obs, true_param=t1)))
                for t1 in grid_param
            ]))
        true_tau_obs = lik_theta0.reshape(-1, ) - max_across_grid.reshape(1)
        # print('TRUE', true_tau_obs)

        # Train the classifier for the odds
        clf_odds_fitted = {}
        clf_cde_fitted = {}
        for clf_name, clf_model in sorted(classifier_dict.items(),
                                          key=lambda x: x[0]):

            start_time = datetime.now()

            if 'carl_' in clf_name:

                # Create CARL
                carl = DoubleParameterizedRatioEstimator(n_hidden=clf_model)

                # Generate data for CARL
                if sample_type == 'MC':
                    n_pairs = int(np.sqrt(b // 2))
                    theta0_base = np.linspace(start=model_obj.low_int,
                                              stop=model_obj.high_int,
                                              num=n_pairs)
                    theta1_base = np.linspace(start=model_obj.low_int,
                                              stop=model_obj.high_int,
                                              num=n_pairs)
                    theta0 = np.repeat(theta0_base.reshape(-1, 1),
                                       int(n_pairs))
                    theta1 = np.tile(theta1_base.reshape(-1, 1),
                                     (int(n_pairs), 1))
                elif sample_type == 'uniform':
                    n_pairs = int(b // 2)
                    theta0 = np.random.uniform(low=model_obj.low_int,
                                               high=model_obj.high_int,
                                               size=n_pairs)
                    theta1 = np.random.uniform(low=model_obj.low_int,
                                               high=model_obj.high_int,
                                               size=n_pairs)
                else:
                    raise NotImplementedError

                sample_t0 = np.array([
                    model_obj.sample_sim(sample_size=sample_size_obs,
                                         true_param=t0) for t0 in theta0
                ])
                sample_t1 = np.array([
                    model_obj.sample_sim(sample_size=sample_size_obs,
                                         true_param=t1) for t1 in theta1
                ])

                theta_mat = np.vstack(
                    (np.hstack((theta0.reshape(-1, model_obj.d),
                                theta1.reshape(-1, model_obj.d))),
                     np.hstack((theta0.reshape(-1, model_obj.d),
                                theta1.reshape(-1, model_obj.d)))))
                x_mat = np.vstack((sample_t0.reshape(-1, sample_size_obs),
                                   sample_t1.reshape(-1, sample_size_obs)))
                y_mat = np.vstack((np.zeros(b // 2).reshape(-1, 1),
                                   np.ones(b // 2).reshape(-1, 1)))

                carl.train(method='carl',
                           x=x_mat,
                           y=y_mat,
                           theta0=theta_mat[:, :model_obj.d],
                           theta1=theta_mat[:, model_obj.d:],
                           n_epochs=25,
                           initial_lr=1e-4,
                           final_lr=1e-4)

                training_time = datetime.now()

                theta0_pred = np.repeat(t0_grid, grid_param.shape[0]).reshape(
                    -1, model_obj.d)
                theta1_pred = np.tile(grid_param,
                                      (t0_grid.shape[0], 1)).reshape(
                                          -1, model_obj.d)
                log_r_hat, _, _ = carl.evaluate(theta0=theta0_pred,
                                                theta1=theta1_pred,
                                                x=x_obs,
                                                evaluate_score=False)

                tau_obs = np.min(np.sum(log_r_hat.reshape(
                    t0_grid.shape[0], grid_param.shape[0], sample_size_obs),
                                        axis=2),
                                 axis=1)
                clf_odds_fitted[clf_name] = (tau_obs,
                                             np.mean(
                                                 (tau_obs - true_tau_obs)**2))
                pred_time = datetime.now()

                if cutoff == 'qr':
                    # Calculate the LR statistics given a sample
                    theta_mat, sample_mat = msnh_sampling_func(
                        b_prime=b_prime, sample_size=sample_size_obs)
                    full_mat = np.hstack((theta_mat, sample_mat))
                    stats_mat = np.apply_along_axis(
                        arr=full_mat,
                        axis=1,
                        func1d=lambda row: compute_statistics_single_t0_carl(
                            model=carl,
                            obs_sample=row[model_obj.d:],
                            t0=row[:model_obj.d],
                            grid_param_t1=grid_param,
                            param_d=model_obj.d))
                    bprime_time = datetime.now()

                    clf_cde_fitted[clf_name] = {}
                    # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]):
                    clf_name_qr = classifier_cde
                    clf_params = classifier_cde_dict[classifier_cde]
                    model = lgb.LGBMRegressor(objective='quantile',
                                              alpha=alpha,
                                              **clf_params[1])
                    model.fit(theta_mat.reshape(-1, model_obj.d),
                              stats_mat.reshape(-1, ))
                    t0_pred_vec = model.predict(
                        t0_grid.reshape(-1, model_obj.d))
                elif cutoff == 'chisquare':
                    chisquare_cutoff = chi2.ppf(q=1.0 - alpha, df=1)
                    t0_pred_vec = np.array([-0.5 * chisquare_cutoff] *
                                           tau_obs.shape[0])

                    bprime_time = datetime.now()
                    clf_name_qr = classifier_cde
                    clf_cde_fitted[clf_name] = {}
                else:
                    raise ValueError(
                        'Cutoff %s not recognized. Either "qr" or "chisquare" are accepted'
                        % cutoff)

            else:

                clf_odds = train_clf(sample_size=b,
                                     clf_model=clf_model,
                                     gen_function=gen_sample_func,
                                     clf_name=clf_name,
                                     marginal=marginal,
                                     nn_square_root=True)
                training_time = datetime.now()

                if verbose:
                    print('----- %s Trained' % clf_name)
                tau_obs = np.array([
                    compute_statistics_single_t0(clf=clf_odds,
                                                 obs_sample=x_obs,
                                                 t0=theta_0,
                                                 grid_param_t1=grid_param,
                                                 d=model_obj.d,
                                                 d_obs=model_obj.d_obs)
                    for theta_0 in t0_grid
                ])
                clf_odds_fitted[clf_name] = (tau_obs,
                                             np.mean(
                                                 (tau_obs - true_tau_obs)**2))
                #print(clf_name, np.mean((tau_obs - true_tau_obs)**2))
                pred_time = datetime.now()

                # Train the quantile regression algorithm for confidence levels
                theta_mat, sample_mat = msnh_sampling_func(
                    b_prime=b_prime, sample_size=sample_size_obs)
                full_mat = np.hstack((theta_mat, sample_mat))
                stats_mat = np.apply_along_axis(
                    arr=full_mat,
                    axis=1,
                    func1d=lambda row: compute_statistics_single_t0(
                        clf=clf_odds,
                        obs_sample=row[model_obj.d:],
                        t0=row[:model_obj.d],
                        grid_param_t1=grid_param,
                        d=model_obj.d,
                        d_obs=model_obj.d_obs))
                bprime_time = datetime.now()

                clf_cde_fitted[clf_name] = {}
                # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]):
                clf_name_qr = classifier_cde
                clf_params = classifier_cde_dict[classifier_cde]
                model = lgb.LGBMRegressor(objective='quantile',
                                          alpha=alpha,
                                          **clf_params[1])
                model.fit(theta_mat.reshape(-1, model_obj.d),
                          stats_mat.reshape(-1, ))
                t0_pred_vec = model.predict(t0_grid.reshape(-1, model_obj.d))

            cutoff_time = datetime.now()
            clf_cde_fitted[clf_name][clf_name_qr] = (t0_pred_vec, (
                (training_time - start_time).total_seconds() * 100,
                (pred_time - training_time).total_seconds() * 100,
                (bprime_time - pred_time).total_seconds() * 100,
                (cutoff_time - bprime_time).total_seconds() * 100))

        # At this point all it's left is to record
        for clf_name, (tau_obs_val, mse_val) in clf_odds_fitted.items():
            for clf_name_qr, (cutoff_val,
                              time_vec) in clf_cde_fitted[clf_name].items():
                size_temp = np.sum(
                    (tau_obs_val >= cutoff_val).astype(int)) / t0_grid.shape[0]
                for kk, theta_0_current in enumerate(t0_grid):
                    out_val.append([
                        b_prime, b, clf_name, clf_name_qr, run, jj,
                        sample_size_obs, t0_val, theta_0_current,
                        int(t0_val == theta_0_current), tau_obs_val[kk],
                        cutoff_val[kk],
                        int(tau_obs_val[kk] > cutoff_val[kk]),
                        int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp,
                        mse_val, time_vec[0], time_vec[1], time_vec[2],
                        time_vec[3],
                        sum(time_vec), cutoff
                    ])
        pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/gp_mc_comparison/'
    out_filename = 'classifier_reps_carl_%s_comparison_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % (
        sample_type, b, b_prime, run, rep, str(alpha).replace(
            '.', '-'), sample_size_obs, str(t0_val).replace('.', '-'),
        classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)

    # Print results
    cov_df = out_df[out_df['on_true_t0'] == 1][[
        'classifier', 'classifier_cde', 'in_confint', 'mse_loss', 'size_CI',
        'training_time', 'pred_time', 'bprime_time', 'cutoff_time',
        'total_time'
    ]]
    print(
        cov_df.groupby(['classifier', 'classifier_cde']).agg({
            'in_confint': [np.average],
            'size_CI': [np.average, np.std],
            'mse_loss': [np.average, np.std],
            'training_time': [np.average, np.std],
            'pred_time': [np.average, np.std],
            'bprime_time': [np.average, np.std],
            'cutoff_time': [np.average, np.std],
            'total_time': [np.average, np.std]
        }))
Exemplo n.º 8
0
def main(b,
         b_prime,
         alpha,
         classifier,
         sample_size_obs,
         run,
         rep,
         debug=False,
         seed=7,
         verbose=False,
         size_reference=1000):

    # Setup the variables, also to account for debug runs
    np.random.seed(seed)
    b = b if not debug else 100
    b_prime = b_prime if not debug else 100
    sample_size_obs = sample_size_obs if not debug else 1
    rep = rep if not debug else 1

    # Create the loader object, which drives most
    print('----- Loading Simulations In')
    model_obj = model_dict[run]()

    # Also, calculate the reference distribution
    model_obj.set_reference_g(size_reference=size_reference)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    clf_model = classifier_dict[classifier]
    gen_sample_func = model_obj.generate_sample
    t0_grid = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    classifier = classifier.replace('\n', '').replace(' ', '-')

    # Start the loop
    out_val = []
    out_cols = [
        'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep',
        'sample_size_obs', 't0_true_ax0', 't0_true_ax1', 'theta_0_current_ax0',
        'theta_0_current_ax1', 'on_true_theta', 'estimated_tau',
        'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI'
    ]
    pbar = tqdm(total=rep,
                desc='Toy Example for Simulations, n=%s' % sample_size_obs)
    for jj in range(rep):

        # Calculate Odds
        if verbose:
            print('----- Calculating Odds')
        # Compute Odds via classifier
        clf = train_clf(sample_size=b,
                        clf_model=clf_model,
                        gen_function=gen_sample_func,
                        d=model_obj.d,
                        clf_name=classifier)
        if verbose:
            print('----- %s Trained' % classifier)

        # Train Quantile Regression
        if verbose:
            print('----- Training Quantile Regression Algorithm')
        theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime,
                                                   sample_size=sample_size_obs)

        # Compute the tau values for QR training
        stats_mat = np.array([
            compute_statistics_single_t0(clf=clf,
                                         d=model_obj.d,
                                         d_obs=model_obj.d_obs,
                                         grid_param_t1=grid_param,
                                         t0=theta_0,
                                         obs_sample=sample_mat[kk, :, :])
            for kk, theta_0 in enumerate(theta_mat)
        ])

        # Fit the QR model
        model = GradientBoostingRegressor(loss='quantile',
                                          alpha=alpha,
                                          **{
                                              'max_depth': 5,
                                              'n_estimators': 1000
                                          })
        model.fit(theta_mat.reshape(-1, 2), stats_mat.reshape(-1, ))
        t0_pred_vec = model.predict(t0_grid.reshape(-1, 2))
        if verbose:
            print('----- Quantile Regression Algorithm Trained')
            pbar2 = tqdm(total=len(t0_grid),
                         desc='Toy Example for Simulations, n=%s' %
                         sample_size_obs)

        for t0_val in t0_grid:

            # Create a sample of observed data that are going to be used later
            # and compute statistics tau value for each t0
            x_obs = gen_obs_func(sample_size=sample_size_obs,
                                 true_param=t0_val)
            tau_obs = np.array([
                compute_statistics_single_t0(clf=clf,
                                             obs_sample=x_obs,
                                             t0=theta_0,
                                             d=model_obj.d,
                                             d_obs=model_obj.d_obs,
                                             grid_param_t1=grid_param)
                for theta_0 in t0_grid
            ])

            size_temp = np.sum(
                (tau_obs > t0_pred_vec).astype(int)) / tau_obs.shape[0]

            # At this point all it's left is to record
            for kk, theta_0_current in enumerate(t0_grid):
                out_val.append([
                    b_prime, b, classifier, 'XGBoost -- (d5, n1000)', run, jj,
                    sample_size_obs, t0_val[0], t0_val[1], theta_0_current[0],
                    theta_0_current[1], 1 if np.sum(
                        (t0_val == theta_0_current).astype(int)) == 2 else 0,
                    tau_obs[kk], t0_pred_vec[kk],
                    int(tau_obs[kk] > t0_pred_vec[kk]),
                    int(tau_obs[kk] <= t0_pred_vec[kk]), size_temp
                ])
            if verbose:
                pbar2.update(1)
        pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/sen_poisson_2d/'
    out_filename = '2d_sen_poisson_heatmap_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_std15_%s.csv' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)

    # Generating Heatmap -- Observed Values
    plot_df = out_df[out_df['on_true_theta'] == 1][[
        't0_true_ax0', 't0_true_ax1', 'in_confint'
    ]]
    plot_df = plot_df.groupby(['t0_true_ax0',
                               't0_true_ax1']).mean().reset_index()

    plt.figure(figsize=(15, 7.5))
    plot_df_heatmap = plot_df.pivot('t0_true_ax1', 't0_true_ax0', 'in_confint')
    ax = sns.heatmap(plot_df_heatmap,
                     cmap='RdYlGn',
                     vmax=plot_df['in_confint'].max(),
                     vmin=plot_df['in_confint'].min())
    ax.invert_yaxis()
    plt.title(
        "Observed Coverage Across %sD %s Param Space, B=%s, B'=%s, n=%s" %
        (model_obj.d, run.title(), b, b_prime, sample_size_obs),
        fontsize=25)
    plt.xlabel('Background', fontsize=25)
    plt.ylabel('Signal', fontsize=25)
    plt.tight_layout()
    image_name = 'heatmap_observed_coverage_%sD_%s_b_%s_bprime_%s_n%s_%s.pdf' % (
        model_obj.d, run, b, b_prime, sample_size_obs,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    plt.savefig('images/%s' % model_obj.out_directory + image_name)

    # Generating Heatmap -- Estimated Coverage
    print('----- Estimating Coverage')
    X_cov = out_df[out_df['on_true_theta'] == 1][[
        't0_true_ax0', 't0_true_ax1'
    ]].values
    y_cov = out_df[out_df['on_true_theta'] == 1]['in_confint'].values

    model = LogisticRegression(penalty='none', solver='saga', max_iter=10000)
    model.fit(X_cov, y_cov)
    pred_grid = model_obj.make_grid_over_param_space(50)
    pred_cov = model.predict_proba(pred_grid)

    plot_df_cov = pd.DataFrame.from_dict({
        't0_true_ax0':
        np.round(pred_grid[:, 0], 1),
        't0_true_ax1':
        np.round(pred_grid[:, 1], 1),
        'in_confint':
        pred_cov[:, 1]
    })
    plot_df_heatmap = plot_df_cov.pivot('t0_true_ax1', 't0_true_ax0',
                                        'in_confint')

    plt.figure(figsize=(15, 7.5))
    ax = sns.heatmap(plot_df_heatmap,
                     cmap='RdYlGn',
                     vmax=plot_df_cov['in_confint'].max(),
                     vmin=plot_df_cov['in_confint'].min())
    ax.invert_yaxis()
    plt.title("Estimated Coverage Across %sD %s Space, B=%s, B'=%s, n=%s" %
              (model_obj.d, run.title(), b, b_prime, sample_size_obs),
              fontsize=25)
    plt.xlabel('Background', fontsize=25)
    plt.ylabel('Signal', fontsize=25)
    plt.tight_layout()
    image_name = 'heatmap_estimated_coverage_%sD_%s_b_%s_bprime_%s_n%s_%s.pdf' % (
        model_obj.d, run, b, b_prime, sample_size_obs,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    plt.savefig('images/%s' % model_obj.out_directory + image_name)
Exemplo n.º 9
0
def main(run,
         rep,
         marginal,
         b,
         b_prime,
         alpha,
         sample_size_obs,
         size_marginal=1000,
         debug=False,
         seed=7,
         size_check=1000,
         size_t0_sampled=250,
         verbose=False):

    # Setup variables
    b = b if not debug else 10
    b_prime = b_prime if not debug else 10
    size_check = size_check if not debug else 100
    rep = rep if not debug else 1
    model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal)

    # Get the correct functions
    msnh_sampling_func = model_obj.sample_msnh_algo5
    grid_param = model_obj.grid
    gen_obs_func = model_obj.sample_sim
    gen_sample_func = model_obj.generate_sample
    t0_val = model_obj.true_param

    np.random.seed(seed)
    t0_grid = np.random.uniform(low=model_obj.low_int,
                                high=model_obj.high_int,
                                size=size_t0_sampled)

    # Loop over repetitions and classifiers
    # Each time we train the different classifiers, we build the intervals and we record
    # whether the point is in or not.
    np.random.seed(seed)
    out_val = []
    out_cols = [
        'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep',
        'sample_size_obs', 'pinball_loss', 'theta_0_current', 'estimated_tau',
        'estimated_cutoff', 'in_confint', 'out_confint'
    ]
    pbar = tqdm(total=rep,
                desc='Toy Example for Simulations, n=%s' % sample_size_obs)
    for jj in range(rep):
        # Train the classifier for the odds
        clf_odds_fitted = {}
        clf_cde_fitted = {}
        for clf_name, clf_model in sorted(classifier_dict[run].items(),
                                          key=lambda x: x[0]):
            clf_odds = train_clf(sample_size=b,
                                 clf_model=clf_model,
                                 gen_function=gen_sample_func,
                                 clf_name=clf_name)
            if verbose:
                print('----- %s Trained' % clf_name)

            # Create a validation set for validating the pinball loss
            np.random.seed(seed)
            theta_mat_valid, sample_mat_valid = msnh_sampling_func(
                b_prime=size_check, sample_size=sample_size_obs)
            full_mat_valid = np.hstack((theta_mat_valid, sample_mat_valid))

            stats_mat_valid = np.apply_along_axis(
                arr=full_mat_valid,
                axis=1,
                func1d=lambda row: compute_statistics_single_t0(
                    obs_sample=row[model_obj.d:],
                    t0=t0_val,
                    grid_param_t1=grid_param,
                    clf=clf_odds))
            X_val = theta_mat_valid
            y_val = stats_mat_valid

            # Train the quantile regression algorithm for confidence levels
            theta_mat, sample_mat = msnh_sampling_func(
                b_prime=b_prime, sample_size=sample_size_obs)
            full_mat = np.hstack((theta_mat, sample_mat))
            stats_mat = np.apply_along_axis(
                arr=full_mat,
                axis=1,
                func1d=lambda row: compute_statistics_single_t0(
                    clf=clf_odds,
                    obs_sample=row[model_obj.d:],
                    t0=row[:model_obj.d],
                    grid_param_t1=grid_param))
            clf_cde_fitted[clf_name] = {}
            for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(),
                                                  key=lambda x: x[0]):
                # Train the regression quantiles algorithms
                if clf_params[0] == 'xgb':
                    model = GradientBoostingRegressor(loss='quantile',
                                                      alpha=alpha,
                                                      **clf_params[1])
                    model.fit(theta_mat.reshape(-1, model_obj.d),
                              stats_mat.reshape(-1, ))
                    t0_pred_vec = model.predict(
                        t0_grid.reshape(-1, model_obj.d))
                    val_pred_vec = model.predict(X_val)
                elif clf_params[0] == 'rf':
                    model = RandomForestQuantileRegressor(**clf_params[1])
                    model.fit(theta_mat.reshape(-1, model_obj.d),
                              stats_mat.reshape(-1, ))
                    t0_pred_vec = model.predict(t0_grid.reshape(
                        -1, model_obj.d),
                                                quantile=alpha)
                    val_pred_vec = model.predict(X_val, quantile=alpha * 100)
                elif clf_params[0] == 'lgb':
                    model = lgb.LGBMRegressor(objective='quantile',
                                              alpha=alpha,
                                              **clf_params[1])
                    model.fit(theta_mat.reshape(-1, model_obj.d),
                              stats_mat.reshape(-1, ))
                    t0_pred_vec = model.predict(
                        t0_grid.reshape(-1, model_obj.d))
                    val_pred_vec = model.predict(X_val)
                elif clf_params[0] == 'linear':
                    t0_pred_vec = QuantReg(
                        theta_mat.reshape(-1, model_obj.d),
                        stats_mat.reshape(-1, )).fit(q=alpha).predict(
                            t0_grid.reshape(-1, model_obj.d))
                    val_pred_vec = QuantReg(
                        theta_mat.reshape(-1, model_obj.d),
                        stats_mat.reshape(-1, )).fit(q=alpha).predict(
                            X_val.reshape(-1, model_obj.d))
                else:
                    raise ValueError('CDE Classifier not defined in the file.')

                loss_value = pinball_loss(y_true=val_pred_vec,
                                          y_pred=y_val,
                                          alpha=alpha)
                clf_cde_fitted[clf_name][clf_name_qr] = (t0_pred_vec,
                                                         loss_value)

            # Generates samples for each t0 values
            # Then calculates tau at each t0, but using the sample generated at that t0
            # In other words, we should expect the samples to be included in the confidence intervals
            # everytime
            t0_obs_sampled = {
                t0: gen_obs_func(sample_size=sample_size_obs, true_param=t0)
                for t0 in t0_grid
            }
            tau_obs = np.array([
                compute_statistics_single_t0(
                    clf=clf_odds,
                    obs_sample=t0_obs_sampled[theta_0],
                    t0=theta_0,
                    grid_param_t1=grid_param) for theta_0 in t0_grid
            ])
            clf_odds_fitted[clf_name] = tau_obs

        # At this point all it's left is to record
        for clf_name, tau_obs_val in clf_odds_fitted.items():
            for clf_name_qr, (cutoff_val,
                              loss_value) in clf_cde_fitted[clf_name].items():
                for kk, theta_0_current in enumerate(t0_grid):
                    out_val.append([
                        b, b_prime, clf_name, clf_name_qr, run, jj,
                        sample_size_obs, loss_value, theta_0_current,
                        tau_obs_val[kk], cutoff_val[kk],
                        int(tau_obs_val[kk] > cutoff_val[kk]),
                        int(tau_obs_val[kk] <= cutoff_val[kk])
                    ])
        pbar.update(1)

    # Saving the results
    out_df = pd.DataFrame.from_records(data=out_val,
                                       index=range(len(out_val)),
                                       columns=out_cols)
    out_dir = 'sims/classifier_coverage_toy/'
    out_filename = 'classifier_coverage_toy_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_%s.csv' % (
        b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs,
        datetime.strftime(datetime.today(), '%Y-%m-%d'))
    out_df.to_csv(out_dir + out_filename)