def main(b, b_prime, alpha, classifier, class_cde, sample_size_obs, run, t_star, c_star, debug=False, seed=7, n_sampled=500, size_reference=1000): # Setup the variables, also to account for debug runs np.random.seed(seed) b = b if not debug else 100 b_prime = b_prime if not debug else 100 sample_size_obs = sample_size_obs if not debug else 1 n_sampled = n_sampled if not debug else 10 # Create the loader object, which drives most print('----- Loading Simulations In') model_obj = model_dict[run]() if not debug else model_dict[run]( num_grid=21) t0_val = model_obj.true_t0 # Also, calculate the reference distribution model_obj.set_reference_g(size_reference=size_reference) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid clf_model = classifier_dict[classifier] gen_sample_func = model_obj.generate_sample t0_grid = model_obj.grid gen_obs_func = model_obj.sample_sim classifier = classifier.replace('\n', '').replace(' ', '-') # Create a sample of observed data that are going to be used later # and compute statistics tau value for each t0 x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val) start_time = datetime.now() # Calculate Odds print('----- Calculating Odds') if t_star: train_time = datetime.now() pbar = tqdm(total=t0_grid.shape[0], desc=r'Calculating True $\tau$') tau_obs = [] for t0 in t0_grid: tau_obs.append( model_obj.compute_exact_tau(x_obs=x_obs, t0_val=t0, meshgrid=grid_param)) pbar.update(1) tau_obs = np.array(tau_obs) pred_time = datetime.now() else: # Compute Odds via classifier clf = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, d=model_obj.d, clf_name=classifier) train_time = datetime.now() print('----- %s Trained' % classifier) pbar = tqdm(total=len(t0_grid), desc='Calculate Odds') tau_obs = [] for theta_0 in t0_grid: tau_obs.append( compute_statistics_single_t0(clf=clf, obs_sample=x_obs, t0=theta_0, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param)) pbar.update(1) tau_obs = np.array(tau_obs) pred_time = datetime.now() # Train Quantile Regression if c_star: pbar = tqdm(total=t0_grid.shape[0], desc=r'Calculating Distribution True $\tau$') tau_distr = [] for t0 in t0_grid: tau_distr.append( model_obj.compute_exact_tau_distr( t0_val=t0, meshgrid=grid_param, n_sampled=n_sampled, sample_size_obs=sample_size_obs)) pbar.update(1) bprime_time = datetime.now() tau_distr = np.array(tau_distr) np.save( file='sims/%stau_distr_t0_%s_%s_%ssampled_%ssamplesizeobs.npy' % (model_obj.out_directory, b, b_prime, n_sampled, sample_size_obs), arr=tau_distr) t0_pred_vec = np.quantile(a=tau_distr, q=alpha, axis=1) cutoff_time = datetime.now() else: print('----- Training Quantile Regression Algorithm') theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) # Compute the tau values for QR training if t_star: stats_mat = np.array([ model_obj.compute_exact_tau(x_obs=sample_mat[kk, :, :], t0_val=theta_0, meshgrid=grid_param) for kk, theta_0 in enumerate(theta_mat) ]) else: stats_mat = np.array([ compute_statistics_single_t0(clf=clf, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param, t0=theta_0, obs_sample=sample_mat[kk, :, :]) for kk, theta_0 in enumerate(theta_mat) ]) bprime_time = datetime.now() clf_params = classifier_cde_dict[class_cde] t0_pred_vec = train_qr_algo( model_obj=model_obj, alpha=alpha, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, prediction_grid=t0_grid) cutoff_time = datetime.now() # Confidence Region print('----- Creating Confidence Region') simultaneous_nh_decision = [] for jj, t0_pred in enumerate(t0_pred_vec): simultaneous_nh_decision.append( [t0_pred, tau_obs[jj], int(tau_obs[jj] < t0_pred)]) time_vec = [(train_time - start_time).total_seconds(), (pred_time - train_time).total_seconds(), (bprime_time - pred_time).total_seconds(), (cutoff_time - bprime_time).total_seconds()] time_vec.append(sum(time_vec)) print(time_vec) # Saving data print('----- Saving Data') save_dict = { 'background': t0_grid[:, 0], 'signal': t0_grid[:, 1], 'tau_statistics': tau_obs, 'simul_nh_cutoff': [el[0] for el in simultaneous_nh_decision], 'simul_nh_decision': [el[2] for el in simultaneous_nh_decision], 'b': b, 'b_prime': b_prime, 'seed': seed, 'sample_size_obs': sample_size_obs, 'classifier': classifier, 't_star': t_star, 'time_vec': time_vec } outfile_name = '2d_confint_%s_data_b_%s_bprime_%s_%s_%s_n%s_%s_%s_%s_%s%s_%s.pkl' % ( run, b, b_prime, t0_val[0], t0_val[1], sample_size_obs, classifier, class_cde, n_sampled, '' if not t_star else '_taustar', '' if not c_star else '_cstar', datetime.strftime(datetime.today(), '%Y-%m-%d')) outdir = 'sims/%s' % model_obj.out_directory pickle.dump(obj=save_dict, file=open(outdir + outfile_name, 'wb')) # Visualization plot_df = pd.DataFrame.from_dict({ 'background': t0_grid[:, 0], 'signal': t0_grid[:, 1], 'tau_statistics': tau_obs, 'simul_nh_cutoff': [el[0] for el in simultaneous_nh_decision], 'simul_nh_decision': [el[2] for el in simultaneous_nh_decision] }) col_vec = ['blue'] alpha_vec = [0.75, 0.1] theta_0_plot = plot_df['background'].values theta_1_plot = plot_df['signal'].values plt.figure(figsize=(12, 8)) for ii, col in enumerate(['simul_nh_decision']): value_temp = plot_df[col].values marker = np.array(["x" if el else "o" for el in value_temp]) unique_markers = set(marker) for j, um in enumerate(unique_markers): mask = marker == um plt.scatter(x=theta_0_plot[mask], y=theta_1_plot[mask], marker=um, color=col_vec[ii], alpha=alpha_vec[j]) plt.scatter(x=t0_val[0], y=t0_val[1], color='r', marker='*', s=500) plt.xlabel('Background', fontsize=25) plt.ylabel('Signal', fontsize=25) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.title("2D Confidence Interval, %s Example, B=%s, B'=%s, n=%s%s%s" % (run.title(), b, b_prime, sample_size_obs, '' if not t_star else '\n tau_star', '' if not c_star else ', c_star'), fontsize=25) plt.tight_layout() image_name = '2d_confint_%s_b_%s_bprime_%s_%s_%s_%s_n%s%s%s_%s.pdf' % ( run, b, b_prime, t0_val[0], t0_val[1], sample_size_obs, classifier, '' if not t_star else '_taustar', '' if not c_star else '_cstar', datetime.strftime(datetime.today(), '%Y-%m-%d')) plt.savefig('images/%s/' % model_obj.out_directory + image_name)
def main(b, alpha, classifier, sample_size_obs, run, n_eval_grid=101, debug=False, seed=7, sample_size_check=1000, size_reference=1000): # Setup the variables, also to account for debug runs np.random.seed(seed) b = b if not debug else 100 sample_size_obs = sample_size_obs if not debug else 5 classifier_cde_dict = classifier_cde_dict_full if not debug else classifier_cde_dict_small # Create the loader object, which drives most print('----- Loading Simulations In') model_obj = model_dict[run]() # Also, calculate the reference distribution model_obj.set_reference_g(size_reference=size_reference) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid clf_model = classifier_dict[classifier] gen_sample_func = model_obj.generate_sample classifier = classifier.replace('\n', '').replace(' ', '-') # Then generate first the thetas used for checking coverage theta_vec, x_vec = model_obj.sample_sim_check( sample_size=sample_size_check, n=sample_size_obs) # Compute Odds via classifier print('----- Calculating Odds') clf = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, d=model_obj.d, clf_name=classifier) tau_obs = np.array([ compute_statistics_single_t0(clf=clf, obs_sample=x_vec[kk, :, :].reshape( -1, model_obj.d_obs), d=model_obj.d, d_obs=model_obj.d_obs, t0=theta_0, grid_param_t1=grid_param) for kk, theta_0 in enumerate(theta_vec) ]) print('----- %s Trained' % classifier) # Loop over B' b_prime_vec = model_obj.b_prime_vec if not debug else [500, 1000] out_val = [] out_cols = [ 'b_prime', 'classifier', 'class_cde', 'run', 'n_eval_grid', 'sample_check', 'sample_reference', 'percent_correct_coverage', 'average_coverage', 'percent_correct_coverage_lr', 'average_coverage_lr', 'percent_correct_coverage_1std', 'average_coverage_1std', 'percent_correct_coverage_2std', 'average_coverage_2std' ] for b_prime in np.array(b_prime_vec).astype(int): # First generate the samples to train b_prime algorithm np.random.seed(seed) theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) stats_mat = np.array([ compute_statistics_single_t0(clf=clf, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param, t0=theta_0, obs_sample=sample_mat[kk, :, :]) for kk, theta_0 in enumerate(theta_mat) ]) pbar = tqdm(total=len(classifier_cde_dict.keys()), desc=r'Working on QR classifiers, b=%s' % b_prime) for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): if b_prime > 10000 and 'RF' in clf_name_qr: continue t0_pred_vec = train_qr_algo( model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, alpha=alpha, prediction_grid=theta_vec) in_vec = np.array([ int(tau_obs[jj] > t0_pred_vec[jj]) for jj in range(theta_vec.shape[0]) ]) # Calculate the mean model = XGBClassifier(depth=3, n_estimators=100) model.fit(theta_vec.reshape(-1, model_obj.d), in_vec.reshape(-1, )) pred_grid = model_obj.pred_grid pred_cov_mean = model.predict_proba(pred_grid)[:, 1] percent_correct_coverage = np.average((pred_cov_mean > (1.0 - alpha)).astype(int)) average_coverage = np.average(pred_cov_mean) # Calculate the upper limit x = theta_vec.reshape(-1, 2) y = in_vec.reshape(-1, ) # estimate the model X = sm.add_constant(x) with Suppressor(): model = sm.Logit(y, X).fit(full_output=False) proba = model.predict(X) percent_correct_coverage_lr = np.average( (proba > (1.0 - alpha)).astype(int)) average_coverage_lr = np.average(proba) # estimate confidence interval for predicted probabilities cov = model.cov_params() gradient = (proba * (1 - proba) * X.T).T # matrix of gradients for each observation std_errors = np.array( [np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient]) c = 1 # multiplier for confidence interval upper = np.maximum(0, np.minimum(1, proba + std_errors * c)) percent_correct_coverage_upper = np.average( (upper > (1.0 - alpha)).astype(int)) average_coverage_upper = np.average(upper) upper_2std = np.maximum(0, np.minimum(1, proba + std_errors * 1.96)) percent_correct_coverage_upper_2std = np.average( (upper_2std > (1.0 - alpha)).astype(int)) average_coverage_upper_2std = np.average(upper_2std) out_val.append([ b_prime, classifier, clf_name_qr, run, n_eval_grid, sample_size_check, size_reference, percent_correct_coverage, average_coverage, percent_correct_coverage_lr, average_coverage_lr, percent_correct_coverage_upper, average_coverage_upper, percent_correct_coverage_upper_2std, average_coverage_upper_2std ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/%s' % model_obj.out_directory out_filename = 'b_prime_analysis_%s_%s_alpha%s_ngrid%s_sizecheck%s_bprimemax%s_logregint_%s.csv' % ( classifier, run, str(alpha).replace( '.', '-'), n_eval_grid, sample_size_check, np.max(b_prime_vec), datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename)
def main(run, rep, b, b_prime, alpha, t0_val, sample_size_obs, classifier_cde, or_loss_samples=1000, debug=False, seed=7, size_check=1000, verbose=False, marginal=False, size_marginal=1000): # Changing values if debugging b = b if not debug else 100 b_prime = b_prime if not debug else 100 size_check = size_check if not debug else 100 rep = rep if not debug else 2 model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid gen_obs_func = model_obj.sample_sim gen_sample_func = model_obj.generate_sample t0_grid = model_obj.pred_grid tp_func = model_obj.compute_exact_prob or_loss_sample_func = model_obj.create_samples_for_or_loss # Creating sample to check entropy about np.random.seed(seed) sample_check = gen_sample_func(sample_size=size_check, marginal=marginal) theta_vec = sample_check[:, :model_obj.d] x_vec = sample_check[:, (model_obj.d + 1):] bern_vec = sample_check[:, model_obj.d] true_prob_vec = tp_func(theta_vec=theta_vec, x_vec=x_vec) entropy_est = -np.average([ np.log(true_prob_vec[kk]) if el == 1 else np.log(1 - true_prob_vec[kk]) for kk, el in enumerate(bern_vec) ]) # Creating sample to calculate the OR loss first_term_sample, second_term_sample = or_loss_sample_func( or_loss_samples=or_loss_samples) # Loop over repetitions and classifiers # Each time we train the different classifiers, we build the intervals and we record # whether the point is in or not. out_val = [] out_cols = [ 'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs', 'cross_entropy_loss', 't0_true_val', 'theta_0_current', 'on_true_t0', 'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI', 'true_entropy', 'or_loss' ] pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s, b=%s' % (sample_size_obs, b)) for jj in range(rep): # Generates samples for each t0 values, so to be able to check both coverage and power x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val) # Train the classifier for the odds clf_odds_fitted = {} clf_cde_fitted = {} for clf_name, clf_model in sorted(classifier_dict.items(), key=lambda x: x[0]): clf_odds = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, clf_name=clf_name, marginal=marginal, nn_square_root=True) if verbose: print('----- %s Trained' % clf_name) tau_obs = np.array([ compute_statistics_single_t0(clf=clf_odds, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid ]) # Calculating cross-entropy est_prob_vec = clf_prob_value(clf=clf_odds, x_vec=x_vec, theta_vec=theta_vec, d=model_obj.d, d_obs=model_obj.d_obs) loss_value = log_loss(y_true=bern_vec, y_pred=est_prob_vec) # Calculating or loss or_loss_value = or_loss(clf=clf_odds, first_sample=first_term_sample, second_sample=second_term_sample) clf_odds_fitted[clf_name] = (tau_obs, loss_value, or_loss_value) # Train the quantile regression algorithm for confidence levels theta_mat, sample_mat = msnh_sampling_func( b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis( arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0( clf=clf_odds, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs)) clf_cde_fitted[clf_name] = {} # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): clf_name_qr = classifier_cde clf_params = classifier_cde_dict[classifier_cde] t0_pred_vec = train_qr_algo( model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, alpha=alpha, prediction_grid=t0_grid) clf_cde_fitted[clf_name][clf_name_qr] = t0_pred_vec # At this point all it's left is to record for clf_name, (tau_obs_val, cross_ent_loss, or_loss_value) in clf_odds_fitted.items(): for clf_name_qr, cutoff_val in clf_cde_fitted[clf_name].items(): size_temp = np.sum( (tau_obs_val >= cutoff_val).astype(int)) / t0_grid.shape[0] for kk, theta_0_current in enumerate(t0_grid): out_val.append([ b_prime, b, clf_name, clf_name_qr, run, jj, sample_size_obs, cross_ent_loss, t0_val, theta_0_current, int(t0_val == theta_0_current), tau_obs_val[kk], cutoff_val[kk], int(tau_obs_val[kk] > cutoff_val[kk]), int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp, entropy_est, or_loss_value ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/classifier_cov_pow_toy/' out_filename = 'classifier_reps_cov_pow_toy_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, str(t0_val).replace('.', '-'), classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename) # Print results cov_df = out_df[out_df['on_true_t0'] == 1][[ 'classifier', 'classifier_cde', 'in_confint', 'cross_entropy_loss', 'size_CI' ]] print( cov_df.groupby(['classifier', 'classifier_cde']).agg({ 'in_confint': [np.average], 'size_CI': [np.average, np.std], 'cross_entropy_loss': [np.average, np.std] })) # Power plots out_df['class_combo'] = out_df[['classifier', 'classifier_cde' ]].apply(lambda x: x[0] + '---' + x[1], axis=1) plot_df = out_df[['class_combo', 'theta_0_current', 'out_confint' ]].groupby(['class_combo', 'theta_0_current']).mean().reset_index() fig = plt.figure(figsize=(20, 10)) sns.lineplot(x='theta_0_current', y='out_confint', hue='class_combo', data=plot_df, palette='cubehelix') plt.legend(loc='best', fontsize=25) plt.xlabel(r'$\theta$', fontsize=25) plt.ylabel('Power', fontsize=25) plt.title("Power of Hypothesis Test, B=%s, B'=%s, n=%s, %s" % (b, b_prime, sample_size_obs, run.title()), fontsize=25) out_dir = 'images/classifier_cov_pow_toy/' outfile_name = 'power_classifier_reps_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.pdf' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, str(t0_val).replace('.', '-'), classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d')) plt.tight_layout() plt.savefig(out_dir + outfile_name) plt.close()
def main(run, rep, b, b_prime, alpha, sample_size_obs, classifier_cde, sample_type='MC', cutoff='qr', debug=False, seed=7, size_check=1000, verbose=False, marginal=False, size_marginal=1000): # Changing values if debugging b = b if not debug else 100 b_prime = b_prime if not debug else 100 size_check = size_check if not debug else 100 rep = rep if not debug else 2 model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid gen_obs_func = model_obj.sample_sim gen_sample_func = model_obj.generate_sample t0_grid = model_obj.pred_grid t0_val = model_obj.true_param lik_func = model_obj.compute_exact_likelihood np.random.seed(seed) # Adding Gaussian Process as an option in the classifier toy example anchor_points_vec = [5, 10, 25] for anchor_points in anchor_points_vec: classifier_dict['gaussian_process_' + str(anchor_points)] = anchor_points # Loop over repetitions and classifiers # Each time we train the different classifiers, we build the intervals and we record # whether the point is in or not. out_val = [] out_cols = ['b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs', 't0_true_val', 'theta_0_current', 'on_true_t0', 'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI', 'mse_loss', 'training_time', 'pred_time', 'bprime_time', 'cutoff_time', 'total_time', 'cutoff_type'] pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s, b=%s' % (sample_size_obs, b)) for jj in range(rep): # Generates samples for each t0 values, so to be able to check both coverage and power x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val) # Calculate the true likelihood ratio lik_theta0 = np.array([np.sum(np.log(lik_func(x_obs=x_obs, true_param=theta_0))) for theta_0 in t0_grid]) max_across_grid = np.max(np.array([np.sum(np.log(lik_func(x_obs=x_obs, true_param=t1))) for t1 in grid_param])) true_tau_obs = lik_theta0.reshape(-1, ) - max_across_grid.reshape(1) # print('TRUE', true_tau_obs) # Train the classifier for the odds clf_odds_fitted = {} clf_cde_fitted = {} for clf_name, clf_model in sorted(classifier_dict.items(), key=lambda x: x[0]): start_time = datetime.now() if 'gaussian_process' in clf_name: # Train Gaussian Process gp_model = train_gp(sample_size=b, n_anchor_points=clf_model, model_obj=model_obj, t0_grid=t0_grid, sample_type=sample_type) training_time = datetime.now() # Calculate LR given a Gaussian Process tau_obs = np.array([ compute_statistics_single_t0_gp( gp_model=gp_model, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid]) clf_odds_fitted[clf_name] = (tau_obs, np.mean((tau_obs - true_tau_obs)**2)) # print(clf_name, clf_odds_fitted[clf_name]) pred_time = datetime.now() # Calculate the LR statistics given a sample if cutoff == 'qr': theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis(arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0_gp( gp_model=gp_model, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs )) bprime_time = datetime.now() clf_cde_fitted[clf_name] = {} # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): clf_name_qr = classifier_cde clf_params = classifier_cde_dict[classifier_cde] t0_pred_vec = train_qr_algo(model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, alpha=alpha, prediction_grid=t0_grid) elif cutoff == 'chisquare': chisquare_cutoff = chi2.ppf(q=1.0-alpha, df=1) t0_pred_vec = np.array([-0.5 * chisquare_cutoff] * tau_obs.shape[0]) bprime_time = datetime.now() clf_name_qr = classifier_cde clf_cde_fitted[clf_name] = {} else: raise ValueError('Cutoff %s not recognized. Either "qr" or "chisquare" are accepted' % cutoff) else: clf_odds = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, clf_name=clf_name, marginal=marginal, nn_square_root=True) training_time = datetime.now() if verbose: print('----- %s Trained' % clf_name) tau_obs = np.array([ compute_statistics_single_t0( clf=clf_odds, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid]) clf_odds_fitted[clf_name] = (tau_obs, np.mean((tau_obs - true_tau_obs)**2)) # print(clf_name, clf_odds_fitted[clf_name]) pred_time = datetime.now() # Train the quantile regression algorithm for confidence levels theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis(arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0( clf=clf_odds, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs )) bprime_time = datetime.now() clf_cde_fitted[clf_name] = {} # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): clf_name_qr = classifier_cde clf_params = classifier_cde_dict[classifier_cde] t0_pred_vec = train_qr_algo(model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, alpha=alpha, prediction_grid=t0_grid) cutoff_time = datetime.now() clf_cde_fitted[clf_name][clf_name_qr] = ( t0_pred_vec, ((training_time - start_time).total_seconds() * 100, (pred_time - training_time).total_seconds() * 100, (bprime_time - pred_time).total_seconds() * 100, (cutoff_time - bprime_time).total_seconds() * 100)) # At this point all it's left is to record for clf_name, (tau_obs_val, mse_val) in clf_odds_fitted.items(): for clf_name_qr, (cutoff_val, time_vec) in clf_cde_fitted[clf_name].items(): size_temp = np.sum((tau_obs_val >= cutoff_val).astype(int))/t0_grid.shape[0] for kk, theta_0_current in enumerate(t0_grid): out_val.append([ b_prime, b, clf_name, clf_name_qr, run, jj, sample_size_obs, t0_val, theta_0_current, int(t0_val == theta_0_current), tau_obs_val[kk], cutoff_val[kk], int(tau_obs_val[kk] > cutoff_val[kk]), int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp, mse_val, time_vec[0], time_vec[1], time_vec[2], time_vec[3], sum(time_vec), cutoff ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/gp_mc_comparison/' out_filename = 'classifier_reps_gp_mc_comparison_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, str(t0_val).replace('.', '-'), classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d') ) out_df.to_csv(out_dir + out_filename) # Print results cov_df = out_df[out_df['on_true_t0'] == 1][['classifier', 'classifier_cde', 'in_confint', 'mse_loss', 'size_CI', 'training_time', 'pred_time', 'bprime_time', 'cutoff_time', 'total_time']] print(cov_df.groupby(['classifier', 'classifier_cde']).agg({'in_confint': [np.average], 'size_CI': [np.average, np.std], 'mse_loss': [np.average, np.std], 'training_time': [np.average, np.std], 'pred_time': [np.average, np.std], 'bprime_time': [np.average, np.std], 'cutoff_time': [np.average, np.std], 'total_time': [np.average, np.std]})) # Power plots out_df['class_combo'] = out_df[['classifier', 'classifier_cde']].apply(lambda x: x[0] + '---' + x[1], axis = 1) plot_df = out_df[['class_combo', 'theta_0_current', 'out_confint']].groupby( ['class_combo', 'theta_0_current']).mean().reset_index() fig = plt.figure(figsize=(20, 10)) sns.lineplot(x='theta_0_current', y='out_confint', hue='class_combo', data=plot_df, palette='cubehelix') plt.legend(loc='best', fontsize=25) plt.xlabel(r'$\theta$', fontsize=25) plt.ylabel('Power', fontsize=25) plt.title("Power of Hypothesis Test, B=%s, B'=%s, n=%s, %s" % ( b, b_prime, sample_size_obs, run.title()), fontsize=25) out_dir = 'images/gp_mc_comparison/' outfile_name = 'power_gp_mc_comparison_reps_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.pdf' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, str(t0_val).replace('.', '-'), classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d') ) plt.tight_layout() plt.savefig(out_dir + outfile_name) plt.close()
def main(run, rep, b, b_prime, alpha, sample_size_obs, classifier_cde, sample_type='MC', cutoff='qr', debug=False, seed=7, size_check=1000, verbose=False, marginal=False, size_marginal=1000): # Changing values if debugging b = b if not debug else 100 b_prime = b_prime if not debug else 100 size_check = size_check if not debug else 100 rep = rep if not debug else 2 model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid gen_obs_func = model_obj.sample_sim gen_sample_func = model_obj.generate_sample t0_grid = model_obj.pred_grid t0_val = model_obj.true_param lik_func = model_obj.compute_exact_likelihood np.random.seed(seed) # Adding Gaussian Process as an option in the classifier toy example num_hidden_vec = [(100, ), (20, 20), (50, 20)] for num_hidden in num_hidden_vec: classifier_dict['carl_' + str(num_hidden)] = num_hidden # # MadMiner output # logging.basicConfig( # format='%(asctime)-5.5s %(name)-20.20s %(levelname)-7.7s %(message)s', # datefmt='%H:%M', # level=logging.INFO # ) # # Output of all other modules (e.g. matplotlib) # for key in logging.Logger.manager.loggerDict: # if "madminer" not in key: # logging.getLogger(key).setLevel(logging.WARNING) # Loop over repetitions and classifiers # Each time we train the different classifiers, we build the intervals and we record # whether the point is in or not. out_val = [] out_cols = [ 'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs', 't0_true_val', 'theta_0_current', 'on_true_t0', 'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI', 'mse_loss', 'training_time', 'pred_time', 'bprime_time', 'cutoff_time', 'total_time', 'cutoff_type' ] pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s, b=%s' % (sample_size_obs, b)) for jj in range(rep): # Generates samples for each t0 values, so to be able to check both coverage and power x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val) # Calculate the true likelihood ratio lik_theta0 = np.array([ np.sum(np.log(lik_func(x_obs=x_obs, true_param=theta_0))) for theta_0 in t0_grid ]) max_across_grid = np.max( np.array([ np.sum(np.log(lik_func(x_obs=x_obs, true_param=t1))) for t1 in grid_param ])) true_tau_obs = lik_theta0.reshape(-1, ) - max_across_grid.reshape(1) # print('TRUE', true_tau_obs) # Train the classifier for the odds clf_odds_fitted = {} clf_cde_fitted = {} for clf_name, clf_model in sorted(classifier_dict.items(), key=lambda x: x[0]): start_time = datetime.now() if 'carl_' in clf_name: # Create CARL carl = DoubleParameterizedRatioEstimator(n_hidden=clf_model) # Generate data for CARL if sample_type == 'MC': n_pairs = int(np.sqrt(b // 2)) theta0_base = np.linspace(start=model_obj.low_int, stop=model_obj.high_int, num=n_pairs) theta1_base = np.linspace(start=model_obj.low_int, stop=model_obj.high_int, num=n_pairs) theta0 = np.repeat(theta0_base.reshape(-1, 1), int(n_pairs)) theta1 = np.tile(theta1_base.reshape(-1, 1), (int(n_pairs), 1)) elif sample_type == 'uniform': n_pairs = int(b // 2) theta0 = np.random.uniform(low=model_obj.low_int, high=model_obj.high_int, size=n_pairs) theta1 = np.random.uniform(low=model_obj.low_int, high=model_obj.high_int, size=n_pairs) else: raise NotImplementedError sample_t0 = np.array([ model_obj.sample_sim(sample_size=sample_size_obs, true_param=t0) for t0 in theta0 ]) sample_t1 = np.array([ model_obj.sample_sim(sample_size=sample_size_obs, true_param=t1) for t1 in theta1 ]) theta_mat = np.vstack( (np.hstack((theta0.reshape(-1, model_obj.d), theta1.reshape(-1, model_obj.d))), np.hstack((theta0.reshape(-1, model_obj.d), theta1.reshape(-1, model_obj.d))))) x_mat = np.vstack((sample_t0.reshape(-1, sample_size_obs), sample_t1.reshape(-1, sample_size_obs))) y_mat = np.vstack((np.zeros(b // 2).reshape(-1, 1), np.ones(b // 2).reshape(-1, 1))) carl.train(method='carl', x=x_mat, y=y_mat, theta0=theta_mat[:, :model_obj.d], theta1=theta_mat[:, model_obj.d:], n_epochs=25, initial_lr=1e-4, final_lr=1e-4) training_time = datetime.now() theta0_pred = np.repeat(t0_grid, grid_param.shape[0]).reshape( -1, model_obj.d) theta1_pred = np.tile(grid_param, (t0_grid.shape[0], 1)).reshape( -1, model_obj.d) log_r_hat, _, _ = carl.evaluate(theta0=theta0_pred, theta1=theta1_pred, x=x_obs, evaluate_score=False) tau_obs = np.min(np.sum(log_r_hat.reshape( t0_grid.shape[0], grid_param.shape[0], sample_size_obs), axis=2), axis=1) clf_odds_fitted[clf_name] = (tau_obs, np.mean( (tau_obs - true_tau_obs)**2)) pred_time = datetime.now() if cutoff == 'qr': # Calculate the LR statistics given a sample theta_mat, sample_mat = msnh_sampling_func( b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis( arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0_carl( model=carl, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param, param_d=model_obj.d)) bprime_time = datetime.now() clf_cde_fitted[clf_name] = {} # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): clf_name_qr = classifier_cde clf_params = classifier_cde_dict[classifier_cde] model = lgb.LGBMRegressor(objective='quantile', alpha=alpha, **clf_params[1]) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) t0_pred_vec = model.predict( t0_grid.reshape(-1, model_obj.d)) elif cutoff == 'chisquare': chisquare_cutoff = chi2.ppf(q=1.0 - alpha, df=1) t0_pred_vec = np.array([-0.5 * chisquare_cutoff] * tau_obs.shape[0]) bprime_time = datetime.now() clf_name_qr = classifier_cde clf_cde_fitted[clf_name] = {} else: raise ValueError( 'Cutoff %s not recognized. Either "qr" or "chisquare" are accepted' % cutoff) else: clf_odds = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, clf_name=clf_name, marginal=marginal, nn_square_root=True) training_time = datetime.now() if verbose: print('----- %s Trained' % clf_name) tau_obs = np.array([ compute_statistics_single_t0(clf=clf_odds, obs_sample=x_obs, t0=theta_0, grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs) for theta_0 in t0_grid ]) clf_odds_fitted[clf_name] = (tau_obs, np.mean( (tau_obs - true_tau_obs)**2)) #print(clf_name, np.mean((tau_obs - true_tau_obs)**2)) pred_time = datetime.now() # Train the quantile regression algorithm for confidence levels theta_mat, sample_mat = msnh_sampling_func( b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis( arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0( clf=clf_odds, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param, d=model_obj.d, d_obs=model_obj.d_obs)) bprime_time = datetime.now() clf_cde_fitted[clf_name] = {} # for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): clf_name_qr = classifier_cde clf_params = classifier_cde_dict[classifier_cde] model = lgb.LGBMRegressor(objective='quantile', alpha=alpha, **clf_params[1]) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) t0_pred_vec = model.predict(t0_grid.reshape(-1, model_obj.d)) cutoff_time = datetime.now() clf_cde_fitted[clf_name][clf_name_qr] = (t0_pred_vec, ( (training_time - start_time).total_seconds() * 100, (pred_time - training_time).total_seconds() * 100, (bprime_time - pred_time).total_seconds() * 100, (cutoff_time - bprime_time).total_seconds() * 100)) # At this point all it's left is to record for clf_name, (tau_obs_val, mse_val) in clf_odds_fitted.items(): for clf_name_qr, (cutoff_val, time_vec) in clf_cde_fitted[clf_name].items(): size_temp = np.sum( (tau_obs_val >= cutoff_val).astype(int)) / t0_grid.shape[0] for kk, theta_0_current in enumerate(t0_grid): out_val.append([ b_prime, b, clf_name, clf_name_qr, run, jj, sample_size_obs, t0_val, theta_0_current, int(t0_val == theta_0_current), tau_obs_val[kk], cutoff_val[kk], int(tau_obs_val[kk] > cutoff_val[kk]), int(tau_obs_val[kk] <= cutoff_val[kk]), size_temp, mse_val, time_vec[0], time_vec[1], time_vec[2], time_vec[3], sum(time_vec), cutoff ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/gp_mc_comparison/' out_filename = 'classifier_reps_carl_%s_comparison_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_t0val%s_%s_%s.csv' % ( sample_type, b, b_prime, run, rep, str(alpha).replace( '.', '-'), sample_size_obs, str(t0_val).replace('.', '-'), classifier_cde, datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename) # Print results cov_df = out_df[out_df['on_true_t0'] == 1][[ 'classifier', 'classifier_cde', 'in_confint', 'mse_loss', 'size_CI', 'training_time', 'pred_time', 'bprime_time', 'cutoff_time', 'total_time' ]] print( cov_df.groupby(['classifier', 'classifier_cde']).agg({ 'in_confint': [np.average], 'size_CI': [np.average, np.std], 'mse_loss': [np.average, np.std], 'training_time': [np.average, np.std], 'pred_time': [np.average, np.std], 'bprime_time': [np.average, np.std], 'cutoff_time': [np.average, np.std], 'total_time': [np.average, np.std] }))
def main(b, b_prime, alpha, classifier, sample_size_obs, run, rep, debug=False, seed=7, verbose=False, size_reference=1000): # Setup the variables, also to account for debug runs np.random.seed(seed) b = b if not debug else 100 b_prime = b_prime if not debug else 100 sample_size_obs = sample_size_obs if not debug else 1 rep = rep if not debug else 1 # Create the loader object, which drives most print('----- Loading Simulations In') model_obj = model_dict[run]() # Also, calculate the reference distribution model_obj.set_reference_g(size_reference=size_reference) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid clf_model = classifier_dict[classifier] gen_sample_func = model_obj.generate_sample t0_grid = model_obj.grid gen_obs_func = model_obj.sample_sim classifier = classifier.replace('\n', '').replace(' ', '-') # Start the loop out_val = [] out_cols = [ 'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs', 't0_true_ax0', 't0_true_ax1', 'theta_0_current_ax0', 'theta_0_current_ax1', 'on_true_theta', 'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint', 'size_CI' ] pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s' % sample_size_obs) for jj in range(rep): # Calculate Odds if verbose: print('----- Calculating Odds') # Compute Odds via classifier clf = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, d=model_obj.d, clf_name=classifier) if verbose: print('----- %s Trained' % classifier) # Train Quantile Regression if verbose: print('----- Training Quantile Regression Algorithm') theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) # Compute the tau values for QR training stats_mat = np.array([ compute_statistics_single_t0(clf=clf, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param, t0=theta_0, obs_sample=sample_mat[kk, :, :]) for kk, theta_0 in enumerate(theta_mat) ]) # Fit the QR model model = GradientBoostingRegressor(loss='quantile', alpha=alpha, **{ 'max_depth': 5, 'n_estimators': 1000 }) model.fit(theta_mat.reshape(-1, 2), stats_mat.reshape(-1, )) t0_pred_vec = model.predict(t0_grid.reshape(-1, 2)) if verbose: print('----- Quantile Regression Algorithm Trained') pbar2 = tqdm(total=len(t0_grid), desc='Toy Example for Simulations, n=%s' % sample_size_obs) for t0_val in t0_grid: # Create a sample of observed data that are going to be used later # and compute statistics tau value for each t0 x_obs = gen_obs_func(sample_size=sample_size_obs, true_param=t0_val) tau_obs = np.array([ compute_statistics_single_t0(clf=clf, obs_sample=x_obs, t0=theta_0, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param) for theta_0 in t0_grid ]) size_temp = np.sum( (tau_obs > t0_pred_vec).astype(int)) / tau_obs.shape[0] # At this point all it's left is to record for kk, theta_0_current in enumerate(t0_grid): out_val.append([ b_prime, b, classifier, 'XGBoost -- (d5, n1000)', run, jj, sample_size_obs, t0_val[0], t0_val[1], theta_0_current[0], theta_0_current[1], 1 if np.sum( (t0_val == theta_0_current).astype(int)) == 2 else 0, tau_obs[kk], t0_pred_vec[kk], int(tau_obs[kk] > t0_pred_vec[kk]), int(tau_obs[kk] <= t0_pred_vec[kk]), size_temp ]) if verbose: pbar2.update(1) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/sen_poisson_2d/' out_filename = '2d_sen_poisson_heatmap_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_std15_%s.csv' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename) # Generating Heatmap -- Observed Values plot_df = out_df[out_df['on_true_theta'] == 1][[ 't0_true_ax0', 't0_true_ax1', 'in_confint' ]] plot_df = plot_df.groupby(['t0_true_ax0', 't0_true_ax1']).mean().reset_index() plt.figure(figsize=(15, 7.5)) plot_df_heatmap = plot_df.pivot('t0_true_ax1', 't0_true_ax0', 'in_confint') ax = sns.heatmap(plot_df_heatmap, cmap='RdYlGn', vmax=plot_df['in_confint'].max(), vmin=plot_df['in_confint'].min()) ax.invert_yaxis() plt.title( "Observed Coverage Across %sD %s Param Space, B=%s, B'=%s, n=%s" % (model_obj.d, run.title(), b, b_prime, sample_size_obs), fontsize=25) plt.xlabel('Background', fontsize=25) plt.ylabel('Signal', fontsize=25) plt.tight_layout() image_name = 'heatmap_observed_coverage_%sD_%s_b_%s_bprime_%s_n%s_%s.pdf' % ( model_obj.d, run, b, b_prime, sample_size_obs, datetime.strftime(datetime.today(), '%Y-%m-%d')) plt.savefig('images/%s' % model_obj.out_directory + image_name) # Generating Heatmap -- Estimated Coverage print('----- Estimating Coverage') X_cov = out_df[out_df['on_true_theta'] == 1][[ 't0_true_ax0', 't0_true_ax1' ]].values y_cov = out_df[out_df['on_true_theta'] == 1]['in_confint'].values model = LogisticRegression(penalty='none', solver='saga', max_iter=10000) model.fit(X_cov, y_cov) pred_grid = model_obj.make_grid_over_param_space(50) pred_cov = model.predict_proba(pred_grid) plot_df_cov = pd.DataFrame.from_dict({ 't0_true_ax0': np.round(pred_grid[:, 0], 1), 't0_true_ax1': np.round(pred_grid[:, 1], 1), 'in_confint': pred_cov[:, 1] }) plot_df_heatmap = plot_df_cov.pivot('t0_true_ax1', 't0_true_ax0', 'in_confint') plt.figure(figsize=(15, 7.5)) ax = sns.heatmap(plot_df_heatmap, cmap='RdYlGn', vmax=plot_df_cov['in_confint'].max(), vmin=plot_df_cov['in_confint'].min()) ax.invert_yaxis() plt.title("Estimated Coverage Across %sD %s Space, B=%s, B'=%s, n=%s" % (model_obj.d, run.title(), b, b_prime, sample_size_obs), fontsize=25) plt.xlabel('Background', fontsize=25) plt.ylabel('Signal', fontsize=25) plt.tight_layout() image_name = 'heatmap_estimated_coverage_%sD_%s_b_%s_bprime_%s_n%s_%s.pdf' % ( model_obj.d, run, b, b_prime, sample_size_obs, datetime.strftime(datetime.today(), '%Y-%m-%d')) plt.savefig('images/%s' % model_obj.out_directory + image_name)
def main(run, rep, marginal, b, b_prime, alpha, sample_size_obs, size_marginal=1000, debug=False, seed=7, size_check=1000, size_t0_sampled=250, verbose=False): # Setup variables b = b if not debug else 10 b_prime = b_prime if not debug else 10 size_check = size_check if not debug else 100 rep = rep if not debug else 1 model_obj = model_dict[run](marginal=marginal, size_marginal=size_marginal) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid gen_obs_func = model_obj.sample_sim gen_sample_func = model_obj.generate_sample t0_val = model_obj.true_param np.random.seed(seed) t0_grid = np.random.uniform(low=model_obj.low_int, high=model_obj.high_int, size=size_t0_sampled) # Loop over repetitions and classifiers # Each time we train the different classifiers, we build the intervals and we record # whether the point is in or not. np.random.seed(seed) out_val = [] out_cols = [ 'b_prime', 'b', 'classifier', 'classifier_cde', 'run', 'rep', 'sample_size_obs', 'pinball_loss', 'theta_0_current', 'estimated_tau', 'estimated_cutoff', 'in_confint', 'out_confint' ] pbar = tqdm(total=rep, desc='Toy Example for Simulations, n=%s' % sample_size_obs) for jj in range(rep): # Train the classifier for the odds clf_odds_fitted = {} clf_cde_fitted = {} for clf_name, clf_model in sorted(classifier_dict[run].items(), key=lambda x: x[0]): clf_odds = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, clf_name=clf_name) if verbose: print('----- %s Trained' % clf_name) # Create a validation set for validating the pinball loss np.random.seed(seed) theta_mat_valid, sample_mat_valid = msnh_sampling_func( b_prime=size_check, sample_size=sample_size_obs) full_mat_valid = np.hstack((theta_mat_valid, sample_mat_valid)) stats_mat_valid = np.apply_along_axis( arr=full_mat_valid, axis=1, func1d=lambda row: compute_statistics_single_t0( obs_sample=row[model_obj.d:], t0=t0_val, grid_param_t1=grid_param, clf=clf_odds)) X_val = theta_mat_valid y_val = stats_mat_valid # Train the quantile regression algorithm for confidence levels theta_mat, sample_mat = msnh_sampling_func( b_prime=b_prime, sample_size=sample_size_obs) full_mat = np.hstack((theta_mat, sample_mat)) stats_mat = np.apply_along_axis( arr=full_mat, axis=1, func1d=lambda row: compute_statistics_single_t0( clf=clf_odds, obs_sample=row[model_obj.d:], t0=row[:model_obj.d], grid_param_t1=grid_param)) clf_cde_fitted[clf_name] = {} for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): # Train the regression quantiles algorithms if clf_params[0] == 'xgb': model = GradientBoostingRegressor(loss='quantile', alpha=alpha, **clf_params[1]) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) t0_pred_vec = model.predict( t0_grid.reshape(-1, model_obj.d)) val_pred_vec = model.predict(X_val) elif clf_params[0] == 'rf': model = RandomForestQuantileRegressor(**clf_params[1]) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) t0_pred_vec = model.predict(t0_grid.reshape( -1, model_obj.d), quantile=alpha) val_pred_vec = model.predict(X_val, quantile=alpha * 100) elif clf_params[0] == 'lgb': model = lgb.LGBMRegressor(objective='quantile', alpha=alpha, **clf_params[1]) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) t0_pred_vec = model.predict( t0_grid.reshape(-1, model_obj.d)) val_pred_vec = model.predict(X_val) elif clf_params[0] == 'linear': t0_pred_vec = QuantReg( theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )).fit(q=alpha).predict( t0_grid.reshape(-1, model_obj.d)) val_pred_vec = QuantReg( theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )).fit(q=alpha).predict( X_val.reshape(-1, model_obj.d)) else: raise ValueError('CDE Classifier not defined in the file.') loss_value = pinball_loss(y_true=val_pred_vec, y_pred=y_val, alpha=alpha) clf_cde_fitted[clf_name][clf_name_qr] = (t0_pred_vec, loss_value) # Generates samples for each t0 values # Then calculates tau at each t0, but using the sample generated at that t0 # In other words, we should expect the samples to be included in the confidence intervals # everytime t0_obs_sampled = { t0: gen_obs_func(sample_size=sample_size_obs, true_param=t0) for t0 in t0_grid } tau_obs = np.array([ compute_statistics_single_t0( clf=clf_odds, obs_sample=t0_obs_sampled[theta_0], t0=theta_0, grid_param_t1=grid_param) for theta_0 in t0_grid ]) clf_odds_fitted[clf_name] = tau_obs # At this point all it's left is to record for clf_name, tau_obs_val in clf_odds_fitted.items(): for clf_name_qr, (cutoff_val, loss_value) in clf_cde_fitted[clf_name].items(): for kk, theta_0_current in enumerate(t0_grid): out_val.append([ b, b_prime, clf_name, clf_name_qr, run, jj, sample_size_obs, loss_value, theta_0_current, tau_obs_val[kk], cutoff_val[kk], int(tau_obs_val[kk] > cutoff_val[kk]), int(tau_obs_val[kk] <= cutoff_val[kk]) ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/classifier_coverage_toy/' out_filename = 'classifier_coverage_toy_%sB_%sBprime_%s_%srep_alpha%s_sampleobs%s_%s.csv' % ( b, b_prime, run, rep, str(alpha).replace('.', '-'), sample_size_obs, datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename)