def main(b, alpha, classifier, sample_size_obs, run, n_eval_grid=101, debug=False, seed=7, sample_size_check=1000, size_reference=1000): # Setup the variables, also to account for debug runs np.random.seed(seed) b = b if not debug else 100 sample_size_obs = sample_size_obs if not debug else 5 classifier_cde_dict = classifier_cde_dict_full if not debug else classifier_cde_dict_small # Create the loader object, which drives most print('----- Loading Simulations In') model_obj = model_dict[run]() # Also, calculate the reference distribution model_obj.set_reference_g(size_reference=size_reference) # Get the correct functions msnh_sampling_func = model_obj.sample_msnh_algo5 grid_param = model_obj.grid clf_model = classifier_dict[classifier] gen_sample_func = model_obj.generate_sample classifier = classifier.replace('\n', '').replace(' ', '-') # Then generate first the thetas used for checking coverage theta_vec, x_vec = model_obj.sample_sim_check( sample_size=sample_size_check, n=sample_size_obs) # Compute Odds via classifier print('----- Calculating Odds') clf = train_clf(sample_size=b, clf_model=clf_model, gen_function=gen_sample_func, d=model_obj.d, clf_name=classifier) tau_obs = np.array([ compute_statistics_single_t0(clf=clf, obs_sample=x_vec[kk, :, :].reshape( -1, model_obj.d_obs), d=model_obj.d, d_obs=model_obj.d_obs, t0=theta_0, grid_param_t1=grid_param) for kk, theta_0 in enumerate(theta_vec) ]) print('----- %s Trained' % classifier) # Loop over B' b_prime_vec = model_obj.b_prime_vec if not debug else [500, 1000] out_val = [] out_cols = [ 'b_prime', 'classifier', 'class_cde', 'run', 'n_eval_grid', 'sample_check', 'sample_reference', 'percent_correct_coverage', 'average_coverage', 'percent_correct_coverage_lr', 'average_coverage_lr', 'percent_correct_coverage_1std', 'average_coverage_1std', 'percent_correct_coverage_2std', 'average_coverage_2std' ] for b_prime in np.array(b_prime_vec).astype(int): # First generate the samples to train b_prime algorithm np.random.seed(seed) theta_mat, sample_mat = msnh_sampling_func(b_prime=b_prime, sample_size=sample_size_obs) stats_mat = np.array([ compute_statistics_single_t0(clf=clf, d=model_obj.d, d_obs=model_obj.d_obs, grid_param_t1=grid_param, t0=theta_0, obs_sample=sample_mat[kk, :, :]) for kk, theta_0 in enumerate(theta_mat) ]) pbar = tqdm(total=len(classifier_cde_dict.keys()), desc=r'Working on QR classifiers, b=%s' % b_prime) for clf_name_qr, clf_params in sorted(classifier_cde_dict.items(), key=lambda x: x[0]): if b_prime > 10000 and 'RF' in clf_name_qr: continue t0_pred_vec = train_qr_algo( model_obj=model_obj, theta_mat=theta_mat, stats_mat=stats_mat, algo_name=clf_params[0], learner_kwargs=clf_params[1], pytorch_kwargs=clf_params[2] if len(clf_params) > 2 else None, alpha=alpha, prediction_grid=theta_vec) in_vec = np.array([ int(tau_obs[jj] > t0_pred_vec[jj]) for jj in range(theta_vec.shape[0]) ]) # Calculate the mean model = XGBClassifier(depth=3, n_estimators=100) model.fit(theta_vec.reshape(-1, model_obj.d), in_vec.reshape(-1, )) pred_grid = model_obj.pred_grid pred_cov_mean = model.predict_proba(pred_grid)[:, 1] percent_correct_coverage = np.average((pred_cov_mean > (1.0 - alpha)).astype(int)) average_coverage = np.average(pred_cov_mean) # Calculate the upper limit x = theta_vec.reshape(-1, 2) y = in_vec.reshape(-1, ) # estimate the model X = sm.add_constant(x) with Suppressor(): model = sm.Logit(y, X).fit(full_output=False) proba = model.predict(X) percent_correct_coverage_lr = np.average( (proba > (1.0 - alpha)).astype(int)) average_coverage_lr = np.average(proba) # estimate confidence interval for predicted probabilities cov = model.cov_params() gradient = (proba * (1 - proba) * X.T).T # matrix of gradients for each observation std_errors = np.array( [np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient]) c = 1 # multiplier for confidence interval upper = np.maximum(0, np.minimum(1, proba + std_errors * c)) percent_correct_coverage_upper = np.average( (upper > (1.0 - alpha)).astype(int)) average_coverage_upper = np.average(upper) upper_2std = np.maximum(0, np.minimum(1, proba + std_errors * 1.96)) percent_correct_coverage_upper_2std = np.average( (upper_2std > (1.0 - alpha)).astype(int)) average_coverage_upper_2std = np.average(upper_2std) out_val.append([ b_prime, classifier, clf_name_qr, run, n_eval_grid, sample_size_check, size_reference, percent_correct_coverage, average_coverage, percent_correct_coverage_lr, average_coverage_lr, percent_correct_coverage_upper, average_coverage_upper, percent_correct_coverage_upper_2std, average_coverage_upper_2std ]) pbar.update(1) # Saving the results out_df = pd.DataFrame.from_records(data=out_val, index=range(len(out_val)), columns=out_cols) out_dir = 'sims/%s' % model_obj.out_directory out_filename = 'b_prime_analysis_%s_%s_alpha%s_ngrid%s_sizecheck%s_bprimemax%s_logregint_%s.csv' % ( classifier, run, str(alpha).replace( '.', '-'), n_eval_grid, sample_size_check, np.max(b_prime_vec), datetime.strftime(datetime.today(), '%Y-%m-%d')) out_df.to_csv(out_dir + out_filename)