def pixelSize_tests(pixelSize_tests_dir='../pixelSize_tests'): #get data constants.DATA_NPY = constants.SIZE50_NPY X_train_50, X_test_50, y_train_50, y_test_50, weights_train_50, _ = data.get_train_test( ) constants.DATA_NPY = constants.NROTATED_NPY X_train_25, X_test_25, y_train_25, y_test_25, weights_train_25, _ = data.get_train_test( ) constants.DATA_NPY = constants.SIZE100_NPY X_train_100, X_test_100, y_train_100, y_test_100, weights_train_100, _ = data.get_train_test( ) #preprocess X_train_100, X_test_100 = preprocess_tests.logAnd1Norm( X_train_100, X_test_100) X_train_50, X_test_50 = preprocess_tests.logAnd1Norm(X_train_50, X_test_50) X_train_25, X_test_25 = preprocess_tests.logAnd1Norm(X_train_25, X_test_25) #calculate results preprocess_tests.printdata('size100', X_train_100, X_test_100, y_train_100, y_test_100, weights_train_100, pixelSize_tests_dir) preprocess_tests.printdata('size50', X_train_50, X_test_50, y_train_50, y_test_50, weights_train_50, pixelSize_tests_dir) preprocess_tests.printdata('size25', X_train_25, X_test_25, y_train_25, y_test_25, weights_train_25, pixelSize_tests_dir)
def preprocess_tests(preprocess_tests_dir='../preprocess_tests'): constants.DATA_NPY = constants.ROTATED_NPY X_train_rot, X_test_rot, y_train_rot, y_test_rot, weights_train_rot, _ = data.get_train_test( ) constants.DATA_NPY = constants.NROTATED_NPY X_train_nrot, X_test_nrot, y_train_nrot, y_test_nrot, weights_train_nrot, _ = data.get_train_test( ) printdata('rotated', X_train_rot, X_test_rot, y_train_rot, y_test_rot, weights_train_rot, preprocess_tests_dir) printdata('n_rotated', X_train_nrot, X_test_nrot, y_train_nrot, y_test_nrot, weights_train_nrot, preprocess_tests_dir) X_train, X_test, y_train, y_test, weights_train = X_train_nrot, X_test_nrot, y_train_nrot, y_test_nrot, weights_train_nrot #use winner of previous test X_train_log = safeLog(X_train) X_test_log = safeLog(X_test) X_train_log_norm1 = X_train_log / safeNorm(X_train_log, 1) X_test_log_norm1 = X_test_log / safeNorm(X_train_log, 1) X_train_norm1 = X_train / safeNorm(X_train, 1) X_test_norm1 = X_test / safeNorm(X_train, 1) X_train_log_norm2 = X_train_log / safeNorm(X_train_log, 2) X_test_log_norm2 = X_test_log / safeNorm(X_train_log, 2) X_train_norm2 = X_train / safeNorm(X_train, 2) X_test_norm2 = X_test / safeNorm(X_train, 2) X_train_log_std = (X_train_log - safeMean(X_train_log)) / safeStd(X_train_log) X_test_log_std = (X_test_log - safeMean(X_train_log)) / safeStd(X_train_log) X_train_std = (X_train - safeMean(X_train)) / safeStd(X_train) X_test_std = (X_test - safeMean(X_train)) / safeStd(X_train) X_train_log_mm = (X_train_log - np.min(X_train_log, axis=0)) / minMax(X_train_log) - 1 X_test_log_mm = (X_test_log - np.min(X_train_log, axis=0)) / minMax(X_train_log) - 1 X_train_mm = (X_train - np.min(X_train, axis=0)) / minMax(X_train) - 1 X_test_mm = (X_test - np.min(X_train, axis=0)) / minMax(X_train) - 1 printdata('norm1_log', X_train_log_norm1, X_test_log_norm1, y_train, y_test, weights_train, preprocess_tests_dir) printdata('norm1', X_train_norm1, X_test_norm1, y_train, y_test, weights_train, preprocess_tests_dir) printdata('norm2_log', X_train_log_norm2, X_test_log_norm2, y_train, y_test, weights_train, preprocess_tests_dir) printdata('norm2', X_train_norm2, X_test_norm2, y_train, y_test, weights_train, preprocess_tests_dir) printdata('std_log', X_train_log_std, X_test_log_std, y_train, y_test, weights_train, preprocess_tests_dir) printdata('std', X_train_std, X_test_std, y_train, y_test, weights_train, preprocess_tests_dir) printdata('mm_log', X_train_log_mm, X_test_log_mm, y_train, y_test, weights_train, preprocess_tests_dir) printdata('mm', X_train_mm, X_test_mm, y_train, y_test, weights_train, preprocess_tests_dir) printdata('log', X_train_log, X_test_log, y_train, y_test, weights_train, preprocess_tests_dir)
def main(): import argparse parser = argparse.ArgumentParser( description='Compile, train and save a model.') parser.add_argument( '--run_dir', default=None, help='The directory in which weights and test samples should be saved.' ) args = parser.parse_args() if not args.run_dir: args.run_dir = utils.make_run_dir() print('[test] New run directory created at {}'.format(args.run_dir)) X_train, X_test, y_train, y_test, weights_train, _, _, _ = data.get_train_test( ) test_dir = os.path.join(args.run_dir, constants.TEST_DIR) try: os.makedirs(test_dir) except OSError as e: print(e) X_test_path = os.path.join(test_dir, 'X_test.npy') y_test_path = os.path.join(test_dir, 'y_test.npy') weights_dir = os.path.join(args.run_dir, constants.WEIGHTS_DIR) try: os.makedirs(weights_dir) except OSError as e: print(e) np.save(X_test_path, X_test) np.save(y_test_path, y_test) train_model(X_train, X_test, y_train, y_test, weights_train, weights_dir)
def lcurve(lcurve_model_dir, total_data_size, step_size=10, min_size=10, max_size=100, recalc=False): bins = (max_size - min_size) / step_size x = np.zeros(bins) y = np.zeros(bins) index = 0 for i in range(min_size, max_size, step_size): sample_size = (i * total_data_size) / 100 X_train, X_test, y_train, y_test, weights_train, _ = data.get_train_test( n=sample_size) modelFileName = lcurve_model_dir + '/learning' + str( sample_size) + '.h5' if os.path.isfile(modelFileName) and not recalc: model = load_model(modelFileName) else: model = train.train_model(X_train, X_test, y_train, y_test, weights_train, lcurve_model_dir, epochs=50) model.save(modelFileName) y[index] = metrics.fixed_efficiency(X_test, y_test, model) x[index] = sample_size index = index + 1 plt.plot(x, y) plt.xlabel('Samples Used', fontsize=15) plt.ylabel('fpr with tpr=0.5', fontsize=15) plt.title('Learning Curve', fontsize=19) plt.savefig(lcurve_model_dir + '/lcurve.png')
def obtain_linear_reg( model_type: int = 0, pca_dimensions: int = 3, pca_threshold: float = 0.95, from_year: int = 2010, to_year: int = 2018, ) -> LINEARREG: """ Obtain a linear regression model Args: model_type -> the type of model data we'd like to build our regression with pca_dimensions -> the number of dimensions to apply to pca (if 0, auto-detect the dimensions) pca_threshold -> The threshold for information preserved by our pca models from_year -> the year we want our nba data to be selected from to_year -> the year we want our nba data up to Returns: Linear regression model using our customized nba dataset """ logging.debug("----OBTAINING NEW REGRESSION MODEL----") nba_stats, nba_ws = filter_cols( get_nba_df(from_year=from_year, to_year=to_year)) nba_stats = nba_stats.fillna(0) # The model we'd like scaling = MODELTYPES.get(model_type, "no scaling") logging.debug(f"Applying {scaling} to our data") # obtain correct data if scaling == "stdscaled": nba_stats = apply_scaling(nba_stats) elif scaling == "mmscaled": nba_stats = apply_scaling(nba_stats, scale_type="MinMax") elif scaling == "pca": nba_stats = apply_pca(nba_stats, pca_dimensions, pca_threshold) elif scaling == "stdpca": nba_stats = apply_pca(apply_scaling(nba_stats), pca_dimensions, pca_threshold) elif scaling == "mmpca": nba_stats = apply_pca(apply_scaling(nba_stats, scale_type="MinMax"), pca_dimensions, pca_threshold) # Obtain features and target data features, target = get_train_test(nba_stats, nba_ws) logging.debug( f"Creating linear regression model comprised of {len(nba_stats.columns)} features" ) reg_model = create_linear_regression(features, target) logging.debug("----FINISHED OBTAINING REGRESSION MODEL----\n") # Return the regression model, nba player stats, and win shares return LINEARREG(reg_model, nba_stats, nba_ws, features, target)
def main() -> None: """ Main functionality of our linear regression """ # Gather the necessary features nba_stats, nba_ws = filter_cols(get_nba_df(from_year=2000)) nba_pca = apply_pca(nba_stats.fillna(0), dimensions=5) std_nba = apply_scaling(nba_stats.fillna(0)) mm_nba = apply_scaling(nba_stats.fillna(0), scale_type="MinMax") std_pca = apply_scaling(nba_pca) mm_pca = apply_scaling(nba_pca, scale_type="MinMax") # get train testing data features, target = get_train_test(nba_stats.fillna(0), nba_ws) pca_feats, pca_target = get_train_test(nba_pca, nba_ws) std_features, std_target = get_train_test(std_nba, nba_ws) mm_features, mm_target = get_train_test(mm_nba, nba_ws) std_pca, std_pca_target = get_train_test(std_pca, nba_ws) mm_pca, mm_pca_target = get_train_test(mm_pca, nba_ws) # Create linear regression models # create_linear_regression(features, target) # create_linear_regression(pca_feats, pca_target) # create_linear_regression(std_features, std_target) # create_linear_regression(mm_features, mm_target) # create_linear_regression(std_pca, std_pca_target) # create_linear_regression(mm_pca, mm_pca_target) obtain_linear_reg() # Find number of dimensions that preserves 95% of the information from our original model obtain_linear_reg(model_type=4, pca_dimensions=0, pca_threshold=0.95)
def comp_all(i, datasets=datasets_s, n=150000): name = 'all_' + datasets[i] + '_comps' X_tests = [] y_yests = [] models = [] model_types = [] labels = [] sig = datasets[i] for j in range(6): if j == i: continue bg = datasets[j] constants.SIG_H5 = os.path.join(constants.DATA_DIR, sig + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, bg + '.h5') X_train, X_test, y_train, y_test, \ _, _, sig_metadata, \ bg_metadata, _ = get_train_test(n=n) if os.path.isfile('../best_model/' + sig + '_vs_' + bg + '_model'): model_name = sig + '_vs_' + bg else: model_name = bg + '_vs_' + sig model = load_model('../best_model/' + model_name + '_model') X_tests.append(X_test) y_yests.append(y_test) models.append(model) model_types.append(True) labels.append(model_name) plot_n_roc_sic(name, 'final_curves/sic_' + name, X_tests, y_yests, models, model_types, labels, True, fontfac=0.5) plot_n_roc_sic(name, 'final_curves/roc_' + name, X_tests, y_yests, models, model_types, labels, False, fontfac=0.5)
def main(): for cmp in range(4): if cmp == 0: constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'h_qq_rot_charged.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'h_gg_rot_charged.h5') sample = 'charged' cmps = ' qq vs gg' elif cmp == 1: constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'h_qq_rot_standard.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'h_gg_rot_standard.h5') sample = 'standard' cmps = ' qq vs gg' elif cmp == 3: constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'h_qq_rot_standard.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'h_qq_rot_charged.h5') sample = 'quarks' cmps = ' charged v standard' else: constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'h_gg_rot_standard.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'h_gg_rot_charged.h5') sample = 'gluons' cmps = ' charged v standard' constants.MODEL_NAME = sample + '_model' X_train, X_test, y_train, y_test, \ weights_train, weights_test, sig_metadata, \ bg_metadata, _ = get_train_test(n=150000) #same_file=True) train(X_train, X_test, y_train, \ y_test, weights_train, sample, cmps) if cmp < 2: makeImage(np.mean(X_train[y_train == 1.0], axis=0), 'Average_' + sample + '_quark') makeImage(np.mean(X_train[y_train == 0.0], axis=0), 'Average_' + sample + '_gluon')
def main(): import argparse parser = argparse.ArgumentParser( description='Plot clusters on given data.') parser.add_argument( '--run_dir', default='../clusters', help='The directory in which cluster plots should be saved.') parser.add_argument('--n_clusters', '-n', default=20, help='The number of clusters to use.') parser.add_argument( '--separate', '-s', default=True, action='store_true', help='If set, separate octet and singlet data for clustering.') args = parser.parse_args() if not args.run_dir: args.run_dir = utils.make_run_dir() print('[clustering] New run directory created at {}'.format( args.run_dir)) if os.path.isfile(args.run_dir + '/test_data_x.npy') and os.path.isfile( args.run_dir + '/test_data_y.npy'): X = np.load(args.run_dir + '/test_data_x.npy') y = np.load(args.run_dir + '/test_data_y.npy') else: _, X, _, y, _, _ = data.get_train_test() if args.separate: mask = (y == 1) X_sig = X[mask] y_sig = np.ones(X_sig.shape[0]) X_bg = X[np.logical_not(mask)] y_bg = np.zeros(X_bg.shape[0]) X = np.concatenate((X_sig, X_bg), axis=0) y = np.concatenate((y_sig, y_bg), axis=0) np.save(args.run_dir + '/test_data_x.npy', X) np.save(args.run_dir + '/test_data_y.npy', y) plot_clusters(args.run_dir, reshape(X), y, args.n_clusters, args.separate)
def main(): import argparse parser = argparse.ArgumentParser(description='Generate a learning curve.') parser.add_argument( '--save', default=None, help='The directory in which models and the curve will be saved.') parser.add_argument( '--step_size', type=int, default=10, help= 'The step size, as a percentage (i.e. step_size = 5 means 5% of total data).' ) parser.add_argument('--min_size', type=int, default=10, help='The min size of data to use, as a percentage.') args = parser.parse_args() X_train, X_test, _, _, _, _ = data.get_train_test() total_size = X_train.shape[0] + X_test.shape[0] lcurve(args.save, total_size, args.step_size, args.min_size)
def sen_stud(datasets, ischarged): for i in range(4): for j in range(4): if j >= i: continue sig = datasets[i] bg = datasets[j] if ischarged: constants.SIG_H5 = os.path.join(constants.DATA_DIR, sig + '_rot_charged.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, bg + '_rot_charged.h5') charge = 'charged' else: constants.SIG_H5 = os.path.join(constants.DATA_DIR, sig + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, bg + '.h5') charge = 'standard' if ischarged: model_name = sig + '_vs_' + bg else: model_name = sig + '_rot_charged_vs_' + bg + '_rot_charged' constants.MODEL_NAME= model_name + '_model' model = load_model('../best_model/' + model_name + '_model') _, X_test_14, _, y_test_14, \ _, _, _, _ = get_train_test(n=150000) if not "qx_qg" in model_name: constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + sig + '_col_1_' + charge + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + bg + '_col_1_' + charge + '.h5') _, X_test_1, _, y_test_1, \ _, _, _, _ = get_train_test(n=30000, train_size=0) np.save('final_curves/sensitivity_study/yvals/true_'+ sig + '_vs_' + bg + '_col_1_' + charge, y_test_1) np.save('final_curves/sensitivity_study/yvals/hat_'+ sig + '_vs_' + bg + '_col_1_' + charge, model.predict(X_test_1)) constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + sig + '_col_2_' + charge + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + bg + '_col_2_' + charge + '.h5') _, X_test_2, _, y_test_2, \ _, _, _, _ = get_train_test(n=30000, train_size=0) np.save('final_curves/sensitivity_study/yvals/true_'+ sig + '_vs_' + bg + '_col_2_' + charge, y_test_2) np.save('final_curves/sensitivity_study/yvals/hat_'+ sig + '_vs_' + bg + '_col_2_' + charge, model.predict(X_test_2)) constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + sig + '_pp_21_' + charge + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + bg + '_pp_21_' + charge + '.h5') _, X_test_21, _, y_test_21, \ _, _, _, _ = get_train_test(n=30000, train_size=0) np.save('final_curves/sensitivity_study/yvals/true_'+ sig + '_vs_' + bg + '_pp_21_' + charge, y_test_21) np.save('final_curves/sensitivity_study/yvals/hat_'+ sig + '_vs_' + bg + '_pp_21_' + charge, model.predict(X_test_21)) constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + sig + '_pp_25_' + charge + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + bg + '_pp_25_' + charge + '.h5') _, X_test_25, _, y_test_25, \ _, _, _, _ = get_train_test(n=30000, train_size=0) np.save('final_curves/sensitivity_study/yvals/true_'+ sig + '_vs_' + bg + '_pp_25_' + charge, y_test_25) np.save('final_curves/sensitivity_study/yvals/hat_'+ sig + '_vs_' + bg + '_pp_25_' + charge, model.predict(X_test_25)) constants.SIG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + sig + '_pp_26_' + charge + '.h5') constants.BG_H5 = os.path.join(constants.DATA_DIR, 'sensitivity_study/' + bg + '_pp_26_' + charge + '.h5') _, X_test_26, _, y_test_26, \ _, _, _, _ = get_train_test(n=30000, train_size=0) np.save('final_curves/sensitivity_study/yvals/true_'+ sig + '_vs_' + bg + '_pp_26_' + charge, y_test_26) np.save('final_curves/sensitivity_study/yvals/hat_'+ sig + '_vs_' + bg + '_pp_26_' + charge, model.predict(X_test_26)) if not "qx_qg" in model_name: X_tests = [X_test_1, X_test_2, X_test_14, X_test_21, X_test_25, X_test_26] y_tests = [y_test_1, y_test_2, y_test_14, y_test_21, y_test_25, y_test_26] models = [model, model, model, model, model, model] model_types = [True, True, True, True, True, True] labels = ['Color 1', 'Color 2', 'pp 14', 'pp 21', 'pp 25', 'pp 26'] else: X_tests = [X_test_2, X_test_14, X_test_21, X_test_25, X_test_26] y_tests = [y_test_2, y_test_14, y_test_21, y_test_25, y_test_26] models = [model, model, model, model, model] model_types = [True, True, True, True, True] labels = ['Color 2', 'pp 14', 'pp 21', 'pp 25', 'pp 26'] plot_n_roc_sic(model_name, 'final_curves/sensitivity_study/sic_sens_'+model_name, X_tests, y_tests, models, model_types, labels, True) plot_n_roc_sic(model_name, 'final_curves/sensitivity_study/roc_sens_'+model_name, X_tests, y_tests, models, model_types, labels, False)
def plot_pearson(run_dir, save_dir, name, show=False, only_true=False, show_obs=False, provide_data=False, X_test=None, y_test=None, model=None): if only_true: _, X_test, _, y_test, _, _, _, _, _ = data.get_train_test() elif not provide_data: model, X_test, y_test = utils.get_model_test(run_dir) X_test_re = X_test.reshape(X_test.shape[0], size * size) y_pearson = np.zeros(X_test_re.shape) for i in range(X_test_re.shape[0]): y_pearson[i, :] = np.full(X_test_re.shape[1], y_test[i]) X_true = np.zeros(size * size) for i in range(size * size): X_pearson = np.corrcoef(X_test_re[:, i], y_pearson[:, i]) #, rowvar = False X_true[i] = X_pearson[0, 1] X_image = X_true.reshape(size, size) plt.clf() fig, ax = plt.subplots(1) plt.imshow(X_image, interpolation="none", cmap='seismic', vmin=-0.2, vmax=0.2) plt.xlabel('Proportional to Translated Pseudorapidity', fontsize=10) plt.ylabel('Proportional to Translated Azimuthal Angle', fontsize=10) plt.title('PCC for pixel intensity and truthful output', fontsize=15) plt.colorbar() if show_obs: ax.add_patch( patches.Circle((32, 32), 1, linewidth=1, edgecolor='g', facecolor='none')) ax.add_patch( patches.Circle((32, 32), 6, linewidth=1, edgecolor='g', facecolor='none')) ax.add_patch( patches.Circle((32, 43), 5, linewidth=1, edgecolor='g', facecolor='none')) ax.add_patch( patches.Ellipse((32, 53), 5, 12, linewidth=1, edgecolor='g', facecolor='none')) plt.savefig(save_dir + 'truths/' + name + '_pearson_truth.png') plt.savefig(save_dir + 'truths/' + name + '_pearson_truth.pdf') np.save(save_dir + 'truths/' + name + '_pearson_truth.png', X_image) if show: plt.show() if (only_true): return y_hat = model.predict(X_test) > 0.5 y_pearson = np.zeros(X_test_re.shape) for i in range(X_test_re.shape[0]): y_pearson[i, :] = np.full(X_test_re.shape[1], y_hat[i]) X_net = np.zeros(size * size) for i in range(size * size): X_pearson = np.corrcoef(X_test_re[:, i], y_pearson[:, i]) #, rowvar = False X_net[i] = X_pearson[0, 1] X_image = X_net.reshape(size, size) plt.clf() plt.imshow(X_image, interpolation="none", cmap='seismic', vmin=-0.2, vmax=0.2) plt.xlabel('Proportional to Translated Pseudorapidity', fontsize=10) plt.ylabel('Proportional to Translated Azimuthal Angle', fontsize=10) plt.title('PCC for pixel intensity and network output', fontsize=15) plt.colorbar() plt.savefig(save_dir + 'NNs/' + name + '_pearson_nn.png') plt.savefig(save_dir + 'NNs/' + name + '_pearson_nn.pdf') np.save(save_dir + 'truths/' + name + '_pearson_net.png', X_image) if show: plt.show() X_image = X_net - X_true X_image = X_image.reshape(size, size) plt.clf() plt.imshow(X_image, interpolation="none", cmap='seismic', vmin=-0.2, vmax=0.2) plt.xlabel('Proportional to Translated Pseudorapidity', fontsize=10) plt.ylabel('Proportional to Translated Azimuthal Angle', fontsize=10) plt.title('Difference between net and true PCCs', fontsize=15) plt.colorbar() plt.savefig(save_dir + 'diffs/' + name + '_pearson_diff.png') plt.savefig(save_dir + 'diffs/' + name + '_pearson_diff.pdf') if show: plt.show() print('[Pearson] Done!')
def get_model_test(run_dir): model = load_model(os.path.join(run_dir, constants.WEIGHTS_DIR, constants.MODEL_NAME)) _, X_test, _, y_test, _, _, _, _, _ = data.get_train_test() return model, X_test, y_test
sys.path.append("../utilities") sys.path.append("../../SMAC3") from smac.configspace import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter, \ UniformFloatHyperparameter, UniformIntegerHyperparameter from smac.tae.execute_func import ExecuteTAFuncDict from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC import data n = 500000 X_train, X_test, y_train, y_test, weights_train, _ = data.get_train_test(n=n) X_val = X_train[:int(n * 0.2)] y_val = y_train[:int(n * 0.2)] def print_incumb(cfg): print('Best model saved in: ' + '../../models/' \ + str(cfg['first_kernel_size']) + '_' \ + str(cfg['conv_filters']) + '_' \ + str(cfg['n_conv']) + '_' \ + str(cfg['dropout']) + '_' \ + cfg['activation'] + '_' \ + str(cfg['dense_width']) + '_' \ + str(cfg['dense_length']) + '_' \ + cfg['optimizer'] + '_' \ + str(cfg['optimizer_lr']) + '_' \
def pipeline(datasets, ischarged, usePrev = True, skip = False, n = 150000): n_hyp_tbl = np.zeros((len(datasets), len(datasets))) - 1 for i in range(len(datasets)): for j in range(len(datasets)): if j >= i: continue sig = datasets[i] bg = datasets[j] model_name = setConstants(sig, bg) if skip \ and os.path.exists('y_vals/y_pull_hat_'+model_name+'.npy') \ and os.path.exists('final_curves/pearsons/truths/'+model_name+'_pearson_truth.png.npy') \ and os.path.exists('../best_model/' + sig + '_vs_' + bg + '_model'): print('Skipped ' + sig + ' vs ' + bg + '!') continue X_train, X_test, y_train, y_test, \ weights_train, weights_test, sig_metadata, \ bg_metadata = get_train_test(n=n) model = train(X_train, X_test, y_train, \ y_test, weights_train, model_name, usePrev=usePrev) makeImage(np.mean(X_train[y_train==1.0], axis=0), 'Average_' + sig) makeImage(np.mean(X_train[y_train==0.0], axis=0), 'Average_' + bg) plot_pearson('../best_model/', 'final_curves/pearsons/', model_name, show_obs=True, provide_data=True, X_test=X_test, y_test=y_test, model=model) obs_train = calcObs(X_train) sig_obs = obs_train[y_train == 1] bg_obs = obs_train[y_train == 0] name = model_name + '_' hist([sig_metadata.iloc[:, 0], bg_metadata.iloc[:, 0]], name+'pull1') hist([sig_metadata.iloc[:, 1], bg_metadata.iloc[:, 1]], name+'pull2') for k in range(10): hist([sig_obs[:, k], bg_obs[:, k]], name+'obs'+str(k+1)) np.save('final_curves/tjets/' + sig + '_obs' + str(k+1), sig_obs[:, k]) np.save('final_curves/tjets/' + bg + '_obs' + str(k+1), bg_obs[:, k]) hist([sig_obs[:, 1], bg_obs[:, 1]], name+'obs2') hist([sig_obs[:, 2], bg_obs[:, 2]], name+'obs3', log=True) hist([sig_obs[:, 3], bg_obs[:, 3]], name+'obs4', log=True) obs_test = calcObs(X_test) obs_model = adaboost(obs_train, y_train) pull1 = np.concatenate((sig_metadata.iloc[:, 0], bg_metadata.iloc[:, 0])) pull2 = np.concatenate((sig_metadata.iloc[:, 1], bg_metadata.iloc[:, 1])) pull_X = np.concatenate((pull1.reshape(pull1.shape[0], 1), pull2.reshape(pull2.shape[0], 1)), axis=1) pull_y = np.concatenate((np.ones(len(sig_metadata.iloc[:, 0])), np.zeros(len(bg_metadata.iloc[:, 0])))) pull_train, pull_test, y_train_pull, y_test_pull = train_test_split(pull_X, pull_y, train_size=0.8) pull_model = adaboost(pull_train, y_train_pull) X_tests = [X_test, obs_test, pull_test] y_tests = [y_test, y_test, y_test_pull] models = [model, obs_model, pull_model] model_types = [True, False, False] labels = ['CNN', 'OBS', 'Pull'] plot_n_roc_sic(model_name, 'final_curves/sic_'+model_name, X_tests, y_tests, models, model_types, labels, True) plot_n_roc_sic(model_name, 'final_curves/roc_'+model_name, X_tests, y_tests, models, model_types, labels, False) n_hyp_tbl[i, j] = n_pass_hyp(X_test[:1000, ...], y_test[:1000], model, flip=0) n_hyp_tbl[j, i] = n_pass_hyp(X_test[:1000, ...], y_test[:1000], model, flip=1) # save all y's np.save('y_vals/y_nn_test_'+model_name, y_test) y_hat = model.predict(X_test) np.save('y_vals/y_nn_hat_'+model_name, y_hat) np.save('y_vals/y_obs_test_'+model_name, y_test) obs_hat = obs_model.predict_proba(obs_test) np.save('y_vals/y_obs_hat_'+model_name, obs_hat[:, 1]) np.save('y_vals/y_pull_test_'+model_name, y_test_pull) pull_hat = pull_model.predict_proba(pull_test) np.save('y_vals/y_pull_hat_'+model_name, pull_hat[:, 1]) print(n_hyp_tbl)