def main(): print('Using Keras version: ', tf.keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=0, type=int) parser.add_argument('-w', '--classweights', dest='classweights', help='Option to choose class weights', default='InverseSRYields', type=str) parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='tH', type=str) args = parser.parse_args() do_model_fit = args.train_model classweights_name = args.classweights selection = args.selection # Number of classes to use number_of_classes = 4 # Create instance of output directory where all results are saved. output_directory = './2017samples_%s_%s/' % (selection, classweights_name) check_dir(output_directory) # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json', 'r') if selection == 'tH': selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)' #&& n_presel_jet>=3' # Load Variables from .json variable_list = list( json.load(input_var_jsonFile, encoding="utf-8").items()) # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('EventWeight') column_headers.append('xsec_rwgt') column_headers.append('nEvent') # Create instance of the input files directory inputs_file_path = '/hpcfs/bes/mlgpu/kapoor/samples' # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe_%s.csv' % (output_directory, selection) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path, column_headers, selection_criteria) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) # Make instance of plotter tool Plotter = plotter() # Create statistically independant lists train/test data (used to train/evaluate the network) traindataset, valdataset = train_test_split(data, test_size=0.2) #valdataset.to_csv('valid_dataset.csv', index=False) #print '<train-DNN> Training dataset shape: ', traindataset.shape #print '<train-DNN> Validation dataset shape: ', valdataset.shape # Remove last two columns (Event weight and xsrw) from column headers training_columns = column_headers[:-3] print('<train-DNN> Training features: ', training_columns) # Select data from columns under the remaining column headers in traindataset X_train = traindataset[training_columns].values # Select data from 'target' as target for MVA Y_train = traindataset.target.astype(int) X_test = valdataset[training_columns].values Y_test = valdataset.target.astype(int) num_variables = len(training_columns) # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] train_df.drop(['EventWeight'], axis=1, inplace=True) train_df.drop(['xsec_rwgt'], axis=1, inplace=True) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' #Plotter.correlation_matrix(train_df) #Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight'] sampleweights = traindataset.loc[:, 'sampleWeight'] sampleweights = np.array(sampleweights) # Dictionaries of class weights to combat class imbalance if classweights_name == 'balanced': tuned_weighted = class_weight.compute_class_weight( 'balanced', np.unique([0, 1, 2, 3]), Y_train) if classweights_name == 'tunedweights': tuned_weighted = {0: 7.67, 1: 1.0, 2: 4.62, 3: 7.67} # Per instance weights calculation so we can correctly apply event weights to diagnostic plots train_weights = traindataset['EventWeight'].values * traindataset[ 'xsec_rwgt'].values test_weights = valdataset['EventWeight'].values * valdataset[ 'xsec_rwgt'].values # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) # Transform to one hot encoded arrays # Y_train = np_utils.to_categorical(encoded_Y) # Y_test = np_utils.to_categorical(encoded_Y_test) Y_train = to_categorical(encoded_Y) Y_test = to_categorical(encoded_Y_test) optimizer = 'Adam' #'Nadam' if do_model_fit == 1: histories = [] labels = [] # Define model and early stopping early_stopping_monitor = EarlyStopping(patience=100, monitor='val_loss', verbose=1) model3 = baseline_model(num_variables, optimizer, number_of_classes) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epochs = One pass over data (useful for periodic logging and evaluation) #history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=500,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor]) history3 = model3.fit(X_train, Y_train, validation_split=0.2, epochs=300, batch_size=1500, verbose=1, shuffle=True, sample_weight=sampleweights, callbacks=[early_stopping_monitor]) histories.append(history3) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) # Which model do you want the rest of the plots for? model = model3 else: # Which model do you want to load? model_name = os.path.join(output_directory, 'model.h5') print('<train-DNN> Loaded Model: %s' % (model_name)) model = load_trained_model(model_name, num_variables, optimizer, number_of_classes) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory # Make overfitting plots of output nodes Plotter.overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) # Get true process values for testing dataset original_encoded_test_Y = [] for i in range(len(result_probs_test)): if Y_test[i][0] == 1: original_encoded_test_Y.append(0) if Y_test[i][1] == 1: original_encoded_test_Y.append(1) if Y_test[i][2] == 1: original_encoded_test_Y.append(2) if Y_test[i][3] == 1: original_encoded_test_Y.append(3) # Get true process integers for training dataset original_encoded_train_Y = [] for i in range(len(result_probs)): if Y_train[i][0] == 1: original_encoded_train_Y.append(0) if Y_train[i][1] == 1: original_encoded_train_Y.append(1) if Y_train[i][2] == 1: original_encoded_train_Y.append(2) if Y_train[i][3] == 1: original_encoded_train_Y.append(3) # Get true class values for testing dataset result_classes_test = newencoder.inverse_transform(result_classes_test) result_classes_train = newencoder.inverse_transform(result_classes) # Create confusion matrices for training and testing performance Plotter.conf_matrix(original_encoded_train_Y, result_classes_train, train_weights, 'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y, result_classes_test, test_weights, 'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') Plotter.conf_matrix(original_encoded_train_Y, result_classes_train, train_weights, '') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y, result_classes_test, test_weights, '') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 0, 'ttHnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 1, 'Other') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 2, 'ttWnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 3, 'tHQnode')
def main(): print 'Using Keras version: ', keras.__version__ usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=0, type=int) parser.add_argument('-r', '--region', dest='region', help='Option to choose SigRegion or CtrlRegion', default='SigRegion', type=str) parser.add_argument( '-w', '--classweights', dest='classweights', help= 'Option to choose class weights (InverseNEventsTR, InverseSRYields or BalancedWeights)', default='InverseNEventsTR', type=str) parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='geq4j', type=str) args = parser.parse_args() do_model_fit = args.train_model region = args.region classweights_name = args.classweights selection = args.selection number_of_classes = 4 # Create instance of output directory where all results are saved. output_directory = '2019-06-18_CNN_LLFOnly_FunkAPI_particleinput_%s_%s_%s/' % ( selection, classweights_name, region) check_dir(output_directory) # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') lowlevel_invar_jsonFile = open('LOWLEVEL_invars_conv1DNN.json', 'r') highlevel_invar_jsonFile = open('HIGHLEVEL_invars_conv1DNN.json', 'r') if selection == 'geq4j': selection_criteria = 'Jet_numLoose>=4' if selection == 'geq3j': selection_criteria = 'Jet_numLoose>=3' if selection == 'eeq3j': selection_criteria = 'Jet_numLoose==3' # WARNINING !!!! #variable_list = json.load(input_var_jsonFile,encoding="utf-8").items() lowlevel_invar_list = json.load(lowlevel_invar_jsonFile, encoding="utf-8").items() highlevel_invar_list = json.load(highlevel_invar_jsonFile, encoding="utf-8").items() lowlevel_column_headers = [] for key, var in lowlevel_invar_list: lowlevel_column_headers.append(key) lowlevel_column_headers.append('EventWeight') lowlevel_column_headers.append('xsec_rwgt') highlevel_column_headers = [] for key, var in highlevel_invar_list: if 'hadTop_BDT' in key: key = 'hadTop_BDT' if 'Hj1_BDT' in key: key = 'Hj1_BDT' if 'Hj_tagger_hadTop' in key: key = 'Hj_tagger_hadTop' highlevel_column_headers.append(key) highlevel_column_headers.append('EventWeight') highlevel_column_headers.append('xsec_rwgt') # Create instance of the input files directory inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/Training_samples_looselepsel/' print 'Getting files from:', inputs_file_path lowlevel_features_DF_name = '%s/lowlevel_features_DF_%s_%s.csv' % ( output_directory, region, selection) highlevel_features_DF_name = '%s/highlevel_features_DF_%s_%s.csv' % ( output_directory, region, selection) if os.path.isfile(lowlevel_features_DF_name): lowlevel_features_data = pandas.read_csv(lowlevel_features_DF_name) print 'Loading %s . . . . ' % (lowlevel_features_DF_name) else: print 'Creating and loading new data file in %s . . . . ' % ( inputs_file_path) lowlevel_features_data = load_data(inputs_file_path, lowlevel_column_headers, selection_criteria) lowlevel_features_data.to_csv(lowlevel_features_DF_name, index=False) lowlevel_features_data = pandas.read_csv(lowlevel_features_DF_name) if os.path.isfile(highlevel_features_DF_name): highlevel_features_data = pandas.read_csv(highlevel_features_DF_name) print 'Loading %s . . . . ' % (highlevel_features_DF_name) else: print 'Creating and loading new data file in %s . . . . ' % ( inputs_file_path) highlevel_features_data = load_data(inputs_file_path, highlevel_column_headers, selection_criteria) highlevel_features_data.to_csv(highlevel_features_DF_name, index=False) highlevel_features_data = pandas.read_csv(highlevel_features_DF_name) Plotter = plotter() # Split pandas dataframe into train/test lowlevel_traindataset, lowlevel_valdataset = train_test_split( lowlevel_features_data, test_size=0.2) highlevel_traindataset, highlevel_valdataset = train_test_split( highlevel_features_data, test_size=0.2) print 'LOWLEVEL train dataset shape [ Nexamples: %s , Nfeatures: %s ]' % ( lowlevel_traindataset.shape[0], lowlevel_traindataset.shape[1]) print 'LOWLEVEL validation dataset shape [ Nexamples: %s , Nfeatures: %s ]' % ( lowlevel_valdataset.shape[0], lowlevel_valdataset.shape[1]) print 'HIGHLEVEL train dataset shape [ Nexamples: %s , Nfeatures: %s ]' % ( highlevel_traindataset.shape[0], highlevel_traindataset.shape[1]) print 'HIGHLEVEL validation dataset shape [ Nexamples: %s , Nfeatures: %s ]' % ( highlevel_valdataset.shape[0], highlevel_valdataset.shape[1]) #feature_corr_df = lowlevel_traindataset + highlevel_traindataset lowlevel_train_df = lowlevel_features_data.iloc[:lowlevel_traindataset. shape[0]] lowlevel_train_df.drop(['EventWeight'], axis=1, inplace=True) lowlevel_train_df.drop(['xsec_rwgt'], axis=1, inplace=True) highlevel_train_df = highlevel_features_data.iloc[:highlevel_traindataset. shape[0]] highlevel_train_df.drop(['EventWeight'], axis=1, inplace=True) highlevel_train_df.drop(['xsec_rwgt'], axis=1, inplace=True) # calculate event weights train_weights = lowlevel_traindataset[ 'EventWeight'].values * lowlevel_traindataset['xsec_rwgt'].values test_weights = lowlevel_valdataset[ 'EventWeight'].values * lowlevel_valdataset['xsec_rwgt'].values # Remove unwanted variables from columns list use for training lowlevel_training_columns = lowlevel_column_headers[:-2] highlevel_training_columns = highlevel_column_headers[:-2] # Collect just the values for the variables used in the training and testing data sets. lowlevel_X_train = lowlevel_traindataset[lowlevel_training_columns].values lowlevel_X_test = lowlevel_valdataset[lowlevel_training_columns].values reshaped_3D_data = reshape_for_particle_rep(lowlevel_traindataset, lowlevel_training_columns) reshaped_3D_data_test = reshape_for_particle_rep( lowlevel_valdataset, lowlevel_training_columns) highlevel_X_train = highlevel_traindataset[ highlevel_training_columns].values highlevel_X_test = highlevel_valdataset[highlevel_training_columns].values Y_train = lowlevel_traindataset.target.astype(int) Y_test = lowlevel_valdataset.target.astype(int) # Need to reshape data to have spatial dimension for conv1d lowlevel_X_train = np.expand_dims(lowlevel_X_train, axis=-1) lowlevel_X_test = np.expand_dims(lowlevel_X_test, axis=-1) #print 'Reshaped lowlevel_data to include spatial dimension for conv1d. New shape = ', lowlevel_X_train.shape lowlevel_num_variables = len(lowlevel_training_columns) highlevel_num_variables = len(highlevel_training_columns) ## Input Variable Correlations lowlevel_correlation_plot_file_name = 'lowlevel_correlation_plot.png' #Plotter.correlation_matrix(lowlevel_train_df) #Plotter.save_plots(dir=plots_dir, filename=lowlevel_correlation_plot_file_name) highlevel_correlation_plot_file_name = 'highlevel_correlation_plot.png' #Plotter.correlation_matrix(highlevel_train_df) #Plotter.save_plots(dir=plots_dir, filename=highlevel_correlation_plot_file_name) # =============== Weights ================== # WARNING! 'sample_weight' will overide 'class_weight' # ========================================== # Sample | ttH | tt+jets | ttW | ttZ | ############################ # ======= geq 4 Jets ====== ############################ # Loose lepton TR selection ############################ # XS 0.2118 831. 0.2043 0.2529 # # events in TR | 221554 | 1168897 | 321674 | 204998 | # Sum of weights: | 94.379784 | 7372.112793 | 206.978439 | 122.834419 | # Yields 2LSS SR HIG 18-019 | 60.08 | 140.25+22.79+17.25 | 151.03 | 87.05 | # =180.29 ############################ #= Control Region (== 3 Jets) ############################ # Loose lepton TR selection ############################ # # events in TR | 39418 | 568724 | 111809 | 58507 | # Sum of weights | 24.269867 | 3807.655762 | 102.885391 | 58.825554 | # Yields 2LSS ttWctrl | 14.36 | 120.54 + 9.55 | 75.97 | 38.64 | # AN2018-098-v18 # Yields 2LSS SR HIG 18-019 | 60.08 | 140.25+22.79+17.25 | 151.03 | 87.05 | # Yields 2LSS ttWctrl | 14.36 | 120.54 + 9.55 | 75.97 | 38.64 | # Yields 2LSS >= 3 jets | 74.44 | 310.38 | 227.00 | 125.69 | if classweights_name == 'InverseSRYields': if selection == 'geq4j': tuned_weighted = { 0: 0.0166445, 1: 0.00554662, 2: 0.00662120, 3: 0.0114877 } if selection == 'geq3j': tuned_weighted = { 0: 0.01343363782, 1: 0.00322185707, 2: 0.00440528634, 3: 0.00795608242 } elif classweights_name == 'InverseNEventsTR': tuned_weighted = { 0: 0.00000451357, 1: 0.000000855507, 2: 0.00000310874, 3: 0.00000487810 } elif classweights_name == 'InverseSumWeightsTR': tuned_weighted = { 0: 0.01059548939, 1: 0.00013564632, 2: 0.00483142111, 3: 0.00814104066 } print 'class weights : ', classweights_name # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) # Transform to one hot encoded arrays Y_train = np_utils.to_categorical(encoded_Y) Y_test = np_utils.to_categorical(encoded_Y_test) #print 'num_variables = ',num_variables optimizer = 'Adam' if do_model_fit == 1: # Training new model histories = [] labels = [] early_stopping_monitor = EarlyStopping(patience=50, monitor='val_loss', verbose=1) # Lists for HP scan #optimizers = ['Adamax','Adam','Nadam'] #batchessize = np.array([100,200,500,1000]) # Define a model #model3 = baseline_model(lowlevel_num_variables, optimizer) print 'Low-level data shape:' print reshaped_3D_data.shape model4 = functional_CNN_model(reshaped_3D_data.shape[1], optimizer, highlevel_num_variables) # Fit the model using training data. # Batch size = number of examples before updating weights (larger = faster training) #history4 = model4.fit([reshaped_3D_data,highlevel_X_train],Y_train,validation_split=0.2,epochs=200,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor]) history4 = model4.fit(reshaped_3D_data, Y_train, validation_split=0.2, epochs=200, batch_size=1000, verbose=1, shuffle=True, class_weight=tuned_weighted, callbacks=[early_stopping_monitor]) # Store history for performance by epoch plot. histories.append(history4) labels.append(optimizer) Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) # Which model do you want the rest of the plots for? #model = model3 model = model4 else: # Which model do you want to load? model_name = os.path.join(output_directory, 'model.h5') print 'Loading %s' % (model_name) #model = load_trained_model(model_name, num_variables, optimizer) model = load_trained_CNN_model(model_name, reshaped_3D_data.shape[1], optimizer, highlevel_num_variables) # Node probabilities for training sample events # Is this the same as in the DNN case? result_probs_train = model.predict([reshaped_3D_data, highlevel_X_train]) # Get maximum probability result_classes_train = result_probs_train.argmax(axis=-1) # Node probabilities for testing sample events result_probs_test = model.predict( [reshaped_3D_data_test, highlevel_X_train]) result_classes_test = result_probs_test.argmax(axis=-1) # Store model in hdf5 format model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) # Save model weights only seperately as well in hdf5 format weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) # Make sure to save model in json format as well model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # Initialise output directory where plotter results will be saved. Plotter.output_directory = output_directory # Make overfitting plots #Plotter.overfitting(model, Y_train, Y_test, result_probs_train, result_probs_test, plots_dir, train_weights, test_weights) # Make list of true labels e.g. (0,1,2,3) original_encoded_test_Y = [] for i in xrange(len(result_probs_test)): if Y_test[i][0] == 1: original_encoded_test_Y.append(0) if Y_test[i][1] == 1: original_encoded_test_Y.append(1) if Y_test[i][2] == 1: original_encoded_test_Y.append(2) if Y_test[i][3] == 1: original_encoded_test_Y.append(3) original_encoded_train_Y = [] for i in xrange(len(result_probs_train)): if Y_train[i][0] == 1: original_encoded_train_Y.append(0) if Y_train[i][1] == 1: original_encoded_train_Y.append(1) if Y_train[i][2] == 1: original_encoded_train_Y.append(2) if Y_train[i][3] == 1: original_encoded_train_Y.append(3) # Invert LabelEncoder transform back to original truth labels result_classes_train = newencoder.inverse_transform(result_classes_train) result_classes_test = newencoder.inverse_transform(result_classes_test) Plotter.plots_directory = plots_dir # Create confusion matrices for training and testing performance #Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') #Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') #Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') #Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train, original_encoded_test_Y, result_probs_test, 0, 'ttHnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train, original_encoded_test_Y, result_probs_test, 1, 'ttJnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train, original_encoded_test_Y, result_probs_test, 2, 'ttWnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train, original_encoded_test_Y, result_probs_test, 3, 'ttZnode')
def main(): print 'Using Keras version: ', keras.__version__ usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=0, type=int) parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='tH', type=str) args = parser.parse_args() do_model_fit = args.train_model selection = args.selection # Number of classes to use number_of_classes = 4 # Create instance of output directory where all results are saved. #output_directory = '2017tautag2p1samples_EVw8s_oldvars_%s_selection/' % (selection) output_directory = '2017samples_xmasupdates_%s_selection/' % (selection) check_dir(output_directory) # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json', 'r') #input_var_jsonFile = open('input_features_new.json','r') if selection == 'tH': selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)' # Load Variables from .json variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('EventWeight') column_headers.append('xsec_rwgt') column_headers.append('nEvent') # Create instance of the input files directory inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/rootplas_LegacyMVA_update_mbb_20191229/DiLepRegion/ttH2017TrainDNN2L/' # Load ttree into .csv including all variables listed in column_headers print '<train-DNN> Input file path: ', inputs_file_path outputdataframe_name = '%s/output_dataframe_%s.csv' % (output_directory, selection) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print '<train-DNN> Loading data .csv from: %s . . . . ' % ( outputdataframe_name) else: print '<train-DNN> Creating new data .csv @: %s . . . . ' % ( inputs_file_path) data = load_data(inputs_file_path, column_headers, selection_criteria) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) # Make instance of plotter tool Plotter = plotter() # Create statistically independant lists train/test data (used to train/evaluate the network) traindataset, valdataset = train_test_split(data, test_size=0.2) valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False) #print '<train-DNN> Training dataset shape: ', traindataset.shape #print '<train-DNN> Validation dataset shape: ', valdataset.shape # Remove last two columns (Event weight and xsrw) from column headers training_columns = column_headers[:-3] print '<train-DNN> Training features: ', training_columns # Select data from columns under the remaining column headers in traindataset X_train = traindataset[training_columns].values # Select data from 'target' as target for MVA Y_train = traindataset.target.astype(int) X_test = valdataset[training_columns].values Y_test = valdataset.target.astype(int) num_variables = len(training_columns) # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] train_df.drop(['EventWeight'], axis=1, inplace=True) train_df.drop(['xsec_rwgt'], axis=1, inplace=True) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #################################################################################### # Weights applied during training. You will also need to update the class weights if # you are going to change the event weights applied. Introduce class weights and any # event weight you want to use here. #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight'] sampleweights = traindataset.loc[:, 'sampleWeight'] * traindataset.loc[:, 'xsec_rwgt'] sampleweights = np.array(sampleweights) # Event weights calculation so we can correctly apply event weights to diagnostic plots. # use seperate list because we don't want to apply class weights in plots. #train_weights = traindataset['EventWeight'].values #test_weights = valdataset['EventWeight'].values train_weights = traindataset['xsec_rwgt'].values test_weights = valdataset['xsec_rwgt'].values # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) # Transform to one hot encoded arrays Y_train = np_utils.to_categorical(encoded_Y) Y_test = np_utils.to_categorical(encoded_Y_test) optimizer = 'Adam' #'Nadam' if do_model_fit == 1: histories = [] labels = [] # Define model and early stopping early_stopping_monitor = EarlyStopping(patience=150, monitor='val_loss', verbose=1) model3 = baseline_model(num_variables, optimizer, number_of_classes) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epochs = One pass over data (useful for periodic logging and evaluation) history3 = model3.fit(X_train, Y_train, validation_split=0.2, epochs=300, batch_size=1500, verbose=1, shuffle=True, sample_weight=sampleweights, callbacks=[early_stopping_monitor]) histories.append(history3) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) # Which model do you want the rest of the plots for? model = model3 else: # Which model do you want to load? model_name = os.path.join(output_directory, 'model.h5') print '<train-DNN> Loaded Model: %s' % (model_name) model = load_trained_model(model_name, num_variables, optimizer, number_of_classes) '''continuetraining=1 if continuetraining == 1: new_model = load_model(model_name) assert_allclose(new_model.predict(X_train),new_model.predict(X_train),1e-5) checkpoint = ModelCheckpoint(model_name, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] history3 = new_model.fit(X_train,Y_train,validation_split=0.2,epochs=50,batch_size=1500,verbose=1,shuffle=True,sample_weight=sampleweights,callbacks=callbacks_list)''' # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory # Make overfitting plots of output nodes #Plotter.overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) # Get true process values for testing dataset original_encoded_test_Y = [] for i in xrange(len(result_probs_test)): if Y_test[i][0] == 1: original_encoded_test_Y.append(0) if Y_test[i][1] == 1: original_encoded_test_Y.append(1) if Y_test[i][2] == 1: original_encoded_test_Y.append(2) if Y_test[i][3] == 1: original_encoded_test_Y.append(3) # Get true process integers for training dataset original_encoded_train_Y = [] for i in xrange(len(result_probs)): if Y_train[i][0] == 1: original_encoded_train_Y.append(0) if Y_train[i][1] == 1: original_encoded_train_Y.append(1) if Y_train[i][2] == 1: original_encoded_train_Y.append(2) if Y_train[i][3] == 1: original_encoded_train_Y.append(3) # Get true class values for testing dataset result_classes_test = newencoder.inverse_transform(result_classes_test) result_classes_train = newencoder.inverse_transform(result_classes) # Create confusion matrices for training and testing performance '''Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns') Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns') Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png') ''' '''Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')''' '''
def main(): print '' DNN_applier = apply_DNN() usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-p', '--processName', dest='processName', help= 'Process name. List of options in keys of process_filename dictionary', default=[], type=str, nargs='+') parser.add_argument('-r', '--region', dest='region', help='Option to choose e.g. DiLepRegion', default='DiLepRegion', type=str) parser.add_argument( '-j', '--JES', dest='JES', help= 'Option to choose whether to run on JES Syst samples (0=Nominal, 1=JESUp, 2=JESDown)', default=0, type=int) parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='tH', type=str) parser.add_argument('-y', '--year', dest='year', help='Option to choose year settings', default='2017', type=str) args = parser.parse_args() processes = args.processName region = args.region JES_flag = args.JES selection = args.selection nClasses = 4 annum = args.year print '<unit_test_evaluation> Succesfully parsed arguments: processName= [%s], region= %s, JES_flag= %s , selection= %s' % ( processes, region, JES_flag, selection) #outputname = '2017samples_tH_tunedweights_%s' % (selection) outputname = 'debug_%s' % (selection) input_var_jsonFile = '' if JES_flag == 1: outputname = outputname + '_JESUp' if JES_flag == 2: outputname = outputname + '_JESDown' # Open and load input variable .json input_var_jsonFile = open('../input_vars_SigRegion_wFwdJet.json', 'r') variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Append variables to a list of column headers for .csv file later column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('EventWeight') column_headers.append('nEvent') if JES_flag == 0: JESname = '' elif JES_flag == 1: JESname = 'JESUp' elif JES_flag == 2: JESname = 'JESDown' # Dictionary of filenames to be run over along with their keys. process_filename = { 'ttH_HWW': ('TTH_hww_' + JESname + region), 'ttH_Hmm': ('TTH_hmm_' + JESname + region), 'ttH_Htautau': ('TTH_htt_' + JESname + region), 'ttH_HZZ': ('TTH_hzz_' + JESname + region), 'ttH_HZG': ('TTH_hzg_' + JESname + region), 'tHq_HWW': ('THQ_hww_' + JESname + region), 'tHq_Htautau': ('THQ_htt_' + JESname + region), 'tHq_HZZ': ('THQ_hzz_' + JESname + region), 'tHq_HMM': ('THQ_hmm_' + JESname + region), 'tHq_HZG': ('THQ_hzg_' + JESname + region), 'tHW_HWW': ('THW_hww_' + JESname + region), 'tHW_Htautau': ('THW_htt_' + JESname + region), 'tHW_HZZ': ('THW_hzz_' + JESname + region), 'tHW_HMM': ('THW_hmm_' + JESname + region), 'tHW_HZG': ('THW_hzg_' + JESname + region), 'ttWW': ('TTWW_' + JESname + region), 'ttW': ('TTW_' + JESname + region), 'ttZ': ('TTZ_' + JESname + region), 'Conv': ('Convs_' + JESname + region), 'EWK': ('EWK_' + JESname + region), 'Fakes': ('Fakes_' + JESname + region), 'Flips': ('Flips_' + JESname + region), 'Rares': ('Rares_' + JESname + region), 'FakeSub': ('FakeSub_' + JESname + region), 'ttbar_closure': ('TT_Clos' + JESname + region), 'Data': ('Data_' + JESname + region) } # Remove 'nEvent' from columns that will be used during in training training_columns = column_headers[:-2] num_variables = len(training_columns) # Name of directory that contains trained MVA model to apply. input_models_path = '' if selection == 'tH': input_models_path = ['2017samples_tH_tunedweights'] # Load trained model optimizer = 'Adam' model_name_1 = os.path.join('../', input_models_path[0], 'model.h5') model_1 = DNN_applier.load_trained_model(model_name_1, num_variables, optimizer, nClasses) # Make instance of plotter class Plotter = plotter() # Lists for all events in all files. Used to make diagnostic plots of networks performance over all samples. true_process = [] model1_probs_ = [] model1_pred_process = [] EventWeights_ = [] # Now loop over all samples for process in processes: print '<unit_test_evaluation> Process: ', process current_sample_name = process_filename.get(process) # Use JES flag to decide if we are running on a JES varied sample or not. if JES_flag == 1: inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s%s/' % ( 'JESUp', region) elif JES_flag == 2: inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s%s/' % ( 'JESDown', region) else: inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s/%s/%s/' % ( region, annum, region) print '<unit_test_evaluation> Input file directory: ', inputs_file_path # Make final output directory samples_dir_w_appended_DNN = 'samples_w_DNN' if not os.path.exists(samples_dir_w_appended_DNN): os.makedirs(samples_dir_w_appended_DNN) samples_final_path_dir = os.path.join(samples_dir_w_appended_DNN, outputname) if not os.path.exists(samples_final_path_dir): os.makedirs(samples_final_path_dir) if JES_flag == 1: JES_label = 'JESUp' elif JES_flag == 2: JES_label = 'JESDown' else: JES_label = 'nominal' dataframe_name = '%s/%s_dataframe_%s_%s.csv' % ( samples_final_path_dir, process, region, JES_label) if os.path.isfile(dataframe_name): print '<unit_test_evaluation> Loading %s . . . . ' % dataframe_name data = pandas.read_csv(dataframe_name) else: print '<unit_test_evaluation> Making *new* data file from %s . . . . ' % ( inputs_file_path) print '<unit_test_evaluation> Applying selection ', selection selection_criteria = '' if selection == 'tH': selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)' #&& n_presel_jet>=3' data = DNN_applier.load_data(inputs_file_path, column_headers, selection_criteria, process, process_filename.get(process)) if len(data) == 0: print '<unit_test_evaluation> No data! Next file.' continue print 'Saving new data .csv file at %s . . . . ' % (dataframe_name) data.to_csv(dataframe_name, index=False) nEvent = data['nEvent'] # Using dataset like this instead of for loop over entries for predictions (necessary for keras). print '<unit_test_evaluation> Input features: ', training_columns X_test = data.iloc[:, 0:num_variables] X_test = X_test.values result_probs_ = model_1.predict(np.array(X_test)) # Create dictionary where the value is the array of probabilities for the four categories and the key is the event number. eventnum_resultsprob_dict = {} for index in range(result_probs_.shape[0]): eventnum_resultsprob_dict[nEvent[index]] = result_probs_[index] model1_probs_.append(result_probs_[index]) inputlist = DNN_applier.getEOSlslist(directory=inputs_file_path + current_sample_name + ".root") current_file = str(inputlist[0]) print '<unit_test_evaluation> Input file: ', current_file # Open files and load ttrees data_file = TFile.Open(current_file) data_tree = data_file.Get("syncTree") # Check if input file is zombie if data_file.IsZombie(): raise IOError('missing file') output_file_name = '%s/%s.root' % (samples_final_path_dir, process_filename.get(process)) print '<unit_test_evaluation> Creating new output .root file' output_file = TFile.Open(output_file_name, 'RECREATE') # CloneTree(nentries) - here copying none of the actually entries output_tree = data_tree.CloneTree(0) output_tree.SetName("output_tree") # Turn off all branches except ones you need if you want to speed up run time? output_tree.SetBranchStatus('*', 1) # Append DNN Branches to new TTree # Add branches for values from highest output node and sentinel values for other nodes i.e. 'categorised' eval_ttHnode_cat = array('f', [0.]) eval_Othernode_cat = array('f', [0.]) eval_ttWnode_cat = array('f', [0.]) eval_tHQnode_cat = array('f', [0.]) ttH_branch_cat = output_tree.Branch('DNN_ttHnode_cat', eval_ttHnode_cat, 'DNN_ttHnode_cat/F') Other_branch_cat = output_tree.Branch('DNN_Othernode_cat', eval_Othernode_cat, 'DNN_Othernode_cat/F') ttW_branch_cat = output_tree.Branch('DNN_ttWnode_cat', eval_ttWnode_cat, 'DNN_ttWnode_cat/F') tHQ_branch_cat = output_tree.Branch('DNN_tHQnode_cat', eval_tHQnode_cat, 'DNN_tHQnode_cat/F') # un-categorised DNN variables eval_ttHnode_all = array('f', [0.]) eval_Othernode_all = array('f', [0.]) eval_ttWnode_all = array('f', [0.]) eval_tHQnode_all = array('f', [0.]) ttH_branch_all = output_tree.Branch('DNN_ttHnode_all', eval_ttHnode_all, 'DNN_ttHnode_all/F') Other_branch_all = output_tree.Branch('DNN_othernode_all', eval_Othernode_all, 'DNN_Othernode_all/F') ttW_branch_all = output_tree.Branch('DNN_ttWnode_all', eval_ttWnode_all, 'DNN_ttWnode_all/F') tHQ_branch_all = output_tree.Branch('DNN_tHQnode_all', eval_tHQnode_all, 'DNN_tHQnode_all/F') # Now add branches conatining the max value for each event and the category for each event eval_maxval = array('f', [0.]) DNNCat = array('f', [0.]) DNNmaxval_branch = output_tree.Branch('DNN_maxval', eval_maxval, 'DNN_maxval/F') DNNCat_branch = output_tree.Branch('DNNCat', DNNCat, 'DNNCat/F') sample_name = process histoname_type = 'Category' histo_ttHclassified_events_title = 'ttH %s Events: %s Sample' % ( histoname_type, sample_name) histo_ttHclassified_events_name = 'histo_ttH%s_events_%s' % ( histoname_type, sample_name) histo_ttHclassified_events = ROOT.TH1D( histo_ttHclassified_events_name, histo_ttHclassified_events_title, 200, 0, 1.) histo_Otherclassified_events_title = 'Other %s Events: %s Sample' % ( histoname_type, sample_name) histo_Otherclassified_events_name = 'histo_Other%s_events_%s' % ( histoname_type, sample_name) histo_Otherclassified_events = ROOT.TH1D( histo_Otherclassified_events_name, histo_Otherclassified_events_title, 200, 0, 1.) histo_ttWclassified_events_title = 'ttW %s Events: %s Sample' % ( histoname_type, sample_name) histo_ttWclassified_events_name = 'histo_ttW%s_events_%s' % ( histoname_type, sample_name) histo_ttWclassified_events = ROOT.TH1D( histo_ttWclassified_events_name, histo_ttWclassified_events_title, 200, 0, 1.) histo_tHQclassified_events_title = 'tHQ %s Events: %s Sample' % ( histoname_type, sample_name) histo_tHQclassified_events_name = 'histo_tHQ%s_events_%s' % ( histoname_type, sample_name) histo_tHQclassified_events = ROOT.TH1D( histo_tHQclassified_events_name, histo_tHQclassified_events_title, 200, 0, 1.) temp_percentage_done = 0 uniqueEventID = [] ######## Loop over ttree ######### print '<unit_test_evaluation> data_tree # Entries: ', data_tree.GetEntries( ) if output_tree.GetEntries() != 0: print '<unit_test_evaluation> output_tree # Entries: ', output_tree.GetEntries( ) print 'This tree should be empty at this point!!!!! check cloning correctly' for i in range(data_tree.GetEntries()): eval_ttHnode_cat[0] = -1. eval_Othernode_cat[0] = -1. eval_ttWnode_cat[0] = -1. eval_tHQnode_cat[0] = -1. eval_ttHnode_all[0] = -1. eval_Othernode_all[0] = -1. eval_ttWnode_all[0] = -1. eval_tHQnode_all[0] = -1. eval_maxval[0] = -1. DNNCat[0] = -1. percentage_done = int(100 * float(i) / float(data_tree.GetEntries())) if percentage_done % 10 == 0: if percentage_done != temp_percentage_done: print percentage_done temp_percentage_done = percentage_done data_tree.GetEntry(i) Eventnum_ = array('d', [0]) Eventnum_ = data_tree.nEvent EventWeight_ = array('d', [0]) EventWeight_ = data_tree.EventWeight xsec_rwgt_ = array('d', [0]) xsec_rwgt_ = data_tree.xsec_rwgt n_presel_jet = array('d', [0]) n_presel_jet = data_tree.n_presel_jet is_tH_like_and_not_ttH_like = array('d', [0]) is_tH_like_and_not_ttH_like = output_tree.is_tH_like_and_not_ttH_like if (is_tH_like_and_not_ttH_like == 0 or is_tH_like_and_not_ttH_like == 1): #and n_presel_jet>=3: pass_selection = 1 else: pass_selection = 0 if selection == 'tH': if pass_selection == 0: continue else: print 'NO selection applied!' '''if Eventnum_ in uniqueEventID: print 'Eventnum_ : %s already exists ' % Eventnum_ continue else: uniqueEventID.append(Eventnum_) ''' if 'ttH_' in process: true_process.append(0) elif 'Fakes' in process or 'Flips' in process: true_process.append(1) elif 'ttW' in process: true_process.append(2) elif 'tHq' in process: true_process.append(3) else: true_process.append(4) EventWeights_.append(EventWeight_) evaluated_node_values = [] #print 'Eventnum_: ', Eventnum_ #for key,var in variable_list: # print 'key: %s, value: %s' % (key , data_tree.GetLeaf(key).GetValue()) #print 'True process: ', true_process # Get the value for event on each of the DNN nodes evaluated_node_values = DNN_applier.evaluate_model( eventnum_resultsprob_dict, Eventnum_) #print 'evaluated_node_values: ', evaluated_node_values # Get the maximum output value maxval = max(evaluated_node_values) # Find the max value in and return its position (i.e. node classification) event_classification = evaluated_node_values.index(maxval) #print 'event_classification: ', event_classification # Append classification value to list of predictions model1_pred_process.append(event_classification) #print 'model1_pred_process: ', model1_pred_process eval_ttHnode_all[0] = evaluated_node_values[0] eval_Othernode_all[0] = evaluated_node_values[1] eval_ttWnode_all[0] = evaluated_node_values[2] eval_tHQnode_all[0] = evaluated_node_values[3] DNNCat[0] = float(event_classification) eval_maxval[0] = evaluated_node_values[event_classification] if event_classification == 0: histo_ttHclassified_events.Fill(evaluated_node_values[0], EventWeight_) eval_ttHnode_cat[0] = evaluated_node_values[0] eval_Othernode_cat[0] = -1. eval_ttWnode_cat[0] = -1. eval_tHQnode_cat[0] = -1. elif event_classification == 1: histo_Otherclassified_events.Fill(evaluated_node_values[1], EventWeight_) eval_ttHnode_cat[0] = -1. eval_Othernode_cat[0] = evaluated_node_values[1] eval_ttWnode_cat[0] = -1. eval_tHQnode_cat[0] = -1. elif event_classification == 2: histo_ttWclassified_events.Fill(evaluated_node_values[2], EventWeight_) eval_ttHnode_cat[0] = -1. eval_Othernode_cat[0] = -1. eval_ttWnode_cat[0] = evaluated_node_values[2] eval_tHQnode_cat[0] = -1. elif event_classification == 3: histo_tHQclassified_events.Fill(evaluated_node_values[3], EventWeight_) eval_ttHnode_cat[0] = -1. eval_Othernode_cat[0] = -1. eval_ttWnode_cat[0] = -1. eval_tHQnode_cat[0] = evaluated_node_values[3] else: histo_ttHclassified_events.Fill(-1., EventWeight_) histo_Otherclassified_events.Fill(-1., EventWeight_) histo_ttWclassified_events.Fill(-1., EventWeight_) histo_tHQclassified_events.Fill(-1., EventWeight_) eval_ttHnode_cat[0] = -1. eval_Othernode_cat[0] = -1. eval_ttWnode_cat[0] = -1. eval_tHQnode_cat[0] = -1. print '<unit_test_evaluation> NO classification for event!?' continue output_tree.Fill() print '<unit_test_evaluation> Clear # event - DNN result dictionary' eventnum_resultsprob_dict.clear() print '<unit_test_evaluation> Write output file : %s ' % ( output_file_name) output_file.Write() print '<unit_test_evaluation> Close output file' output_file.Close() print '<unit_test_evaluation> Close input file' data_file.Close() plots_dir = os.path.join(samples_final_path_dir, 'plots/') Plotter.plots_directory = plots_dir Plotter.conf_matrix(true_process, model1_pred_process, EventWeights_, '') Plotter.save_plots(dir=plots_dir, filename='yields_non_norm_confusion_matrix_APPL.png') Plotter.conf_matrix(true_process, model1_pred_process, EventWeights_, 'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_APPL.png') model1_probs_ = np.array(model1_probs_) Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 0, 'ttHnode') Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 1, 'Othernode') Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 2, 'ttWnode') Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 3, 'tHQnode') exit(0)
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument( '-i', '--inputs_file_path', dest='inputs_file_path', help= 'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/' hyp_param_scan = args.hyp_param_scan # Set model hyper-parameters weights = 'BalanceYields' # 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split = 0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size = 200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size = 32 #epochs = 10 #batch_size=200 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory, 'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file, 'w') additional_hyperparams.write("optimizer: " + optimizer + "\n") additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n") additional_hyperparams.write("epochs: " + str(epochs) + "\n") additional_hyperparams.write("validation_split: " + str(validation_split) + "\n") additional_hyperparams.write("weights: " + weights + "\n") # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_variables.json', 'r') selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )' # Load Variables from .json variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' % (output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path, column_headers, selection_criteria) # Change sentinal value to speed up training. data = data.mask(data < -25., -9.) #data = data.replace(to_replace=-99.,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'weight'] weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'weight'] weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'weight'] weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'weight'] weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'weight'] weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'weight'] HHsum_weighted = sum(weights_for_HH) Hggsum_weighted = sum(weights_for_Hgg) DiPhotonsum_weighted = sum(weights_for_DiPhoton) GJetsum_weighted = sum(weights_for_GJet) QCDsum_weighted = sum(weights_for_QCD) DYsum_weighted = sum(weights_for_DY) TTGsJetssum_weighted = sum(weights_for_TTGsJets) WGsJetssum_weighted = sum(weights_for_WGsJets) WWsum_weighted = sum(weights_for_WW) bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'unweighted'] nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'unweighted'] nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'unweighted'] nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'unweighted'] nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'unweighted'] nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'unweighted'] HHsum_unweighted = sum(nevents_for_HH) Hggsum_unweighted = sum(nevents_for_Hgg) DiPhotonsum_unweighted = sum(nevents_for_DiPhoton) GJetsum_unweighted = sum(nevents_for_GJet) QCDsum_unweighted = sum(nevents_for_QCD) DYsum_unweighted = sum(nevents_for_DY) TTGsJetssum_unweighted = sum(nevents_for_TTGsJets) WGsJetssum_unweighted = sum(nevents_for_WGsJets) WWsum_unweighted = sum(nevents_for_WW) bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted HHsum_weighted = 2 * HHsum_weighted HHsum_unweighted = 2 * HHsum_unweighted if weights == 'BalanceYields': print('HHsum_weighted= ', HHsum_weighted) print('Hggsum_weighted= ', Hggsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('QCDsum_weighted= ', QCDsum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('TTGsJetssum_weighted= ', TTGsJetssum_weighted) print('WGsJetssum_weighted= ', WGsJetssum_weighted) print('WWsum_weighted= ', WWsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = HHsum_unweighted / HHsum_weighted traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) if weights == 'BalanceNonWeighted': print('HHsum_unweighted= ', HHsum_unweighted) print('Hggsum_unweighted= ', Hggsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('QCDsum_unweighted= ', QCDsum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted) print('WGsJetssum_unweighted= ', WGsJetssum_unweighted) print('WWsum_unweighted= ', WWsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' % (output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i + "\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights == 'BalanceYields': trainingweights = traindataset.loc[:, 'classweight'] * traindataset.loc[:, 'weight'] if weights == 'BalanceNonWeighted': trainingweights = traindataset.loc[:, 'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.png') Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.pdf') # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name, 'a') time_str = str(time.localtime()) + '\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates = [0.00001, 0.0001] epochs = [150, 200] batch_size = [400, 500] param_grid = dict(learn_rate=learn_rates, epochs=epochs, batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model, verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train, Y_train, shuffle=True, sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_, grid_result.best_params_)) hyp_param_scan_results.write( "Best score: %f , best params: %s\n" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean, stdev, param)) hyp_param_scan_results.write( "Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean, stdev, param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=100, monitor='val_loss', min_delta=0.01, verbose=1) #model = baseline_model(num_variables, learn_rate=learn_rate) model = new_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train, Y_train, validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True, sample_weight=trainingweights, callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.png') Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename + '.pdf') Plotter.history_plot(history, label='loss') Plotter.save_plots(dir=plots_dir, filename='history_loss.png') Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf') else: model_name = os.path.join(output_directory, 'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory Plotter.ROC(model, X_test, Y_test, X_train, Y_train) Plotter.save_plots(dir=plots_dir, filename='ROC.png') Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' hyp_param_scan=args.hyp_param_scan # Set model hyper-parameters weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split=0.1 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size=200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 200 batch_size=400 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file,'w') additional_hyperparams.write("optimizer: "+optimizer+"\n") additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n") additional_hyperparams.write("epochs: "+str(epochs)+"\n") additional_hyperparams.write("validation_split: "+str(validation_split)+"\n") additional_hyperparams.write("weights: "+weights+"\n") # Create plots subdirectory plots_dir = os.path.join(output_directory,'plots/') input_var_jsonFile = open('input_variables.json','r') selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # selection_criteria = '(AtLeast4GoodJets0Lep==1)' # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)' #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)' # Load Variables from .json variable_list = json.load(input_var_jsonFile,encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key,var in variable_list: column_headers.append(key) column_headers.append('weight') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Create instance of the input files directory #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars' inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples' #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/' #inputs_file_path = 'PromptPromptApplied/' # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' %(output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path,column_headers,selection_criteria) # Change sentinal value to speed up training. data = data.replace(to_replace=-999.000000,value=-9.0) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.1) valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight'] weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight'] weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight'] weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight'] weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight'] weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight'] HHsum_weighted= sum(weights_for_HH) GJetsum_weighted= sum(weights_for_GJet) DiPhotonsum_weighted= sum(weights_for_DiPhoton) TTGGsum_weighted= sum(weights_for_TTGG) TTGJetssum_weighted= sum(weights_for_TTGJets) TTJetssum_weighted= sum(weights_for_TTJets) WJetssum_weighted= sum(weights_for_WJets) ttHsum_weighted= sum(weights_for_ttH) DYsum_weighted= sum(weights_for_DY) #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted'] nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted'] nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted'] nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted'] nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted'] nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted'] HHsum_unweighted= sum(nevents_for_HH) GJetsum_unweighted= sum(nevents_for_GJet) DiPhotonsum_unweighted= sum(nevents_for_DiPhoton) TTGGsum_unweighted= sum(nevents_for_TTGG) TTGJetssum_unweighted= sum(nevents_for_TTGJets) TTJetssum_unweighted= sum(nevents_for_TTJets) WJetssum_unweighted= sum(nevents_for_WJets) ttHsum_unweighted= sum(nevents_for_ttH) DYsum_unweighted= sum(nevents_for_DY) #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted if weights=='BalanceYields': print('HHsum_weighted= ' , HHsum_weighted) print('ttHsum_weighted= ' , ttHsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('WJetssum_weighted= ', WJetssum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted) if weights=='BalanceNonWeighted': print('HHsum_unweighted= ' , HHsum_unweighted) print('ttHsum_unweighted= ' , ttHsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('WJetssum_unweighted= ', WJetssum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' %(output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i+"\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #################################################################################### # Weights applied during training. You will also need to update the class weights if # you are going to change the event weights applied. Introduce class weights and any # event weight you want to use here. #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight'] #trainingweights = np.array(trainingweights) # Temp hack to be able to change class weights without remaking dataframe #for inde in xrange(len(trainingweights)): # newweight = 13243.0/6306.0 # trainingweights[inde]= newweight #print 'training event weight = ', trainingweights[0] # Event weights calculation so we can correctly apply event weights to diagnostic plots. # use seperate list because we don't want to apply class weights in plots. # Event weights if wanted train_weights = traindataset['weight'].values test_weights = valdataset['weight'].values # Weights applied during training. if weights=='BalanceYields': trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight'] if weights=='BalanceNonWeighted': trainingweights = traindataset.loc[:,'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.pdf' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name,'a') time_str = str(time.localtime())+'\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates=[0.00001, 0.0001] epochs = [150,200] batch_size = [400,500] param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model,verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_)) hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param)) hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1) model = baseline_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) else: model_name = os.path.join(output_directory,'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory,'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory,'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory,'model_serialised.json') with open(model_json_name,'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory,'model_schematic.eps') print "DEBUG: ",model_schematic_name plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory ''' print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') ''' # Make overfitting plots of output nodes Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) print "DEBUG: Y_train shape: ",Y_train.shape # # Get true process integers for training dataset # original_encoded_train_Y = [] # for i in xrange(len(result_probs)): # if Y_train[i][0] == 1: # original_encoded_train_Y.append(0) # if Y_train[i][1] == 1: # original_encoded_train_Y.append(1) # if Y_train[i][2] == 1: # original_encoded_train_Y.append(2) # if Y_train[i][3] == 1: # original_encoded_train_Y.append(3) # Get true class values for testing dataset # result_classes_test = newencoder.inverse_transform(result_classes_test) # result_classes_train = newencoder.inverse_transform(result_classes) e = shap.DeepExplainer(model, X_train[:400, ]) shap_values = e.shap_values(X_test[:400, ]) Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers) Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers) #e = shap.GradientExplainer(model, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers) #e = shap.KernelExplainer(model.predict, X_train[:100, ]) #shap_values = e.shap_values(X_test[:100, ]) #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers) # Create confusion matrices for training and testing performance # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns') # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png') # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'') # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png') Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
def main(): print('') DNN_applier = apply_DNN() usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-p', '--processName', dest='processName', help= 'Process name. List of options in keys of process_filename dictionary', default=[], type=str, nargs='+') parser.add_argument( '-d', '--modeldir', dest='modeldir', help='Option to choose directory containing trained model') args = parser.parse_args() processes = args.processName nClasses = 1 modeldir = args.modeldir print( '<run_network_evaluation> Succesfully parsed arguments: processName= [%s], model directory= %s' % (processes, modeldir)) input_var_jsonFile = '' # Open and load input variable .json input_var_jsonFile = open('../input_variables.json', 'r') variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Append variables to a list of column headers for .csv file later column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('event') column_headers.append('weight') # Dictionary of filenames to be run over along with their keys. process_filename = { 'HHWWgg': ('HHWWgg-SL-SM-NLO-2017'), 'DiPhoton': ('DiPhotonJetsBox_MGG-80toInf_13TeV-Sherpa_Hadded'), 'GJet_Pt-20toInf': ('GJet_Pt-20toInf_DoubleEMEnriched_MGG-40to80_TuneCP5_13TeV_Pythia8_Hadded' ), 'GJet_Pt-20to40': ('GJet_Pt-20to40_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_Hadded' ), 'GJet_Pt-40toInf': ('GJet_Pt-40toInf_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_Hadded' ), 'DYJetsToLL_M-50': ('DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8_Hadded'), 'TTGJets': ('TTGJets_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8_Hadded'), 'TTGG': ('TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8_Hadded'), 'TTJets_HT-600to800': ('TTJets_HT-600to800_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'TTJets_HT-800to1200': ('TTJets_HT-800to1200_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'TTJets_HT-1200to2500': ('TTJets_HT-1200to2500_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'TTJets_HT-2500toInf': ('TTJets_HT-2500toInf_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'W1JetsToLNu_LHEWpT_0-50': ('W1JetsToLNu_LHEWpT_0-50_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W1JetsToLNu_LHEWpT_50-150': ('W1JetsToLNu_LHEWpT_50-150_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W1JetsToLNu_LHEWpT_150-250': ('W1JetsToLNu_LHEWpT_150-250_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W1JetsToLNu_LHEWpT_250-400': ('W1JetsToLNu_LHEWpT_250-400_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W1JetsToLNu_LHEWpT_400-inf': ('W1JetsToLNu_LHEWpT_400-inf_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W2JetsToLNu_LHEWpT_0-50': ('W2JetsToLNu_LHEWpT_0-50_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W2JetsToLNu_LHEWpT_50-150': ('W2JetsToLNu_LHEWpT_50-150_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W2JetsToLNu_LHEWpT_150-250': ('W2JetsToLNu_LHEWpT_150-250_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W2JetsToLNu_LHEWpT_250-400': ('W2JetsToLNu_LHEWpT_250-400_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W2JetsToLNu_LHEWpT_400-inf': ('W2JetsToLNu_LHEWpT_400-inf_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'), 'W3JetsToLNu': ('W3JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'W4JetsToLNu': ('W4JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'), 'ttHJetToGG': ('ttHJetToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8_Hadded') #'Data' : ('Data_'+JESname+region) } training_columns = column_headers[:-2] num_variables = len(training_columns) # Load trained model model_name_1 = os.path.join('../', modeldir, 'model.h5') print('<run_network_evaluation> Using Model: ', model_name_1) model_1 = load_model(model_name_1, compile=False) # Make instance of plotter class Plotter = plotter() # Lists for all events in all files. Used to make diagnostic plots of networks performance over all samples. true_process = [] model1_probs_ = [] EventWeights_ = [] succesfully_run_files = open("succesfully_run_files.txt", "w+") # Now loop over all samples for process in processes: print('<run_network_evaluation> Process: ', process) current_sample_name = process_filename.get(process) inputs_file_path = '/Users/joshuhathomas-wilsker/Documents/work/lxplus_remote/work/private/IHEP/HH/HHWWyy/HHWWgg_DataSignalMCnTuples/2017/' if 'HHWWgg' in process: inputs_file_path += 'Signal/' else: inputs_file_path += 'Bkgs/' print('<run_network_evaluation> Input file directory: ', inputs_file_path) # Make final output directory samples_dir_w_appended_DNN = 'samples_w_DNN' if not os.path.exists(samples_dir_w_appended_DNN): os.makedirs(samples_dir_w_appended_DNN) samples_final_path_dir = os.path.join(samples_dir_w_appended_DNN, modeldir) if not os.path.exists(samples_final_path_dir): os.makedirs(samples_final_path_dir) dataframe_name = '%s/%s_dataframe.csv' % (samples_final_path_dir, process) if os.path.isfile(dataframe_name): print('<run_network_evaluation> Loading %s . . . . ' % dataframe_name) data = pandas.read_csv(dataframe_name) else: print( '<run_network_evaluation> Making *new* data file from %s . . . . ' % (inputs_file_path)) selection_criteria = '( ( (Leading_Photon_pt/CMS_hgg_mass) > 0.35 ) && ( (Subleading_Photon_pt/CMS_hgg_mass) > 0.25 ) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1 )' data = DNN_applier.load_data(inputs_file_path, column_headers, selection_criteria, current_sample_name) if len(data) == 0: print('<run_network_evaluation> No data! Next file.') continue print( '<run_network_evaluation> Saving new data .csv file at %s . . . . ' % (dataframe_name)) print( '<run_network_evaluation> Found events passing selection. Process name will be stored in succesfully_run_files.txt' ) succesfully_run_files.write(process) data = data.replace(to_replace=-999.000000, value=-9.0) data.to_csv(dataframe_name, index=False) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("<run_network_evaluation> Total length of HH = %i, bckg = %i" % (nHH, nbckg)) # Create dataset from dataframe to evaluate DNN X_test = data[training_columns].values result_probs_ = model_1.predict(np.array(X_test)) nEvent = data['event'] if len(result_probs_) < 1.: print('<run_network_evaluation> Warning: only %s test values.' % (len(result_probs_))) print('<run_network_evaluation> Probabilities: ', result_probs_) print('<run_network_evaluation> Exiting now.') exit(0) # Dictionary: # key = event number : value = DNN output eventnum_resultsprob_dict = {} for index in range(len(nEvent)): #print('nEvent= %s , prob = %s' % (nEvent[index], result_probs_[index]) eventnum_resultsprob_dict[nEvent[index]] = result_probs_[index] model1_probs_.append(result_probs_[index]) print(current_sample_name) infile = inputs_file_path + current_sample_name + ".root" print('<run_network_evaluation> Input file: ', infile) # Open file and load ttrees data_file = TFile.Open(infile) if 'HHWWgg' in current_sample_name: treename = [ 'GluGluToHHTo2G2Qlnu_node_cHHH1_TuneCP5_PSWeights_13TeV_powheg_pythia8alesauva_2017_1_10_6_4_v0_RunIIFall17MiniAODv2_PU2017_12Apr2018_94X_mc2017_realistic_v14_v1_1c4bfc6d0b8215cc31448570160b99fdUSER' ] elif 'DiPhotonJetsBox_MGG' in current_sample_name: treename = ['DiPhotonJetsBox_MGG_80toInf_13TeV_Sherpa'] elif 'GJet_Pt-20toInf' in current_sample_name: treename = [ 'GJet_Pt_20toInf_DoubleEMEnriched_MGG_40to80_TuneCP5_13TeV_Pythia8' ] elif 'GJet_Pt-20to40' in current_sample_name: treename = [ 'GJet_Pt_20to40_DoubleEMEnriched_MGG_80toInf_TuneCP5_13TeV_Pythia8' ] elif 'GJet_Pt-40toInf' in current_sample_name: treename = [ 'GJet_Pt_40toInf_DoubleEMEnriched_MGG_80toInf_TuneCP5_13TeV_Pythia8' ] elif 'DYJetsToLL_M-50_TuneCP5' in current_sample_name: treename = ['DYJetsToLL_M_50_TuneCP5_13TeV_amcatnloFXFX_pythia8'] elif 'TTGG' in current_sample_name: treename = ['TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8'] elif 'TTGJets' in current_sample_name: treename = ['TTGJets_TuneCP5_13TeV_amcatnloFXFX_madspin_pythia8'] elif 'TTJets_HT-600to800' in current_sample_name: treename = ['TTJets_HT_600to800_TuneCP5_13TeV_madgraphMLM_pythia8'] elif 'TTJets_HT-800to1200' in current_sample_name: treename = [ 'TTJets_HT_800to1200_TuneCP5_13TeV_madgraphMLM_pythia8' ] elif 'TTJets_HT-1200to2500' in current_sample_name: treename = [ 'TTJets_HT_1200to2500_TuneCP5_13TeV_madgraphMLM_pythia8' ] elif 'TTJets_HT-2500toInf' in current_sample_name: treename = [ 'TTJets_HT_2500toInf_TuneCP5_13TeV_madgraphMLM_pythia8' ] elif 'W1JetsToLNu_LHEWpT_0-50' in current_sample_name: treename = [ 'W1JetsToLNu_LHEWpT_0_50_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W1JetsToLNu_LHEWpT_50-150' in current_sample_name: treename = [ 'W1JetsToLNu_LHEWpT_50_150_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W1JetsToLNu_LHEWpT_150-250' in current_sample_name: treename = [ 'W1JetsToLNu_LHEWpT_150_250_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W1JetsToLNu_LHEWpT_250-400' in current_sample_name: treename = [ 'W1JetsToLNu_LHEWpT_250_400_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W1JetsToLNu_LHEWpT_400-inf' in current_sample_name: treename = [ 'W1JetsToLNu_LHEWpT_400_inf_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W2JetsToLNu_LHEWpT_0-50' in current_sample_name: treename = [ 'W2JetsToLNu_LHEWpT_0_50_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W2JetsToLNu_LHEWpT_50-150' in current_sample_name: treename = [ 'W2JetsToLNu_LHEWpT_50_150_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W2JetsToLNu_LHEWpT_150-250' in current_sample_name: treename = [ 'W2JetsToLNu_LHEWpT_150_250_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W2JetsToLNu_LHEWpT_250-400' in current_sample_name: treename = [ 'W2JetsToLNu_LHEWpT_250_400_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W2JetsToLNu_LHEWpT_400-inf' in current_sample_name: treename = [ 'W2JetsToLNu_LHEWpT_400_inf_TuneCP5_13TeV_amcnloFXFX_pythia8' ] elif 'W3JetsToLNu' in current_sample_name: treename = ['W3JetsToLNu_TuneCP5_13TeV_madgraphMLM_pythia8'] elif 'W4JetsToLNu' in current_sample_name: treename = ['W4JetsToLNu_TuneCP5_13TeV_madgraphMLM_pythia8'] elif 'ttHJetToGG' in current_sample_name: treename = ['ttHJetToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8'] else: print( '<run_network_evaluation> Warning: Process name not recognised. Exiting.' ) exit(0) # Open each TTree in file and loop over events. # Append evaluated DNN score to DNN branch for each event. # Score assigned to event according to event number. for tname in treename: print('<run_network_evaluation> TTree: ', tname) data_tree = data_file.Get(tname) # Check if input file is zombie if data_file.IsZombie(): raise IOError('missing file') exit(0) output_file_name = '%s/%s.root' % (samples_final_path_dir, process_filename.get(process)) print('<run_network_evaluation> Creating new output .root file') output_file = TFile.Open(output_file_name, 'RECREATE') # Clone empty tree output_tree = data_tree.CloneTree(0) output_tree.SetName("output_tree") # All branches on. # Turn off all branches except those needed to speed up run-time output_tree.SetBranchStatus('*', 1) # Append DNN Branches to new TTree DNN_evaluation = array('f', [0.]) DNN_evaluation_branch = output_tree.Branch('DNN_evaluation', DNN_evaluation, 'DNN_evaluation/F') sample_name = process histo_DNN_values_title = 'DNN values: %s Sample' % (sample_name) histo_DNN_values_name = 'histo_DNN_values_%s_sample' % ( sample_name) histo_DNN_values = ROOT.TH1D(histo_DNN_values_name, histo_DNN_values_title, 200, 0, 1.) temp_percentage_done = 0 ######## Loop over ttree ######### print('<run_network_evaluation> data_tree # Entries: ', data_tree.GetEntries()) if output_tree.GetEntries() != 0: print('<run_network_evaluation> output_tree # Entries: ', output_tree.GetEntries()) print( 'This tree should be empty at this point!!!!! check cloning correctly' ) for i in range(data_tree.GetEntries()): DNN_evaluation[0] = -1. percentage_done = int(100 * float(i) / float(data_tree.GetEntries())) if percentage_done % 10 == 0: if percentage_done != temp_percentage_done: print(percentage_done) temp_percentage_done = percentage_done data_tree.GetEntry(i) Eventnum_ = array('d', [0]) Eventnum_ = data_tree.event EventWeight_ = array('d', [0]) EventWeight_ = data_tree.weight passbVeto = array('d', [0]) passbVeto = data_tree.passbVeto ExOneLep = array('d', [0]) ExOneLep = data_tree.ExOneLep Leading_Photon_pt = array('d', [0]) Leading_Photon_pt = data_tree.Leading_Photon_pt Subleading_Photon_pt = array('d', [0]) Subleading_Photon_pt = data_tree.Subleading_Photon_pt CMS_hgg_mass = array('d', [0]) CMS_hgg_mass = data_tree.CMS_hgg_mass N_goodJets = array('d', [0]) N_goodJets = data_tree.N_goodJets if ((Leading_Photon_pt / CMS_hgg_mass) > 0.35 and (Subleading_Photon_pt / CMS_hgg_mass) > 0.25 and passbVeto == 1 and ExOneLep == 1 and N_goodJets >= 1): pass_selection = 1 else: pass_selection = 0 if pass_selection == 0: continue if 'HHWWgg' in process: true_process.append(1) else: true_process.append(0) EventWeights_.append(EventWeight_) histo_DNN_values.Fill( eventnum_resultsprob_dict.get(Eventnum_)[0], EventWeight_) DNN_evaluation[0] = eventnum_resultsprob_dict.get(Eventnum_)[0] output_tree.Fill() eventnum_resultsprob_dict.clear() output_file.Write() output_file.Close() data_file.Close() #plots_dir = os.path.join(samples_final_path_dir,'plots/') #Plotter.plots_directory = plots_dir #model1_probs_ = np.array(model1_probs_) #Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 0 , 'ttHnode') exit(0)