Exemplo n.º 1
0
def main():
    print('Using Keras version: ', tf.keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=0,
        type=int)
    parser.add_argument('-w',
                        '--classweights',
                        dest='classweights',
                        help='Option to choose class weights',
                        default='InverseSRYields',
                        type=str)
    parser.add_argument('-s',
                        '--sel',
                        dest='selection',
                        help='Option to choose selection',
                        default='tH',
                        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    classweights_name = args.classweights
    selection = args.selection

    # Number of classes to use
    number_of_classes = 4

    # Create instance of output directory where all results are saved.
    output_directory = './2017samples_%s_%s/' % (selection, classweights_name)

    check_dir(output_directory)

    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')

    input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json', 'r')

    if selection == 'tH':
        selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)'  #&& n_presel_jet>=3'

    # Load Variables from .json
    variable_list = list(
        json.load(input_var_jsonFile, encoding="utf-8").items())

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('EventWeight')
    column_headers.append('xsec_rwgt')
    column_headers.append('nEvent')

    # Create instance of the input files directory
    inputs_file_path = '/hpcfs/bes/mlgpu/kapoor/samples'

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe_%s.csv' % (output_directory,
                                                           selection)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' %
              (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' %
              (inputs_file_path))
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    # Make instance of plotter tool
    Plotter = plotter()

    # Create statistically independant lists train/test data (used to train/evaluate the network)
    traindataset, valdataset = train_test_split(data, test_size=0.2)
    #valdataset.to_csv('valid_dataset.csv', index=False)

    #print '<train-DNN> Training dataset shape: ', traindataset.shape
    #print '<train-DNN> Validation dataset shape: ', valdataset.shape

    # Remove last two columns (Event weight and xsrw) from column headers
    training_columns = column_headers[:-3]
    print('<train-DNN> Training features: ', training_columns)

    # Select data from columns under the remaining column headers in traindataset
    X_train = traindataset[training_columns].values

    # Select data from 'target' as target for MVA
    Y_train = traindataset.target.astype(int)
    X_test = valdataset[training_columns].values
    Y_test = valdataset.target.astype(int)

    num_variables = len(training_columns)

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]
    train_df.drop(['EventWeight'], axis=1, inplace=True)
    train_df.drop(['xsec_rwgt'], axis=1, inplace=True)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    #Plotter.correlation_matrix(train_df)
    #Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight']
    sampleweights = traindataset.loc[:, 'sampleWeight']
    sampleweights = np.array(sampleweights)

    # Dictionaries of class weights to combat class imbalance
    if classweights_name == 'balanced':
        tuned_weighted = class_weight.compute_class_weight(
            'balanced', np.unique([0, 1, 2, 3]), Y_train)
    if classweights_name == 'tunedweights':
        tuned_weighted = {0: 7.67, 1: 1.0, 2: 4.62, 3: 7.67}

    # Per instance weights calculation so we can correctly apply event weights to diagnostic plots
    train_weights = traindataset['EventWeight'].values * traindataset[
        'xsec_rwgt'].values
    test_weights = valdataset['EventWeight'].values * valdataset[
        'xsec_rwgt'].values

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)
    # Transform to one hot encoded arrays
    # Y_train = np_utils.to_categorical(encoded_Y)
    # Y_test = np_utils.to_categorical(encoded_Y_test)
    Y_train = to_categorical(encoded_Y)
    Y_test = to_categorical(encoded_Y_test)

    optimizer = 'Adam'  #'Nadam'
    if do_model_fit == 1:
        histories = []
        labels = []
        # Define model and early stopping
        early_stopping_monitor = EarlyStopping(patience=100,
                                               monitor='val_loss',
                                               verbose=1)
        model3 = baseline_model(num_variables, optimizer, number_of_classes)

        # Fit the model
        # Batch size = examples before updating weights (larger = faster training)
        # Epochs = One pass over data (useful for periodic logging and evaluation)
        #history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=500,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor])
        history3 = model3.fit(X_train,
                              Y_train,
                              validation_split=0.2,
                              epochs=300,
                              batch_size=1500,
                              verbose=1,
                              shuffle=True,
                              sample_weight=sampleweights,
                              callbacks=[early_stopping_monitor])
        histories.append(history3)
        labels.append(optimizer)

        # Make plot of loss function evolution
        Plotter.plot_training_progress_acc(histories, labels)
        acc_progress_filename = 'DNN_acc_wrt_epoch.png'
        Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
        # Which model do you want the rest of the plots for?
        model = model3
    else:
        # Which model do you want to load?
        model_name = os.path.join(output_directory, 'model.h5')
        print('<train-DNN> Loaded Model: %s' % (model_name))
        model = load_trained_model(model_name, num_variables, optimizer,
                                   number_of_classes)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    plot_model(model,
               to_file=model_schematic_name,
               show_shapes=True,
               show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    # Make overfitting plots of output nodes
    Plotter.overfitting(model, Y_train, Y_test, result_probs,
                        result_probs_test, plots_dir, train_weights,
                        test_weights)

    # Get true process values for testing dataset
    original_encoded_test_Y = []
    for i in range(len(result_probs_test)):
        if Y_test[i][0] == 1:
            original_encoded_test_Y.append(0)
        if Y_test[i][1] == 1:
            original_encoded_test_Y.append(1)
        if Y_test[i][2] == 1:
            original_encoded_test_Y.append(2)
        if Y_test[i][3] == 1:
            original_encoded_test_Y.append(3)

    # Get true process integers for training dataset
    original_encoded_train_Y = []
    for i in range(len(result_probs)):
        if Y_train[i][0] == 1:
            original_encoded_train_Y.append(0)
        if Y_train[i][1] == 1:
            original_encoded_train_Y.append(1)
        if Y_train[i][2] == 1:
            original_encoded_train_Y.append(2)
        if Y_train[i][3] == 1:
            original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    result_classes_test = newencoder.inverse_transform(result_classes_test)
    result_classes_train = newencoder.inverse_transform(result_classes)

    # Create confusion matrices for training and testing performance
    Plotter.conf_matrix(original_encoded_train_Y, result_classes_train,
                        train_weights, 'index')
    Plotter.save_plots(dir=plots_dir,
                       filename='yields_norm_confusion_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y, result_classes_test,
                        test_weights, 'index')
    Plotter.save_plots(dir=plots_dir,
                       filename='yields_norm_confusion_matrix_TEST.png')

    Plotter.conf_matrix(original_encoded_train_Y, result_classes_train,
                        train_weights, '')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y, result_classes_test,
                        test_weights, '')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')

    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs,
                        original_encoded_test_Y, result_probs_test, 0,
                        'ttHnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs,
                        original_encoded_test_Y, result_probs_test, 1, 'Other')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs,
                        original_encoded_test_Y, result_probs_test, 2,
                        'ttWnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs,
                        original_encoded_test_Y, result_probs_test, 3,
                        'tHQnode')
Exemplo n.º 2
0
def main():
    print 'Using Keras version: ', keras.__version__

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=0,
        type=int)
    parser.add_argument('-r',
                        '--region',
                        dest='region',
                        help='Option to choose SigRegion or CtrlRegion',
                        default='SigRegion',
                        type=str)
    parser.add_argument(
        '-w',
        '--classweights',
        dest='classweights',
        help=
        'Option to choose class weights (InverseNEventsTR, InverseSRYields or BalancedWeights)',
        default='InverseNEventsTR',
        type=str)
    parser.add_argument('-s',
                        '--sel',
                        dest='selection',
                        help='Option to choose selection',
                        default='geq4j',
                        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    region = args.region
    classweights_name = args.classweights
    selection = args.selection
    number_of_classes = 4

    # Create instance of output directory where all results are saved.
    output_directory = '2019-06-18_CNN_LLFOnly_FunkAPI_particleinput_%s_%s_%s/' % (
        selection, classweights_name, region)

    check_dir(output_directory)

    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')

    lowlevel_invar_jsonFile = open('LOWLEVEL_invars_conv1DNN.json', 'r')
    highlevel_invar_jsonFile = open('HIGHLEVEL_invars_conv1DNN.json', 'r')

    if selection == 'geq4j':
        selection_criteria = 'Jet_numLoose>=4'
    if selection == 'geq3j':
        selection_criteria = 'Jet_numLoose>=3'
    if selection == 'eeq3j':
        selection_criteria = 'Jet_numLoose==3'

    # WARNINING !!!!
    #variable_list = json.load(input_var_jsonFile,encoding="utf-8").items()
    lowlevel_invar_list = json.load(lowlevel_invar_jsonFile,
                                    encoding="utf-8").items()
    highlevel_invar_list = json.load(highlevel_invar_jsonFile,
                                     encoding="utf-8").items()

    lowlevel_column_headers = []
    for key, var in lowlevel_invar_list:
        lowlevel_column_headers.append(key)
    lowlevel_column_headers.append('EventWeight')
    lowlevel_column_headers.append('xsec_rwgt')

    highlevel_column_headers = []
    for key, var in highlevel_invar_list:
        if 'hadTop_BDT' in key:
            key = 'hadTop_BDT'
        if 'Hj1_BDT' in key:
            key = 'Hj1_BDT'
        if 'Hj_tagger_hadTop' in key:
            key = 'Hj_tagger_hadTop'
        highlevel_column_headers.append(key)
    highlevel_column_headers.append('EventWeight')
    highlevel_column_headers.append('xsec_rwgt')

    # Create instance of the input files directory
    inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/Training_samples_looselepsel/'

    print 'Getting files from:', inputs_file_path
    lowlevel_features_DF_name = '%s/lowlevel_features_DF_%s_%s.csv' % (
        output_directory, region, selection)
    highlevel_features_DF_name = '%s/highlevel_features_DF_%s_%s.csv' % (
        output_directory, region, selection)

    if os.path.isfile(lowlevel_features_DF_name):
        lowlevel_features_data = pandas.read_csv(lowlevel_features_DF_name)
        print 'Loading %s . . . . ' % (lowlevel_features_DF_name)
    else:
        print 'Creating and loading new data file in %s . . . . ' % (
            inputs_file_path)
        lowlevel_features_data = load_data(inputs_file_path,
                                           lowlevel_column_headers,
                                           selection_criteria)
        lowlevel_features_data.to_csv(lowlevel_features_DF_name, index=False)
        lowlevel_features_data = pandas.read_csv(lowlevel_features_DF_name)

    if os.path.isfile(highlevel_features_DF_name):
        highlevel_features_data = pandas.read_csv(highlevel_features_DF_name)
        print 'Loading %s . . . . ' % (highlevel_features_DF_name)
    else:
        print 'Creating and loading new data file in %s . . . . ' % (
            inputs_file_path)
        highlevel_features_data = load_data(inputs_file_path,
                                            highlevel_column_headers,
                                            selection_criteria)
        highlevel_features_data.to_csv(highlevel_features_DF_name, index=False)
        highlevel_features_data = pandas.read_csv(highlevel_features_DF_name)

    Plotter = plotter()

    # Split pandas dataframe into train/test
    lowlevel_traindataset, lowlevel_valdataset = train_test_split(
        lowlevel_features_data, test_size=0.2)
    highlevel_traindataset, highlevel_valdataset = train_test_split(
        highlevel_features_data, test_size=0.2)

    print 'LOWLEVEL train dataset shape [ Nexamples: %s , Nfeatures: %s ]' % (
        lowlevel_traindataset.shape[0], lowlevel_traindataset.shape[1])
    print 'LOWLEVEL validation dataset shape [ Nexamples: %s , Nfeatures: %s ]' % (
        lowlevel_valdataset.shape[0], lowlevel_valdataset.shape[1])
    print 'HIGHLEVEL train dataset shape [ Nexamples: %s , Nfeatures: %s ]' % (
        highlevel_traindataset.shape[0], highlevel_traindataset.shape[1])
    print 'HIGHLEVEL validation dataset shape [ Nexamples: %s , Nfeatures: %s ]' % (
        highlevel_valdataset.shape[0], highlevel_valdataset.shape[1])

    #feature_corr_df = lowlevel_traindataset + highlevel_traindataset
    lowlevel_train_df = lowlevel_features_data.iloc[:lowlevel_traindataset.
                                                    shape[0]]
    lowlevel_train_df.drop(['EventWeight'], axis=1, inplace=True)
    lowlevel_train_df.drop(['xsec_rwgt'], axis=1, inplace=True)
    highlevel_train_df = highlevel_features_data.iloc[:highlevel_traindataset.
                                                      shape[0]]
    highlevel_train_df.drop(['EventWeight'], axis=1, inplace=True)
    highlevel_train_df.drop(['xsec_rwgt'], axis=1, inplace=True)

    # calculate event weights
    train_weights = lowlevel_traindataset[
        'EventWeight'].values * lowlevel_traindataset['xsec_rwgt'].values
    test_weights = lowlevel_valdataset[
        'EventWeight'].values * lowlevel_valdataset['xsec_rwgt'].values

    # Remove unwanted variables from columns list use for training
    lowlevel_training_columns = lowlevel_column_headers[:-2]
    highlevel_training_columns = highlevel_column_headers[:-2]

    # Collect just the values for the variables used in the training and testing data sets.
    lowlevel_X_train = lowlevel_traindataset[lowlevel_training_columns].values
    lowlevel_X_test = lowlevel_valdataset[lowlevel_training_columns].values

    reshaped_3D_data = reshape_for_particle_rep(lowlevel_traindataset,
                                                lowlevel_training_columns)

    reshaped_3D_data_test = reshape_for_particle_rep(
        lowlevel_valdataset, lowlevel_training_columns)

    highlevel_X_train = highlevel_traindataset[
        highlevel_training_columns].values
    highlevel_X_test = highlevel_valdataset[highlevel_training_columns].values

    Y_train = lowlevel_traindataset.target.astype(int)
    Y_test = lowlevel_valdataset.target.astype(int)

    # Need to reshape data to have spatial dimension for conv1d
    lowlevel_X_train = np.expand_dims(lowlevel_X_train, axis=-1)
    lowlevel_X_test = np.expand_dims(lowlevel_X_test, axis=-1)
    #print 'Reshaped lowlevel_data to include spatial dimension for conv1d. New shape = ', lowlevel_X_train.shape

    lowlevel_num_variables = len(lowlevel_training_columns)
    highlevel_num_variables = len(highlevel_training_columns)

    ## Input Variable Correlations
    lowlevel_correlation_plot_file_name = 'lowlevel_correlation_plot.png'
    #Plotter.correlation_matrix(lowlevel_train_df)
    #Plotter.save_plots(dir=plots_dir, filename=lowlevel_correlation_plot_file_name)

    highlevel_correlation_plot_file_name = 'highlevel_correlation_plot.png'
    #Plotter.correlation_matrix(highlevel_train_df)
    #Plotter.save_plots(dir=plots_dir, filename=highlevel_correlation_plot_file_name)

    # =============== Weights ==================
    # WARNING! 'sample_weight' will overide 'class_weight'
    # ==========================================
    # Sample                    |       ttH       |      tt+jets       |       ttW        |       ttZ        |
    ############################
    #  ======= geq 4 Jets ======
    ############################
    # Loose lepton TR selection
    ############################
    # XS                              0.2118              831.                0.2043            0.2529
    # # events in TR            |     221554      |      1168897       |      321674      |      204998      |
    # Sum of weights:           |    94.379784    |   7372.112793      |    206.978439    |    122.834419    |
    # Yields 2LSS SR HIG 18-019 |      60.08      | 140.25+22.79+17.25 |      151.03      |      87.05       |
    #                                                    =180.29
    ############################
    #= Control Region (== 3 Jets)
    ############################
    # Loose lepton TR selection
    ############################
    # # events in TR            |    39418        |      568724        |      111809      |      58507       |
    # Sum of weights            |   24.269867     |    3807.655762     |    102.885391    |     58.825554    |
    # Yields 2LSS ttWctrl       |    14.36        |    120.54 + 9.55   |       75.97      |      38.64       |
    #   AN2018-098-v18

    # Yields 2LSS SR HIG 18-019 |       60.08        | 140.25+22.79+17.25 |       151.03       |      87.05       |
    # Yields 2LSS ttWctrl       |       14.36        |    120.54 + 9.55   |        75.97       |      38.64       |
    # Yields 2LSS >= 3 jets     |       74.44        |        310.38      |       227.00       |     125.69       |

    if classweights_name == 'InverseSRYields':
        if selection == 'geq4j':
            tuned_weighted = {
                0: 0.0166445,
                1: 0.00554662,
                2: 0.00662120,
                3: 0.0114877
            }
        if selection == 'geq3j':
            tuned_weighted = {
                0: 0.01343363782,
                1: 0.00322185707,
                2: 0.00440528634,
                3: 0.00795608242
            }
    elif classweights_name == 'InverseNEventsTR':
        tuned_weighted = {
            0: 0.00000451357,
            1: 0.000000855507,
            2: 0.00000310874,
            3: 0.00000487810
        }
    elif classweights_name == 'InverseSumWeightsTR':
        tuned_weighted = {
            0: 0.01059548939,
            1: 0.00013564632,
            2: 0.00483142111,
            3: 0.00814104066
        }

    print 'class weights : ', classweights_name

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)
    # Transform to one hot encoded arrays
    Y_train = np_utils.to_categorical(encoded_Y)
    Y_test = np_utils.to_categorical(encoded_Y_test)

    #print 'num_variables = ',num_variables
    optimizer = 'Adam'
    if do_model_fit == 1:

        # Training new model
        histories = []
        labels = []
        early_stopping_monitor = EarlyStopping(patience=50,
                                               monitor='val_loss',
                                               verbose=1)

        # Lists for HP scan
        #optimizers = ['Adamax','Adam','Nadam']
        #batchessize = np.array([100,200,500,1000])

        # Define a model
        #model3 = baseline_model(lowlevel_num_variables, optimizer)
        print 'Low-level data shape:'
        print reshaped_3D_data.shape
        model4 = functional_CNN_model(reshaped_3D_data.shape[1], optimizer,
                                      highlevel_num_variables)

        # Fit the model using training data.
        # Batch size = number of examples before updating weights (larger = faster training)
        #history4 = model4.fit([reshaped_3D_data,highlevel_X_train],Y_train,validation_split=0.2,epochs=200,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor])
        history4 = model4.fit(reshaped_3D_data,
                              Y_train,
                              validation_split=0.2,
                              epochs=200,
                              batch_size=1000,
                              verbose=1,
                              shuffle=True,
                              class_weight=tuned_weighted,
                              callbacks=[early_stopping_monitor])

        # Store history for performance by epoch plot.
        histories.append(history4)
        labels.append(optimizer)
        Plotter.plot_training_progress_acc(histories, labels)
        acc_progress_filename = 'DNN_acc_wrt_epoch.png'
        Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)

        # Which model do you want the rest of the plots for?
        #model = model3
        model = model4
    else:
        # Which model do you want to load?
        model_name = os.path.join(output_directory, 'model.h5')
        print 'Loading  %s' % (model_name)
        #model = load_trained_model(model_name, num_variables, optimizer)
        model = load_trained_CNN_model(model_name, reshaped_3D_data.shape[1],
                                       optimizer, highlevel_num_variables)

    # Node probabilities for training sample events
    # Is this the same as in the DNN case?
    result_probs_train = model.predict([reshaped_3D_data, highlevel_X_train])
    # Get maximum probability
    result_classes_train = result_probs_train.argmax(axis=-1)

    # Node probabilities for testing sample events
    result_probs_test = model.predict(
        [reshaped_3D_data_test, highlevel_X_train])
    result_classes_test = result_probs_test.argmax(axis=-1)

    # Store model in hdf5 format
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)

    # Save model weights only seperately as well in hdf5 format
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)

    # Make sure to save model in json format as well
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)

    model.summary()

    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    # Initialise output directory where plotter results will be saved.
    Plotter.output_directory = output_directory

    # Make overfitting plots
    #Plotter.overfitting(model, Y_train, Y_test, result_probs_train, result_probs_test, plots_dir, train_weights, test_weights)

    # Make list of true labels e.g. (0,1,2,3)
    original_encoded_test_Y = []
    for i in xrange(len(result_probs_test)):
        if Y_test[i][0] == 1:
            original_encoded_test_Y.append(0)
        if Y_test[i][1] == 1:
            original_encoded_test_Y.append(1)
        if Y_test[i][2] == 1:
            original_encoded_test_Y.append(2)
        if Y_test[i][3] == 1:
            original_encoded_test_Y.append(3)

    original_encoded_train_Y = []
    for i in xrange(len(result_probs_train)):
        if Y_train[i][0] == 1:
            original_encoded_train_Y.append(0)
        if Y_train[i][1] == 1:
            original_encoded_train_Y.append(1)
        if Y_train[i][2] == 1:
            original_encoded_train_Y.append(2)
        if Y_train[i][3] == 1:
            original_encoded_train_Y.append(3)

    # Invert LabelEncoder transform back to original truth labels
    result_classes_train = newencoder.inverse_transform(result_classes_train)
    result_classes_test = newencoder.inverse_transform(result_classes_test)

    Plotter.plots_directory = plots_dir

    # Create confusion matrices for training and testing performance
    #Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    #Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    #Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    #Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train,
                        original_encoded_test_Y, result_probs_test, 0,
                        'ttHnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train,
                        original_encoded_test_Y, result_probs_test, 1,
                        'ttJnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train,
                        original_encoded_test_Y, result_probs_test, 2,
                        'ttWnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs_train,
                        original_encoded_test_Y, result_probs_test, 3,
                        'ttZnode')
Exemplo n.º 3
0
def main():
    print 'Using Keras version: ', keras.__version__

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=0,
        type=int)
    parser.add_argument('-s',
                        '--sel',
                        dest='selection',
                        help='Option to choose selection',
                        default='tH',
                        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    selection = args.selection

    # Number of classes to use
    number_of_classes = 4

    # Create instance of output directory where all results are saved.
    #output_directory = '2017tautag2p1samples_EVw8s_oldvars_%s_selection/' % (selection)
    output_directory = '2017samples_xmasupdates_%s_selection/' % (selection)

    check_dir(output_directory)

    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')

    input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json', 'r')
    #input_var_jsonFile = open('input_features_new.json','r')

    if selection == 'tH':
        selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('EventWeight')
    column_headers.append('xsec_rwgt')
    column_headers.append('nEvent')

    # Create instance of the input files directory
    inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/rootplas_LegacyMVA_update_mbb_20191229/DiLepRegion/ttH2017TrainDNN2L/'

    # Load ttree into .csv including all variables listed in column_headers
    print '<train-DNN> Input file path: ', inputs_file_path
    outputdataframe_name = '%s/output_dataframe_%s.csv' % (output_directory,
                                                           selection)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print '<train-DNN> Loading data .csv from: %s . . . . ' % (
            outputdataframe_name)
    else:
        print '<train-DNN> Creating new data .csv @: %s . . . . ' % (
            inputs_file_path)
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    # Make instance of plotter tool
    Plotter = plotter()

    # Create statistically independant lists train/test data (used to train/evaluate the network)
    traindataset, valdataset = train_test_split(data, test_size=0.2)
    valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False)

    #print '<train-DNN> Training dataset shape: ', traindataset.shape
    #print '<train-DNN> Validation dataset shape: ', valdataset.shape

    # Remove last two columns (Event weight and xsrw) from column headers
    training_columns = column_headers[:-3]
    print '<train-DNN> Training features: ', training_columns

    # Select data from columns under the remaining column headers in traindataset
    X_train = traindataset[training_columns].values

    # Select data from 'target' as target for MVA
    Y_train = traindataset.target.astype(int)
    X_test = valdataset[training_columns].values
    Y_test = valdataset.target.astype(int)

    num_variables = len(training_columns)

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]
    train_df.drop(['EventWeight'], axis=1, inplace=True)
    train_df.drop(['xsec_rwgt'], axis=1, inplace=True)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    ####################################################################################
    # Weights applied during training. You will also need to update the class weights if
    # you are going to change the event weights applied. Introduce class weights and any
    # event weight you want to use here.
    #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight']
    sampleweights = traindataset.loc[:,
                                     'sampleWeight'] * traindataset.loc[:,
                                                                        'xsec_rwgt']
    sampleweights = np.array(sampleweights)

    # Event weights calculation so we can correctly apply event weights to diagnostic plots.
    # use seperate list because we don't want to apply class weights in plots.
    #train_weights = traindataset['EventWeight'].values
    #test_weights = valdataset['EventWeight'].values
    train_weights = traindataset['xsec_rwgt'].values
    test_weights = valdataset['xsec_rwgt'].values

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)
    # Transform to one hot encoded arrays
    Y_train = np_utils.to_categorical(encoded_Y)
    Y_test = np_utils.to_categorical(encoded_Y_test)

    optimizer = 'Adam'  #'Nadam'
    if do_model_fit == 1:
        histories = []
        labels = []
        # Define model and early stopping
        early_stopping_monitor = EarlyStopping(patience=150,
                                               monitor='val_loss',
                                               verbose=1)
        model3 = baseline_model(num_variables, optimizer, number_of_classes)

        # Fit the model
        # Batch size = examples before updating weights (larger = faster training)
        # Epochs = One pass over data (useful for periodic logging and evaluation)
        history3 = model3.fit(X_train,
                              Y_train,
                              validation_split=0.2,
                              epochs=300,
                              batch_size=1500,
                              verbose=1,
                              shuffle=True,
                              sample_weight=sampleweights,
                              callbacks=[early_stopping_monitor])
        histories.append(history3)
        labels.append(optimizer)

        # Make plot of loss function evolution
        Plotter.plot_training_progress_acc(histories, labels)
        acc_progress_filename = 'DNN_acc_wrt_epoch.png'
        Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
        # Which model do you want the rest of the plots for?
        model = model3
    else:
        # Which model do you want to load?
        model_name = os.path.join(output_directory, 'model.h5')
        print '<train-DNN> Loaded Model: %s' % (model_name)
        model = load_trained_model(model_name, num_variables, optimizer,
                                   number_of_classes)
        '''continuetraining=1
        if continuetraining == 1:
            new_model = load_model(model_name)
            assert_allclose(new_model.predict(X_train),new_model.predict(X_train),1e-5)
            checkpoint = ModelCheckpoint(model_name, monitor='loss', verbose=1, save_best_only=True, mode='min')
            callbacks_list = [checkpoint]
            history3 = new_model.fit(X_train,Y_train,validation_split=0.2,epochs=50,batch_size=1500,verbose=1,shuffle=True,sample_weight=sampleweights,callbacks=callbacks_list)'''

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    plot_model(model,
               to_file=model_schematic_name,
               show_shapes=True,
               show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    # Make overfitting plots of output nodes
    #Plotter.overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights)

    # Get true process values for testing dataset
    original_encoded_test_Y = []
    for i in xrange(len(result_probs_test)):
        if Y_test[i][0] == 1:
            original_encoded_test_Y.append(0)
        if Y_test[i][1] == 1:
            original_encoded_test_Y.append(1)
        if Y_test[i][2] == 1:
            original_encoded_test_Y.append(2)
        if Y_test[i][3] == 1:
            original_encoded_test_Y.append(3)

    # Get true process integers for training dataset
    original_encoded_train_Y = []
    for i in xrange(len(result_probs)):
        if Y_train[i][0] == 1:
            original_encoded_train_Y.append(0)
        if Y_train[i][1] == 1:
            original_encoded_train_Y.append(1)
        if Y_train[i][2] == 1:
            original_encoded_train_Y.append(2)
        if Y_train[i][3] == 1:
            original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    result_classes_test = newencoder.inverse_transform(result_classes_test)
    result_classes_train = newencoder.inverse_transform(result_classes)

    # Create confusion matrices for training and testing performance
    '''Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png')
    '''
    '''Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')'''
    '''
def main():
    print ''
    DNN_applier = apply_DNN()

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)

    parser.add_argument(
        '-p',
        '--processName',
        dest='processName',
        help=
        'Process name. List of options in keys of process_filename dictionary',
        default=[],
        type=str,
        nargs='+')
    parser.add_argument('-r',
                        '--region',
                        dest='region',
                        help='Option to choose e.g. DiLepRegion',
                        default='DiLepRegion',
                        type=str)
    parser.add_argument(
        '-j',
        '--JES',
        dest='JES',
        help=
        'Option to choose whether to run on JES Syst samples (0=Nominal, 1=JESUp, 2=JESDown)',
        default=0,
        type=int)
    parser.add_argument('-s',
                        '--sel',
                        dest='selection',
                        help='Option to choose selection',
                        default='tH',
                        type=str)
    parser.add_argument('-y',
                        '--year',
                        dest='year',
                        help='Option to choose year settings',
                        default='2017',
                        type=str)

    args = parser.parse_args()
    processes = args.processName
    region = args.region
    JES_flag = args.JES
    selection = args.selection
    nClasses = 4
    annum = args.year

    print '<unit_test_evaluation> Succesfully parsed arguments: processName= [%s], region= %s, JES_flag= %s , selection= %s' % (
        processes, region, JES_flag, selection)

    #outputname = '2017samples_tH_tunedweights_%s' % (selection)
    outputname = 'debug_%s' % (selection)

    input_var_jsonFile = ''

    if JES_flag == 1:
        outputname = outputname + '_JESUp'
    if JES_flag == 2:
        outputname = outputname + '_JESDown'

    # Open and load input variable .json

    input_var_jsonFile = open('../input_vars_SigRegion_wFwdJet.json', 'r')
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Append variables to a list of column headers for .csv file later
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('EventWeight')
    column_headers.append('nEvent')

    if JES_flag == 0:
        JESname = ''
    elif JES_flag == 1:
        JESname = 'JESUp'
    elif JES_flag == 2:
        JESname = 'JESDown'

    # Dictionary of filenames to be run over along with their keys.
    process_filename = {
        'ttH_HWW': ('TTH_hww_' + JESname + region),
        'ttH_Hmm': ('TTH_hmm_' + JESname + region),
        'ttH_Htautau': ('TTH_htt_' + JESname + region),
        'ttH_HZZ': ('TTH_hzz_' + JESname + region),
        'ttH_HZG': ('TTH_hzg_' + JESname + region),
        'tHq_HWW': ('THQ_hww_' + JESname + region),
        'tHq_Htautau': ('THQ_htt_' + JESname + region),
        'tHq_HZZ': ('THQ_hzz_' + JESname + region),
        'tHq_HMM': ('THQ_hmm_' + JESname + region),
        'tHq_HZG': ('THQ_hzg_' + JESname + region),
        'tHW_HWW': ('THW_hww_' + JESname + region),
        'tHW_Htautau': ('THW_htt_' + JESname + region),
        'tHW_HZZ': ('THW_hzz_' + JESname + region),
        'tHW_HMM': ('THW_hmm_' + JESname + region),
        'tHW_HZG': ('THW_hzg_' + JESname + region),
        'ttWW': ('TTWW_' + JESname + region),
        'ttW': ('TTW_' + JESname + region),
        'ttZ': ('TTZ_' + JESname + region),
        'Conv': ('Convs_' + JESname + region),
        'EWK': ('EWK_' + JESname + region),
        'Fakes': ('Fakes_' + JESname + region),
        'Flips': ('Flips_' + JESname + region),
        'Rares': ('Rares_' + JESname + region),
        'FakeSub': ('FakeSub_' + JESname + region),
        'ttbar_closure': ('TT_Clos' + JESname + region),
        'Data': ('Data_' + JESname + region)
    }

    # Remove 'nEvent' from columns that will be used during in training
    training_columns = column_headers[:-2]
    num_variables = len(training_columns)

    # Name of directory that contains trained MVA model to apply.
    input_models_path = ''

    if selection == 'tH':
        input_models_path = ['2017samples_tH_tunedweights']

    # Load trained model
    optimizer = 'Adam'
    model_name_1 = os.path.join('../', input_models_path[0], 'model.h5')
    model_1 = DNN_applier.load_trained_model(model_name_1, num_variables,
                                             optimizer, nClasses)

    # Make instance of plotter class
    Plotter = plotter()

    # Lists for all events in all files. Used to make diagnostic plots of networks performance over all samples.
    true_process = []
    model1_probs_ = []
    model1_pred_process = []
    EventWeights_ = []

    # Now loop over all samples
    for process in processes:
        print '<unit_test_evaluation> Process: ', process
        current_sample_name = process_filename.get(process)

        # Use JES flag to decide if we are running on a JES varied sample or not.
        if JES_flag == 1:
            inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s%s/' % (
                'JESUp', region)
        elif JES_flag == 2:
            inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s%s/' % (
                'JESDown', region)
        else:
            inputs_file_path = 'b/binghuan/Rootplas/Legacy/rootplas_LegacyAll_1110/%s/%s/%s/' % (
                region, annum, region)

        print '<unit_test_evaluation> Input file directory: ', inputs_file_path

        # Make final output directory
        samples_dir_w_appended_DNN = 'samples_w_DNN'
        if not os.path.exists(samples_dir_w_appended_DNN):
            os.makedirs(samples_dir_w_appended_DNN)
        samples_final_path_dir = os.path.join(samples_dir_w_appended_DNN,
                                              outputname)
        if not os.path.exists(samples_final_path_dir):
            os.makedirs(samples_final_path_dir)

        if JES_flag == 1:
            JES_label = 'JESUp'
        elif JES_flag == 2:
            JES_label = 'JESDown'
        else:
            JES_label = 'nominal'

        dataframe_name = '%s/%s_dataframe_%s_%s.csv' % (
            samples_final_path_dir, process, region, JES_label)
        if os.path.isfile(dataframe_name):
            print '<unit_test_evaluation> Loading %s . . . . ' % dataframe_name
            data = pandas.read_csv(dataframe_name)
        else:
            print '<unit_test_evaluation> Making *new* data file from %s . . . . ' % (
                inputs_file_path)
            print '<unit_test_evaluation> Applying selection ', selection
            selection_criteria = ''
            if selection == 'tH':
                selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)'  #&& n_presel_jet>=3'
            data = DNN_applier.load_data(inputs_file_path, column_headers,
                                         selection_criteria, process,
                                         process_filename.get(process))
            if len(data) == 0:
                print '<unit_test_evaluation> No data! Next file.'
                continue
            print 'Saving new data .csv file at %s . . . . ' % (dataframe_name)
            data.to_csv(dataframe_name, index=False)

        nEvent = data['nEvent']

        # Using dataset like this instead of for loop over entries for predictions (necessary for keras).
        print '<unit_test_evaluation> Input features: ', training_columns
        X_test = data.iloc[:, 0:num_variables]
        X_test = X_test.values
        result_probs_ = model_1.predict(np.array(X_test))

        # Create dictionary where the value is the array of probabilities for the four categories and the key is the event number.
        eventnum_resultsprob_dict = {}
        for index in range(result_probs_.shape[0]):
            eventnum_resultsprob_dict[nEvent[index]] = result_probs_[index]
            model1_probs_.append(result_probs_[index])

        inputlist = DNN_applier.getEOSlslist(directory=inputs_file_path +
                                             current_sample_name + ".root")
        current_file = str(inputlist[0])
        print '<unit_test_evaluation> Input file: ', current_file

        # Open files and load ttrees
        data_file = TFile.Open(current_file)
        data_tree = data_file.Get("syncTree")

        # Check if input file is zombie
        if data_file.IsZombie():
            raise IOError('missing file')

        output_file_name = '%s/%s.root' % (samples_final_path_dir,
                                           process_filename.get(process))
        print '<unit_test_evaluation> Creating new output .root file'
        output_file = TFile.Open(output_file_name, 'RECREATE')

        # CloneTree(nentries) - here copying none of the actually entries
        output_tree = data_tree.CloneTree(0)
        output_tree.SetName("output_tree")

        # Turn off all branches except ones you need if you want to speed up run time?
        output_tree.SetBranchStatus('*', 1)

        # Append DNN Branches to new TTree
        # Add branches for values from highest output node and sentinel values for other nodes i.e. 'categorised'
        eval_ttHnode_cat = array('f', [0.])
        eval_Othernode_cat = array('f', [0.])
        eval_ttWnode_cat = array('f', [0.])
        eval_tHQnode_cat = array('f', [0.])
        ttH_branch_cat = output_tree.Branch('DNN_ttHnode_cat',
                                            eval_ttHnode_cat,
                                            'DNN_ttHnode_cat/F')
        Other_branch_cat = output_tree.Branch('DNN_Othernode_cat',
                                              eval_Othernode_cat,
                                              'DNN_Othernode_cat/F')
        ttW_branch_cat = output_tree.Branch('DNN_ttWnode_cat',
                                            eval_ttWnode_cat,
                                            'DNN_ttWnode_cat/F')
        tHQ_branch_cat = output_tree.Branch('DNN_tHQnode_cat',
                                            eval_tHQnode_cat,
                                            'DNN_tHQnode_cat/F')

        # un-categorised DNN variables
        eval_ttHnode_all = array('f', [0.])
        eval_Othernode_all = array('f', [0.])
        eval_ttWnode_all = array('f', [0.])
        eval_tHQnode_all = array('f', [0.])
        ttH_branch_all = output_tree.Branch('DNN_ttHnode_all',
                                            eval_ttHnode_all,
                                            'DNN_ttHnode_all/F')
        Other_branch_all = output_tree.Branch('DNN_othernode_all',
                                              eval_Othernode_all,
                                              'DNN_Othernode_all/F')
        ttW_branch_all = output_tree.Branch('DNN_ttWnode_all',
                                            eval_ttWnode_all,
                                            'DNN_ttWnode_all/F')
        tHQ_branch_all = output_tree.Branch('DNN_tHQnode_all',
                                            eval_tHQnode_all,
                                            'DNN_tHQnode_all/F')

        # Now add branches conatining the max value for each event and the category for each event
        eval_maxval = array('f', [0.])
        DNNCat = array('f', [0.])
        DNNmaxval_branch = output_tree.Branch('DNN_maxval', eval_maxval,
                                              'DNN_maxval/F')
        DNNCat_branch = output_tree.Branch('DNNCat', DNNCat, 'DNNCat/F')

        sample_name = process
        histoname_type = 'Category'

        histo_ttHclassified_events_title = 'ttH %s Events: %s Sample' % (
            histoname_type, sample_name)
        histo_ttHclassified_events_name = 'histo_ttH%s_events_%s' % (
            histoname_type, sample_name)
        histo_ttHclassified_events = ROOT.TH1D(
            histo_ttHclassified_events_name, histo_ttHclassified_events_title,
            200, 0, 1.)
        histo_Otherclassified_events_title = 'Other %s Events: %s Sample' % (
            histoname_type, sample_name)
        histo_Otherclassified_events_name = 'histo_Other%s_events_%s' % (
            histoname_type, sample_name)
        histo_Otherclassified_events = ROOT.TH1D(
            histo_Otherclassified_events_name,
            histo_Otherclassified_events_title, 200, 0, 1.)
        histo_ttWclassified_events_title = 'ttW %s Events: %s Sample' % (
            histoname_type, sample_name)
        histo_ttWclassified_events_name = 'histo_ttW%s_events_%s' % (
            histoname_type, sample_name)
        histo_ttWclassified_events = ROOT.TH1D(
            histo_ttWclassified_events_name, histo_ttWclassified_events_title,
            200, 0, 1.)
        histo_tHQclassified_events_title = 'tHQ %s Events: %s Sample' % (
            histoname_type, sample_name)
        histo_tHQclassified_events_name = 'histo_tHQ%s_events_%s' % (
            histoname_type, sample_name)
        histo_tHQclassified_events = ROOT.TH1D(
            histo_tHQclassified_events_name, histo_tHQclassified_events_title,
            200, 0, 1.)

        temp_percentage_done = 0
        uniqueEventID = []

        ######## Loop over ttree #########

        print '<unit_test_evaluation> data_tree # Entries: ', data_tree.GetEntries(
        )
        if output_tree.GetEntries() != 0:
            print '<unit_test_evaluation> output_tree # Entries: ', output_tree.GetEntries(
            )
            print 'This tree should be empty at this point!!!!! check cloning correctly'

        for i in range(data_tree.GetEntries()):
            eval_ttHnode_cat[0] = -1.
            eval_Othernode_cat[0] = -1.
            eval_ttWnode_cat[0] = -1.
            eval_tHQnode_cat[0] = -1.
            eval_ttHnode_all[0] = -1.
            eval_Othernode_all[0] = -1.
            eval_ttWnode_all[0] = -1.
            eval_tHQnode_all[0] = -1.
            eval_maxval[0] = -1.
            DNNCat[0] = -1.

            percentage_done = int(100 * float(i) /
                                  float(data_tree.GetEntries()))
            if percentage_done % 10 == 0:
                if percentage_done != temp_percentage_done:
                    print percentage_done
                    temp_percentage_done = percentage_done
            data_tree.GetEntry(i)

            Eventnum_ = array('d', [0])
            Eventnum_ = data_tree.nEvent
            EventWeight_ = array('d', [0])
            EventWeight_ = data_tree.EventWeight
            xsec_rwgt_ = array('d', [0])
            xsec_rwgt_ = data_tree.xsec_rwgt

            n_presel_jet = array('d', [0])
            n_presel_jet = data_tree.n_presel_jet
            is_tH_like_and_not_ttH_like = array('d', [0])
            is_tH_like_and_not_ttH_like = output_tree.is_tH_like_and_not_ttH_like

            if (is_tH_like_and_not_ttH_like == 0 or is_tH_like_and_not_ttH_like
                    == 1):  #and n_presel_jet>=3:
                pass_selection = 1
            else:
                pass_selection = 0

            if selection == 'tH':
                if pass_selection == 0:
                    continue
            else:
                print 'NO selection applied!'
            '''if Eventnum_ in uniqueEventID:
                print 'Eventnum_ : %s already exists ' % Eventnum_
                continue
            else:
                uniqueEventID.append(Eventnum_)
            '''

            if 'ttH_' in process:
                true_process.append(0)
            elif 'Fakes' in process or 'Flips' in process:
                true_process.append(1)
            elif 'ttW' in process:
                true_process.append(2)
            elif 'tHq' in process:
                true_process.append(3)
            else:
                true_process.append(4)

            EventWeights_.append(EventWeight_)

            evaluated_node_values = []
            #print 'Eventnum_: ', Eventnum_
            #for key,var in variable_list:
            #    print 'key: %s, value: %s' % (key , data_tree.GetLeaf(key).GetValue())
            #print 'True process: ', true_process
            # Get the value for event on each of the DNN nodes
            evaluated_node_values = DNN_applier.evaluate_model(
                eventnum_resultsprob_dict, Eventnum_)
            #print 'evaluated_node_values: ', evaluated_node_values
            # Get the maximum output value
            maxval = max(evaluated_node_values)
            # Find the max value in and return its position (i.e. node classification)
            event_classification = evaluated_node_values.index(maxval)
            #print 'event_classification: ', event_classification
            # Append classification value to list of predictions
            model1_pred_process.append(event_classification)
            #print 'model1_pred_process: ', model1_pred_process

            eval_ttHnode_all[0] = evaluated_node_values[0]
            eval_Othernode_all[0] = evaluated_node_values[1]
            eval_ttWnode_all[0] = evaluated_node_values[2]
            eval_tHQnode_all[0] = evaluated_node_values[3]

            DNNCat[0] = float(event_classification)
            eval_maxval[0] = evaluated_node_values[event_classification]

            if event_classification == 0:
                histo_ttHclassified_events.Fill(evaluated_node_values[0],
                                                EventWeight_)
                eval_ttHnode_cat[0] = evaluated_node_values[0]
                eval_Othernode_cat[0] = -1.
                eval_ttWnode_cat[0] = -1.
                eval_tHQnode_cat[0] = -1.
            elif event_classification == 1:
                histo_Otherclassified_events.Fill(evaluated_node_values[1],
                                                  EventWeight_)
                eval_ttHnode_cat[0] = -1.
                eval_Othernode_cat[0] = evaluated_node_values[1]
                eval_ttWnode_cat[0] = -1.
                eval_tHQnode_cat[0] = -1.
            elif event_classification == 2:
                histo_ttWclassified_events.Fill(evaluated_node_values[2],
                                                EventWeight_)
                eval_ttHnode_cat[0] = -1.
                eval_Othernode_cat[0] = -1.
                eval_ttWnode_cat[0] = evaluated_node_values[2]
                eval_tHQnode_cat[0] = -1.
            elif event_classification == 3:
                histo_tHQclassified_events.Fill(evaluated_node_values[3],
                                                EventWeight_)
                eval_ttHnode_cat[0] = -1.
                eval_Othernode_cat[0] = -1.
                eval_ttWnode_cat[0] = -1.
                eval_tHQnode_cat[0] = evaluated_node_values[3]
            else:
                histo_ttHclassified_events.Fill(-1., EventWeight_)
                histo_Otherclassified_events.Fill(-1., EventWeight_)
                histo_ttWclassified_events.Fill(-1., EventWeight_)
                histo_tHQclassified_events.Fill(-1., EventWeight_)
                eval_ttHnode_cat[0] = -1.
                eval_Othernode_cat[0] = -1.
                eval_ttWnode_cat[0] = -1.
                eval_tHQnode_cat[0] = -1.
                print '<unit_test_evaluation> NO classification for event!?'
                continue

            output_tree.Fill()

        print '<unit_test_evaluation> Clear # event - DNN result dictionary'
        eventnum_resultsprob_dict.clear()
        print '<unit_test_evaluation> Write output file : %s ' % (
            output_file_name)
        output_file.Write()
        print '<unit_test_evaluation> Close output file'
        output_file.Close()
        print '<unit_test_evaluation> Close input file'
        data_file.Close()

    plots_dir = os.path.join(samples_final_path_dir, 'plots/')
    Plotter.plots_directory = plots_dir

    Plotter.conf_matrix(true_process, model1_pred_process, EventWeights_, '')
    Plotter.save_plots(dir=plots_dir,
                       filename='yields_non_norm_confusion_matrix_APPL.png')
    Plotter.conf_matrix(true_process, model1_pred_process, EventWeights_,
                        'index')
    Plotter.save_plots(dir=plots_dir,
                       filename='yields_norm_confusion_matrix_APPL.png')

    model1_probs_ = np.array(model1_probs_)
    Plotter.ROC_sklearn(true_process, model1_probs_, true_process,
                        model1_probs_, 0, 'ttHnode')
    Plotter.ROC_sklearn(true_process, model1_probs_, true_process,
                        model1_probs_, 1, 'Othernode')
    Plotter.ROC_sklearn(true_process, model1_probs_, true_process,
                        model1_probs_, 2, 'ttWnode')
    Plotter.ROC_sklearn(true_process, model1_probs_, true_process,
                        model1_probs_, 3, 'tHQnode')

    exit(0)
Exemplo n.º 5
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=1,
        type=int)
    parser.add_argument('-s',
                        '--suff',
                        dest='suffix',
                        help='Option to choose suffix for training',
                        default='',
                        type=str)
    parser.add_argument('-p',
                        '--para',
                        dest='hyp_param_scan',
                        help='Option to run hyper-parameter scan',
                        default=0,
                        type=int)
    parser.add_argument(
        '-i',
        '--inputs_file_path',
        dest='inputs_file_path',
        help=
        'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.',
        default='',
        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'
    inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/'

    hyp_param_scan = args.hyp_param_scan
    # Set model hyper-parameters
    weights = 'BalanceYields'  # 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split = 0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size = 200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size = 32
        #epochs = 10
        #batch_size=200

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,
                                   'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file, 'w')
    additional_hyperparams.write("optimizer: " + optimizer + "\n")
    additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n")
    additional_hyperparams.write("epochs: " + str(epochs) + "\n")
    additional_hyperparams.write("validation_split: " + str(validation_split) +
                                 "\n")
    additional_hyperparams.write("weights: " + weights + "\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')
    input_var_jsonFile = open('input_variables.json', 'r')
    selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' % (output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' %
              (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' %
              (inputs_file_path))
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        # Change sentinal value to speed up training.
        data = data.mask(data < -25., -9.)
        #data = data.replace(to_replace=-99.,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" %
          (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)

    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'weight']
    weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'weight']
    weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'weight']
    weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'weight']
    weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'weight']
    weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'weight']

    HHsum_weighted = sum(weights_for_HH)
    Hggsum_weighted = sum(weights_for_Hgg)
    DiPhotonsum_weighted = sum(weights_for_DiPhoton)
    GJetsum_weighted = sum(weights_for_GJet)
    QCDsum_weighted = sum(weights_for_QCD)
    DYsum_weighted = sum(weights_for_DY)
    TTGsJetssum_weighted = sum(weights_for_TTGsJets)
    WGsJetssum_weighted = sum(weights_for_WGsJets)
    WWsum_weighted = sum(weights_for_WW)
    bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted
    #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'unweighted']
    nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'unweighted']
    nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'unweighted']
    nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'unweighted']
    nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'unweighted']
    nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'unweighted']

    HHsum_unweighted = sum(nevents_for_HH)
    Hggsum_unweighted = sum(nevents_for_Hgg)
    DiPhotonsum_unweighted = sum(nevents_for_DiPhoton)
    GJetsum_unweighted = sum(nevents_for_GJet)
    QCDsum_unweighted = sum(nevents_for_QCD)
    DYsum_unweighted = sum(nevents_for_DY)
    TTGsJetssum_unweighted = sum(nevents_for_TTGsJets)
    WGsJetssum_unweighted = sum(nevents_for_WGsJets)
    WWsum_unweighted = sum(nevents_for_WW)
    bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted
    #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted

    HHsum_weighted = 2 * HHsum_weighted
    HHsum_unweighted = 2 * HHsum_unweighted

    if weights == 'BalanceYields':
        print('HHsum_weighted= ', HHsum_weighted)
        print('Hggsum_weighted= ', Hggsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('QCDsum_weighted= ', QCDsum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('TTGsJetssum_weighted= ', TTGsJetssum_weighted)
        print('WGsJetssum_weighted= ', WGsJetssum_weighted)
        print('WWsum_weighted= ', WWsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = HHsum_unweighted / HHsum_weighted
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)

    if weights == 'BalanceNonWeighted':
        print('HHsum_unweighted= ', HHsum_unweighted)
        print('Hggsum_unweighted= ', Hggsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('QCDsum_unweighted= ', QCDsum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted)
        print('WGsJetssum_unweighted= ', WGsJetssum_unweighted)
        print('WWsum_unweighted= ', WWsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' % (output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i + "\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights == 'BalanceYields':
        trainingweights = traindataset.loc[:,
                                           'classweight'] * traindataset.loc[:,
                                                                             'weight']
    if weights == 'BalanceNonWeighted':
        trainingweights = traindataset.loc[:, 'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.png')
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.pdf')

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name, 'a')
            time_str = str(time.localtime()) + '\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates = [0.00001, 0.0001]
            epochs = [150, 200]
            batch_size = [400, 500]
            param_grid = dict(learn_rate=learn_rates,
                              epochs=epochs,
                              batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model, verbose=0)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1)
            grid_result = grid.fit(X_train,
                                   Y_train,
                                   shuffle=True,
                                   sample_weight=trainingweights)
            print("Best score: %f , best params: %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            hyp_param_scan_results.write(
                "Best score: %f , best params: %s\n" %
                (grid_result.best_score_, grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" %
                      (mean, stdev, param))
                hyp_param_scan_results.write(
                    "Mean (stdev) test score: %f (%f) with parameters: %r\n" %
                    (mean, stdev, param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=100,
                                                   monitor='val_loss',
                                                   min_delta=0.01,
                                                   verbose=1)
            #model = baseline_model(num_variables, learn_rate=learn_rate)
            model = new_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,
                                Y_train,
                                validation_split=validation_split,
                                epochs=epochs,
                                batch_size=batch_size,
                                verbose=1,
                                shuffle=True,
                                sample_weight=trainingweights,
                                callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch'
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.png')
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.pdf')

            Plotter.history_plot(history, label='loss')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.png')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf')
    else:
        model_name = os.path.join(output_directory, 'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    Plotter.ROC(model, X_test, Y_test, X_train, Y_train)
    Plotter.save_plots(dir=plots_dir, filename='ROC.png')
    Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
Exemplo n.º 6
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int)
    parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str)
    parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int)
    parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'

    hyp_param_scan=args.hyp_param_scan
    # Set model hyper-parameters
    weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split=0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size=200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size=400

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file,'w')
    additional_hyperparams.write("optimizer: "+optimizer+"\n")
    additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n")
    additional_hyperparams.write("epochs: "+str(epochs)+"\n")
    additional_hyperparams.write("validation_split: "+str(validation_split)+"\n")
    additional_hyperparams.write("weights: "+weights+"\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory,'plots/')
    input_var_jsonFile = open('input_variables.json','r')
    selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'
    # selection_criteria = '(AtLeast4GoodJets0Lep==1)'
    # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)'
    #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile,encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key,var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Create instance of the input files directory
    #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars'
    inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples'
    #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/'
    #inputs_file_path = 'PromptPromptApplied/'

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' %(output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path))
        data = load_data(inputs_file_path,column_headers,selection_criteria)
        # Change sentinal value to speed up training.
        data = data.replace(to_replace=-999.000000,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)


    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight']
    weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight']
    weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight']
    weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight']
    weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight']
    weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight']

    HHsum_weighted= sum(weights_for_HH)
    GJetsum_weighted= sum(weights_for_GJet)
    DiPhotonsum_weighted= sum(weights_for_DiPhoton)
    TTGGsum_weighted= sum(weights_for_TTGG)
    TTGJetssum_weighted= sum(weights_for_TTGJets)
    TTJetssum_weighted= sum(weights_for_TTJets)
    WJetssum_weighted= sum(weights_for_WJets)
    ttHsum_weighted= sum(weights_for_ttH)
    DYsum_weighted= sum(weights_for_DY)
    #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted
    bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted']
    nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted']
    nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted']
    nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted']
    nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted']
    nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted']

    HHsum_unweighted= sum(nevents_for_HH)
    GJetsum_unweighted= sum(nevents_for_GJet)
    DiPhotonsum_unweighted= sum(nevents_for_DiPhoton)
    TTGGsum_unweighted= sum(nevents_for_TTGG)
    TTGJetssum_unweighted= sum(nevents_for_TTGJets)
    TTJetssum_unweighted= sum(nevents_for_TTJets)
    WJetssum_unweighted= sum(nevents_for_WJets)
    ttHsum_unweighted= sum(nevents_for_ttH)
    DYsum_unweighted= sum(nevents_for_DY)

    #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted
    bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted


    if weights=='BalanceYields':
        print('HHsum_weighted= ' , HHsum_weighted)
        print('ttHsum_weighted= ' , ttHsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('WJetssum_weighted= ', WJetssum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted)

    if weights=='BalanceNonWeighted':
        print('HHsum_unweighted= ' , HHsum_unweighted)
        print('ttHsum_unweighted= ' , ttHsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('WJetssum_unweighted= ', WJetssum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' %(output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i+"\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    ####################################################################################
    # Weights applied during training. You will also need to update the class weights if
    # you are going to change the event weights applied. Introduce class weights and any
    # event weight you want to use here.
    #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight']
    #trainingweights = np.array(trainingweights)

    # Temp hack to be able to change class weights without remaking dataframe
    #for inde in xrange(len(trainingweights)):
    #    newweight = 13243.0/6306.0
    #    trainingweights[inde]= newweight
    #print 'training event weight = ', trainingweights[0]

    # Event weights calculation so we can correctly apply event weights to diagnostic plots.
    # use seperate list because we don't want to apply class weights in plots.
    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights=='BalanceYields':
        trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight']
    if weights=='BalanceNonWeighted':
        trainingweights = traindataset.loc[:,'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.pdf'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name,'a')
            time_str = str(time.localtime())+'\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates=[0.00001, 0.0001]
            epochs = [150,200]
            batch_size = [400,500]
            param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model,verbose=0)
            grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
            grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights)
            print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_))
            hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param))
                hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1)
            model = baseline_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch.png'
            Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
    else:
        model_name = os.path.join(output_directory,'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory,'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory,'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory,'model_serialised.json')
    with open(model_json_name,'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,'model_schematic.eps')
    print "DEBUG: ",model_schematic_name
    plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)
    # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    '''
    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')
    '''

    # Make overfitting plots of output nodes
    Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights)
    print "DEBUG: Y_train shape: ",Y_train.shape

    # # Get true process integers for training dataset
    # original_encoded_train_Y = []
    # for i in xrange(len(result_probs)):
    #     if Y_train[i][0] == 1:
    #         original_encoded_train_Y.append(0)
    #     if Y_train[i][1] == 1:
    #         original_encoded_train_Y.append(1)
    #     if Y_train[i][2] == 1:
    #         original_encoded_train_Y.append(2)
    #     if Y_train[i][3] == 1:
    #         original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    # result_classes_test = newencoder.inverse_transform(result_classes_test)
    # result_classes_train = newencoder.inverse_transform(result_classes)
    e = shap.DeepExplainer(model, X_train[:400, ])
    shap_values = e.shap_values(X_test[:400, ])
    Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers)
    Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers)
    #e = shap.GradientExplainer(model, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers)
    #e = shap.KernelExplainer(model.predict, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)

    # Create confusion matrices for training and testing performance
    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')

    Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
Exemplo n.º 7
0
def main():
    print('')
    DNN_applier = apply_DNN()

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)

    parser.add_argument(
        '-p',
        '--processName',
        dest='processName',
        help=
        'Process name. List of options in keys of process_filename dictionary',
        default=[],
        type=str,
        nargs='+')
    parser.add_argument(
        '-d',
        '--modeldir',
        dest='modeldir',
        help='Option to choose directory containing trained model')

    args = parser.parse_args()
    processes = args.processName
    nClasses = 1
    modeldir = args.modeldir
    print(
        '<run_network_evaluation> Succesfully parsed arguments: processName= [%s], model directory= %s'
        % (processes, modeldir))

    input_var_jsonFile = ''

    # Open and load input variable .json
    input_var_jsonFile = open('../input_variables.json', 'r')
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Append variables to a list of column headers for .csv file later
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('event')
    column_headers.append('weight')

    # Dictionary of filenames to be run over along with their keys.
    process_filename = {
        'HHWWgg': ('HHWWgg-SL-SM-NLO-2017'),
        'DiPhoton': ('DiPhotonJetsBox_MGG-80toInf_13TeV-Sherpa_Hadded'),
        'GJet_Pt-20toInf':
        ('GJet_Pt-20toInf_DoubleEMEnriched_MGG-40to80_TuneCP5_13TeV_Pythia8_Hadded'
         ),
        'GJet_Pt-20to40':
        ('GJet_Pt-20to40_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_Hadded'
         ),
        'GJet_Pt-40toInf':
        ('GJet_Pt-40toInf_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_Hadded'
         ),
        'DYJetsToLL_M-50':
        ('DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8_Hadded'),
        'TTGJets':
        ('TTGJets_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8_Hadded'),
        'TTGG': ('TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8_Hadded'),
        'TTJets_HT-600to800':
        ('TTJets_HT-600to800_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'TTJets_HT-800to1200':
        ('TTJets_HT-800to1200_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'TTJets_HT-1200to2500':
        ('TTJets_HT-1200to2500_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'TTJets_HT-2500toInf':
        ('TTJets_HT-2500toInf_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'W1JetsToLNu_LHEWpT_0-50':
        ('W1JetsToLNu_LHEWpT_0-50_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W1JetsToLNu_LHEWpT_50-150':
        ('W1JetsToLNu_LHEWpT_50-150_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W1JetsToLNu_LHEWpT_150-250':
        ('W1JetsToLNu_LHEWpT_150-250_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W1JetsToLNu_LHEWpT_250-400':
        ('W1JetsToLNu_LHEWpT_250-400_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W1JetsToLNu_LHEWpT_400-inf':
        ('W1JetsToLNu_LHEWpT_400-inf_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W2JetsToLNu_LHEWpT_0-50':
        ('W2JetsToLNu_LHEWpT_0-50_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W2JetsToLNu_LHEWpT_50-150':
        ('W2JetsToLNu_LHEWpT_50-150_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W2JetsToLNu_LHEWpT_150-250':
        ('W2JetsToLNu_LHEWpT_150-250_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W2JetsToLNu_LHEWpT_250-400':
        ('W2JetsToLNu_LHEWpT_250-400_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W2JetsToLNu_LHEWpT_400-inf':
        ('W2JetsToLNu_LHEWpT_400-inf_TuneCP5_13TeV-amcnloFXFX-pythia8_Hadded'),
        'W3JetsToLNu':
        ('W3JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'W4JetsToLNu':
        ('W4JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8_Hadded'),
        'ttHJetToGG':
        ('ttHJetToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8_Hadded')
        #'Data' : ('Data_'+JESname+region)
    }

    training_columns = column_headers[:-2]
    num_variables = len(training_columns)

    # Load trained model
    model_name_1 = os.path.join('../', modeldir, 'model.h5')
    print('<run_network_evaluation> Using Model: ', model_name_1)
    model_1 = load_model(model_name_1, compile=False)
    # Make instance of plotter class
    Plotter = plotter()

    # Lists for all events in all files. Used to make diagnostic plots of networks performance over all samples.
    true_process = []
    model1_probs_ = []
    EventWeights_ = []

    succesfully_run_files = open("succesfully_run_files.txt", "w+")
    # Now loop over all samples
    for process in processes:
        print('<run_network_evaluation> Process: ', process)
        current_sample_name = process_filename.get(process)
        inputs_file_path = '/Users/joshuhathomas-wilsker/Documents/work/lxplus_remote/work/private/IHEP/HH/HHWWyy/HHWWgg_DataSignalMCnTuples/2017/'
        if 'HHWWgg' in process:
            inputs_file_path += 'Signal/'
        else:
            inputs_file_path += 'Bkgs/'

        print('<run_network_evaluation> Input file directory: ',
              inputs_file_path)

        # Make final output directory
        samples_dir_w_appended_DNN = 'samples_w_DNN'
        if not os.path.exists(samples_dir_w_appended_DNN):
            os.makedirs(samples_dir_w_appended_DNN)
        samples_final_path_dir = os.path.join(samples_dir_w_appended_DNN,
                                              modeldir)
        if not os.path.exists(samples_final_path_dir):
            os.makedirs(samples_final_path_dir)

        dataframe_name = '%s/%s_dataframe.csv' % (samples_final_path_dir,
                                                  process)
        if os.path.isfile(dataframe_name):
            print('<run_network_evaluation> Loading %s . . . . ' %
                  dataframe_name)
            data = pandas.read_csv(dataframe_name)
        else:
            print(
                '<run_network_evaluation> Making *new* data file from %s . . . . '
                % (inputs_file_path))
            selection_criteria = '( ( (Leading_Photon_pt/CMS_hgg_mass) > 0.35 ) && ( (Subleading_Photon_pt/CMS_hgg_mass) > 0.25 ) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1 )'
            data = DNN_applier.load_data(inputs_file_path, column_headers,
                                         selection_criteria,
                                         current_sample_name)
            if len(data) == 0:
                print('<run_network_evaluation> No data! Next file.')
                continue
            print(
                '<run_network_evaluation> Saving new data .csv file at %s . . . . '
                % (dataframe_name))
            print(
                '<run_network_evaluation> Found events passing selection. Process name will be stored in succesfully_run_files.txt'
            )
            succesfully_run_files.write(process)

            data = data.replace(to_replace=-999.000000, value=-9.0)
            data.to_csv(dataframe_name, index=False)

        nHH = len(data.iloc[data.target.values == 1])
        nbckg = len(data.iloc[data.target.values == 0])

        print("<run_network_evaluation> Total length of HH = %i, bckg = %i" %
              (nHH, nbckg))

        # Create dataset from dataframe to evaluate DNN
        X_test = data[training_columns].values
        result_probs_ = model_1.predict(np.array(X_test))
        nEvent = data['event']

        if len(result_probs_) < 1.:
            print('<run_network_evaluation> Warning: only %s test values.' %
                  (len(result_probs_)))
            print('<run_network_evaluation> Probabilities: ', result_probs_)
            print('<run_network_evaluation> Exiting now.')
            exit(0)

        # Dictionary:
        # key = event number : value = DNN output
        eventnum_resultsprob_dict = {}
        for index in range(len(nEvent)):
            #print('nEvent= %s , prob = %s' % (nEvent[index], result_probs_[index])
            eventnum_resultsprob_dict[nEvent[index]] = result_probs_[index]
            model1_probs_.append(result_probs_[index])

        print(current_sample_name)
        infile = inputs_file_path + current_sample_name + ".root"
        print('<run_network_evaluation> Input file: ', infile)

        # Open file and load ttrees
        data_file = TFile.Open(infile)
        if 'HHWWgg' in current_sample_name:
            treename = [
                'GluGluToHHTo2G2Qlnu_node_cHHH1_TuneCP5_PSWeights_13TeV_powheg_pythia8alesauva_2017_1_10_6_4_v0_RunIIFall17MiniAODv2_PU2017_12Apr2018_94X_mc2017_realistic_v14_v1_1c4bfc6d0b8215cc31448570160b99fdUSER'
            ]
        elif 'DiPhotonJetsBox_MGG' in current_sample_name:
            treename = ['DiPhotonJetsBox_MGG_80toInf_13TeV_Sherpa']
        elif 'GJet_Pt-20toInf' in current_sample_name:
            treename = [
                'GJet_Pt_20toInf_DoubleEMEnriched_MGG_40to80_TuneCP5_13TeV_Pythia8'
            ]
        elif 'GJet_Pt-20to40' in current_sample_name:
            treename = [
                'GJet_Pt_20to40_DoubleEMEnriched_MGG_80toInf_TuneCP5_13TeV_Pythia8'
            ]
        elif 'GJet_Pt-40toInf' in current_sample_name:
            treename = [
                'GJet_Pt_40toInf_DoubleEMEnriched_MGG_80toInf_TuneCP5_13TeV_Pythia8'
            ]
        elif 'DYJetsToLL_M-50_TuneCP5' in current_sample_name:
            treename = ['DYJetsToLL_M_50_TuneCP5_13TeV_amcatnloFXFX_pythia8']
        elif 'TTGG' in current_sample_name:
            treename = ['TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8']
        elif 'TTGJets' in current_sample_name:
            treename = ['TTGJets_TuneCP5_13TeV_amcatnloFXFX_madspin_pythia8']
        elif 'TTJets_HT-600to800' in current_sample_name:
            treename = ['TTJets_HT_600to800_TuneCP5_13TeV_madgraphMLM_pythia8']
        elif 'TTJets_HT-800to1200' in current_sample_name:
            treename = [
                'TTJets_HT_800to1200_TuneCP5_13TeV_madgraphMLM_pythia8'
            ]
        elif 'TTJets_HT-1200to2500' in current_sample_name:
            treename = [
                'TTJets_HT_1200to2500_TuneCP5_13TeV_madgraphMLM_pythia8'
            ]
        elif 'TTJets_HT-2500toInf' in current_sample_name:
            treename = [
                'TTJets_HT_2500toInf_TuneCP5_13TeV_madgraphMLM_pythia8'
            ]
        elif 'W1JetsToLNu_LHEWpT_0-50' in current_sample_name:
            treename = [
                'W1JetsToLNu_LHEWpT_0_50_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W1JetsToLNu_LHEWpT_50-150' in current_sample_name:
            treename = [
                'W1JetsToLNu_LHEWpT_50_150_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W1JetsToLNu_LHEWpT_150-250' in current_sample_name:
            treename = [
                'W1JetsToLNu_LHEWpT_150_250_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W1JetsToLNu_LHEWpT_250-400' in current_sample_name:
            treename = [
                'W1JetsToLNu_LHEWpT_250_400_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W1JetsToLNu_LHEWpT_400-inf' in current_sample_name:
            treename = [
                'W1JetsToLNu_LHEWpT_400_inf_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W2JetsToLNu_LHEWpT_0-50' in current_sample_name:
            treename = [
                'W2JetsToLNu_LHEWpT_0_50_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W2JetsToLNu_LHEWpT_50-150' in current_sample_name:
            treename = [
                'W2JetsToLNu_LHEWpT_50_150_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W2JetsToLNu_LHEWpT_150-250' in current_sample_name:
            treename = [
                'W2JetsToLNu_LHEWpT_150_250_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W2JetsToLNu_LHEWpT_250-400' in current_sample_name:
            treename = [
                'W2JetsToLNu_LHEWpT_250_400_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W2JetsToLNu_LHEWpT_400-inf' in current_sample_name:
            treename = [
                'W2JetsToLNu_LHEWpT_400_inf_TuneCP5_13TeV_amcnloFXFX_pythia8'
            ]
        elif 'W3JetsToLNu' in current_sample_name:
            treename = ['W3JetsToLNu_TuneCP5_13TeV_madgraphMLM_pythia8']
        elif 'W4JetsToLNu' in current_sample_name:
            treename = ['W4JetsToLNu_TuneCP5_13TeV_madgraphMLM_pythia8']
        elif 'ttHJetToGG' in current_sample_name:
            treename = ['ttHJetToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8']
        else:
            print(
                '<run_network_evaluation> Warning: Process name not recognised. Exiting.'
            )
            exit(0)

        # Open each TTree in file and loop over events.
        # Append evaluated DNN score to DNN branch for each event.
        # Score assigned to event according to event number.
        for tname in treename:
            print('<run_network_evaluation> TTree: ', tname)
            data_tree = data_file.Get(tname)
            # Check if input file is zombie
            if data_file.IsZombie():
                raise IOError('missing file')
                exit(0)

            output_file_name = '%s/%s.root' % (samples_final_path_dir,
                                               process_filename.get(process))
            print('<run_network_evaluation> Creating new output .root file')
            output_file = TFile.Open(output_file_name, 'RECREATE')

            # Clone empty tree
            output_tree = data_tree.CloneTree(0)
            output_tree.SetName("output_tree")

            # All branches on.
            # Turn off all branches except those needed to speed up run-time
            output_tree.SetBranchStatus('*', 1)

            # Append DNN Branches to new TTree
            DNN_evaluation = array('f', [0.])
            DNN_evaluation_branch = output_tree.Branch('DNN_evaluation',
                                                       DNN_evaluation,
                                                       'DNN_evaluation/F')

            sample_name = process

            histo_DNN_values_title = 'DNN values: %s Sample' % (sample_name)
            histo_DNN_values_name = 'histo_DNN_values_%s_sample' % (
                sample_name)
            histo_DNN_values = ROOT.TH1D(histo_DNN_values_name,
                                         histo_DNN_values_title, 200, 0, 1.)

            temp_percentage_done = 0

            ######## Loop over ttree #########
            print('<run_network_evaluation> data_tree # Entries: ',
                  data_tree.GetEntries())
            if output_tree.GetEntries() != 0:
                print('<run_network_evaluation> output_tree # Entries: ',
                      output_tree.GetEntries())
                print(
                    'This tree should be empty at this point!!!!! check cloning correctly'
                )

            for i in range(data_tree.GetEntries()):
                DNN_evaluation[0] = -1.

                percentage_done = int(100 * float(i) /
                                      float(data_tree.GetEntries()))
                if percentage_done % 10 == 0:
                    if percentage_done != temp_percentage_done:
                        print(percentage_done)
                        temp_percentage_done = percentage_done
                data_tree.GetEntry(i)

                Eventnum_ = array('d', [0])
                Eventnum_ = data_tree.event
                EventWeight_ = array('d', [0])
                EventWeight_ = data_tree.weight
                passbVeto = array('d', [0])
                passbVeto = data_tree.passbVeto
                ExOneLep = array('d', [0])
                ExOneLep = data_tree.ExOneLep
                Leading_Photon_pt = array('d', [0])
                Leading_Photon_pt = data_tree.Leading_Photon_pt
                Subleading_Photon_pt = array('d', [0])
                Subleading_Photon_pt = data_tree.Subleading_Photon_pt
                CMS_hgg_mass = array('d', [0])
                CMS_hgg_mass = data_tree.CMS_hgg_mass
                N_goodJets = array('d', [0])
                N_goodJets = data_tree.N_goodJets

                if ((Leading_Photon_pt / CMS_hgg_mass) > 0.35
                        and (Subleading_Photon_pt / CMS_hgg_mass) > 0.25
                        and passbVeto == 1 and ExOneLep == 1
                        and N_goodJets >= 1):
                    pass_selection = 1
                else:
                    pass_selection = 0

                if pass_selection == 0:
                    continue

                if 'HHWWgg' in process:
                    true_process.append(1)
                else:
                    true_process.append(0)

                EventWeights_.append(EventWeight_)
                histo_DNN_values.Fill(
                    eventnum_resultsprob_dict.get(Eventnum_)[0], EventWeight_)
                DNN_evaluation[0] = eventnum_resultsprob_dict.get(Eventnum_)[0]
                output_tree.Fill()

        eventnum_resultsprob_dict.clear()
        output_file.Write()
        output_file.Close()
        data_file.Close()

    #plots_dir = os.path.join(samples_final_path_dir,'plots/')
    #Plotter.plots_directory = plots_dir
    #model1_probs_ = np.array(model1_probs_)
    #Plotter.ROC_sklearn(true_process, model1_probs_, true_process, model1_probs_, 0 , 'ttHnode')

    exit(0)