def main(opts): # Save start time (epoch seconds) tStart = time.time() Verbose("Started @ " + str(tStart), True) # Do not display canvases & disable screen output info ROOT.gROOT.SetBatch(ROOT.kTRUE) ROOT.gROOT.ProcessLine("gErrorIgnoreLevel = 1001;") # Setup the style style = tdrstyle.TDRStyle() style.setOptStat(False) style.setGridX(opts.gridX) style.setGridY(opts.gridY) # Open the ROOT file f = ROOT.TFile.Open(opts.rootFileName) directory = f.Get("Method_BDT/BDTG") #hBDT_signal = directory.Get("MVA_BDTG_S_high") #hBDT_bkg = directory.Get("MVA_BDTG_B_high") hBDT_signal = directory.Get("MVA_BDTG_effS") hBDT_bkg = directory.Get("MVA_BDTG_effB") graphBDT = func.GetROC(hBDT_signal, hBDT_bkg) jsonWr = JsonWriter(saveDir=opts.saveDir, verbose=opts.verbose) #jsonWr.addGraph("ROC", graphBDT) jsonWr.addGraph("EfficiencySig", func.convertHistoToGaph(hBDT_signal)) jsonWr.addGraph("EfficiencyBkg", func.convertHistoToGaph(hBDT_bkg)) jsonWr.write(opts.resultsJSON) return
def main(): ROOT.gStyle.SetOptStat(0) style = tdrstyle.TDRStyle() style.setOptStat(False) style.setGridX(True) style.setGridY(True) # Definitions filename = "histograms-TT_19var.root" debug = 1 nprint = 100 tfile = ROOT.TFile.Open(filename) #Signal and background branches signal = uproot.open(filename)["treeS"] background = uproot.open(filename)["treeB"] # Input list inputList = [] inputList.append("TrijetPtDR") inputList.append("TrijetDijetPtDR") inputList.append("TrijetBjetMass") inputList.append("TrijetLdgJetBDisc") inputList.append("TrijetSubldgJetBDisc") inputList.append("TrijetBJetLdgJetMass") inputList.append("TrijetBJetSubldgJetMass") inputList.append("TrijetMass") inputList.append("TrijetDijetMass") inputList.append("TrijetBJetBDisc") inputList.append("TrijetSoftDrop_n2") inputList.append("TrijetLdgJetCvsL") inputList.append("TrijetSubldgJetCvsL") inputList.append("TrijetLdgJetPtD") inputList.append("TrijetSubldgJetPtD") inputList.append("TrijetLdgJetAxis2") inputList.append("TrijetSubldgJetAxis2") inputList.append("TrijetLdgJetMult") inputList.append("TrijetSubldgJetMult") nInputs = len(inputList) #Signal and background dataframes df_signal = signal.pandas.df(inputList) df_background = background.pandas.df(inputList) nEvts = len(df_signal.index) print "=== Number of signal events: ", nEvts #Signal and background datasets dset_signal = df_signal.values dset_background = df_background.values # Concat signal, background datasets df_list = [df_signal, df_background] df_all = pandas.concat(df_list) dataset = df_all.values dataset_target_all = pandas.concat([ signal.pandas.df(["TrijetMass"]), background.pandas.df(["TrijetMass"]) ]).values dataset_target_bkg = background.pandas.df(["TrijetMass"]).values dataset_target_signal = signal.pandas.df(["TrijetMass"]).values X_signal = dset_signal[:nEvts, 0:nInputs] X_background = dset_background[:nEvts, 0:nInputs] #nEvts = nEvts X = dset_background[:nEvts, 0:nInputs] colors = [ ROOT.kBlue, ROOT.kMagenta, ROOT.kRed, ROOT.kOrange, ROOT.kYellow, ROOT.kGreen, ROOT.kCyan, ROOT.kViolet + 5, ROOT.kPink + 5, ROOT.kOrange + 5, ROOT.kSpring + 5, ROOT.kTeal + 5 ] lines = [1, 2, 9, 3, 6, 7, 10] graphList = [] nameList = [] # Load keras model model = load_model("Model_relu_relu_relu_sigmoid.h5") model.compile(loss='binary_crossentropy', optimizer='adam') Y_signal = model.predict(X_signal, verbose=0) Y_bkg = model.predict(X_background, verbose=0) htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest( Y_signal, Y_signal, Y_bkg, Y_bkg, "plotROC", "model", ["pdf"]) graph = func.GetROC(htest_s, htest_b) graphList.append(graph.Clone("NN")) nameList.append("Neural Network") # Read BDTG results f = ROOT.TFile.Open("TopRecoTree_191009_083540_multFloat.root") directory = f.Get("Method_BDT/BDTG") hBDT_signal = directory.Get("MVA_BDTG_S_high") hBDT_bkg = directory.Get("MVA_BDTG_B_high") func.PlotEfficiency(hBDT_signal, hBDT_bkg, "TestROC", "EfficBDT", ["pdf"]) graphBDT = func.GetROC(hBDT_signal, hBDT_bkg) graphList.append(graphBDT.Clone("BDT")) nameList.append("BDTG") graph_roc = {"graph": graphList, "name": nameList} func.PlotROC(graph_roc, "TestROC", "NN_vs_BDTG", ["pdf", "C"])
def main(opts): # Save start time (epoch seconds) tStart = time.time() Verbose("Started @ " + str(tStart), True) # Do not display canvases & disable screen output info ROOT.gROOT.SetBatch(ROOT.kTRUE) ROOT.gROOT.ProcessLine( "gErrorIgnoreLevel = 1001;") # Setup the style style = tdrstyle.TDRStyle() style.setOptStat(False) style.setGridX(opts.gridX) style.setGridY(opts.gridY) # Open the ROOT file ROOT.TFile.Open(opts.rootFileName) # Setting the seed for numpy-generated random numbers numpy.random.seed(opts.rndSeed) # Setting the seed for python random numbers rn.seed(opts.rndSeed) # Setting tensorflow random seed tf.set_random_seed(opts.rndSeed) # Open the signal and background TTrees with uproot (uproot allows one to read ROOT data, in python, without using ROOT) Print("Opening the signal and background TTrees with uproot using ROOT file %s" % (ts + opts.rootFileName + ns), True) signal = uproot.open(opts.rootFileName)["treeS"] background = uproot.open(opts.rootFileName)["treeB"] # Input list of discriminatin variables (TBranches) inputList = [] inputList.append("TrijetPtDR") inputList.append("TrijetDijetPtDR") inputList.append("TrijetBjetMass") inputList.append("TrijetLdgJetBDisc") inputList.append("TrijetSubldgJetBDisc") inputList.append("TrijetBJetLdgJetMass") inputList.append("TrijetBJetSubldgJetMass") #inputList.append("TrijetMass") inputList.append("TrijetDijetMass") inputList.append("TrijetBJetBDisc") inputList.append("TrijetSoftDrop_n2") inputList.append("TrijetLdgJetCvsL") inputList.append("TrijetSubldgJetCvsL") inputList.append("TrijetLdgJetPtD") inputList.append("TrijetSubldgJetPtD") inputList.append("TrijetLdgJetAxis2") inputList.append("TrijetSubldgJetAxis2") inputList.append("TrijetLdgJetMult") inputList.append("TrijetSubldgJetMult") nInputs = len(inputList) # Construct signal and background dataframes using a list of TBranches (a Dataframe is a two dimensional structure representing data in python) Print("Constucting dataframes for signal and background with %d input variables:\n\t%s%s%s" % (nInputs, ss, "\n\t".join(inputList), ns), True) df_signal = signal.pandas.df(inputList) # call an array-fetching method to fill a Pandas DataFrame. df_bkg = background.pandas.df(inputList) # Get the index (row labels) of the DataFrame. nsignal = len(df_signal.index) nbkg = len(df_bkg.index) Verbose("Signal has %s%d%s row labels. Background has %s%d%s row labels" % (ss, nsignal, ns, es, nbkg, ns), True) # Apply rule-of-thumb to prevent over-fitting checkNeuronsPerLayer(nsignal, opts) # Sanity check columns = list(df_signal.columns.values) Verbose("The signal columns are :\n\t%s%s%s" % (ss, "\n\t".join(columns), ns), True) # Get a Numpy representation of the DataFrames for signal and background datasets Verbose("Getting a numpy representation of the DataFrames for signal and background datasets", True) dset_signal = df_signal.values dset_bkg = df_bkg.values printDataset(dset_signal) printDataset(dset_bkg) # Construct the pandas DataFrames (2D size-mutable tabular data structure with labeled axes i.e. rows and columns) Verbose("Constructing pandas DataFrames for signal and background", True) ds_signal = pandas.DataFrame(data=dset_signal,columns=inputList) ds_bkg = pandas.DataFrame(data=dset_bkg,columns=inputList) # Construct pandas DataFrames (2D size-mutable tabular data structure with labeled axes i.e. rows and columns) Verbose("Constructing pandas DataFrames", True) df_signal = df_signal.assign(signal=1) df_bkg = df_bkg.assign(signal=0) Verbose("Printing tabular data for signal:\n%s%s%s" % (ss, ds_signal,ns), True) Verbose("Printing tabular data for background:\n%s%s%s" % (ss, ds_bkg,ns), True) # Create dataframe lists df_list = [df_signal, df_bkg] df_all = pandas.concat(df_list) # Get a Numpy representation of the DataFrames for signal and background datasets (again, and AFTER assigning signal and background) dset_signal = df_signal.values dset_bkg = df_bkg.values dset_all = df_all.values # Define keras model as a linear stack of layers. Add layers one at a time until we are happy with our network architecture. Print("Creating the sequential Keras model", True) model = Sequential() # The best network structure is found through a process of trial and error experimentation. Generally, you need a network large enough to capture the structure of the problem. # The Dense function defines each layer - how many neurons and mathematical function to use. for iLayer, n in enumerate(opts.neurons, 0): layer = "%d layer#" % (int(iLayer)+1) if iLayer == 0: layer += " (input Layer)" # Input variables, sometimes called the visible layer. elif iLayer == len(opts.neurons)-1: layer += " (output Layer)" # Layers of nodes between the input and output layers. There may be one or more of these layers. else: layer += " (hidden layer)" # A layer of nodes that produce the output variables. Print("Adding %s, with %s%d neurons%s and activation function %s" % (ts + layer + ns, ls, n, ns, ls + opts.activation[iLayer] + ns), i==0) if iLayer == 0: if opts.neurons[iLayer] != nInputs > 1: msg = "The number of neurons must equal the number of features (columns) in your data. Some NN configuration add one additional node for a bias term" Print(msg, True) # Only first layer demands input_dim. For the rest it is implied. model.add( Dense(opts.neurons[iLayer], input_dim = nInputs) ) #, weights = [np.zeros([692, 50]), np.zeros(50)] OR bias_initializer='zeros', or bias_initializer=initializers.Constant(0.1) model.add(Activation(opts.activation[iLayer])) #model.add( Dense(opts.neurons[iLayer], input_dim = nInputs, activation = opts.activation[iLayer]) ) # her majerty Soti requested to break this into 2 lines else: if 0: #opts.neurons[iLayer] < nInputs: msg = "The number of neurons (=%d) is less than the number of input variables (=%d). Please set the number of neurons to be at least the number of inputs!" % (opts.neurons[iLayer], nInputs) raise Exception(es + msg + ns) model.add( Dense(opts.neurons[iLayer])) model.add(Activation(opts.activation[iLayer])) #model.add( Dense(opts.neurons[iLayer], activation = opts.activation[iLayer]) ) # Print a summary representation of your model. Print("Printing model summary:", True) model.summary() # Get a dictionary containing the configuration of the model. The model can be reinstantiated from its config via: model = Model.from_config(config) if 0: config = model.get_config() # Split data into input (X) and output (Y). (Note: dataset includes both signal and background sequentially) # Use only 2*nsignal rows => First nSignal rows is the signal. Another (equal) nSignal rows for the bkg. # Therefore signal and bkg have exactly the same number X = dset_all[:2*nsignal,0:nInputs] # rows: 0 -> 2*signal, columns: 0 -> 19 Y = dset_all[:2*nsignal,nInputs:] # rows: 0 -> 2*signal, columns: 19 (isSignal = 0 or 1) X_signal = dset_signal[:nsignal, 0:nInputs] X_background = dset_bkg[:nsignal, 0:nInputs] Print("Signal dataset has %s%d%s rows. Background dataset has %s%d%s rows" % (ss, len(X_signal), ns, es, len(X_background), ns), True) # Split the datasets (X= 19 inputs, Y=output variable). Test size 0.5 means half for training half for testing X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=opts.rndSeed, shuffle=True) opts.testSample = len(X_test) opts.trainSample = len(X_train) # Early stop? Stop training when a monitored quantity has stopped improving. # Show patience of "50" epochs with a change in the loss function smaller than "min_delta" before stopping procedure # https://stackoverflow.com/questions/43906048/keras-early-stopping earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=10, verbose=1, mode='auto') # earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=50) callbacks = [earlystop] # [Loss function is used to understand how well the network is working (compare predicted label with actual label via some function)] # Optimizer function is related to a function used to optimise the weights Print("Compiling the model with the loss function %s and optimizer %s " % (ls + opts.lossFunction + ns, ts + opts.optimizer + ns), True) model.compile(loss=opts.lossFunction, optimizer=opts.optimizer, metrics=['accuracy']) #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) # Customise the optimiser settings? if 0: #opts.optimizer == "adam": # does not work # Default parameters follow those provided in the original paper. # learning_rate: float >= 0. Learning rate. # beta_1: float, 0 < beta < 1. Generally close to 1. # beta_2: float, 0 < beta < 1. Generally close to 1. # amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and Beyond". keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) # batch size equal to the training batch size (See https://machinelearningmastery.com/use-different-batch-sizes-training-predicting-python-keras/) if opts.batchSize == None: opts.batchSize = len(X_train)/2 # Fit the model with our data # (An "epoch" is an arbitrary cutoff, generally defined as "one iteration of training on the whole dataset", # used to separate training into distinct phases, which is useful for logging and periodic evaluation.) try: hist = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = opts.epochs, # a full pass over all of your training data batch_size = opts.batchSize, # a set of N samples (https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network) shuffle = False, verbose = 1, # 0=silent, 1=progress, 2=mention the number of epoch callbacks = callbacks ) except KeyboardInterrupt: #(KeyboardInterrupt, SystemExit): msg = "Manually interrupted the training (keyboard interrupt)!" Print(es + msg + ns, True) # Write the model modelName = "Model_%s_trained.h5" % (opts.rootFileName.replace(".root","")) model.save(modelName) # serialize model to JSON (contains arcitecture of model) model_json = model.to_json() with open("model_architecture.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights('model_weights.h5', overwrite=True) model.save(modelName) # write weights and architecture in txt file func.WriteModel(model, model_json, inputList, os.path.join(opts.saveDir, "model.txt") ) # Produce method score (i.e. predict output value for given input dataset). Computation is done in batches. # https://stackoverflow.com/questions/49288199/batch-size-in-model-fit-and-model-predict Print("Generating output predictions for the input samples", True) # (e.g. Numpy array) pred_train = model.predict(X_train , batch_size=None, verbose=1, steps=None) pred_test = model.predict(X_test , batch_size=None, verbose=1, steps=None) pred_signal = model.predict(X_signal , batch_size=None, verbose=1, steps=None) pred_bkg = model.predict(X_background, batch_size=None, verbose=1, steps=None) # pred_train = model.predict(x, batch_size=None, verbose=0, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False) # Keras version 2.2.5 or later (https://keras.io/models/model/) # Join a sequence of arrays (X and Y) along an existing axis (1). In other words, add the ouput variable (Y) to the input variables (X) XY_train = numpy.concatenate((X_train, Y_train), axis=1) XY_test = numpy.concatenate((X_test , Y_test ), axis=1) # Pick events with output = 1 Print("Select events/samples which have an output variable Y (last column) equal to 1 (i.e. prediction is combatible with signal)", True) x_train_S = XY_train[XY_train[:,nInputs] == 1]; x_train_S = x_train_S[:,0:nInputs] x_test_S = XY_test[XY_test[:,nInputs] == 1]; x_test_S = x_test_S[:,0:nInputs] Print("Select events/samples which have an output variable Y (last column) equal to 0 (i.e. prediction is NOT combatible with signal)", False) x_train_B = XY_train[XY_train[:,nInputs] == 0]; x_train_B = x_train_B[:,0:nInputs] x_test_B = XY_test[XY_test[:,nInputs] == 0]; x_test_B = x_test_B[:,0:nInputs] # Produce method score for signal (training and test) and background (training and test) pred_train_S = model.predict(x_train_S, batch_size=None, verbose=1, steps=None) pred_train_B = model.predict(x_train_B, batch_size=None, verbose=1, steps=None) pred_test_S = model.predict(x_test_S , batch_size=None, verbose=1, steps=None) pred_test_B = model.predict(x_test_B , batch_size=None, verbose=1, steps=None) # Inform user of early stop stopEpoch = earlystop.stopped_epoch if stopEpoch != 0 and stopEpoch < opts.epochs: msg = "Early stop occured after %d epochs!" % (stopEpoch) opts.epochs = stopEpoch Print(cs + msg + ns, True) # Create json files writeCfgFile(opts) writeGitFile(opts) jsonWr = JsonWriter(saveDir=opts.saveDir, verbose=opts.verbose) # Plot selected output and save to JSON file for future use func.PlotAndWriteJSON(pred_signal , pred_bkg , opts.saveDir, "Output" , jsonWr, opts.saveFormats) func.PlotAndWriteJSON(pred_train , pred_test , opts.saveDir, "OutputPred" , jsonWr, opts.saveFormats) func.PlotAndWriteJSON(pred_train_S, pred_train_B, opts.saveDir, "OutputTrain", jsonWr, opts.saveFormats) func.PlotAndWriteJSON(pred_test_S , pred_test_B , opts.saveDir, "OutputTest" , jsonWr, opts.saveFormats) # Plot overtraining test htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest(pred_train_S, pred_test_S, pred_train_B, pred_test_B, opts.saveDir, "OvertrainingTest", opts.saveFormats) # Plot summary plot (efficiency & singificance) func.PlotEfficiency(htest_s, htest_b, opts.saveDir, "Summary", opts.saveFormats) # Write efficiencies (signal and bkg) xVals_S, xErrs_S, effVals_S, effErrs_S = func.GetEfficiency(htest_s) xVals_B, xErrs_B, effVals_B, effErrs_B = func.GetEfficiency(htest_b) func.PlotTGraph(xVals_S, xErrs_S, effVals_S, effErrs_S, opts.saveDir, "EfficiencySig", jsonWr, opts.saveFormats) func.PlotTGraph(xVals_B, xErrs_B, effVals_B, effErrs_B, opts.saveDir, "EfficiencyBkg", jsonWr, opts.saveFormats) xVals, xErrs, sig_def, sig_alt = func.GetSignificance(htest_s, htest_b) func.PlotTGraph(xVals, xErrs, sig_def, effErrs_B, opts.saveDir, "SignificanceA", jsonWr, opts.saveFormats) func.PlotTGraph(xVals, xErrs, sig_alt, effErrs_B, opts.saveDir, "SignificanceB", jsonWr, opts.saveFormats) # Plot ROC curve gSig = func.GetROC(htest_s, htest_b) if 0: gBkg = func.GetROC(htest_b, htest_s) gDict = {"graph" : [gSig, gBkg], "name" : ["signal", "bkg"]} else: gDict = {"graph" : [gSig], "name" : [os.path.basename(opts.saveDir)]} style.setLogY(True) func.PlotROC(gDict, opts.saveDir, "ROC", opts.saveFormats) # Write the resultsJSON file! jsonWr.write(opts.resultsJSON) # Print total time elapsed days, hours, mins, secs = GetTime(tStart) Print("Total elapsed time is %s days, %s hours, %s mins, %s secs" % (days[0], hours[0], mins[0], secs), True) return
x_train_B = XY_train[XY_train[:,nInputs] == 0]; x_train_B = x_train_B[:,0:nInputs] x_test_S = XY_test[XY_test[:,nInputs] == 1]; x_test_S = x_test_S[:,0:nInputs] x_test_B = XY_test[XY_test[:,nInputs] == 0]; x_test_B = x_test_B[:,0:nInputs] # Calculate the output of the model for the four datasets (training signal, training bkg, testing signal, testing bkg) pred_train_S = model.predict(x_train_S) pred_train_B = model.predict(x_train_B) pred_test_S = model.predict(x_test_S) pred_test_B = model.predict(x_test_B) # Output directory dirName = plot.getDirName("TopTag") # X_signal = dset_signal[:nsignal, 0:nInputs] # X_background = dset_background[:nsignal, 0:nInputs] # func.PlotOutput(pred_signal, pred_background, dirName, "Output_SB", 1, ["pdf"]) # func.PlotOutput(pred_train, pred_test, dirName, "Output_pred", 0, ["pdf"]) # func.PlotOutput(pred_train_S, pred_train_B, dirName, "Output_SB_train", 1, ["pdf"]) # func.PlotOutput(pred_test_S, pred_test_B, dirName, "Output_SB_test", 1, ["pdf"]) # Plot the output of the four datasets htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest(pred_train_S, pred_test_S, pred_train_B, pred_test_B, dirName, "OvertrainingTest", ["pdf"]) # Calculate efficiency func.PlotEfficiency(htest_s, htest_b, dirName, "Efficiency.pdf", ["pdf"]) # Plot ROC curve graph1 = func.GetROC(htest_s, htest_b) graph_roc = {"graph" : [graph1], "name" : ["graph 1"]} func.PlotROC(graph_roc, dirName, "ROC", ["pdf"])