예제 #1
0
def main(opts):

    # Save start time (epoch seconds)
    tStart = time.time()
    Verbose("Started @ " + str(tStart), True)

    # Do not display canvases & disable screen output info
    ROOT.gROOT.SetBatch(ROOT.kTRUE)
    ROOT.gROOT.ProcessLine("gErrorIgnoreLevel = 1001;")

    # Setup the style
    style = tdrstyle.TDRStyle()
    style.setOptStat(False)
    style.setGridX(opts.gridX)
    style.setGridY(opts.gridY)

    # Open the ROOT file
    f = ROOT.TFile.Open(opts.rootFileName)
    directory = f.Get("Method_BDT/BDTG")
    #hBDT_signal = directory.Get("MVA_BDTG_S_high")
    #hBDT_bkg    = directory.Get("MVA_BDTG_B_high")
    hBDT_signal = directory.Get("MVA_BDTG_effS")
    hBDT_bkg = directory.Get("MVA_BDTG_effB")
    graphBDT = func.GetROC(hBDT_signal, hBDT_bkg)
    jsonWr = JsonWriter(saveDir=opts.saveDir, verbose=opts.verbose)
    #jsonWr.addGraph("ROC", graphBDT)
    jsonWr.addGraph("EfficiencySig", func.convertHistoToGaph(hBDT_signal))
    jsonWr.addGraph("EfficiencyBkg", func.convertHistoToGaph(hBDT_bkg))
    jsonWr.write(opts.resultsJSON)

    return
예제 #2
0
def main():
    ROOT.gStyle.SetOptStat(0)

    style = tdrstyle.TDRStyle()
    style.setOptStat(False)
    style.setGridX(True)
    style.setGridY(True)

    # Definitions
    filename = "histograms-TT_19var.root"
    debug = 1
    nprint = 100
    tfile = ROOT.TFile.Open(filename)

    #Signal and background branches
    signal = uproot.open(filename)["treeS"]
    background = uproot.open(filename)["treeB"]

    # Input list
    inputList = []
    inputList.append("TrijetPtDR")
    inputList.append("TrijetDijetPtDR")
    inputList.append("TrijetBjetMass")
    inputList.append("TrijetLdgJetBDisc")
    inputList.append("TrijetSubldgJetBDisc")
    inputList.append("TrijetBJetLdgJetMass")
    inputList.append("TrijetBJetSubldgJetMass")
    inputList.append("TrijetMass")
    inputList.append("TrijetDijetMass")
    inputList.append("TrijetBJetBDisc")
    inputList.append("TrijetSoftDrop_n2")
    inputList.append("TrijetLdgJetCvsL")
    inputList.append("TrijetSubldgJetCvsL")
    inputList.append("TrijetLdgJetPtD")
    inputList.append("TrijetSubldgJetPtD")
    inputList.append("TrijetLdgJetAxis2")
    inputList.append("TrijetSubldgJetAxis2")
    inputList.append("TrijetLdgJetMult")
    inputList.append("TrijetSubldgJetMult")

    nInputs = len(inputList)

    #Signal and background dataframes
    df_signal = signal.pandas.df(inputList)
    df_background = background.pandas.df(inputList)

    nEvts = len(df_signal.index)
    print "=== Number of signal events: ", nEvts

    #Signal and background datasets
    dset_signal = df_signal.values
    dset_background = df_background.values

    # Concat signal, background datasets
    df_list = [df_signal, df_background]
    df_all = pandas.concat(df_list)
    dataset = df_all.values
    dataset_target_all = pandas.concat([
        signal.pandas.df(["TrijetMass"]),
        background.pandas.df(["TrijetMass"])
    ]).values
    dataset_target_bkg = background.pandas.df(["TrijetMass"]).values
    dataset_target_signal = signal.pandas.df(["TrijetMass"]).values

    X_signal = dset_signal[:nEvts, 0:nInputs]
    X_background = dset_background[:nEvts, 0:nInputs]

    #nEvts = nEvts
    X = dset_background[:nEvts, 0:nInputs]

    colors = [
        ROOT.kBlue, ROOT.kMagenta, ROOT.kRed, ROOT.kOrange, ROOT.kYellow,
        ROOT.kGreen, ROOT.kCyan, ROOT.kViolet + 5, ROOT.kPink + 5,
        ROOT.kOrange + 5, ROOT.kSpring + 5, ROOT.kTeal + 5
    ]
    lines = [1, 2, 9, 3, 6, 7, 10]

    graphList = []
    nameList = []

    # Load keras model
    model = load_model("Model_relu_relu_relu_sigmoid.h5")
    model.compile(loss='binary_crossentropy', optimizer='adam')
    Y_signal = model.predict(X_signal, verbose=0)
    Y_bkg = model.predict(X_background, verbose=0)

    htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest(
        Y_signal, Y_signal, Y_bkg, Y_bkg, "plotROC", "model", ["pdf"])
    graph = func.GetROC(htest_s, htest_b)
    graphList.append(graph.Clone("NN"))
    nameList.append("Neural Network")

    # Read BDTG results
    f = ROOT.TFile.Open("TopRecoTree_191009_083540_multFloat.root")
    directory = f.Get("Method_BDT/BDTG")
    hBDT_signal = directory.Get("MVA_BDTG_S_high")
    hBDT_bkg = directory.Get("MVA_BDTG_B_high")
    func.PlotEfficiency(hBDT_signal, hBDT_bkg, "TestROC", "EfficBDT", ["pdf"])
    graphBDT = func.GetROC(hBDT_signal, hBDT_bkg)
    graphList.append(graphBDT.Clone("BDT"))
    nameList.append("BDTG")

    graph_roc = {"graph": graphList, "name": nameList}
    func.PlotROC(graph_roc, "TestROC", "NN_vs_BDTG", ["pdf", "C"])
예제 #3
0
def main(opts): 

    # Save start time (epoch seconds)
    tStart = time.time()
    Verbose("Started @ " + str(tStart), True)

    # Do not display canvases & disable screen output info
    ROOT.gROOT.SetBatch(ROOT.kTRUE)
    ROOT.gROOT.ProcessLine( "gErrorIgnoreLevel = 1001;")

    # Setup the style
    style = tdrstyle.TDRStyle() 
    style.setOptStat(False) 
    style.setGridX(opts.gridX)
    style.setGridY(opts.gridY)

    # Open the ROOT file
    ROOT.TFile.Open(opts.rootFileName)

    # Setting the seed for numpy-generated random numbers
    numpy.random.seed(opts.rndSeed)

    # Setting the seed for python random numbers
    rn.seed(opts.rndSeed)

    # Setting tensorflow random seed
    tf.set_random_seed(opts.rndSeed)

    # Open the signal and background TTrees with uproot (uproot allows one to read ROOT data, in python, without using ROOT)
    Print("Opening the signal and background TTrees with uproot using ROOT file %s" % (ts + opts.rootFileName + ns), True)
    signal     = uproot.open(opts.rootFileName)["treeS"]
    background = uproot.open(opts.rootFileName)["treeB"]

    # Input list of discriminatin variables (TBranches)
    inputList = []
    inputList.append("TrijetPtDR")
    inputList.append("TrijetDijetPtDR")
    inputList.append("TrijetBjetMass")
    inputList.append("TrijetLdgJetBDisc")
    inputList.append("TrijetSubldgJetBDisc")
    inputList.append("TrijetBJetLdgJetMass")
    inputList.append("TrijetBJetSubldgJetMass")
    #inputList.append("TrijetMass")
    inputList.append("TrijetDijetMass")
    inputList.append("TrijetBJetBDisc")
    inputList.append("TrijetSoftDrop_n2")
    inputList.append("TrijetLdgJetCvsL")
    inputList.append("TrijetSubldgJetCvsL")
    inputList.append("TrijetLdgJetPtD")
    inputList.append("TrijetSubldgJetPtD")
    inputList.append("TrijetLdgJetAxis2")
    inputList.append("TrijetSubldgJetAxis2")
    inputList.append("TrijetLdgJetMult")
    inputList.append("TrijetSubldgJetMult")
    nInputs = len(inputList)

    # Construct signal and background dataframes using a list of TBranches (a Dataframe is a two dimensional structure representing data in python)
    Print("Constucting dataframes for signal and background with %d input variables:\n\t%s%s%s" % (nInputs, ss, "\n\t".join(inputList), ns), True)
    df_signal = signal.pandas.df(inputList) # call an array-fetching method to fill a Pandas DataFrame.
    df_bkg    = background.pandas.df(inputList)

    # Get the index (row labels) of the DataFrame.
    nsignal = len(df_signal.index)
    nbkg    = len(df_bkg.index)
    Verbose("Signal has %s%d%s row labels. Background has %s%d%s row labels" % (ss, nsignal, ns, es, nbkg, ns), True)

    # Apply rule-of-thumb to prevent over-fitting
    checkNeuronsPerLayer(nsignal, opts)
    
    # Sanity check
    columns = list(df_signal.columns.values)
    Verbose("The signal columns are :\n\t%s%s%s" % (ss, "\n\t".join(columns), ns), True)    

    # Get a Numpy representation of the DataFrames for signal and background datasets
    Verbose("Getting a numpy representation of the DataFrames for signal and background datasets", True)
    dset_signal = df_signal.values
    dset_bkg    = df_bkg.values
    printDataset(dset_signal)
    printDataset(dset_bkg)

    # Construct the pandas DataFrames (2D size-mutable tabular data structure with labeled axes i.e. rows and columns)
    Verbose("Constructing pandas DataFrames for signal and background", True)
    ds_signal = pandas.DataFrame(data=dset_signal,columns=inputList)
    ds_bkg    = pandas.DataFrame(data=dset_bkg,columns=inputList)

    # Construct pandas DataFrames (2D size-mutable tabular data structure with labeled axes i.e. rows and columns)
    Verbose("Constructing pandas DataFrames", True)
    df_signal = df_signal.assign(signal=1)
    df_bkg    = df_bkg.assign(signal=0)
    Verbose("Printing tabular data for signal:\n%s%s%s" % (ss, ds_signal,ns), True)
    Verbose("Printing tabular data for background:\n%s%s%s" % (ss, ds_bkg,ns), True)
    
    # Create dataframe lists
    df_list = [df_signal, df_bkg]
    df_all  = pandas.concat(df_list)

    # Get a Numpy representation of the DataFrames for signal and background datasets (again, and AFTER assigning signal and background)
    dset_signal = df_signal.values
    dset_bkg    = df_bkg.values
    dset_all    = df_all.values

    # Define keras model as a linear stack of layers. Add layers one at a time until we are happy with our network architecture.
    Print("Creating the sequential Keras model", True)
    model = Sequential()

    # The best network structure is found through a process of trial and error experimentation. Generally, you need a network large enough to capture the structure of the problem.
    # The Dense function defines each layer - how many neurons and mathematical function to use.
    for iLayer, n in enumerate(opts.neurons, 0):
        layer = "%d layer#" % (int(iLayer)+1)
        if iLayer == 0:
            layer += " (input Layer)" # Input variables, sometimes called the visible layer.
        elif iLayer == len(opts.neurons)-1:
            layer += " (output Layer)" # Layers of nodes between the input and output layers. There may be one or more of these layers.
        else:            
            layer += " (hidden layer)" # A layer of nodes that produce the output variables.
            
        Print("Adding %s, with %s%d neurons%s and activation function %s" % (ts + layer + ns, ls, n, ns, ls + opts.activation[iLayer] + ns), i==0)
        if iLayer == 0:
            if opts.neurons[iLayer] != nInputs > 1:
                msg = "The number of neurons must equal the number of features (columns) in your data. Some NN configuration add one additional node for a bias term"
                Print(msg, True)
            # Only first layer demands input_dim. For the rest it is implied.
            model.add( Dense(opts.neurons[iLayer], input_dim = nInputs) ) #, weights = [np.zeros([692, 50]), np.zeros(50)] OR bias_initializer='zeros',  or bias_initializer=initializers.Constant(0.1)
            model.add(Activation(opts.activation[iLayer]))
            #model.add( Dense(opts.neurons[iLayer], input_dim = nInputs, activation = opts.activation[iLayer]) ) # her majerty Soti requested to break this into 2 lines
        else:
            if 0: #opts.neurons[iLayer] < nInputs:
                msg = "The number of  neurons (=%d) is less than the number of input variables (=%d). Please set the number of neurons to be at least the number of inputs!" % (opts.neurons[iLayer], nInputs)
                raise Exception(es + msg + ns)  
            model.add( Dense(opts.neurons[iLayer]))
            model.add(Activation(opts.activation[iLayer]))
            #model.add( Dense(opts.neurons[iLayer], activation = opts.activation[iLayer]) ) 

    # Print a summary representation of your model. 
    Print("Printing model summary:", True)
    model.summary()

    # Get a dictionary containing the configuration of the model. The model can be reinstantiated from its config via: model = Model.from_config(config)
    if 0:
        config = model.get_config()
    
    # Split data into input (X) and output (Y). (Note: dataset includes both signal and background sequentially)
    # Use only 2*nsignal rows => First nSignal rows is the signal. Another (equal) nSignal rows for the bkg. 
    # Therefore signal and bkg have exactly the same number
    X = dset_all[:2*nsignal,0:nInputs] # rows: 0 -> 2*signal, columns: 0 -> 19
    Y = dset_all[:2*nsignal,nInputs:]  # rows: 0 -> 2*signal, columns: 19 (isSignal = 0 or 1)
    X_signal     = dset_signal[:nsignal, 0:nInputs]
    X_background = dset_bkg[:nsignal, 0:nInputs]
    Print("Signal dataset has %s%d%s rows. Background dataset has %s%d%s rows" % (ss, len(X_signal), ns, es, len(X_background), ns), True)

    # Split the datasets (X= 19 inputs, Y=output variable). Test size 0.5 means half for training half for testing
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=opts.rndSeed, shuffle=True)
    opts.testSample  = len(X_test)
    opts.trainSample = len(X_train)

    # Early stop? Stop training when a monitored quantity has stopped improving.
    # Show patience of "50" epochs with a change in the loss function smaller than "min_delta" before stopping procedure
    # https://stackoverflow.com/questions/43906048/keras-early-stopping
    earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=10, verbose=1, mode='auto')
    # earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=50)
    callbacks = [earlystop]

    # [Loss function is used to understand how well the network is working (compare predicted label with actual label via some function)]
    # Optimizer function is related to a function used to optimise the weights
    Print("Compiling the model with the loss function %s and optimizer %s " % (ls + opts.lossFunction + ns, ts + opts.optimizer + ns), True)
    model.compile(loss=opts.lossFunction, optimizer=opts.optimizer, metrics=['accuracy'])
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    # Customise the optimiser settings?
    if 0: #opts.optimizer == "adam":  # does not work
        # Default parameters follow those provided in the original paper.
        # learning_rate: float >= 0. Learning rate.
        # beta_1: float, 0 < beta < 1. Generally close to 1.
        # beta_2: float, 0 < beta < 1. Generally close to 1.
        # amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and Beyond".
        keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)

    # batch size equal to the training batch size (See https://machinelearningmastery.com/use-different-batch-sizes-training-predicting-python-keras/)
    if opts.batchSize == None:
        opts.batchSize = len(X_train)/2

    # Fit the model with our data
    # (An "epoch" is an arbitrary cutoff, generally defined as "one iteration of training on the whole dataset", 
    # used to separate training into distinct phases, which is useful for logging and periodic evaluation.)
    try:
        hist = model.fit(X_train,
                         Y_train,
                         validation_data=(X_test, Y_test),
                         epochs     = opts.epochs,    # a full pass over all of your training data
                         batch_size = opts.batchSize, # a set of N samples (https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network)
                         shuffle    = False,
                         verbose    = 1, # 0=silent, 1=progress, 2=mention the number of epoch
                         callbacks  = callbacks
                     )
    except KeyboardInterrupt: #(KeyboardInterrupt, SystemExit):
        msg = "Manually interrupted the training (keyboard interrupt)!"
        Print(es + msg + ns, True)

    # Write the model
    modelName = "Model_%s_trained.h5" % (opts.rootFileName.replace(".root",""))
    model.save(modelName)
        
    # serialize model to JSON (contains arcitecture of model)
    model_json = model.to_json()
    with open("model_architecture.json", "w") as json_file:
        json_file.write(model_json)
        
    # serialize weights to HDF5
    model.save_weights('model_weights.h5', overwrite=True)
    model.save(modelName)
        
    # write weights and architecture in txt file
    func.WriteModel(model, model_json, inputList, os.path.join(opts.saveDir, "model.txt") )

    # Produce method score (i.e. predict output value for given input dataset). Computation is done in batches.
    # https://stackoverflow.com/questions/49288199/batch-size-in-model-fit-and-model-predict
    Print("Generating output predictions for the input samples", True) # (e.g. Numpy array)
    pred_train  = model.predict(X_train     , batch_size=None, verbose=1, steps=None)
    pred_test   = model.predict(X_test      , batch_size=None, verbose=1, steps=None)
    pred_signal = model.predict(X_signal    , batch_size=None, verbose=1, steps=None)
    pred_bkg    = model.predict(X_background, batch_size=None, verbose=1, steps=None)    
    # pred_train = model.predict(x, batch_size=None, verbose=0, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False) # Keras version 2.2.5 or later (https://keras.io/models/model/)

    # Join a sequence of arrays (X and Y) along an existing axis (1). In other words, add the ouput variable (Y) to the input variables (X)
    XY_train = numpy.concatenate((X_train, Y_train), axis=1)
    XY_test  = numpy.concatenate((X_test , Y_test ), axis=1)

    # Pick events with output = 1
    Print("Select events/samples which have an output variable Y (last column) equal to 1 (i.e. prediction is combatible with signal)", True)
    x_train_S = XY_train[XY_train[:,nInputs] == 1]; x_train_S = x_train_S[:,0:nInputs]
    x_test_S  = XY_test[XY_test[:,nInputs] == 1];   x_test_S  = x_test_S[:,0:nInputs]

    Print("Select events/samples which have an output variable Y (last column) equal to 0 (i.e. prediction is NOT combatible with signal)", False)
    x_train_B = XY_train[XY_train[:,nInputs] == 0]; x_train_B = x_train_B[:,0:nInputs]
    x_test_B  = XY_test[XY_test[:,nInputs] == 0];   x_test_B  = x_test_B[:,0:nInputs]
    
    # Produce method score for signal (training and test) and background (training and test)
    pred_train_S =  model.predict(x_train_S, batch_size=None, verbose=1, steps=None)
    pred_train_B =  model.predict(x_train_B, batch_size=None, verbose=1, steps=None)
    pred_test_S  =  model.predict(x_test_S , batch_size=None, verbose=1, steps=None)
    pred_test_B  =  model.predict(x_test_B , batch_size=None, verbose=1, steps=None)

    # Inform user of early stop
    stopEpoch = earlystop.stopped_epoch
    if stopEpoch != 0 and stopEpoch < opts.epochs:
        msg = "Early stop occured after %d epochs!" % (stopEpoch)
        opts.epochs = stopEpoch
        Print(cs + msg + ns, True)

    # Create json files
    writeCfgFile(opts)
    writeGitFile(opts)
    jsonWr  = JsonWriter(saveDir=opts.saveDir, verbose=opts.verbose)

    # Plot selected output and save to JSON file for future use
    func.PlotAndWriteJSON(pred_signal , pred_bkg    , opts.saveDir, "Output"     , jsonWr, opts.saveFormats)
    func.PlotAndWriteJSON(pred_train  , pred_test   , opts.saveDir, "OutputPred" , jsonWr, opts.saveFormats)
    func.PlotAndWriteJSON(pred_train_S, pred_train_B, opts.saveDir, "OutputTrain", jsonWr, opts.saveFormats)
    func.PlotAndWriteJSON(pred_test_S , pred_test_B , opts.saveDir, "OutputTest" , jsonWr, opts.saveFormats)

    # Plot overtraining test
    htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest(pred_train_S, pred_test_S, pred_train_B, pred_test_B, opts.saveDir, "OvertrainingTest", opts.saveFormats)

    # Plot summary plot (efficiency & singificance)
    func.PlotEfficiency(htest_s, htest_b, opts.saveDir, "Summary", opts.saveFormats)

    # Write efficiencies (signal and bkg)
    xVals_S, xErrs_S, effVals_S, effErrs_S  = func.GetEfficiency(htest_s)
    xVals_B, xErrs_B, effVals_B, effErrs_B  = func.GetEfficiency(htest_b)
    func.PlotTGraph(xVals_S, xErrs_S, effVals_S, effErrs_S, opts.saveDir, "EfficiencySig", jsonWr, opts.saveFormats)
    func.PlotTGraph(xVals_B, xErrs_B, effVals_B, effErrs_B, opts.saveDir, "EfficiencyBkg", jsonWr, opts.saveFormats)

    xVals, xErrs, sig_def, sig_alt = func.GetSignificance(htest_s, htest_b)
    func.PlotTGraph(xVals, xErrs, sig_def, effErrs_B, opts.saveDir, "SignificanceA", jsonWr, opts.saveFormats)
    func.PlotTGraph(xVals, xErrs, sig_alt, effErrs_B, opts.saveDir, "SignificanceB", jsonWr, opts.saveFormats)

    # Plot ROC curve
    gSig = func.GetROC(htest_s, htest_b)
    if 0:
        gBkg = func.GetROC(htest_b, htest_s)
        gDict = {"graph" : [gSig, gBkg], "name" : ["signal", "bkg"]}
    else:
        gDict = {"graph" : [gSig], "name" : [os.path.basename(opts.saveDir)]}
    style.setLogY(True)
    func.PlotROC(gDict, opts.saveDir, "ROC", opts.saveFormats)

    # Write the  resultsJSON file!
    jsonWr.write(opts.resultsJSON)
    
    # Print total time elapsed
    days, hours, mins, secs = GetTime(tStart)
    Print("Total elapsed time is %s days, %s hours, %s mins, %s secs" % (days[0], hours[0], mins[0], secs), True)
    
    return 
예제 #4
0
x_train_B = XY_train[XY_train[:,nInputs] == 0]; x_train_B = x_train_B[:,0:nInputs]
x_test_S  = XY_test[XY_test[:,nInputs] == 1];   x_test_S  = x_test_S[:,0:nInputs]
x_test_B  = XY_test[XY_test[:,nInputs] == 0];   x_test_B  = x_test_B[:,0:nInputs]

# Calculate the output of the model for the four datasets (training signal, training bkg, testing signal, testing bkg)
pred_train_S =  model.predict(x_train_S)
pred_train_B =  model.predict(x_train_B)
pred_test_S  =  model.predict(x_test_S)
pred_test_B  =  model.predict(x_test_B)

# Output directory
dirName = plot.getDirName("TopTag")

# X_signal     = dset_signal[:nsignal, 0:nInputs]
# X_background = dset_background[:nsignal, 0:nInputs]

# func.PlotOutput(pred_signal, pred_background, dirName, "Output_SB", 1, ["pdf"])
# func.PlotOutput(pred_train, pred_test, dirName, "Output_pred", 0, ["pdf"])
# func.PlotOutput(pred_train_S, pred_train_B, dirName, "Output_SB_train", 1, ["pdf"])
# func.PlotOutput(pred_test_S, pred_test_B, dirName, "Output_SB_test", 1, ["pdf"])

# Plot the output of the four datasets
htrain_s, htest_s, htrain_b, htest_b = func.PlotOvertrainingTest(pred_train_S, pred_test_S, pred_train_B, pred_test_B, dirName, "OvertrainingTest", ["pdf"])
# Calculate efficiency
func.PlotEfficiency(htest_s, htest_b, dirName, "Efficiency.pdf", ["pdf"])

# Plot ROC curve
graph1 = func.GetROC(htest_s, htest_b)
graph_roc = {"graph" : [graph1], "name" : ["graph 1"]}
func.PlotROC(graph_roc, dirName, "ROC", ["pdf"])