Python Reader.getSamplesForTraining示例

def run(samples,
        channel,
        era,
        use,
        train,
        short,
        datacard=False,
        add_nominal=False):

    # import model
    if use == "xgb":
        from XGBModel import XGBObject as modelObject
        parameters = "conf/parameters_xgb.json"

    if use == "keras":
        from KerasModel import KerasObject as modelObject
        parameters = "conf/parameters_keras.json"

    # create instance of Reader
    read = Reader(channel=channel, config_file=samples, folds=2, era=era)

    target_names = read.config["target_names"]
    variables = read.config["variables"]

    models_folder = era + "/models"
    if not os.path.exists(models_folder):
        os.makedirs(models_folder)

    modelname = "{0}/{1}.{2}".format(models_folder, channel, use)
    scaler = None

    if train:
        print "Training new model"
        print "Loading Training set"
        # load samples for training. This returns the input files merged (ggH+qqH+...), randomized and split into two folds
        trainSet = read.getSamplesForTraining()

        # use StandardScaler from sklearn and "train" it on the training set. Then pickle it store it for later and finaly apply it to the training set.
        print "Fit Scaler to training set...",
        scaler = trainScaler(trainSet, variables)

        print " done. Dumping for later."
        with open("{0}/StandardScaler.{1}.pkl".format(models_folder, channel),
                  'wb') as FSO:
            cPickle.dump(scaler, FSO, 2)
        scaler = [scaler, scaler]  # Hotfix since KIT uses 2 scalers

        trainSet = applyScaler(scaler, trainSet, variables)

        # load model and start training
        model = modelObject(parameter_file=parameters,
                            variables=variables,
                            target_names=target_names)
        model.train(trainSet)  # done in KerasModel.py
        model.save(modelname)

    # prediction starts
    #load scaler and trained model
    elif not datacard:
        # TODO: Maybe not needed to check. Just load what is there
        if os.path.exists("{0}/StandardScaler.{1}.pkl".format(
                models_folder, channel)):
            print "Loading Scaler"
            scaler = []
            if glob("{0}/{1}_*_keras_preprocessing.pickle".format(
                    models_folder, channel)):
                with open(
                        "{0}/{1}_fold0_keras_preprocessing.pickle".format(
                            models_folder, channel), "rb") as FSO:
                    scaler.append(cPickle.load(FSO))

                with open(
                        "{0}/{1}_fold1_keras_preprocessing.pickle".format(
                            models_folder, channel), "rb") as FSO:
                    scaler.append(cPickle.load(FSO))
            else:
                with open(
                        "{0}/StandardScaler.{1}.pkl".format(
                            models_folder, channel), "rb") as FSO:
                    tmp = cPickle.load(FSO)
                    scaler = [tmp, tmp]

        print "Loading model and predicting."
        model = modelObject(filename=modelname)
        read.variables = model.variables
        variables = model.variables

    if not datacard:

        outpath = read.config["outpath"] + "/predictions_" + era
        predictions = {}
        print "Predicting samples"
        # only used by DESY
        if add_nominal:
            print "Predicting Nominal"
            for sample, sampleConfig in read.get(what="nominal",
                                                 for_prediction=True):
                sandbox(channel, model, scaler, sample, variables,
                        "nom_" + sampleConfig["histname"], outpath,
                        sampleConfig, read.modifyDF)

        # Iterate over nominal samples
        for sample, sampleConfig in read.get(what="full",
                                             add_jec=not short,
                                             for_prediction=True):
            if "data" in sampleConfig["histname"]:
                sandbox(channel, model, scaler, sample, variables,
                        "NOMINAL_ntuple_Data", outpath, sampleConfig,
                        read.modifyDF)
            elif "full" in sampleConfig["histname"]:
                sandbox(
                    channel, model, scaler, sample, variables,
                    "NOMINAL_ntuple_" + sampleConfig["histname"].split("_")[0],
                    outpath, sampleConfig, read.modifyDF)
            else:
                splName = sampleConfig["histname"].split("_")
                sandbox(
                    channel, model, scaler, sample, variables,
                    "_".join(splName[1:]) + "_ntuple_" +
                    sampleConfig["histname"].split("_")[0], outpath,
                    sampleConfig, read.modifyDF)
        # shape files
        if not short:
            print "Predicting shapes"
            for sample, sampleConfig in read.get(what="tes",
                                                 for_prediction=True):
                sandbox(channel, model, scaler, sample, variables,
                        sampleConfig["histname"], outpath, sampleConfig,
                        read.modifyDF)

    if "hephy.at" in os.environ["HOME"]:
        from Tools.Datacard.produce import Datacard, makePlot
        from Tools.CutObject.CutObject import Cut
        from Tools.FakeFactor.FakeFactor import FakeFactor

        Datacard.use_config = era + "/datacard_conf"
        D = Datacard(channel=channel,
                     variable="predicted_prob",
                     era=era,
                     real_est="mc",
                     add_systematics=not short,
                     debug=True,
                     use_cutfile="conf/cuts_{0}.json".format(era))

        FakeFactor.fractions = "{0}/datacard_conf/fractions/htt_ff_fractions_{0}.root".format(
            era)

        D.create(era + "/" + use)
        makePlot(channel, "ML", era + "/" + use, era, era + "/plots")

示例#2

显示文件

def run(samples,channel, use, train,short, preprocess_chain = []):

    if use == "xgb":
        from XGBModel import XGBObject as modelObject
        parameters = "conf/parameters_xgb.json"

    if use == "keras":
        from KerasModel import KerasObject as modelObject
        parameters = "conf/parameters_keras.json"


    read = Reader(channel = channel,
                  config_file = samples,
                  folds=2)

    target_names = read.config["target_names"]
    variables = read.config["variables"]
    if not os.path.exists("models"):
        os.mkdir("models")

    modelname = "models/{0}.{1}".format(channel,use)
    scaler = None

    if train:
        print "Training new model"
        print "Loading Training set"
        trainSet = read.getSamplesForTraining()

        print "Fit Scaler to training set...",
        scaler = trainScaler(trainSet, variables )

        print " done. Dumping for later."
        with open("models/StandardScaler.{0}.pkl".format(channel), 'wb') as FSO:
            cPickle.dump(scaler, FSO , 2)
        trainSet = applyScaler(scaler, trainSet, variables)

        model = modelObject( parameter_file = parameters,
                             variables=variables,
                             target_names = target_names )
        model.train( trainSet )
        model.save(modelname)

    else:
        
        if os.path.exists("models/StandardScaler.{0}.pkl".format(channel) ):
            print "Loading Scaler"
            with open( "models/StandardScaler.{0}.pkl".format(channel), "rb" ) as FSO:
                scaler = cPickle.load( FSO )

        print "Loading model and predicting."
        model = modelObject( filename = modelname )

    where = ""
    coll = Collector( channel = channel,
                      var_name = "pred_prob",
                      target_names = target_names, 
                      path = use, 
                      recreate = True,
                      rebin = False )

    print "Predicting simulation"
    for sample, sampleName in read.get(what = "nominal"):
        pred =  model.predict( applyScaler(scaler, sample, variables), where )
        coll.addPrediction(pred, sample, sampleName)
        
    print "Adding looser samples to predictions"
    for sample, sampleName in read.get(what = "more"):
        pred =  model.predict( applyScaler(scaler, sample, variables), where )
        coll.addPrediction(pred, sample, sampleName)
        
    print "Predicting data"
    for sample, sampleName in read.get(what = "data"):
        pred =  model.predict( applyScaler(scaler, sample, variables), where )
        coll.addPrediction(pred, sample, sampleName)
    
    if not short:
        print "Predicting TES shapes"
        for sample, sampleName in read.get(what = "tes"):
            pred =  model.predict( applyScaler(scaler, sample, variables), where )
            coll.addPrediction(pred, sample, sampleName)

        print "Predicting JES shapes"
        for sample, sampleName in read.get(what = "jec"):
            pred =  model.predict( applyScaler(scaler, sample, variables), where )
            coll.addPrediction(pred, sample, sampleName)   

    coll.createDC(writeAll = True)

    plot = Plotter( channel= channel,
                    naming = read.processes,
                    path = use )

    plot.makePlots()