def mainXGB(options): import xgboost as xgb from taggerOptions import StandardVariables, getJetVarNames print "PROCESSING TRAINING DATA" #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) # Import data dg = DataGetter(allVars) dataFiles = [] dataFiles += glob(options.dataFilePath + "/") dataFiles += glob(options.dataFilePath + "/") dataFiles += glob(options.dataFilePath + "/") dataFiles += glob(options.dataFilePath + "/") trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight) #= dg.importData(samplesToRun = tuple(glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[0].h5")), prescale=True, ptReweight=options.ptReweight) print "TRAINING XGB" # Create xgboost classifier # Train random forest xgData = xgb.DMatrix(trainData["data"], label=trainData["labels"][:,0])#, weight=trainData["weights"][:,0]) param = {'max_depth':6, 'eta':0.03, 'objective':'binary:logistic', 'eval_metric':['error', 'auc', 'logloss'], 'nthread':28 } gbm = xgb.train(param, xgData, num_boost_round=2000) #Dump output from training gbm.save_model(options.directory + "/" + 'TrainingModel.xgb')
def mainXGB(options): import xgboost as xgb from taggerOptions import StandardVariables, getJetVarNames print "PROCESSING TRAINING DATA" #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) # Import data dg = DataGetter(allVars) dataFiles = [] dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5") trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight) #= dg.importData(samplesToRun = tuple(glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[0].h5")), prescale=True, ptReweight=options.ptReweight) print "TRAINING XGB" # Create xgboost classifier # Train random forest xgData = xgb.DMatrix(trainData["data"], label=trainData["labels"][:,0])#, weight=trainData["weights"][:,0]) param = {'max_depth':6, 'eta':0.03, 'objective':'binary:logistic', 'eval_metric':['error', 'auc', 'logloss'], 'nthread':28 } gbm = xgb.train(param, xgData, num_boost_round=2000) #Dump output from training gbm.save_model(options.directory + "/" + 'TrainingModel.xgb')
def mainSKL(options): from sklearn.ensemble import RandomForestClassifier import pickle print "PROCESSING TRAINING DATA" from taggerOptions import StandardVariables, getJetVarNames #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) # Import data dg = DataGetter(allVars) trainData = dg.importData(samplesToRun=tuple( glob( options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_0.h5" )), prescale=True, ptReweight=options.ptReweight) # Create random forest clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=4, verbose=True) print "TRAINING RF" # Train random forest clf = clf.fit(trainData["data"], trainData["labels"][:, 0], sample_weight=trainData["weights"][:, 0]) #Dump output from training fileObject = open(options.directory + "/" + "TrainingOutput.pkl", 'wb') out = pickle.dump(clf, fileObject) fileObject.close()
def mainSKL(options): from sklearn.ensemble import RandomForestClassifier # import xgboost as xgb import pickle print "PROCESSING TRAINING DATA" from taggerOptions import StandardVariables, getJetVarNames #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) print allVars # Import data #dgSig = DataGetter.DefinedVariables(allVars, signal = True) #dgBg = DataGetter.DefinedVariables(allVars, background = True) # #validDataSig = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),] # #validDataBgTTbar = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),] # #validDataBgQCDMC = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT100to200_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT200to300_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT300to500_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT500to700_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT700to1000_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1000to1500_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1500to2000_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT2000toInf_training_0.h5", ), 1)] # #validDataBgQCDData = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_Data_JetHT_2016_training_0.h5", ), 1)] # # #print "Input Variables: ",len(dgSig.getList()) # ## Import data ##print options.runOp.validationSamples # #validDataSig = getValidData(dgSig, validDataSig, options) #validDataBgTTbar = getValidData(dgBg, validDataBgTTbar, options) #validDataBgQCDMC = getValidData(dgBg, validDataBgQCDMC, options) #validDataBgQCDData = getValidData(dgBg, validDataBgQCDData, options) # #validDataTTbar = combineValidationData(validDataSig, validDataBgTTbar) #validDataQCDMC = combineValidationData(validDataSig, validDataBgQCDMC) #validDataQCDData = combineValidationData(validDataSig, validDataBgQCDData) dg = DataGetter(allVars) dataFiles = [] dataFiles += glob( options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5" ) dataFiles += glob( options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5" ) dataFiles += glob( options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5" ) dataFiles += glob( options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5" ) print dataFiles trainData = dg.importData(samplesToRun=tuple(dataFiles), prescale=True, ptReweight=options.ptReweight) # Create random forest #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, # gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6, # min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28, # objective='binary:logistic', reg_alpha=0, reg_lambda=0.01, # scale_pos_weight=1, seed=0, silent=False, subsample=1 ) clf = RandomForestClassifier(n_estimators=1000, max_depth=10, n_jobs=28, verbose=True) print "TRAINING RF" # Train random forest clf = clf.fit(trainData["data"], trainData["labels"][:, 0], sample_weight=trainData["weights"][:, 0]) #Dump output from training fileObject = open(options.directory + "/" + "TrainingOutput.pkl", 'wb') out = pickle.dump(clf, fileObject) fileObject.close()
def mainSKL(options): from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier #import xgboost as xgb import pickle print "PROCESSING TRAINING DATA" from taggerOptions import StandardVariables, getJetVarNames #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) print allVars # Import data #dg = DataGetter(allVars) dgSig = DataGetter.DefinedVariables(allVars, signal = True, background = False) dgBg = DataGetter.DefinedVariables(allVars, signal = False, background = True) dataFiles = [] dataFiles += glob(dataPath + "/trainingTuple_*_division_*_rpv_stop_*_training_0.h5") dataFiles2 =glob(dataPath + "/trainingTuple_*_division_0_TT_training_0.h5") dataSig = dgSig.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=False) dataBg = dgBg.importData(samplesToRun = tuple(dataFiles2), prescale=True, ptReweight=False) minLen = min(len(dataSig["data"]),len(dataBg["data"])) trainDataArray = [dataSig,dataBg] trainData = {} for data in trainDataArray: for key in data: if key in trainData: trainData[key] = numpy.vstack([trainData[key], data[key][:minLen]]) else: trainData[key] = data[key][:minLen] perms = numpy.random.permutation(trainData["data"].shape[0]) for key in trainData: trainData[key] = trainData[key][perms] # Create random forest #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, # gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6, # min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28, # objective='binary:logistic', reg_alpha=0, reg_lambda=0.01, # scale_pos_weight=1, seed=0, silent=False, subsample=1 ) #clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs = 28, verbose = True) clf = MLPClassifier(hidden_layer_sizes=(20)) print "TRAINING RF" # Train random forest clf = clf.fit(trainData["data"], trainData["labels"][:,0])#, sample_weight=trainData["weights"][:,0]) #Dump output from training fileObject = open(options.directory + "/" + "TrainingOutput.pkl",'wb') out = pickle.dump(clf, fileObject) fileObject.close()
def mainSKL(options): from sklearn.ensemble import RandomForestClassifier # import xgboost as xgb import pickle print "PROCESSING TRAINING DATA" from taggerOptions import StandardVariables, getJetVarNames #get variables globalVars, jetVars = StandardVariables(options.variables) allVars = globalVars + getJetVarNames(jetVars) print allVars # Import data #dgSig = DataGetter.DefinedVariables(allVars, signal = True) #dgBg = DataGetter.DefinedVariables(allVars, background = True) # #validDataSig = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),] # #validDataBgTTbar = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),] # #validDataBgQCDMC = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT100to200_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT200to300_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT300to500_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT500to700_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT700to1000_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1000to1500_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1500to2000_training_0.h5", ), 1), # (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT2000toInf_training_0.h5", ), 1)] # #validDataBgQCDData = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_Data_JetHT_2016_training_0.h5", ), 1)] # # #print "Input Variables: ",len(dgSig.getList()) # ## Import data ##print options.runOp.validationSamples # #validDataSig = getValidData(dgSig, validDataSig, options) #validDataBgTTbar = getValidData(dgBg, validDataBgTTbar, options) #validDataBgQCDMC = getValidData(dgBg, validDataBgQCDMC, options) #validDataBgQCDData = getValidData(dgBg, validDataBgQCDData, options) # #validDataTTbar = combineValidationData(validDataSig, validDataBgTTbar) #validDataQCDMC = combineValidationData(validDataSig, validDataBgQCDMC) #validDataQCDData = combineValidationData(validDataSig, validDataBgQCDData) dg = DataGetter(allVars) dataFiles = [] dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5") dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5") print dataFiles trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight) # Create random forest #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, # gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6, # min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28, # objective='binary:logistic', reg_alpha=0, reg_lambda=0.01, # scale_pos_weight=1, seed=0, silent=False, subsample=1 ) clf = RandomForestClassifier(n_estimators=1000, max_depth=10, n_jobs = 28, verbose = True) print "TRAINING RF" # Train random forest clf = clf.fit(trainData["data"], trainData["labels"][:,0], sample_weight=trainData["weights"][:,0]) #Dump output from training fileObject = open(options.directory + "/" + "TrainingOutput.pkl",'wb') out = pickle.dump(clf, fileObject) fileObject.close()