def simple_test_xgboost(): data_x, data_y = data[0][:1000, :5], data[1][:1000] data_y_binary = (data_y > 5).astype(np.int32) print("Binary classification") print("training model") model = xgboost.XGBClassifier(n_estimators=10) model.fit(data_x, data_y_binary) for tree in model.booster().get_dump(): print(tree) features = ["f{0}".format(i) for i in range(data_x.shape[1])] target_names = [ "cls{0}".format(i) for i in range(len(np.unique(data_y_binary))) ] bdt = BDTxgboost(model, features, target_names) bdt.to_tmva("test.xml") bdt.setup_tmva("test.xml") d1 = 0.0 for irow in range(data_x.shape[0]): predA1 = bdt.eval_tmva(data_x[irow, :]) predB1 = bdt.eval(data_x[irow, :])[0] if np.abs(predA1 - predB1 > 0.1): print("large deviance for row", irow, predA1, predB1, [data_x[irow, i] for i in range(5)]) d1 += np.abs((predA1 - predB1) / predA1) return d1
def test_classify_binary(self): print("TestBDTxgboost test_classify_binary") model = xgboost.XGBClassifier(n_estimators=10) model.fit(self.data_x, self.data_y_binary) bdt = BDTxgboost(model, self.features, ["cls0", "cls1"]) bdt.to_tmva("xgb_binary.xml") bdt.setup_tmva("xgb_binary.xml") dev = 0.0 for irow in range(self.data_x.shape[0]): predA = bdt.eval_tmva(self.data_x[irow, :]) predB = bdt.eval(self.data_x[irow, :]) local_dev = np.abs((predA - predB) / predA) self.assertTrue(local_dev < 0.05) dev += local_dev self.assertTrue(dev < 0.01)
def simple_test_xgboost(): data_x, data_y = data[0][:1000, :5], data[1][:1000] data_y_binary = (data_y > 5).astype(np.int32) print "Binary classification" print "training model" model = xgboost.XGBClassifier(n_estimators=10) model.fit(data_x, data_y_binary) features = ["f{0}".format(i) for i in range(data_x.shape[1])] target_names = [ "cls{0}".format(i) for i in range(len(np.unique(data_y_binary))) ] bdt = BDTxgboost(model, features, target_names) bdt.to_tmva("test.xml") bdt.setup_tmva("test.xml") d1 = 0.0 for irow in range(data_x.shape[0]): predA1 = bdt.eval_tmva(data_x[irow, :]) predB1 = bdt.eval(data_x[irow, :]) d1 += np.abs((predA1 - predB1) / predA1)
geometry = opt.geometry eta_region = opt.etaRegion bdt_name = "%s_vs_%s_%s"%(opt.signal,opt.background,opt.bdtConfig) #set up global variables modelDir = os.environ['CMSSW_BASE']+"/src/L1Trigger/egid_analysis/HGCal_L1T_egammaID/output/models/%s"%geometry #define variables used in model egID_var_dict = {'electron_vs_neutrino_baseline':['cl3d_coreshowerlength','cl3d_firstlayer','cl3d_maxlayer','cl3d_srrmean'],'electron_vs_neutrino_full':['cl3d_coreshowerlength','cl3d_showerlength','cl3d_firstlayer','cl3d_maxlayer','cl3d_szz','cl3d_srrmean','cl3d_srrtot','cl3d_seetot','cl3d_spptot']} egID_vars = egID_var_dict[ bdt_name ] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# # Load models egID_model = xg.Booster() if eta_region in ['low','high']: modelStr = "%s/egID_%s_%s_%seta.model"%(modelDir,modelAlgo,bdt_name,eta_region) else: modelStr = "%s/egID_%s_%s.model"%(modelDir,modelAlgo,bdt_name) egID_model.load_model( modelStr ) print " --> Loaded model: %s"%modelStr # Define name of xml file to save if eta_region in ['low','high']: f_xml = "%s/egID_%s_%s_%seta.xml"%(modelDir,modelAlgo,bdt_name,eta_region) else: f_xml = "%s/egID_%s_%s.xml"%(modelDir,modelAlgo,bdt_name) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# # Convert to xml from mlglue.tree import tree_to_tmva, BDTxgboost, BDTsklearn target_names = ['background','signal'] bdt = BDTxgboost( egID_model, egID_vars, target_names, kind='binary', max_depth=6, learning_rate=0.3 ) bdt.to_tmva( f_xml ) print " --> Converted to xml"
#load models altDiphoModel = xg.Booster() #modelName = '/vols/cms/es811/Stage1categorisation/2016/models/altDiphoModel.model' #modelName = '/vols/cms/es811/Stage1categorisation/2017/models/altDiphoModel.model' #modelName = '/vols/cms/es811/Stage1categorisation/Pass1/2016/models/altDiphoModel.model' modelName = '/vols/cms/es811/Stage1categorisation/Pass1/2017/models/altDiphoModel.model' xmlName = modelName.split('/')[-1].replace('.model', '.xml') altDiphoModel.load_model(modelName) print 'Loaded model called %s' % modelName.split('/')[-1] weightDir = 'WeightFiles/' if '2016' in modelName: weightDir += '2016' elif '2017' in modelName: weightDir += '2017' else: exit('expected year 2016 or 2017 in path') if not path.isdir(weightDir): system('mkdir -p %s' % weightDir) #convert! from mlglue.tree import tree_to_tmva, BDTxgboost, BDTsklearn target_names = ['bkg', 'sig'] bdt = BDTxgboost(altDiphoModel, diphoVars, target_names, kind='binary', max_depth=6, learning_rate=0.3) bdt.to_tmva('%s/%s' % (weightDir, xmlName)) print 'Created xml called %s' % xmlName
verbose=TrainVerbose) if len(clstst) > 0: clstst[ind].fit(eval_sets[ind][0][0], eval_sets[ind][0][1], eval_sets[ind][0][2], eval_set=eval_sets[ind], early_stopping_rounds=50, eval_metric=["error"], verbose=TrainVerbose) printlog("Training step " + str(ind + 1) + " of " + str(len(clses)) + " done.") if saveTraining and ind == saveSetindex: printlog("Saving training with features: " + str(saveSetnames)) #features.append(["f{0}".format(i) for i in range(len(vset[ind]))]) bdt = BDTxgboost(clses[ind], saveSetf, target_names) bdt.to_tmva(savefilename + ".xml") bdt.setup_tmva(savefilename + ".xml") printlog("Exported training step " + str(ind + 1) + " of " + str(len(clses)) + " to \"" + savefilename + ".xml\"") def insertcMVAdefault(dd): for ind in range(len(clses)): cond = (dd["Jet_JP"] == 0.) & (dd["Jet_JBP"] == 0.) for feat in [ "Jet_CSV", "Jet_CSVIVF", "Jet_DeepCSVBDisc", "Jet_SoftMu", "Jet_SoftEl", "Jet_SoftMuasEta", "Jet_SoftMuSuppressed", "Jet_SoftMuSupPTEta", "Jet_SoftElasEta" ]: if feat in vset[ind]: