#data["mva1"] = 0.0 #data["mva2"] = 0.0 cls = GradientBoostingClassifier(max_depth=3, learning_rate=0.01, n_estimators=100, verbose=True, min_samples_leaf=10, min_samples_split=10) d_train, d_test, t_train, t_test, w_train, w_test = train_test_split( data_train, targets, data_weights, test_size=0.2, random_state=7) #cls.fit(data_train, targets) cls.fit(d_train, t_train, w_train) sklearn_to_tmva.gbr_to_tmva(cls, data[trainVars()], "TMVABDT_3l_1tau_maxDepth3_8Var_frWt.xml", coef=2) import ROOT, array from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend fout = TFile("3l_1tau_performance_maxDepth3_8Var_frWt.root", "RECREATE") c1 = TCanvas() c1.SetFillColor(10) c1.SetBorderSize(2) c1.SetLeftMargin(0.12) c1.SetBottomMargin(0.12) c1.SetRightMargin(0.05) c1.SetLogy() histogram_base = TH1F("histogram_base", "", 100, 0., 1.)
learning_rate=0.01, n_estimators=100, verbose=True, min_samples_leaf=10, min_samples_split=10) #d_train, d_test, t_train, t_test, w_train, w_test = train_test_split(data_train, targets, data_weights, test_size=0.2, random_state=7) d_train, d_test, t_train, t_test = train_test_split(data_train, targets, test_size=0.2, random_state=12345) #cls.fit(data_train, targets) #cls.fit(d_train, t_train, w_train) cls.fit(d_train, t_train) sklearn_to_tmva.gbr_to_tmva(cls, data[trainVars()], "TMVABDT_hadTopTagger_maxDepth3_9Var_ps75.xml", coef=2) import ROOT, array from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend fout = TFile("hadTopTagger_performance_maxDepth3_9Var_ps75.root", "RECREATE") c1 = TCanvas() c1.SetFillColor(10) c1.SetBorderSize(2) c1.SetLeftMargin(0.12) c1.SetBottomMargin(0.12) c1.SetRightMargin(0.05) c1.SetLogy() histogram_base = TH1F("histogram_base", "", 100, 0., 1.)
xgboost2tmva.convert_model(model, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml") # xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml #skTMVA.convert_bdt_sklearn_tmva(cls, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml") #sklearn_to_tmva.xgbr_to_tmva(cls,evals_result,data[trainVars()],trainVars(),"TMVABDT_2lss_1tau_XGB_wMEMallVars.xml",coef=2) # run cross validation print("XGBoost trained") proba = cls.predict_proba(traindataset[trainVars()].values) fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1]) train_auc = auc(fpr, tpr, reorder=True) print("XGBoost train set auc - {}".format(train_auc)) proba = cls.predict_proba(valdataset[trainVars()].values) fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1]) test_auct = auc(fprt, tprt, reorder=True) print("XGBoost test set auc - {}".format(test_auct)) """ sklearn_to_tmva.gbr_to_tmva( cls, data[trainVars()], trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml", coef=2 ) #""" ################################################## clc = catboost.CatBoostClassifier() clc.fit( traindataset[trainVars()].values, traindataset.target.astype(np.bool), #sample_weight= np.absolute((traindataset[weights].astype(np.float64))), #eval_set=[(traindataset[trainVars()].values, traindataset.target.astype(np.bool),traindataset[weights].astype(np.float64)),
# joblib.dump(clf, training_file,protocol = HIGHEST_PROTOCOL) ####################################### ## Converting to TMVA readable xml file ####################################### import sklearn_to_tmva as convert trainingWeights_TMVA = "TMVAClassification_BDTG.weights.xml" log.info("Dumping training file in: " + trainingWeights_TMVA) # *** Sklearn(python)-type training file (.pkl) *** # joblib.dump(clf, trainingWeights_TMVA, compress=True) # *** TMVA-style training file (.xml) *** out_ext = (trainingWeights_TMVA).split(".")[-1] convert.gbr_to_tmva(clf, X, trainingWeights_TMVA, mva_name="BDTG", coef=10, var_names=variables) ################################# # # # Validation # # # ################################# # input_files = [i.strip() for i in open('data_trees/inputs/ttjets.list')] input_files = [ i.strip() for i in open("../data_trees/inputs/ttjets.list") ] # Make sure there are no empty lines in .list pt_bins = [15, 40, 60, 90, 150, 400, 600] eta_bins = [1.2, 2.1] # flavors = ['C', 'B', 'DUSG'] # sv_categories = ["NoVertex", "PseudoVertex", "RecoVertex"]
start = time.time() clf.fit(X, y,weights) end = time.time() log.info('training completed --> Elapsed time: %.1f minutes' % ((end-start)/60)) ## num_nodes = [nnodes(i.tree_) for i in clf.estimators_] ## ## tot_nodes = sum(num_nodes) ## mean = float(tot_nodes)/len(num_nodes) ## print tot_nodes, mean if args.out: log.info('Dumping training file in: ' + args.out) out_ext = (args.out).split('.')[-1] if (out_ext == 'xml'): convert.gbr_to_tmva(clf,X,args.out,mva_name = "BDTG",coef = 10, var_names = variables) else: joblib.dump(clf, args.out, compress=True) ################################# # # # Validation # # # ################################# # you can reload the training if needed (or if you only want to do a validation on an existing training) # but it is much faster to use the still existing classifier from the training ''' print 'Loading training file from: ' + training_file clf_val = joblib.load(training_file) '''
max_depth=4, learning_rate=0.01, n_estimators=100, verbose=True, min_samples_leaf=10, min_samples_split=10 ) cls.fit( np.array(data[["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]]), np.array(np.abs(data[["Jet_flavour"]]) == 5).ravel(len(data), ) ) sklearn_to_tmva.gbr_to_tmva( cls, data[["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]], "test.xml", coef=2 ) import ROOT, array from ROOT import TMVA reader = TMVA.Reader("!V") vardict = {} for fn in ["Jet_CSV", "Jet_CSVIVF", "Jet_pt"]: vardict[fn] = array.array("f", [0]) reader.AddVariable(fn, vardict[fn]) reader.BookMVA("testmva", "test.xml") def mva1(x,y,z): ret = 0
cls = GradientBoostingClassifier( max_depth=3, learning_rate=0.01, n_estimators=100, verbose=True, min_samples_leaf=10, min_samples_split=10 ) d_train, d_test, t_train, t_test, w_train, w_test = train_test_split(data_train, targets, data_weights, test_size=0.2, random_state=7) #cls.fit(data_train, targets) cls.fit(d_train, t_train, w_train) sklearn_to_tmva.gbr_to_tmva( cls, data[trainVars()], "TMVABDT_2lss_1tau_ttV_maxDepth3_10Var_frWt_wMEMall.xml", coef=2 ) import ROOT, array from ROOT import TFile, TH1F, TGraph, TCanvas, TLegend fout = TFile("2lss_1tau_ttV_performance_maxDepth3_10Var_frWt_wMEMall.root", "RECREATE") c1 = TCanvas() c1.SetFillColor(10) c1.SetBorderSize(2) c1.SetLeftMargin(0.12) c1.SetBottomMargin(0.12) c1.SetRightMargin(0.05) c1.SetLogy()