y[signal.shape[0]:] *= -1 permute = RNG.permutation(y.shape[0]) X = X[permute] y = y[permute] # Use all dataset for training X_train, y_train, w_train = X, y, w # Declare BDT - we are going to use AdaBoost Decision Tree dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X_train)) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) # Train BDT bdt.fit(X_train, y_train) # Save BDT to pickle file with open('bdt_sklearn_to_tmva_example.pkl', 'wb') as fid: cPickle.dump(bdt, fid) # Save BDT to TMVA xml file # Note: # - declare input variable names and their type # - variable order is important for TMVA convert_bdt_sklearn_tmva(bdt, [('var1', 'F'), ('var2', 'F')], 'bdt_sklearn_to_tmva_example.xml')
import cPickle from skTMVA import convert_bdt_sklearn_tmva # load decision tree bdt_path = '/Users/musthero/Documents/Yura/Applications/tmva_local/electrons_v5_VeryTightLH_20per.pkl' with open(bdt_path, 'rb') as fid: bdt = cPickle.load(fid) # specify input variable list var_list = [('m_el_pt', 'F'), ('m_el_eta', 'F'), ('m_el_sigd0PV', 'F'), ('m_el_z0SinTheta', 'F'), ('m_el_etcone20Dpt', 'F'), ('m_el_ptcone20Dpt', 'F')] # specify output TMVA xml-file tmva_outfile_xml = 'SKLearn_BDT_electons.weights.xml' # save scikit-learn trained BDT classifier to TMVA xml-file convert_bdt_sklearn_tmva(bdt, var_list, tmva_outfile_xml)
def train(self, train_data, classification_variables, variable_dict, sample_name, grid_search): """ Definition: ----------- Training method for sklBDT; it pickles the model into the "pickle" sub-folder Args: ----- train_data = dictionary, containing "X", "y", "w" for the training set, where: X = ndarray of dim (# training examples, # features) y = array of dim (# training examples) with target values w = array of dim (# training examples) with event weights classification_variables = list of names of variables used for classification variable_dict = ordered dict, mapping all the branches from the TTree to their type sample_name = string that specifies the file name of the sample being trained on """ # -- Train: logging.getLogger("skl_BDT").info("Training...") if grid_search: # Thoughts: # -- min_samples_leaf is supposedly faster to train than min_samples_split # -- could first tune optimum number of trees # then tune max_depth and min_samples to save on combinatorics parameters = {"n_estimators":[100, 150, 200, 250, 300], "max_depth":[2, 4, 6, 8, 10], "min_samples_leaf":[20, 30, 40, 50, 60]} fit_params = {"sample_weight":train_data["w"]} # Run grid search over provided ranges logging.getLogger("skl_BDT").info("Running grid search parameter optimisation...") grid_search = GridSearchCV( estimator=GradientBoostingClassifier( learning_rate=0.2, min_samples_leaf=50, max_features="sqrt", subsample=0.8, random_state=10 ), param_grid=parameters, fit_params=fit_params, scoring="roc_auc", n_jobs=-1, iid=False, cv=3, verbose=1 ) grid_search.fit(train_data["X"], train_data["y"]) for param_name in parameters.keys(): if grid_search.best_params_[param_name] in [ parameters[param_name][0], parameters[param_name][-1] ]: logging.getLogger("skl_BDT").info("Best value of {} is at limit of considered range!".format(param_name)) parameters[param_name] = grid_search.best_params_[param_name] for param_name in parameters.keys(): logging.getLogger("skl_BDT").info("... {}: {}".format(param_name, parameters[param_name])) else: classifier = GradientBoostingClassifier( n_estimators=300, # was n_estimators=300 max_depth=10, # was max_depth=15 min_samples_leaf=40, # was min_samples_split=0.5 * len(train_data["y"]) verbose=1 ) classifier.fit(train_data["X"], train_data["y"], sample_weight=train_data["w"]) # -- Dump output to pickle ensure_directory(os.path.join(self.output_directory, sample_name, self.name, "classifier")) joblib.dump(classifier, os.path.join(self.output_directory, sample_name, self.name, "classifier", "skl_BDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL) # Save BDT to TMVA xml file # -- variable order is important for TMVA # -- can't yet reproduce scikit-learn output in TMVA(!) try: from skTMVA import convert_bdt_sklearn_tmva logging.getLogger("skl_BDT").info("Exporting output to TMVA XML file") variables = [ (v,variable_dict[v]) for v in classification_variables ] convert_bdt_sklearn_tmva( classifier, variables, os.path.join(self.output_directory, sample_name, self.name, "classifier", "skl_BDT_TMVA.weights.xml") ) except ImportError: logging.getLogger("skl_BDT").info("Could not import skTMVA. Skipping export to TMVA output.")
y = np.ones(X.shape[0]) w = RNG.randint(1, 10, n_events * 2) y[signal.shape[0]:] *= -1 permute = RNG.permutation(y.shape[0]) X = X[permute] y = y[permute] # Use all dataset for training X_train, y_train, w_train = X, y, w # Declare BDT - we are going to use AdaBoost Decision Tree bdt = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5, min_samples_leaf=0.05 * len(X_train), max_depth=3, random_state=0) # Train BDT bdt.fit(X_train, y_train) # Save BDT to pickle file with open('bdt_sklearn_to_tmva_example.pkl', 'wb') as fid: cPickle.dump(bdt, fid) # Save BDT to TMVA xml file # Note: # - declare input variable names and their type # - variable order is important for TMVA convert_bdt_sklearn_tmva(bdt, [('var1', 'F'), ('var2', 'F')], 'bdt_sklearn_to_tmva_example.xml')
bins=100, weights=w_test[(y_test < 0.5)], range=[-1, 1], histtype='stepfilled', label='B (test)', color='sandybrown', normed=1) plt.xlabel("predict proba") plt.ylabel("Arbitrary units") plt.ylim(0.0, 6.0) plt.legend(loc='best') plt.savefig('BDT_score_12042019_endcap.png') ################################################################################################################## ################################################################################################################## ##################################################################################################### # convert sklearn model to TMVA weights ################################################################################################# from skTMVA import convert_bdt_sklearn_tmva convert_bdt_sklearn_tmva(model, [('SCRawE', 'F'), ('r9', 'F'), ('sigmaIetaIeta', 'F'), ('etaWidth', 'F'), ('phiWidth', 'F'), ('covIEtaIPhi', 'F'), ('s4', 'F'), ('phoIso03', 'F'), ('chgIsoWrtChosenVtx', 'F'), ('chgIsoWrtWorstVtx', 'F'), ('scEta', 'F'), ('rho', 'F'), ('esEffSigmaRR', 'F'), ('esEnergyOverRawE', 'F')], 'bdt_sklearn_to_tmva_nTree100.xml')
import cPickle from skTMVA import convert_bdt_sklearn_tmva # load decision tree bdt_path = '/Users/musthero/Documents/Yura/Applications/tmva_local/electrons_v5_VeryTightLH_20per.pkl' with open(bdt_path, 'rb') as fid: bdt = cPickle.load(fid) # specify input variable list var_list = [ ('m_el_pt', 'F'), ('m_el_eta', 'F'), ('m_el_sigd0PV', 'F'), ('m_el_z0SinTheta', 'F'), ('m_el_etcone20Dpt', 'F'), ('m_el_ptcone20Dpt', 'F') ] # specify output TMVA xml-file tmva_outfile_xml = 'SKLearn_BDT_electons.weights.xml' # save scikit-learn trained BDT classifier to TMVA xml-file convert_bdt_sklearn_tmva(bdt, var_list, tmva_outfile_xml)
data, test = train_test_split(data, shuffle=False, test_size=0.2) test.to_root('test_sample.root') #scaler1 = MinMaxScaler().fit(data.drop(['trk_isTrue','trk_mva'],axis=1)) #joblib.dump(scaler1, "scaler.pkl") train_x = data.drop(['isSignal', 'TransMass'], axis=1) #train_x=pd.DataFrame(data=scaler1.transform(train_x),columns=train_x.columns.values) train_y = data['isSignal'] #clf = RandomForestClassifier(n_estimators=100,verbose=1,n_jobs=-1,class_weight='balanced') from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(verbose=1, n_estimators=500) clf.fit(train_x, train_y) preds = clf.predict(train_x[:5]) print preds import cPickle with open('BDT_500.pkl', 'wb') as f: cPickle.dump(clf, f) from skTMVA import convert_bdt_sklearn_tmva convert_bdt_sklearn_tmva(clf, [('Tau_pt', 'F'), ('Bjet_pt', 'F'), ('MET', 'F'), ('DPhi_tau_miss', 'F'), ('DPhi_bjet_miss', 'F'), ('Dist_tau_bjet', 'F'), ('Upsilon', 'F'), ('Transmass', 'F')], 'BDT_500.xml')
variable_names = [ "abs_eta_j", "abs_eta_jb", "Delta_eta_jb", "idx_by_mH", "idx_by_pT", "idx_by_pT_jb", "m_jb", "pT_j", "pT_jb" ] variables = OrderedDict( (k,array.array("f",[0])) for k in variable_names ) # Load scikit-learn from pickle skl_bdt = joblib.load("output/merged_inputs/skl_BDT/classifier/skl_BDT_clf.pkl") # # Set up ROOT TMVAs # reader_ROOT = ROOT.TMVA.Reader() # for variable_name, variable in variables.items(): # reader_ROOT.AddVariable(variable_name, variable) # reader_ROOT.BookMVA("scikit-learn", "output/merged_inputs/root_tmva/weights/TMVAClassification_BDT.weights.xml") from skTMVA import convert_bdt_sklearn_tmva logging.getLogger("skl_BDT").info("Exporting output to TMVA XML file") tree_variables = [ (v,"D") for v in variable_names ] convert_bdt_sklearn_tmva(skl_bdt, tree_variables, "converted_skl_BDT_TMVA.weights.xml") reader_skl = ROOT.TMVA.Reader() for variable_name, variable in variables.items(): reader_skl.AddVariable(variable_name, variable) reader_skl.BookMVA("converted", "converted_skl_BDT_TMVA.weights.xml") for input_filename in [ glob.glob("inputs/*X275*root")[0] ]: logger.info("Now considering {}".format(input_filename)) input_file = ROOT.TFile.Open(input_filename, "READ") event_tree = input_file.Get("events_1tag") for idx_evt, event in enumerate(event_tree): if idx_evt > 5 : break n_pairs = len([ x for x in event.isCorrect ]) scores = dict( (k,[]) for k in ["ROOT", "scikit-learn", "converted"] ) for idx_pair in range(n_pairs):