def test_param_conversion(): """ Test the model parameter type conversion funtionality """ objective_sp = 'multi:softprob' # let's make sonarcloud happy init_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1., 'colsample_bynode': 1., 'colsample_bytree': 1., 'gamma': 1., 'learning_rate': 1., 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': np.nan, 'n_estimators': 100, 'n_jobs': 1, 'objective': objective_sp, 'random_state': 0, 'reg_alpha': 1., 'reg_lambda': 1., 'scale_pos_weight': 1., 'subsample': 1., 'verbosity': 1} orig_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 0.72, 'colsample_bynode': 0.81, 'colsample_bytree': 0.94, 'gamma': 5.5, 'learning_rate': 0.012, 'max_delta_step': 0.3, 'max_depth': 6.87, 'min_child_weight': 6.22, 'missing': np.nan, 'n_estimators': 1127.9, 'n_jobs': 1, 'objective': objective_sp, 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 2.2, 'scale_pos_weight': 11.4, 'subsample': 0.91, 'verbosity': 1} right_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 0.72, 'colsample_bynode': 0.81, 'colsample_bytree': 0.94, 'gamma': 5.5, 'learning_rate': 0.012, 'max_delta_step': 0, 'max_depth': 7, 'min_child_weight': 6, 'missing': np.nan, 'n_estimators': 1128, 'n_jobs': 1, 'objective': objective_sp, 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 2.2, 'scale_pos_weight': 11.4, 'subsample': 0.91, 'verbosity': 1} model = ModelHandler(xgb.XGBClassifier(), None, init_dict) converted_dict = model._ModelHandler__cast_model_params(orig_dict) # pylint: disable=protected-access assert converted_dict == right_dict, 'Wrong conversion of model parameters!'
def load_hipe4mlmodel(self): self.logger.info("Loading hipe4ml model") self.v_train = self.signalhandler.get_var_names() self.v_train.remove('inv_mass') self.v_train.remove('pt_cand') model_xgboost = xgb.XGBClassifier() self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)
def load_ML_analysis(self, cent_class, pt_range, ct_range, split=''): info_string = f'_{cent_class[0]}{cent_class[1]}_{pt_range[0]}{pt_range[1]}_{ct_range[0]}{ct_range[1]}{split}' handlers_path = os.environ['HYPERML_MODELS_{}'.format( self.mode)] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_{}'.format( self.mode)] filename_handler = handlers_path + '/model_handler' + info_string + '.pkl' filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + '.npy' eff_score_array = np.load(filename_efficiencies) model_handler = ModelHandler() model_handler.load_model_handler(filename_handler) return eff_score_array, model_handler
def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin): #pylint: disable=too-many-statements, too-many-branches ''' function for model training and testing ''' n_classes = len(np.unique(TrainTestData[3])) modelClf = xgb.XGBClassifier(use_label_encoder=False) TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'][iBin] if not isinstance(TrainCols, list): print('\033[91mERROR: training columns must be defined!\033[0m') sys.exit() if not isinstance(HyperPars, dict): print( '\033[91mERROR: hyper-parameters must be defined or be an empty dict!\033[0m' ) sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization if inputCfg['ml']['hyper_par_opt']['do_hyp_opt']: print('Perform bayesian optimization') BayesOptConfig = inputCfg['ml']['hyper_par_opt']['bayes_opt_config'] if not isinstance(BayesOptConfig, dict): print('\033[91mERROR: bayes_opt_config must be defined!\033[0m') sys.exit() if n_classes > 2: average_method = inputCfg['ml']['roc_auc_average'] roc_method = inputCfg['ml']['roc_auc_approach'] if not (average_method in ['macro', 'weighted'] and roc_method in ['ovo', 'ovr']): print( '\033[91mERROR: selected ROC configuration is not valid!\033[0m' ) sys.exit() if average_method == 'weighted': metric = f'roc_auc_{roc_method}_{average_method}' else: metric = f'roc_auc_{roc_method}' else: metric = 'roc_auc' print('Performing hyper-parameters optimisation: ...', end='\r') OutFileHypPars = open( f'{OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt', 'wt') sys.stdout = OutFileHypPars ModelHandl.optimize_params_bayes( TrainTestData, BayesOptConfig, metric, nfold=inputCfg['ml']['hyper_par_opt']['nfolds'], init_points=inputCfg['ml']['hyper_par_opt']['initpoints'], n_iter=inputCfg['ml']['hyper_par_opt']['niter'], njobs=inputCfg['ml']['hyper_par_opt']['njobs']) OutFileHypPars.close() sys.stdout = sys.__stdout__ print('Performing hyper-parameters optimisation: Done!') print( f'Output saved in {OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt' ) print(f'Best hyper-parameters:\n{ModelHandl.get_model_params()}') else: ModelHandl.set_model_params(HyperPars) # train and test the model with the updated hyper-parameters yPredTest = ModelHandl.train_test_model( TrainTestData, True, output_margin=inputCfg['ml']['raw_output'], average=inputCfg['ml']['roc_auc_average'], multi_class_opt=inputCfg['ml']['roc_auc_approach']) yPredTrain = ModelHandl.predict(TrainTestData[0], inputCfg['ml']['raw_output']) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.pickle') ModelHandl.dump_original_model( f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}.model', True) #plots LegLabels = [ inputCfg['output']['leg_labels']['Bkg'], inputCfg['output']['leg_labels']['Prompt'] ] if inputCfg['output']['leg_labels']['FD'] is not None: LegLabels.append(inputCfg['output']['leg_labels']['FD']) OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, inputCfg['plots']['train_test_log'], density=True) if n_classes > 2: for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig( f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: MLOutputFig.savefig( f'{OutPutDirPt}/MLOutputDistr_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveFig.savefig( f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') pickle.dump( ROCCurveFig, open(f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pkl', 'wb')) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveTTFig = plot_utils.plot_roc_train_test( TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveTTFig.savefig( f'{OutPutDirPt}/ROCCurveTrainTest_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl, LegLabels) n_plot = n_classes if n_classes > 2 else 1 for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < n_plot: label = OutputLabels[iFig] if n_classes > 2 else '' Fig.savefig( f'{OutPutDirPt}/FeatureImportance{label}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) return ModelHandl
plt.xlabel('Iteration') plt.ylabel('ROC AUC') plt.legend() plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white') plt.close() ################################################################################## # BEST HYPERPARAMETERS FOR EACH METHOD names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb'] if False: for name in names: model_hdl = ModelHandler() model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl') print(name) print(model_hdl.get_model_params()) print('\n---------------\n') ################################################################################## # PLOT SUPERIMPOSED ROC ''' plt.close() objects = [] for n in names: with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile:
from hipe4ml.model_handler import ModelHandler # data preparation DIGITS_DATA = datasets.load_digits(n_class=2) DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10]) # pylint: disable=E1101 Y_DIGITS = DIGITS_DATA.target # pylint: disable=E1101 SIG_DF = DIGITS[Y_DIGITS == 1] BKG_DF = DIGITS[Y_DIGITS == 0] TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split( DIGITS, Y_DIGITS, test_size=0.5, random_state=42) DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST] # -------------------------------------------- # training and testing INPUT_MODEL = xgb.XGBClassifier() MODEL = ModelHandler(INPUT_MODEL) MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) Y_PRED_TRAIN = MODEL.predict(DATA[0]) EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array(DATA[3], Y_PRED, n_points=10) # -------------------------------------------- def test_plot_distr(): """ Test the feature distribution plot """ assert isinstance(plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns), np.ndarray)
def train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData): ''' function for model training and testing ''' modelClf = xgb.XGBClassifier() TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'] if not isinstance(TrainCols, list): print('ERROR: training columns must be defined!') sys.exit() if not isinstance(HyperPars, dict): print('ERROR: hyper-parameters must be defined or be an empty dict!') sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization --> not working with multi-class classification at the moment #HypRanges = { # # # defines the maximum depth of a single tree (regularization) # 'max_depth': (1, 30), # 'learning_rate': (0.01, 0.3), # learning rate # 'n_estimators': (50, 1000) # number of boosting trees #} #ModelHandl.optimize_params_bayes(TrainTestData, HypRanges, None) # train and test the model with the updated hyperparameters ModelHandl.train_test_model(TrainTestData) yPredTest = ModelHandl.predict(TrainTestData[2], inputCfg['ml']['raw_output'], True) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtMin}_{PtMax}.pickle') #plots LegLabels = inputCfg['output']['leg_labels'] OutputLabels = inputCfg['output']['out_labels'] #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, True, inputCfg['plots']['train_test_log'], density=True) for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig(f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (8, 7) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, LegLabels) ROCCurveFig.savefig(f'{OutPutDirPt}/ROCCurveAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl) for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < 3: Fig.savefig( f'{OutPutDirPt}/FeatureImportance{OutputLabels[iFig]}_pT_{PtMin}_{PtMax}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtMin}_{PtMax}.pdf') return ModelHandl
def main(): # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') parser.add_argument("--train", help="perform only training and testing", action="store_true") parser.add_argument("--apply", help="perform only application", action="store_true") args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') print('Loading data files: ...', end='\r') PromptDf = LoadDfFromRootOrParquet(inputCfg['input']['prompt']) FDDf = LoadDfFromRootOrParquet(inputCfg['input']['FD']) DataDf = LoadDfFromRootOrParquet(inputCfg['input']['data']) print('Loading data files: Done!') for iBin, (PtMin, PtMax) in enumerate( zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])): print( f'\n\033[94mStarting ML analysis --- {PtMin} < pT < {PtMax} GeV/c\033[0m' ) OutPutDirPt = os.path.join(inputCfg['output']['dir'], f'pt{PtMin}_{PtMax}') if os.path.isdir(OutPutDirPt): print( 'Output directory already exists, overwrites possibly ongoing!' ) else: os.mkdir(OutPutDirPt) # data preparation #_____________________________________________ TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff = data_prep( \ inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf) # training, testing #_____________________________________________ if not args.apply: ModelHandl = train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData) else: ModelList = inputCfg['ml']['saved_models'] ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print(f'ERROR: path to model not correctly defined!') sys.exit() print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) # model application #_____________________________________________ if not args.train: appl(inputCfg, PtMin, PtMax, OutPutDirPt, ModelHandl, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff) # delete dataframes to release memory for data in TrainTestData: del data del DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
split_ineq_sign = '< 0.5' for i_cent_bins in range(len(CENTRALITY_LIST)): cent_bins = CENTRALITY_LIST[i_cent_bins] bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}' ############################################################## # TRAINING AND TEST SET PREPARATION ############################################################## # features plot leg_labels = ['background', 'signal'] model_clf = xgb.XGBClassifier(use_label_encoder=False, n_jobs=10) model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST) model_hdl.set_model_params(HYPERPARAMS) # hyperparameters optimization and model training if not os.path.isdir('models'): os.mkdir('models') bin_model = bin if MERGE_CENTRALITY: bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}' if OPTIMIZE and TRAIN: model_hdl.optimize_params_bayes(train_test_data, HYPERPARAMS_RANGES, 'roc_auc', nfold=5, init_points=10,
''' import os import sys import argparse from hipe4ml.model_handler import ModelHandler parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('inFilePkl', metavar='text', default='model.pkl', help='input pickle file to be converted') args = parser.parse_args() ModelPath = os.path.expanduser(args.inFilePkl) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) if '.pickle' in ModelPath: outFileName = ModelPath.replace('.pickle', '.model') elif '.pkl' in ModelPath: outFileName = ModelPath.replace('.pkl', '.model') else: print(f'ERROR: invalid input file {ModelHandl}, please check it! Exit') sys.exit() ModelHandl.dump_original_model(outFileName, True) print(f'Saved model: {outFileName}')
if split == 'antimatter': split_ineq_sign = '< 0.5' for i_cent_bins in range(len(CENTRALITY_LIST)): cent_bins = CENTRALITY_LIST[i_cent_bins] bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}' ############################################################## # TRAINING AND TEST SET PREPARATION ############################################################## # features plot leg_labels = ['background', 'non_prompt', 'prompt'] model_clf = xgb.XGBClassifier(use_label_encoder=False, n_jobs=4) model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST) model_hdl.set_model_params(HYPERPARAMS) # hyperparameters optimization and model training if not os.path.isdir('models'): os.mkdir('models') bin_model = bin if MERGE_CENTRALITY: bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}' if OPTIMIZE and TRAIN: model_hdl.optimize_params_optuna(train_test_data, HYPERPARAMS_RANGES, 'roc_auc_ovr', nfold=5, timeout=30) isModelTrained = os.path.isfile(f'models/{bin_model}_trained') print(f'isModelTrained {bin_model}: {isModelTrained}')
ml_application = ModelApplication(N_BODY, data_path, analysis_res_path, CENT_CLASSES, split) shift_bin = 1 for cclass in CENT_CLASSES: for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]): for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]): # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test data = ml_analysis.prepare_dataframe(COLUMNS, cent_class=cclass, ct_range=ctbin, pt_range=ptbin) input_model = xgb.XGBClassifier() model_handler = ModelHandler(input_model) info_string = f'_{cclass[0]}{cclass[1]}_{ptbin[0]}{ptbin[1]}_{ctbin[0]}{ctbin[1]}{split}' filename_handler = handlers_path + '/model_handler' + info_string + '.pkl' model_handler.load_model_handler(filename_handler) y_pred = model_handler.predict(data[2]) test_set = pd.concat([data[2], data[3]], axis=1, sort=False) test_set.insert(0, 'score', y_pred) test_set.query('y>0', inplace=True) mass_bins = 40 if ctbin[1] < 16 else 36 eff_score_array, model_handler = ml_application.load_ML_analysis( cclass, ptbin, ctbin, split)
# split data into training and test set train_test_data = train_test_generator( [signal_tree_handler, background_tree_handler], [1, 0], test_size=0.5, random_state=RANDOM_STATE) print( f'Number of candidates ({split}) for training in {cent_bins[0]}-{cent_bins[1]}%, {ct_bins[0]}<=ct<{ct_bins[1]} cm: {len(train_test_data[0])}' ) print( f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}' ) print('') model_clf = xgb.XGBClassifier(use_label_encoder=False) model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST) model_hdl.set_model_params(HYPERPARAMS) # hyperparameters optimization and model training if not os.path.isdir('models'): os.mkdir('models') if OPTIMIZE and TRAIN: model_hdl.optimize_params_bayes(train_test_data, HYPERPARAMS_RANGES, 'roc_auc', nfold=5, init_points=10, n_iter=10, njobs=-1) if TRAIN: model_hdl.train_test_model(train_test_data)
# -------------------------------------------- SKLEARN_DATA = datasets.load_digits(n_class=2) DIGITS_DATASET = pd.DataFrame(SKLEARN_DATA.data) # pylint: disable=E1101 Y_DIGITS = SKLEARN_DATA.target # pylint: disable=E1101 SIG_DF = DIGITS_DATASET[Y_DIGITS == 1] BKG_DF = DIGITS_DATASET[Y_DIGITS == 0] TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split( DIGITS_DATASET, Y_DIGITS, test_size=0.5, random_state=42) DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST] # -------------------------------------------- # TRAINING AND TESTING # -------------------------------------------- INPUT_MODEL = xgb.XGBClassifier() MODEL = ModelHandler(INPUT_MODEL) # hyperparams optimization HYP_RANGES = { # # defines the maximum depth of a single tree (regularization) 'max_depth': (5, 15), # 'learning_rate': (0.01, 0.3), # learning rate 'n_estimators': (5, 10), # number of boosting trees } MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc') # train and test the model with the updated hyperparameters MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) # Calculate the BDT efficiency as a function of the BDT score
for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]): for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]): print( '\n==================================================') print('centrality:', cclass, ' ct:', ctbin, ' pT:', ptbin, split) part_time = time.time() # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test data = ml_analysis.prepare_dataframe(COLUMNS, cent_class=cclass, ct_range=ctbin, pt_range=ptbin) input_model = xgb.XGBClassifier() model_handler = ModelHandler(input_model) model_handler.set_model_params(MODEL_PARAMS) model_handler.set_model_params(HYPERPARAMS) model_handler.set_training_columns(COLUMNS) if OPTIMIZE: model_handler.optimize_params_bayes(data, HYPERPARAMS_RANGE, 'roc_auc', init_points=10, n_iter=10) model_handler.train_test_model(data) print("train test model") print(
def get_skimmed_large_data(data_path, cent_classes, pt_bins, ct_bins, training_columns, application_columns, mode, split=''): print('\n++++++++++++++++++++++++++++++++++++++++++++++++++') print('\nStarting BDT appplication on large data') if mode == 3: handlers_path = os.environ['HYPERML_MODELS_3'] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_3'] if mode == 2: handlers_path = os.environ['HYPERML_MODELS_2'] + '/handlers' efficiencies_path = os.environ['HYPERML_EFFICIENCIES_2'] executor = ThreadPoolExecutor() iterator = uproot.pandas.iterate(data_path, 'DataTable', executor=executor, reportfile=True) df_applied = pd.DataFrame() for current_file, data in iterator: rename_df_columns(data) print('current file: {}'.format(current_file)) print('start entry chunk: {}, stop entry chunk: {}'.format( data.index[0], data.index[-1])) for cclass in cent_classes: for ptbin in zip(pt_bins[:-1], pt_bins[1:]): for ctbin in zip(ct_bins[:-1], ct_bins[1:]): info_string = '_{}{}_{}{}_{}{}'.format( cclass[0], cclass[1], ptbin[0], ptbin[1], ctbin[0], ctbin[1]) filename_handler = handlers_path + '/model_handler' + info_string + split + '.pkl' filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + split + '.npy' model_handler = ModelHandler() model_handler.load_model_handler(filename_handler) eff_score_array = np.load(filename_efficiencies) tsd = eff_score_array[1][-1] data_range = f'{ctbin[0]}<ct<{ctbin[1]} and {ptbin[0]}<pt<{ptbin[1]} and {cclass[0]}<=centrality<{cclass[1]}' df_tmp = data.query(data_range) df_tmp.insert( 0, 'score', model_handler.predict(df_tmp[training_columns])) df_tmp = df_tmp.query('score>@tsd') df_tmp = df_tmp.loc[:, application_columns] df_applied = df_applied.append(df_tmp, ignore_index=True, sort=False) print(df_applied.info(memory_usage='deep')) return df_applied
filename_dict['analysis_path'] + '/' + filename_dict['analysis_name']) ########################################################################## print('\nHypertriton 3-body - pp @ 13 TeV\n') if flag_dict['train_model']: print('Starting model training & application\n') train.train_model(filename_dict, presel_dict, flag_dict, eff_array, train_vars, params, params_range) print('Model training & application complete\n') #print('BENCHMARKING') #utils.benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, train_vars) model_hdl = ModelHandler() model_hdl.load_model_handler(filename_dict['analysis_path'] + '/model/model_hdl') print('Model loaded\n') eff_array, scores = train.load_eff_scores(filename_dict['analysis_path'] + 'output_data/') data = train.load_data_with_scores(filename_dict['analysis_path'] + 'output_data/data_scores.parquet.gzip' ) #pd dataframe already processed print('Data loaded\n') #data.query('model_output > -5', inplace = True) ## PARAM!!!!! #print('Query on data applied\n') background_ls = train.load_data_with_scores(
def main(): #pylint: disable=too-many-statements # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') parser.add_argument("--train", help="perform only training and testing", action="store_true") parser.add_argument("--apply", help="perform only application", action="store_true") args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') print('Loading and preparing data files: ...', end='\r') PromptHandler = TreeHandler(inputCfg['input']['prompt'], inputCfg['input']['treename']) FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler( inputCfg['input']['FD'], inputCfg['input']['treename']) DataHandler = TreeHandler(inputCfg['input']['data'], inputCfg['input']['treename']) if inputCfg['data_prep']['filt_bkg_mass']: BkgHandler = DataHandler.get_subset( inputCfg['data_prep']['filt_bkg_mass'], frac=1., rndm_state=inputCfg['data_prep']['seed_split']) else: BkgHandler = DataHandler PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] PromptHandler.slice_data_frame('pt_cand', PtBins, True) if FDHandler is not None: FDHandler.slice_data_frame('pt_cand', PtBins, True) DataHandler.slice_data_frame('pt_cand', PtBins, True) BkgHandler.slice_data_frame('pt_cand', PtBins, True) print('Loading and preparing data files: Done!') for iBin, PtBin in enumerate(PtBins): print( f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m' ) OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['output']['dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) # data preparation #_____________________________________________ FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice( iBin) TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep( inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin), FDDfPt, BkgHandler.get_slice(iBin)) if args.apply and inputCfg['data_prep']['test_fraction'] < 1.: print( '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m' ) # training, testing #_____________________________________________ if not args.apply: ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin) else: ModelList = inputCfg['ml']['saved_models'] ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print( '\033[91mERROR: path to model not correctly defined!\033[0m' ) sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) # model application #_____________________________________________ if not args.train: appl(inputCfg, PtBin, OutPutDirPt, ModelHandl, DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff) # delete dataframes to release memory for data in TrainTestData: del data del PromptDfSelForEff, FDDfSelForEff
def train_xgboost_model(signal, background, filename_dict, params, params_range, flag_dict, training_variables='', testsize=0.5): ''' Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance ''' print('Training XGBOOST model') training_fig_path = filename_dict['analysis_path'] + "/images/training" train_test_data = train_test_generator([signal, background], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) if not flag_dict['use_default_param']: model_hdl.set_model_params(params) if flag_dict['benchmark_opt']: print('Benchamarking optimizers\n') import time from sklearn.metrics import roc_auc_score times_sk = [] roc_sk = [] for i in range(1): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test)) times_sk.append(time.time() - start) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') time = [] roc = [] for i in range(1): for key in params: if isinstance(params[key], str): params_range[key] = params[key] model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs']) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') if flag_dict['optimize_bayes']: import time print('Doing Bayes optimization of hyperparameters\n') start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', n_iter=700, njobs=flag_dict['n_jobs']) print('Elapsed time: ' + str(time.time() - start)) if flag_dict['optimize_optuna']: print('Doing Optuna optimization of hyperparameters\n') for key in params: if isinstance(params[key], str): params_range[key] = params[key] study = model_hdl.optimize_params_optuna(train_test_data, params_range, scoring='roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs'], n_trials=None) print('Parameters optimization done!\n') if flag_dict['plot_optim']: print('Saving optimization plots') fig = optuna.visualization.plot_slice(study) fig.write_image(training_fig_path + '/optuna_slice.png') fig = optuna.visualization.plot_optimization_history(study) fig.write_image(training_fig_path + '/optuna_history.png') '''fig = optuna.visualization.plot_param_importances(study) fig.write_image(training_fig_path + '/optuna_param_importance.png') fig = optuna.visualization.plot_contour(study) fig.write_image(training_fig_path + '/optuna_contour.png')''' print('Done\n') import joblib joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl") model_hdl.train_test_model(train_test_data, ) print(model_hdl.get_model_params()) print('Predicting values on training and test datas') y_pred_train = model_hdl.predict(train_test_data[0], True) y_pred_test = model_hdl.predict(train_test_data[2], True) #used to evaluate model performance print('Prediction done\n') plt.rcParams["figure.figsize"] = (10, 7) leg_labels = ['background', 'signal'] print('Saving Output comparison plot') plt.figure() ml_out_fig = plot_utils.plot_output_train_test(model_hdl, train_test_data, 100, True, leg_labels, True, density=False) plt.savefig(training_fig_path + '/output_train_test.png', dpi=300, facecolor='white') plt.close() print('Done\n') print('Saving ROC AUC plot') plt.figure() roc_train_test_fig = plot_utils.plot_roc_train_test( train_test_data[3], y_pred_test, train_test_data[1], y_pred_train, None, leg_labels) #ROC AUC plot plt.savefig(training_fig_path + '/ROC_AUC_train_test.png', dpi=300, facecolor='white') import pickle with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f: pickle.dump(roc_train_test_fig, f) plt.close() print('Done\n') print('Saving feature importance plots') plt.figure() feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl, approximate=True) feat_imp_1.savefig(training_fig_path + '/feature_importance_HIPE4ML_violin.png', dpi=300, facecolor='white') feat_imp_2.savefig(training_fig_path + '/feature_importance_HIPE4ML_bar.png', dpi=300, facecolor='white') plt.close() print('Done\n') efficiency_score_conversion(train_test_data, y_pred_test, filename_dict) return train_test_data, y_pred_test, model_hdl
def main(): #pylint: disable=too-many-statements, too-many-branches # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) ColumnsToSave = inputCfg['appl']['column_to_save_list'] ModelList = inputCfg['ml']['saved_models'] ModelHandls = [] for iBin in range(len(PtBins)): ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print('\033[91mERROR: path to model not correctly defined!\033[0m') sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) ModelHandls.append(ModelHandl) for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'], inputCfg['standalone_appl']['output_names']): print(f'Loading and preparing data file {inputFile}: ...', end='\r') DataHandler = TreeHandler(inputFile) DataHandler.slice_data_frame('pt_cand', PtBins, True) print(f'Loading and preparing data files {inputFile}: Done!') print('Applying ML model to dataframes: ...', end='\r') for iBin, PtBin in enumerate(PtBins): OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['standalone_appl']['output_dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) DataDfPtSel = DataHandler.get_slice(iBin) yPred = ModelHandls[iBin].predict(DataDfPtSel, inputCfg['ml']['raw_output']) ColumnsToSaveFinal = ColumnsToSave if not isinstance(ColumnsToSaveFinal, list): print( '\033[91mERROR: column_to_save_list must be defined!\033[0m' ) sys.exit() if 'inv_mass' not in ColumnsToSaveFinal: print( '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m' ) if 'pt_cand' not in ColumnsToSaveFinal: print( '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m' ) if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns: ColumnsToSaveFinal.remove('pt_B') # only in MC DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal] if ModelHandls[iBin].get_n_classes() < 3: DataDfPtSel['ML_output'] = yPred else: for Pred, Lab in enumerate(OutputLabels): DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred] DataDfPtSel.to_parquet( f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip' ) del DataDfPtSel print('Applying ML model to dataframes: Done!')
def benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, training_variables='', testsize=0.75): import time from sklearn.metrics import roc_auc_score N_run = 1 data_path = filename_dict['data_path'] analysis_path = filename_dict['analysis_path'] print('Loading MC signal') mc_signal = TreeHandler() mc_signal.get_handler_from_large_file( file_name=data_path + filename_dict['MC_signal_filename'], tree_name=filename_dict['MC_signal_table']) print('MC signal loaded\n') print('Loading background data for training') background_ls = TreeHandler() background_ls.get_handler_from_large_file( file_name=data_path + filename_dict['train_bckg_filename'], tree_name=filename_dict['train_bckg_table']) background_ls.apply_preselections(presel_dict['train_bckg_presel']) background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(), mc_signal.get_n_cand() * 4)) print('Done\n') train_test_data = train_test_generator([mc_signal, background_ls], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) times = [] roc = [] for i in range(N_run): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) times.append(time.time() - start) print('BAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') for i in range(N_run): model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=np.mean(times), njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n')
corr[0].savefig(results_ml_path + "/correlations.png", bbox_inches='tight') print("---------------------------------------------") print("Data loaded. Training and testing ....") params_range = { "max_depth": (8, 18), "learning_rate": (0.07, 0.15), "n_estimators": (150, 250), "gamma": (0.3, 0.5), "min_child_weight": (3, 8), "subsample": (0.5, 1), "colsample_bytree": (0.3, 1), } model_hdl = ModelHandler(xgb.XGBClassifier(), training_columns) model_hdl.set_model_params(MODEL_PARAMS) model_hdl.set_model_params(HYPERPARAMS) if optmize: model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1, init_points=10, n_iter=20) y_pred_test = model_hdl.train_test_model(train_test_data, True, True) bdt_out_plot = pu.plot_output_train_test(model_hdl, train_test_data, 100,
class Optimiserhipe4mltree: # Class Attribute species = "optimiser_hipe4mltree" def __init__(self, data_param, binmin, binmax, training_var, bkg_sel, hyper_pars): self.logger = get_logger() # directory #self.do_mlprefilter = datap.get("doml_asprefilter", None) self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] #if self.do_mlprefilter is True: # self.dirmodel = self.dirmodel + "/prefilter" # self.dirmlplot = self.dirmlplot + "/prefilter" #if self.do_mlprefilter is False: # self.dirmodel = self.dirmodel + "/analysis" # self.dirmlplot = self.dirmlplot + "/analysis" self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root" self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root" self.v_train = None self.p_binmin = binmin self.p_binmax = binmax self.s_selsigml = "" self.s_selbkgml = bkg_sel #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00" self.v_bkgoversigfrac = 3 self.v_sig = 1 self.v_bkg = 0 self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.prompthandler = None self.datahandler = None self.bkghandler = None self.traintestdata = None self.ypredtrain_hipe4ml = None self.ypredtest_hipe4ml = None self.preparesample() self.p_hipe4ml_model = None self.v_hipe4ml_pars = hyper_pars self.load_hipe4mlmodel() self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][ "bayes_opt_config"] self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"] self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"] self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"] self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"] self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"] self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"] self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"] self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"] self.multiclass_labels = data_param["ml"].get("multiclass_labels", None) self.logger.info("Using the following training variables: %s", self.v_train) def preparesample(self): self.logger.info("Prepare Sample for hipe4ml") self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus') nsigcand = self.signalhandler.get_n_cand() self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus') self.bkghandler = self.datahandler.get_subset(self.s_selbkgml, size=nsigcand * self.v_bkgoversigfrac) self.traintestdata = train_test_generator( [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg], test_size=self.test_frac, random_state=self.rnd_splt) def load_hipe4mlmodel(self): self.logger.info("Loading hipe4ml model") self.v_train = self.signalhandler.get_var_names() self.v_train.remove('inv_mass') self.v_train.remove('pt_cand') model_xgboost = xgb.XGBClassifier() self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train) def set_hipe4ml_modelpar(self): self.logger.info("Setting hipe4ml hyperparameters") self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars) def do_hipe4mlhyperparopti(self): self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)") if not (self.average_method_hipe4ml in ['macro', 'weighted'] and self.roc_method_hipe4ml in ['ovo', 'ovr']): self.logger.fatal("Selected ROC configuration is not valid!") if self.average_method_hipe4ml == 'weighted': metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}' else: metric = f'roc_auc_{self.roc_method_hipe4ml}' hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt' outfilehyppars = open(hypparsfile, 'wt') sys.stdout = outfilehyppars self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata, self.bayesoptconfig_hipe4ml, metric, self.nfold_hipe4ml, self.init_points, self.n_iter_hipe4ml, self.njobs_hipe4ml) outfilehyppars.close() sys.stdout = sys.__stdout__ self.logger.info("Performing hyper-parameters optimisation: Done!") def do_hipe4mltrain(self): self.logger.info("Training + testing hipe4ml model") t0 = time.time() self.p_hipe4ml_model.train_test_model(self.traintestdata, self.average_method_hipe4ml, self.roc_method_hipe4ml) self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[0], self.raw_output_hipe4ml) self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[2], self.raw_output_hipe4ml) modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl' self.p_hipe4ml_model.dump_model_handler(modelhandlerfile) modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model' self.p_hipe4ml_model.dump_original_model(modelfile) self.logger.info("Training + testing hipe4ml: Done!") self.logger.info("Time elapsed = %.3f", time.time() - t0) def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)