Пример #1
0
def test_param_conversion():
    """
    Test the model parameter type conversion funtionality
    """
    objective_sp = 'multi:softprob' # let's make sonarcloud happy
    init_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1., 'colsample_bynode': 1.,
                 'colsample_bytree': 1., 'gamma': 1., 'learning_rate': 1., 'max_delta_step': 0, 'max_depth': 3,
                 'min_child_weight': 1, 'missing': np.nan, 'n_estimators': 100, 'n_jobs': 1,
                 'objective': objective_sp, 'random_state': 0, 'reg_alpha': 1., 'reg_lambda': 1.,
                 'scale_pos_weight': 1., 'subsample': 1., 'verbosity': 1}

    orig_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 0.72, 'colsample_bynode': 0.81,
                 'colsample_bytree': 0.94, 'gamma': 5.5, 'learning_rate': 0.012, 'max_delta_step': 0.3,
                 'max_depth': 6.87, 'min_child_weight':  6.22, 'missing': np.nan, 'n_estimators': 1127.9, 'n_jobs': 1,
                 'objective': objective_sp, 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 2.2,
                 'scale_pos_weight': 11.4, 'subsample': 0.91, 'verbosity': 1}

    right_dict = {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 0.72, 'colsample_bynode': 0.81,
                  'colsample_bytree': 0.94, 'gamma': 5.5, 'learning_rate': 0.012, 'max_delta_step': 0,
                  'max_depth': 7, 'min_child_weight': 6, 'missing': np.nan, 'n_estimators': 1128, 'n_jobs': 1,
                  'objective': objective_sp, 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 2.2,
                  'scale_pos_weight': 11.4, 'subsample': 0.91, 'verbosity': 1}

    model = ModelHandler(xgb.XGBClassifier(), None, init_dict)
    converted_dict = model._ModelHandler__cast_model_params(orig_dict)  # pylint: disable=protected-access
    assert converted_dict == right_dict, 'Wrong conversion of model parameters!'
    def load_hipe4mlmodel(self):
        self.logger.info("Loading hipe4ml model")
        self.v_train = self.signalhandler.get_var_names()
        self.v_train.remove('inv_mass')
        self.v_train.remove('pt_cand')

        model_xgboost = xgb.XGBClassifier()
        self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)
Пример #3
0
    def load_ML_analysis(self, cent_class, pt_range, ct_range, split=''):

        info_string = f'_{cent_class[0]}{cent_class[1]}_{pt_range[0]}{pt_range[1]}_{ct_range[0]}{ct_range[1]}{split}'

        handlers_path = os.environ['HYPERML_MODELS_{}'.format(
            self.mode)] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_{}'.format(
            self.mode)]

        filename_handler = handlers_path + '/model_handler' + info_string + '.pkl'
        filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + '.npy'

        eff_score_array = np.load(filename_efficiencies)

        model_handler = ModelHandler()
        model_handler.load_model_handler(filename_handler)

        return eff_score_array, model_handler
def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin):  #pylint: disable=too-many-statements, too-many-branches
    '''
    function for model training and testing
    '''
    n_classes = len(np.unique(TrainTestData[3]))
    modelClf = xgb.XGBClassifier(use_label_encoder=False)
    TrainCols = inputCfg['ml']['training_columns']
    HyperPars = inputCfg['ml']['hyper_par'][iBin]
    if not isinstance(TrainCols, list):
        print('\033[91mERROR: training columns must be defined!\033[0m')
        sys.exit()
    if not isinstance(HyperPars, dict):
        print(
            '\033[91mERROR: hyper-parameters must be defined or be an empty dict!\033[0m'
        )
        sys.exit()
    ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars)

    # hyperparams optimization
    if inputCfg['ml']['hyper_par_opt']['do_hyp_opt']:
        print('Perform bayesian optimization')

        BayesOptConfig = inputCfg['ml']['hyper_par_opt']['bayes_opt_config']
        if not isinstance(BayesOptConfig, dict):
            print('\033[91mERROR: bayes_opt_config must be defined!\033[0m')
            sys.exit()

        if n_classes > 2:
            average_method = inputCfg['ml']['roc_auc_average']
            roc_method = inputCfg['ml']['roc_auc_approach']
            if not (average_method in ['macro', 'weighted']
                    and roc_method in ['ovo', 'ovr']):
                print(
                    '\033[91mERROR: selected ROC configuration is not valid!\033[0m'
                )
                sys.exit()

            if average_method == 'weighted':
                metric = f'roc_auc_{roc_method}_{average_method}'
            else:
                metric = f'roc_auc_{roc_method}'
        else:
            metric = 'roc_auc'

        print('Performing hyper-parameters optimisation: ...', end='\r')
        OutFileHypPars = open(
            f'{OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt', 'wt')
        sys.stdout = OutFileHypPars
        ModelHandl.optimize_params_bayes(
            TrainTestData,
            BayesOptConfig,
            metric,
            nfold=inputCfg['ml']['hyper_par_opt']['nfolds'],
            init_points=inputCfg['ml']['hyper_par_opt']['initpoints'],
            n_iter=inputCfg['ml']['hyper_par_opt']['niter'],
            njobs=inputCfg['ml']['hyper_par_opt']['njobs'])
        OutFileHypPars.close()
        sys.stdout = sys.__stdout__
        print('Performing hyper-parameters optimisation: Done!')
        print(
            f'Output saved in {OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt'
        )
        print(f'Best hyper-parameters:\n{ModelHandl.get_model_params()}')
    else:
        ModelHandl.set_model_params(HyperPars)

    # train and test the model with the updated hyper-parameters
    yPredTest = ModelHandl.train_test_model(
        TrainTestData,
        True,
        output_margin=inputCfg['ml']['raw_output'],
        average=inputCfg['ml']['roc_auc_average'],
        multi_class_opt=inputCfg['ml']['roc_auc_approach'])
    yPredTrain = ModelHandl.predict(TrainTestData[0],
                                    inputCfg['ml']['raw_output'])

    # save model handler in pickle
    ModelHandl.dump_model_handler(
        f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.pickle')
    ModelHandl.dump_original_model(
        f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}.model', True)

    #plots
    LegLabels = [
        inputCfg['output']['leg_labels']['Bkg'],
        inputCfg['output']['leg_labels']['Prompt']
    ]
    if inputCfg['output']['leg_labels']['FD'] is not None:
        LegLabels.append(inputCfg['output']['leg_labels']['FD'])
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 7)
    MLOutputFig = plot_utils.plot_output_train_test(
        ModelHandl,
        TrainTestData,
        80,
        inputCfg['ml']['raw_output'],
        LegLabels,
        inputCfg['plots']['train_test_log'],
        density=True)
    if n_classes > 2:
        for Fig, Lab in zip(MLOutputFig, OutputLabels):
            Fig.savefig(
                f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )
    else:
        MLOutputFig.savefig(
            f'{OutPutDirPt}/MLOutputDistr_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 9)
    ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, None,
                                      LegLabels,
                                      inputCfg['ml']['roc_auc_average'],
                                      inputCfg['ml']['roc_auc_approach'])
    ROCCurveFig.savefig(
        f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    pickle.dump(
        ROCCurveFig,
        open(f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pkl', 'wb'))
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 9)
    ROCCurveTTFig = plot_utils.plot_roc_train_test(
        TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None,
        LegLabels, inputCfg['ml']['roc_auc_average'],
        inputCfg['ml']['roc_auc_approach'])
    ROCCurveTTFig.savefig(
        f'{OutPutDirPt}/ROCCurveTrainTest_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    PrecisionRecallFig = plot_utils.plot_precision_recall(
        TrainTestData[3], yPredTest, LegLabels)
    PrecisionRecallFig.savefig(
        f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (12, 7)
    FeaturesImportanceFig = plot_utils.plot_feature_imp(
        TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl, LegLabels)
    n_plot = n_classes if n_classes > 2 else 1
    for iFig, Fig in enumerate(FeaturesImportanceFig):
        if iFig < n_plot:
            label = OutputLabels[iFig] if n_classes > 2 else ''
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportance{label}_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )
        else:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )

    return ModelHandl
Пример #5
0
plt.xlabel('Iteration')
plt.ylabel('ROC AUC')
plt.legend()

plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white')
plt.close()

##################################################################################

# BEST HYPERPARAMETERS FOR EACH METHOD

names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb']

if False:
    for name in names:
        model_hdl = ModelHandler()
        model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl')

        print(name)
        print(model_hdl.get_model_params())
        print('\n---------------\n')

##################################################################################

# PLOT SUPERIMPOSED ROC
'''
plt.close()
objects = []

for n in names:
    with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile:
Пример #6
0
from hipe4ml.model_handler import ModelHandler

# data preparation
DIGITS_DATA = datasets.load_digits(n_class=2)
DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10])     # pylint: disable=E1101
Y_DIGITS = DIGITS_DATA.target       # pylint: disable=E1101
SIG_DF = DIGITS[Y_DIGITS == 1]
BKG_DF = DIGITS[Y_DIGITS == 0]
TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split(
    DIGITS, Y_DIGITS, test_size=0.5, random_state=42)
DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST]
# --------------------------------------------

# training and testing
INPUT_MODEL = xgb.XGBClassifier()
MODEL = ModelHandler(INPUT_MODEL)
MODEL.train_test_model(DATA)
Y_PRED = MODEL.predict(DATA[2])
Y_PRED_TRAIN = MODEL.predict(DATA[0])
EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array(DATA[3], Y_PRED, n_points=10)
# --------------------------------------------


def test_plot_distr():
    """
    Test the feature distribution plot
    """
    assert isinstance(plot_utils.plot_distr(
        [SIG_DF, BKG_DF], SIG_DF.columns), np.ndarray)

Пример #7
0
def train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData):
    '''
    function for model training and testing
    '''
    modelClf = xgb.XGBClassifier()
    TrainCols = inputCfg['ml']['training_columns']
    HyperPars = inputCfg['ml']['hyper_par']
    if not isinstance(TrainCols, list):
        print('ERROR: training columns must be defined!')
        sys.exit()
    if not isinstance(HyperPars, dict):
        print('ERROR: hyper-parameters must be defined or be an empty dict!')
        sys.exit()
    ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars)

    # hyperparams optimization --> not working with multi-class classification at the moment
    #HypRanges = {
    #    # # defines the maximum depth of a single tree (regularization)
    #    'max_depth': (1, 30),
    #    'learning_rate': (0.01, 0.3),  # learning rate
    #    'n_estimators': (50, 1000)  # number of boosting trees
    #}
    #ModelHandl.optimize_params_bayes(TrainTestData, HypRanges, None)

    # train and test the model with the updated hyperparameters
    ModelHandl.train_test_model(TrainTestData)
    yPredTest = ModelHandl.predict(TrainTestData[2],
                                   inputCfg['ml']['raw_output'], True)

    # save model handler in pickle
    ModelHandl.dump_model_handler(
        f'{OutPutDirPt}/ModelHandler_pT_{PtMin}_{PtMax}.pickle')

    #plots
    LegLabels = inputCfg['output']['leg_labels']
    OutputLabels = inputCfg['output']['out_labels']
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 7)
    MLOutputFig = plot_utils.plot_output_train_test(
        ModelHandl,
        TrainTestData,
        80,
        inputCfg['ml']['raw_output'],
        LegLabels,
        True,
        inputCfg['plots']['train_test_log'],
        density=True)
    for Fig, Lab in zip(MLOutputFig, OutputLabels):
        Fig.savefig(f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (8, 7)
    ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, LegLabels)
    ROCCurveFig.savefig(f'{OutPutDirPt}/ROCCurveAll_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    PrecisionRecallFig = plot_utils.plot_precision_recall(
        TrainTestData[3], yPredTest, LegLabels)
    PrecisionRecallFig.savefig(
        f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (12, 7)
    FeaturesImportanceFig = plot_utils.plot_feature_imp(
        TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl)
    for iFig, Fig in enumerate(FeaturesImportanceFig):
        if iFig < 3:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportance{OutputLabels[iFig]}_pT_{PtMin}_{PtMax}.pdf'
            )
        else:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtMin}_{PtMax}.pdf')

    return ModelHandl
Пример #8
0
def main():
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    parser.add_argument("--train",
                        help="perform only training and testing",
                        action="store_true")
    parser.add_argument("--apply",
                        help="perform only application",
                        action="store_true")
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    print('Loading data files: ...', end='\r')
    PromptDf = LoadDfFromRootOrParquet(inputCfg['input']['prompt'])
    FDDf = LoadDfFromRootOrParquet(inputCfg['input']['FD'])
    DataDf = LoadDfFromRootOrParquet(inputCfg['input']['data'])
    print('Loading data files: Done!')

    for iBin, (PtMin, PtMax) in enumerate(
            zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])):

        print(
            f'\n\033[94mStarting ML analysis --- {PtMin} < pT < {PtMax} GeV/c\033[0m'
        )

        OutPutDirPt = os.path.join(inputCfg['output']['dir'],
                                   f'pt{PtMin}_{PtMax}')
        if os.path.isdir(OutPutDirPt):
            print(
                'Output directory already exists, overwrites possibly ongoing!'
            )
        else:
            os.mkdir(OutPutDirPt)

        # data preparation
        #_____________________________________________
        TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff = data_prep( \
            inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf)

        # training, testing
        #_____________________________________________
        if not args.apply:
            ModelHandl = train_test(inputCfg, PtMin, PtMax, OutPutDirPt,
                                    TrainTestData)
        else:
            ModelList = inputCfg['ml']['saved_models']
            ModelPath = ModelList[iBin]
            if not isinstance(ModelPath, str):
                print(f'ERROR: path to model not correctly defined!')
                sys.exit()
            print(f'Loaded saved model: {ModelPath}')
            ModelHandl = ModelHandler()
            ModelHandl.load_model_handler(ModelPath)

        # model application
        #_____________________________________________
        if not args.train:
            appl(inputCfg, PtMin, PtMax, OutPutDirPt, ModelHandl, DataDfPtSel,
                 PromptDfPtSelForEff, FDDfPtSelForEff)

        # delete dataframes to release memory
        for data in TrainTestData:
            del data
        del DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
Пример #9
0
                    split_ineq_sign = '< 0.5'

            for i_cent_bins in range(len(CENTRALITY_LIST)):
                cent_bins = CENTRALITY_LIST[i_cent_bins]

                bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}'
                ##############################################################
                # TRAINING AND TEST SET PREPARATION
                ##############################################################

                # features plot
                leg_labels = ['background', 'signal']

                model_clf = xgb.XGBClassifier(use_label_encoder=False,
                                              n_jobs=10)
                model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST)
                model_hdl.set_model_params(HYPERPARAMS)

                # hyperparameters optimization and model training
                if not os.path.isdir('models'):
                    os.mkdir('models')
                bin_model = bin
                if MERGE_CENTRALITY:
                    bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}'

                if OPTIMIZE and TRAIN:
                    model_hdl.optimize_params_bayes(train_test_data,
                                                    HYPERPARAMS_RANGES,
                                                    'roc_auc',
                                                    nfold=5,
                                                    init_points=10,
Пример #10
0
'''

import os
import sys
import argparse

from hipe4ml.model_handler import ModelHandler

parser = argparse.ArgumentParser(description='Arguments to pass')
parser.add_argument('inFilePkl',
                    metavar='text',
                    default='model.pkl',
                    help='input pickle file to be converted')
args = parser.parse_args()

ModelPath = os.path.expanduser(args.inFilePkl)
print(f'Loaded saved model: {ModelPath}')
ModelHandl = ModelHandler()
ModelHandl.load_model_handler(ModelPath)

if '.pickle' in ModelPath:
    outFileName = ModelPath.replace('.pickle', '.model')
elif '.pkl' in ModelPath:
    outFileName = ModelPath.replace('.pkl', '.model')
else:
    print(f'ERROR: invalid input file {ModelHandl}, please check it! Exit')
    sys.exit()

ModelHandl.dump_original_model(outFileName, True)
print(f'Saved model: {outFileName}')
Пример #11
0
                if split == 'antimatter':
                    split_ineq_sign = '< 0.5'

            for i_cent_bins in range(len(CENTRALITY_LIST)):
                cent_bins = CENTRALITY_LIST[i_cent_bins]
                
                bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}'
                ##############################################################
                # TRAINING AND TEST SET PREPARATION
                ##############################################################

                # features plot
                leg_labels = ['background', 'non_prompt', 'prompt']
                
                model_clf = xgb.XGBClassifier(use_label_encoder=False, n_jobs=4)
                model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST)
                model_hdl.set_model_params(HYPERPARAMS)

                # hyperparameters optimization and model training
                if not os.path.isdir('models'):
                    os.mkdir('models')
                bin_model = bin
                if MERGE_CENTRALITY:
                    bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}'

                if OPTIMIZE and TRAIN:
                    model_hdl.optimize_params_optuna(train_test_data, HYPERPARAMS_RANGES,
                                                    'roc_auc_ovr', nfold=5, timeout=30)

                isModelTrained = os.path.isfile(f'models/{bin_model}_trained')
                print(f'isModelTrained {bin_model}: {isModelTrained}')
Пример #12
0
    ml_application = ModelApplication(N_BODY, data_path, analysis_res_path,
                                      CENT_CLASSES, split)

    shift_bin = 1

    for cclass in CENT_CLASSES:
        for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]):
            for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]):
                # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test
                data = ml_analysis.prepare_dataframe(COLUMNS,
                                                     cent_class=cclass,
                                                     ct_range=ctbin,
                                                     pt_range=ptbin)

                input_model = xgb.XGBClassifier()
                model_handler = ModelHandler(input_model)

                info_string = f'_{cclass[0]}{cclass[1]}_{ptbin[0]}{ptbin[1]}_{ctbin[0]}{ctbin[1]}{split}'
                filename_handler = handlers_path + '/model_handler' + info_string + '.pkl'
                model_handler.load_model_handler(filename_handler)

                y_pred = model_handler.predict(data[2])
                test_set = pd.concat([data[2], data[3]], axis=1, sort=False)
                test_set.insert(0, 'score', y_pred)
                test_set.query('y>0', inplace=True)

                mass_bins = 40 if ctbin[1] < 16 else 36

                eff_score_array, model_handler = ml_application.load_ML_analysis(
                    cclass, ptbin, ctbin, split)
Пример #13
0
                # split data into training and test set
                train_test_data = train_test_generator(
                    [signal_tree_handler, background_tree_handler], [1, 0],
                    test_size=0.5,
                    random_state=RANDOM_STATE)
                print(
                    f'Number of candidates ({split}) for training in {cent_bins[0]}-{cent_bins[1]}%, {ct_bins[0]}<=ct<{ct_bins[1]} cm: {len(train_test_data[0])}'
                )
                print(
                    f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}'
                )
                print('')

                model_clf = xgb.XGBClassifier(use_label_encoder=False)
                model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST)
                model_hdl.set_model_params(HYPERPARAMS)

                # hyperparameters optimization and model training
                if not os.path.isdir('models'):
                    os.mkdir('models')
                if OPTIMIZE and TRAIN:
                    model_hdl.optimize_params_bayes(train_test_data,
                                                    HYPERPARAMS_RANGES,
                                                    'roc_auc',
                                                    nfold=5,
                                                    init_points=10,
                                                    n_iter=10,
                                                    njobs=-1)
                if TRAIN:
                    model_hdl.train_test_model(train_test_data)
# --------------------------------------------
SKLEARN_DATA = datasets.load_digits(n_class=2)
DIGITS_DATASET = pd.DataFrame(SKLEARN_DATA.data)     # pylint: disable=E1101
Y_DIGITS = SKLEARN_DATA.target       # pylint: disable=E1101
SIG_DF = DIGITS_DATASET[Y_DIGITS == 1]
BKG_DF = DIGITS_DATASET[Y_DIGITS == 0]
TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split(
    DIGITS_DATASET, Y_DIGITS, test_size=0.5, random_state=42)
DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST]
# --------------------------------------------


# TRAINING AND TESTING
# --------------------------------------------
INPUT_MODEL = xgb.XGBClassifier()
MODEL = ModelHandler(INPUT_MODEL)

# hyperparams optimization
HYP_RANGES = {
    # # defines the maximum depth of a single tree (regularization)
    'max_depth': (5, 15),
    # 'learning_rate': (0.01, 0.3),  # learning rate
    'n_estimators': (5, 10),  # number of boosting trees
}
MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc')

# train and test the model with the updated hyperparameters
MODEL.train_test_model(DATA)
Y_PRED = MODEL.predict(DATA[2])

# Calculate the BDT efficiency as a function of the BDT score
Пример #15
0
            for ptbin in zip(PT_BINS[:-1], PT_BINS[1:]):
                for ctbin in zip(CT_BINS[:-1], CT_BINS[1:]):
                    print(
                        '\n==================================================')
                    print('centrality:', cclass, ' ct:', ctbin, ' pT:', ptbin,
                          split)

                    part_time = time.time()

                    # data[0]=train_set, data[1]=y_train, data[2]=test_set, data[3]=y_test
                    data = ml_analysis.prepare_dataframe(COLUMNS,
                                                         cent_class=cclass,
                                                         ct_range=ctbin,
                                                         pt_range=ptbin)
                    input_model = xgb.XGBClassifier()
                    model_handler = ModelHandler(input_model)

                    model_handler.set_model_params(MODEL_PARAMS)
                    model_handler.set_model_params(HYPERPARAMS)
                    model_handler.set_training_columns(COLUMNS)

                    if OPTIMIZE:
                        model_handler.optimize_params_bayes(data,
                                                            HYPERPARAMS_RANGE,
                                                            'roc_auc',
                                                            init_points=10,
                                                            n_iter=10)

                    model_handler.train_test_model(data)
                    print("train test model")
                    print(
Пример #16
0
def get_skimmed_large_data(data_path,
                           cent_classes,
                           pt_bins,
                           ct_bins,
                           training_columns,
                           application_columns,
                           mode,
                           split=''):
    print('\n++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('\nStarting BDT appplication on large data')

    if mode == 3:
        handlers_path = os.environ['HYPERML_MODELS_3'] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_3']

    if mode == 2:
        handlers_path = os.environ['HYPERML_MODELS_2'] + '/handlers'
        efficiencies_path = os.environ['HYPERML_EFFICIENCIES_2']

    executor = ThreadPoolExecutor()
    iterator = uproot.pandas.iterate(data_path,
                                     'DataTable',
                                     executor=executor,
                                     reportfile=True)

    df_applied = pd.DataFrame()

    for current_file, data in iterator:
        rename_df_columns(data)

        print('current file: {}'.format(current_file))
        print('start entry chunk: {}, stop entry chunk: {}'.format(
            data.index[0], data.index[-1]))

        for cclass in cent_classes:
            for ptbin in zip(pt_bins[:-1], pt_bins[1:]):
                for ctbin in zip(ct_bins[:-1], ct_bins[1:]):
                    info_string = '_{}{}_{}{}_{}{}'.format(
                        cclass[0], cclass[1], ptbin[0], ptbin[1], ctbin[0],
                        ctbin[1])

                    filename_handler = handlers_path + '/model_handler' + info_string + split + '.pkl'
                    filename_efficiencies = efficiencies_path + '/Eff_Score' + info_string + split + '.npy'

                    model_handler = ModelHandler()
                    model_handler.load_model_handler(filename_handler)

                    eff_score_array = np.load(filename_efficiencies)
                    tsd = eff_score_array[1][-1]

                    data_range = f'{ctbin[0]}<ct<{ctbin[1]} and {ptbin[0]}<pt<{ptbin[1]} and {cclass[0]}<=centrality<{cclass[1]}'

                    df_tmp = data.query(data_range)
                    df_tmp.insert(
                        0, 'score',
                        model_handler.predict(df_tmp[training_columns]))

                    df_tmp = df_tmp.query('score>@tsd')
                    df_tmp = df_tmp.loc[:, application_columns]

                    df_applied = df_applied.append(df_tmp,
                                                   ignore_index=True,
                                                   sort=False)

    print(df_applied.info(memory_usage='deep'))
    return df_applied
Пример #17
0
        filename_dict['analysis_path'] + '/' + filename_dict['analysis_name'])

    ##########################################################################

    print('\nHypertriton 3-body - pp @ 13 TeV\n')

    if flag_dict['train_model']:
        print('Starting model training & application\n')
        train.train_model(filename_dict, presel_dict, flag_dict, eff_array,
                          train_vars, params, params_range)
        print('Model training & application complete\n')

    #print('BENCHMARKING')
    #utils.benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, train_vars)

    model_hdl = ModelHandler()
    model_hdl.load_model_handler(filename_dict['analysis_path'] +
                                 '/model/model_hdl')

    print('Model loaded\n')

    eff_array, scores = train.load_eff_scores(filename_dict['analysis_path'] +
                                              'output_data/')

    data = train.load_data_with_scores(filename_dict['analysis_path'] +
                                       'output_data/data_scores.parquet.gzip'
                                       )  #pd dataframe already processed
    print('Data loaded\n')
    #data.query('model_output > -5', inplace = True)         ## PARAM!!!!!
    #print('Query on data applied\n')
    background_ls = train.load_data_with_scores(
def main():  #pylint: disable=too-many-statements
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    parser.add_argument("--train",
                        help="perform only training and testing",
                        action="store_true")
    parser.add_argument("--apply",
                        help="perform only application",
                        action="store_true")
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    print('Loading and preparing data files: ...', end='\r')
    PromptHandler = TreeHandler(inputCfg['input']['prompt'],
                                inputCfg['input']['treename'])
    FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler(
        inputCfg['input']['FD'], inputCfg['input']['treename'])
    DataHandler = TreeHandler(inputCfg['input']['data'],
                              inputCfg['input']['treename'])

    if inputCfg['data_prep']['filt_bkg_mass']:
        BkgHandler = DataHandler.get_subset(
            inputCfg['data_prep']['filt_bkg_mass'],
            frac=1.,
            rndm_state=inputCfg['data_prep']['seed_split'])
    else:
        BkgHandler = DataHandler

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    PromptHandler.slice_data_frame('pt_cand', PtBins, True)
    if FDHandler is not None:
        FDHandler.slice_data_frame('pt_cand', PtBins, True)
    DataHandler.slice_data_frame('pt_cand', PtBins, True)
    BkgHandler.slice_data_frame('pt_cand', PtBins, True)
    print('Loading and preparing data files: Done!')

    for iBin, PtBin in enumerate(PtBins):
        print(
            f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m'
        )

        OutPutDirPt = os.path.join(
            os.path.expanduser(inputCfg['output']['dir']),
            f'pt{PtBin[0]}_{PtBin[1]}')
        if os.path.isdir(OutPutDirPt):
            print((
                f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                ' overwrites possibly ongoing!\033[0m'))
        else:
            os.makedirs(OutPutDirPt)

        # data preparation
        #_____________________________________________
        FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice(
            iBin)
        TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep(
            inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin),
            FDDfPt, BkgHandler.get_slice(iBin))
        if args.apply and inputCfg['data_prep']['test_fraction'] < 1.:
            print(
                '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m'
            )

        # training, testing
        #_____________________________________________
        if not args.apply:
            ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt,
                                    TrainTestData, iBin)
        else:
            ModelList = inputCfg['ml']['saved_models']
            ModelPath = ModelList[iBin]
            if not isinstance(ModelPath, str):
                print(
                    '\033[91mERROR: path to model not correctly defined!\033[0m'
                )
                sys.exit()
            ModelPath = os.path.expanduser(ModelPath)
            print(f'Loaded saved model: {ModelPath}')
            ModelHandl = ModelHandler()
            ModelHandl.load_model_handler(ModelPath)

        # model application
        #_____________________________________________
        if not args.train:
            appl(inputCfg, PtBin, OutPutDirPt, ModelHandl,
                 DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff)

        # delete dataframes to release memory
        for data in TrainTestData:
            del data
        del PromptDfSelForEff, FDDfSelForEff
Пример #19
0
def train_xgboost_model(signal,
                        background,
                        filename_dict,
                        params,
                        params_range,
                        flag_dict,
                        training_variables='',
                        testsize=0.5):
    '''
    Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance
    '''

    print('Training XGBOOST model')

    training_fig_path = filename_dict['analysis_path'] + "/images/training"

    train_test_data = train_test_generator([signal, background], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)
    if not flag_dict['use_default_param']:
        model_hdl.set_model_params(params)

    if flag_dict['benchmark_opt']:

        print('Benchamarking optimizers\n')
        import time
        from sklearn.metrics import roc_auc_score
        times_sk = []
        roc_sk = []

        for i in range(1):
            start = time.time()

            model_hdl.optimize_params_bayes(train_test_data,
                                            params_range,
                                            'roc_auc',
                                            njobs=-1)
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test))

            times_sk.append(time.time() - start)

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')

        time = []
        roc = []

        for i in range(1):

            for key in params:
                if isinstance(params[key], str):
                    params_range[key] = params[key]

            model_hdl.optimize_params_optuna(train_test_data,
                                             params_range,
                                             'roc_auc',
                                             timeout=flag_dict['timeout'],
                                             n_jobs=flag_dict['n_jobs'])
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')
        print('Fixed time : ' + str(np.mean(time)))
        print('Mean ROC : ' + str(np.mean(roc)))
        print('--------------\n')

    if flag_dict['optimize_bayes']:
        import time
        print('Doing Bayes optimization of hyperparameters\n')
        start = time.time()
        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        n_iter=700,
                                        njobs=flag_dict['n_jobs'])
        print('Elapsed time: ' + str(time.time() - start))

    if flag_dict['optimize_optuna']:
        print('Doing Optuna optimization of hyperparameters\n')
        for key in params:
            if isinstance(params[key], str):
                params_range[key] = params[key]
        study = model_hdl.optimize_params_optuna(train_test_data,
                                                 params_range,
                                                 scoring='roc_auc',
                                                 timeout=flag_dict['timeout'],
                                                 n_jobs=flag_dict['n_jobs'],
                                                 n_trials=None)

        print('Parameters optimization done!\n')

        if flag_dict['plot_optim']:
            print('Saving optimization plots')
            fig = optuna.visualization.plot_slice(study)
            fig.write_image(training_fig_path + '/optuna_slice.png')
            fig = optuna.visualization.plot_optimization_history(study)
            fig.write_image(training_fig_path + '/optuna_history.png')
            '''fig = optuna.visualization.plot_param_importances(study)
            fig.write_image(training_fig_path + '/optuna_param_importance.png')
            fig = optuna.visualization.plot_contour(study)
            fig.write_image(training_fig_path + '/optuna_contour.png')'''
            print('Done\n')

        import joblib

        joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl")

    model_hdl.train_test_model(train_test_data, )
    print(model_hdl.get_model_params())

    print('Predicting values on training and test datas')
    y_pred_train = model_hdl.predict(train_test_data[0], True)
    y_pred_test = model_hdl.predict(train_test_data[2],
                                    True)  #used to evaluate model performance
    print('Prediction done\n')

    plt.rcParams["figure.figsize"] = (10, 7)
    leg_labels = ['background', 'signal']

    print('Saving Output comparison plot')
    plt.figure()
    ml_out_fig = plot_utils.plot_output_train_test(model_hdl,
                                                   train_test_data,
                                                   100,
                                                   True,
                                                   leg_labels,
                                                   True,
                                                   density=False)
    plt.savefig(training_fig_path + '/output_train_test.png',
                dpi=300,
                facecolor='white')
    plt.close()
    print('Done\n')

    print('Saving ROC AUC plot')
    plt.figure()
    roc_train_test_fig = plot_utils.plot_roc_train_test(
        train_test_data[3], y_pred_test, train_test_data[1], y_pred_train,
        None, leg_labels)  #ROC AUC plot
    plt.savefig(training_fig_path + '/ROC_AUC_train_test.png',
                dpi=300,
                facecolor='white')

    import pickle
    with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f:
        pickle.dump(roc_train_test_fig, f)
    plt.close()

    print('Done\n')

    print('Saving feature importance plots')
    plt.figure()
    feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2],
                                                         train_test_data[3],
                                                         model_hdl,
                                                         approximate=True)
    feat_imp_1.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_violin.png',
                       dpi=300,
                       facecolor='white')
    feat_imp_2.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_bar.png',
                       dpi=300,
                       facecolor='white')
    plt.close()
    print('Done\n')

    efficiency_score_conversion(train_test_data, y_pred_test, filename_dict)

    return train_test_data, y_pred_test, model_hdl
Пример #20
0
def main():  #pylint: disable=too-many-statements, too-many-branches
    # read config file
    parser = argparse.ArgumentParser(description='Arguments to pass')
    parser.add_argument('cfgFileName',
                        metavar='text',
                        default='cfgFileNameML.yml',
                        help='config file name for ml')
    args = parser.parse_args()

    print('Loading analysis configuration: ...', end='\r')
    with open(args.cfgFileName, 'r') as ymlCfgFile:
        inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
    print('Loading analysis configuration: Done!')

    PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'],
                                     inputCfg['pt_ranges']['max'])]
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    ColumnsToSave = inputCfg['appl']['column_to_save_list']
    ModelList = inputCfg['ml']['saved_models']
    ModelHandls = []
    for iBin in range(len(PtBins)):
        ModelPath = ModelList[iBin]
        if not isinstance(ModelPath, str):
            print('\033[91mERROR: path to model not correctly defined!\033[0m')
            sys.exit()
        ModelPath = os.path.expanduser(ModelPath)
        print(f'Loaded saved model: {ModelPath}')
        ModelHandl = ModelHandler()
        ModelHandl.load_model_handler(ModelPath)
        ModelHandls.append(ModelHandl)

    for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'],
                                  inputCfg['standalone_appl']['output_names']):
        print(f'Loading and preparing data file {inputFile}: ...', end='\r')
        DataHandler = TreeHandler(inputFile)
        DataHandler.slice_data_frame('pt_cand', PtBins, True)
        print(f'Loading and preparing data files {inputFile}: Done!')

        print('Applying ML model to dataframes: ...', end='\r')
        for iBin, PtBin in enumerate(PtBins):
            OutPutDirPt = os.path.join(
                os.path.expanduser(inputCfg['standalone_appl']['output_dir']),
                f'pt{PtBin[0]}_{PtBin[1]}')
            if os.path.isdir(OutPutDirPt):
                print((
                    f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,'
                    ' overwrites possibly ongoing!\033[0m'))
            else:
                os.makedirs(OutPutDirPt)
            DataDfPtSel = DataHandler.get_slice(iBin)
            yPred = ModelHandls[iBin].predict(DataDfPtSel,
                                              inputCfg['ml']['raw_output'])
            ColumnsToSaveFinal = ColumnsToSave
            if not isinstance(ColumnsToSaveFinal, list):
                print(
                    '\033[91mERROR: column_to_save_list must be defined!\033[0m'
                )
                sys.exit()
            if 'inv_mass' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_cand' not in ColumnsToSaveFinal:
                print(
                    '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m'
                )
            if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns:
                ColumnsToSaveFinal.remove('pt_B')  # only in MC
            DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal]
            if ModelHandls[iBin].get_n_classes() < 3:
                DataDfPtSel['ML_output'] = yPred
            else:
                for Pred, Lab in enumerate(OutputLabels):
                    DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred]
            DataDfPtSel.to_parquet(
                f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip'
            )
            del DataDfPtSel
        print('Applying ML model to dataframes: Done!')
Пример #21
0
def benchmark_hyperparam_optimizers(filename_dict,
                                    params,
                                    params_range,
                                    flag_dict,
                                    presel_dict,
                                    training_variables='',
                                    testsize=0.75):

    import time
    from sklearn.metrics import roc_auc_score

    N_run = 1

    data_path = filename_dict['data_path']
    analysis_path = filename_dict['analysis_path']

    print('Loading MC signal')
    mc_signal = TreeHandler()
    mc_signal.get_handler_from_large_file(
        file_name=data_path + filename_dict['MC_signal_filename'],
        tree_name=filename_dict['MC_signal_table'])
    print('MC signal loaded\n')

    print('Loading background data for training')
    background_ls = TreeHandler()
    background_ls.get_handler_from_large_file(
        file_name=data_path + filename_dict['train_bckg_filename'],
        tree_name=filename_dict['train_bckg_table'])
    background_ls.apply_preselections(presel_dict['train_bckg_presel'])
    background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(),
                                              mc_signal.get_n_cand() * 4))
    print('Done\n')

    train_test_data = train_test_generator([mc_signal, background_ls], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)

    times = []
    roc = []

    for i in range(N_run):
        start = time.time()

        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        times.append(time.time() - start)

    print('BAYES OPTIMIZATION WITH SKLEARN')
    print('Mean time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')

    for i in range(N_run):
        model_hdl.optimize_params_optuna(train_test_data,
                                         params_range,
                                         'roc_auc',
                                         timeout=np.mean(times),
                                         njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

    print('OPTUNA')
    print('Fixed time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')
    corr[0].savefig(results_ml_path + "/correlations.png", bbox_inches='tight')

    print("---------------------------------------------")
    print("Data loaded. Training and testing ....")

    params_range = {
        "max_depth": (8, 18),
        "learning_rate": (0.07, 0.15),
        "n_estimators": (150, 250),
        "gamma": (0.3, 0.5),
        "min_child_weight": (3, 8),
        "subsample": (0.5, 1),
        "colsample_bytree": (0.3, 1),
    }

    model_hdl = ModelHandler(xgb.XGBClassifier(), training_columns)
    model_hdl.set_model_params(MODEL_PARAMS)
    model_hdl.set_model_params(HYPERPARAMS)
    if optmize:
        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1,
                                        init_points=10,
                                        n_iter=20)

    y_pred_test = model_hdl.train_test_model(train_test_data, True, True)

    bdt_out_plot = pu.plot_output_train_test(model_hdl,
                                             train_test_data,
                                             100,
class Optimiserhipe4mltree:
    # Class Attribute
    species = "optimiser_hipe4mltree"

    def __init__(self, data_param, binmin, binmax, training_var, bkg_sel,
                 hyper_pars):

        self.logger = get_logger()

        # directory
        #self.do_mlprefilter = datap.get("doml_asprefilter", None)
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]
        #if self.do_mlprefilter is True:
        #    self.dirmodel = self.dirmodel + "/prefilter"
        #    self.dirmlplot = self.dirmlplot + "/prefilter"
        #if self.do_mlprefilter is False:
        #    self.dirmodel = self.dirmodel + "/analysis"
        #    self.dirmlplot = self.dirmlplot + "/analysis"

        self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root"
        self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root"
        self.v_train = None
        self.p_binmin = binmin
        self.p_binmax = binmax

        self.s_selsigml = ""
        self.s_selbkgml = bkg_sel  #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00"
        self.v_bkgoversigfrac = 3
        self.v_sig = 1
        self.v_bkg = 0
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]

        self.prompthandler = None
        self.datahandler = None
        self.bkghandler = None
        self.traintestdata = None
        self.ypredtrain_hipe4ml = None
        self.ypredtest_hipe4ml = None

        self.preparesample()

        self.p_hipe4ml_model = None
        self.v_hipe4ml_pars = hyper_pars
        self.load_hipe4mlmodel()

        self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][
            "bayes_opt_config"]
        self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"]
        self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"]
        self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"]
        self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"]
        self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"]
        self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"]
        self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"]
        self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"]

        self.multiclass_labels = data_param["ml"].get("multiclass_labels",
                                                      None)

        self.logger.info("Using the following training variables: %s",
                         self.v_train)

    def preparesample(self):
        self.logger.info("Prepare Sample for hipe4ml")

        self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus')
        nsigcand = self.signalhandler.get_n_cand()
        self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus')
        self.bkghandler = self.datahandler.get_subset(self.s_selbkgml,
                                                      size=nsigcand *
                                                      self.v_bkgoversigfrac)
        self.traintestdata = train_test_generator(
            [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg],
            test_size=self.test_frac,
            random_state=self.rnd_splt)

    def load_hipe4mlmodel(self):
        self.logger.info("Loading hipe4ml model")
        self.v_train = self.signalhandler.get_var_names()
        self.v_train.remove('inv_mass')
        self.v_train.remove('pt_cand')

        model_xgboost = xgb.XGBClassifier()
        self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)

    def set_hipe4ml_modelpar(self):
        self.logger.info("Setting hipe4ml hyperparameters")
        self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars)

    def do_hipe4mlhyperparopti(self):
        self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)")

        if not (self.average_method_hipe4ml in ['macro', 'weighted']
                and self.roc_method_hipe4ml in ['ovo', 'ovr']):
            self.logger.fatal("Selected ROC configuration is not valid!")

        if self.average_method_hipe4ml == 'weighted':
            metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}'
        else:
            metric = f'roc_auc_{self.roc_method_hipe4ml}'

        hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt'
        outfilehyppars = open(hypparsfile, 'wt')
        sys.stdout = outfilehyppars
        self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata,
                                                   self.bayesoptconfig_hipe4ml,
                                                   metric, self.nfold_hipe4ml,
                                                   self.init_points,
                                                   self.n_iter_hipe4ml,
                                                   self.njobs_hipe4ml)
        outfilehyppars.close()
        sys.stdout = sys.__stdout__
        self.logger.info("Performing hyper-parameters optimisation: Done!")

    def do_hipe4mltrain(self):
        self.logger.info("Training + testing hipe4ml model")
        t0 = time.time()

        self.p_hipe4ml_model.train_test_model(self.traintestdata,
                                              self.average_method_hipe4ml,
                                              self.roc_method_hipe4ml)
        self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[0], self.raw_output_hipe4ml)
        self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[2], self.raw_output_hipe4ml)

        modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl'
        self.p_hipe4ml_model.dump_model_handler(modelhandlerfile)
        modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model'
        self.p_hipe4ml_model.dump_original_model(modelfile)

        self.logger.info("Training + testing hipe4ml: Done!")
        self.logger.info("Time elapsed = %.3f", time.time() - t0)

    def do_hipe4mlplot(self):
        self.logger.info("Plotting hipe4ml model")

        leglabels = ["Background", "Prompt signal"]
        outputlabels = ["Bkg", "SigPrompt"]

        # _____________________________________________
        plot_utils.plot_distr([self.bkghandler, self.signalhandler],
                              self.v_train, 100, leglabels)
        plt.subplots_adjust(left=0.06,
                            bottom=0.06,
                            right=0.99,
                            top=0.96,
                            hspace=0.55,
                            wspace=0.55)
        figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        plt.savefig(figname)
        plt.close('all')
        # _____________________________________________
        corrmatrixfig = plot_utils.plot_corr(
            [self.bkghandler, self.signalhandler], self.v_train, leglabels)
        for figg, labb in zip(corrmatrixfig, outputlabels):
            plt.figure(figg.number)
            plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
            figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf'
            figg.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 7)
        mloutputfig = plot_utils.plot_output_train_test(
            self.p_hipe4ml_model,
            self.traintestdata,
            80,
            self.raw_output_hipe4ml,
            leglabels,
            self.train_test_log_hipe4ml,
            density=True)
        figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        mloutputfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvefig = plot_utils.plot_roc(self.traintestdata[3],
                                          self.ypredtest_hipe4ml, None,
                                          leglabels,
                                          self.average_method_hipe4ml,
                                          self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvefig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvettfig = plot_utils.plot_roc_train_test(
            self.traintestdata[3], self.ypredtest_hipe4ml,
            self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels,
            self.average_method_hipe4ml, self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvettfig.savefig(figname)
        # _____________________________________________
        precisionrecallfig = plot_utils.plot_precision_recall(
            self.traintestdata[3], self.ypredtest_hipe4ml, leglabels)
        figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        precisionrecallfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (12, 7)
        featuresimportancefig = plot_utils.plot_feature_imp(
            self.traintestdata[2][self.v_train], self.traintestdata[3],
            self.p_hipe4ml_model, leglabels)
        for i in range(0, len(featuresimportancefig)):
            figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_'
                       f'pT_{self.p_binmin}_{self.p_binmax}.pdf')
            featuresimportancefig[i].savefig(figname)