bins=40, labels=leg_labels, log=True, density=True, figsize=(12, 12), alpha=0.5, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/FeaturePlots.pdf') bkg_corr = plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['Background']) bkg_corr.set_size_inches(6, 6) plt.subplots_adjust(left=0.1, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/BackgroundCorrelationMatrix.pdf') np_corr = plot_utils.plot_corr([non_prompt_tree_handler], TRAINING_COLUMNS_LIST, ['Non-prompt']) np_corr.set_size_inches(6, 6) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/NonPromptCorrelationMatrix.pdf') p_corr = plot_utils.plot_corr([prompt_tree_handler],
def data_prep(inputCfg, iBin, PtBin, OutPutDirPt, PromptDf, FDDf, BkgDf): #pylint: disable=too-many-statements, too-many-branches ''' function for data preparation ''' nPrompt = len(PromptDf) nFD = len(FDDf) nBkg = len(BkgDf) if FDDf.empty: out = f'\n Signal: {nPrompt}\n Bkg: {nBkg}' else: out = f'\n Prompt: {nPrompt}\n FD: {nFD}\n Bkg: {nBkg}' print( f'Number of available candidates in {PtBin[0]} < pT < {PtBin[1]} GeV/c:{out}' ) dataset_opt = inputCfg['data_prep']['dataset_opt'] seed_split = inputCfg['data_prep']['seed_split'] test_f = inputCfg['data_prep']['test_fraction'] if dataset_opt == 'equal': if FDDf.empty: nCandToKeep = min([nPrompt, nBkg]) out = 'signal' out2 = 'signal' else: nCandToKeep = min([nPrompt, nFD, nBkg]) out = 'prompt, FD' out2 = 'prompt' print(( f'Keep same number of {out} and background (minimum) for training and ' f'testing ({1 - test_f}-{test_f}): {nCandToKeep}')) print( f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}' ) if nPrompt > nCandToKeep: print((f'Remaining {out2} candidates ({nPrompt - nCandToKeep})' 'will be used for the efficiency together with test set')) if nFD > nCandToKeep: print(( f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the ' 'efficiency together with test set')) TotDf = pd.concat([ BkgDf.iloc[:nCandToKeep], PromptDf.iloc[:nCandToKeep], FDDf.iloc[:nCandToKeep] ], sort=True) if FDDf.empty: LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep) else: LabelsArray = np.array([0] * nCandToKeep + [1] * nCandToKeep + [2] * nCandToKeep) if test_f < 1: TrainSet, TestSet, yTrain, yTest = train_test_split( TotDf, LabelsArray, test_size=test_f, random_state=seed_split) else: TrainSet = pd.DataFrame() TestSet = TotDf.copy() yTrain = pd.Series() yTest = LabelsArray.copy() TrainTestData = [TrainSet, yTrain, TestSet, yTest] PromptDfSelForEff = pd.concat([ PromptDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 1] ], sort=False) if FDDf.empty: FDDfSelForEff = pd.DataFrame() else: FDDfSelForEff = pd.concat([ FDDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 2] ], sort=False) del TotDf elif dataset_opt == 'max_signal': nCandBkg = round(inputCfg['data_prep']['bkg_mult'][iBin] * (nPrompt + nFD)) out = 'signal' if FDDf.empty else 'prompt and FD' print(( f'Keep all {out} and use {nCandBkg} bkg candidates for training and ' f'testing ({1 - test_f}-{test_f})')) if nCandBkg >= nBkg: nCandBkg = nBkg print('\033[93mWARNING: using all bkg available, not good!\033[0m') print( f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}' ) TotDf = pd.concat([BkgDf.iloc[:nCandBkg], PromptDf, FDDf], sort=True) if FDDf.empty: LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt) else: LabelsArray = np.array([0] * nCandBkg + [1] * nPrompt + [2] * nFD) if test_f < 1: TrainSet, TestSet, yTrain, yTest = train_test_split( TotDf, LabelsArray, test_size=test_f, random_state=seed_split) else: TrainSet = pd.DataFrame() TestSet = TotDf.copy() yTrain = pd.Series() yTest = LabelsArray.copy() TrainTestData = [TrainSet, yTrain, TestSet, yTest] PromptDfSelForEff = TestSet[pd.Series(yTest).array == 1] FDDfSelForEff = pd.DataFrame() if FDDf.empty else TestSet[pd.Series( yTest).array == 2] del TotDf else: print(f'\033[91mERROR: {dataset_opt} is not a valid option!\033[0m') sys.exit() # plots VarsToDraw = inputCfg['plots']['plotting_columns'] LegLabels = [ inputCfg['output']['leg_labels']['Bkg'], inputCfg['output']['leg_labels']['Prompt'] ] if inputCfg['output']['leg_labels']['FD'] is not None: LegLabels.append(inputCfg['output']['leg_labels']['FD']) OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) ListDf = [BkgDf, PromptDf] if FDDf.empty else [BkgDf, PromptDf, FDDf] #_____________________________________________ plot_utils.plot_distr(ListDf, VarsToDraw, 100, LegLabels, figsize=(12, 7), alpha=0.3, log=True, grid=False, density=True) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') plt.close('all') #_____________________________________________ CorrMatrixFig = plot_utils.plot_corr(ListDf, VarsToDraw, LegLabels) for Fig, Lab in zip(CorrMatrixFig, OutputLabels): plt.figure(Fig.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) Fig.savefig( f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf') return TrainTestData, PromptDfSelForEff, FDDfSelForEff
def data_prep(inputCfg, iBin, PtMin, PtMax, OutPutDirPt, DataDf, PromptDf, FDDf): #pylint: disable=too-many-statements ''' function for data preparation ''' DataDfPtSel = DataDf.query(f'{PtMin} < pt_cand < {PtMax}') BkgDfPtSel = DataDfPtSel.query(inputCfg['data_prep']['filt_bkg_mass']) PromptDfPtSel = PromptDf.query(f'{PtMin} < pt_cand < {PtMax}') FDDfPtSel = FDDf.query(f'{PtMin} < pt_cand < {PtMax}') nPrompt = len(PromptDfPtSel) nFD = len(FDDfPtSel) nBkg = len(BkgDfPtSel) print(( f'Number of available candidates in {PtMin} < pT < {PtMax} GeV/c:\n Prompt: {nPrompt}' f'\n FD: {nFD}\n Bkg: {nBkg}')) dataset_opt = inputCfg['data_prep']['dataset_opt'] seed_split = inputCfg['data_prep']['seed_split'] test_f = inputCfg['data_prep']['test_fraction'] if dataset_opt == 'equal': nCandToKeep = min([nPrompt, nFD, nBkg]) print(( 'Keep same number of prompt, FD, and background (minimum) for training and ' f'testing ({1 - test_f}-{test_f}): {nCandToKeep}')) print( f'Fraction of real data candidates used for ML: {nCandToKeep/nBkg:.5f}' ) if nPrompt > nCandToKeep: print((f'Remaining prompt candidates ({nPrompt - nCandToKeep})' 'will be used for the efficiency together with test set')) if nFD > nCandToKeep: print(( f'Remaining FD candidates ({nFD - nCandToKeep}) will be used for the ' 'efficiency together with test set')) TotDfPtSel = pd.concat([ BkgDfPtSel.iloc[:nCandToKeep], PromptDfPtSel.iloc[:nCandToKeep], FDDfPtSel.iloc[:nCandToKeep] ], sort=True) LabelsArray = [0] * nCandToKeep + [1] * nCandToKeep + [2] * nCandToKeep TrainSet, TestSet, yTrain, yTest = train_test_split( TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split) TrainTestData = [TrainSet, yTrain, TestSet, yTest] CandTypeFlags = pd.Series(yTest) PromptDfPtSelForEff = pd.concat([ PromptDfPtSel.iloc[nCandToKeep:], TestSet[CandTypeFlags.values == 1] ], sort=False) FDDfPtSelForEff = pd.concat( [FDDfPtSel.iloc[nCandToKeep:], TestSet[CandTypeFlags.values == 2]], sort=False) del TotDfPtSel elif dataset_opt == 'max_signal': nCandBkg = round(inputCfg['ml']['bkg_mult'][iBin] * (nPrompt + nFD)) print(( f'Keep all prompt and FD and use {nCandBkg} bkg candidates for training and ' f'testing ({1 - test_f}-{test_f})')) if nCandBkg >= nBkg: nCandBkg = nBkg print('WARNING: using all bkg available, not good!') print( f'Fraction of real data candidates used for ML: {nCandBkg/nBkg:.5f}' ) TotDfPtSel = pd.concat( [BkgDfPtSel.iloc[:nCandBkg], PromptDfPtSel, FDDfPtSel], sort=True) LabelsArray = [0] * nCandBkg + [1] * nPrompt + [2] * nFD TrainSet, TestSet, yTrain, yTest = train_test_split( TotDfPtSel, LabelsArray, test_size=test_f, random_state=seed_split) TrainTestData = [TrainSet, yTrain, TestSet, yTest] CandTypeFlags = pd.Series(yTest) PromptDfPtSelForEff = TestSet[CandTypeFlags.values == 1] FDDfPtSelForEff = TestSet[CandTypeFlags.values == 2] del TotDfPtSel else: print(f'ERROR: {dataset_opt} is not a valid option!') sys.exit() # plots VarsToDraw = inputCfg['ml']['plotting_columns'] LegLabels = inputCfg['output']['leg_labels'] OutputLabels = inputCfg['output']['out_labels'] #_____________________________________________ plot_utils.plot_distr([BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw, (12, 7), 100, True, LegLabels, 0.3) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{OutPutDirPt}/DistributionsAll_pT_{PtMin}_{PtMax}.pdf') plt.close('all') #_____________________________________________ CorrMatrixFig = plot_utils.plot_corr( [BkgDfPtSel, PromptDfPtSel, FDDfPtSel], VarsToDraw, LegLabels) for Fig, Lab in zip(CorrMatrixFig, OutputLabels): plt.figure(Fig.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) Fig.savefig(f'{OutPutDirPt}/CorrMatrix{Lab}_pT_{PtMin}_{PtMax}.pdf') del BkgDfPtSel, PromptDfPtSel, FDDfPtSel return TrainTestData, DataDfPtSel, PromptDfPtSelForEff, FDDfPtSelForEff
def test_plot_corr(): """ Test the correlation matrix plot """ assert isinstance(plot_utils.plot_corr( [SIG_DF, BKG_DF], SIG_DF.columns), list)
HYP_RANGES = { # # defines the maximum depth of a single tree (regularization) 'max_depth': (5, 15), # 'learning_rate': (0.01, 0.3), # learning rate 'n_estimators': (5, 10), # number of boosting trees } MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc') # train and test the model with the updated hyperparameters MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) # Calculate the BDT efficiency as a function of the BDT score EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array( DATA[3], Y_PRED, n_points=10) # -------------------------------------------- # PLOTTING # -------------------------------------------- FEATURES_DISTRIBUTIONS_PLOT = plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns) CORRELATION_MATRIX_PLOT = plot_utils.plot_corr([SIG_DF, BKG_DF], SIG_DF.columns) BDT_OUTPUT_PLOT = plot_utils.plot_output_train_test(MODEL, DATA) ROC_CURVE_PLOT = plot_utils.plot_roc(DATA[3], Y_PRED) PRECISION_RECALL_PLOT = plot_utils.plot_precision_recall(DATA[3], Y_PRED) BDT_EFFICIENCY_PLOT = plot_utils.plot_bdt_eff(THRESHOLD, EFFICIENCY) FEATURES_IMPORTANCE = plot_utils.plot_feature_imp(TEST_SET, Y_TEST, MODEL) plt.show() # ---------------------------------------------
# define tree handlers signal_tree_handler = TreeHandler() background_tree_handler = TreeHandler() signal_tree_handler.set_data_frame(df_signal_ct) background_tree_handler.set_data_frame(df_background_ct) del df_signal_ct, df_background_ct if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') leg_labels = ['background', 'signal'] plot_distr = plot_utils.plot_distr( [background_tree_handler, signal_tree_handler], TRAINING_COLUMNS_LIST, bins=40, labels=leg_labels, log=True, density=True, figsize=(10, 12), alpha=0.5, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/FeaturePlots.pdf') bkg_corr = plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['Background']) bkg_corr.set_size_inches(6,6) plt.subplots_adjust(left=0.1, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/BackgroundCorrelationMatrix.pdf') sig_corr = plot_utils.plot_corr([signal_tree_handler], TRAINING_COLUMNS_LIST, ['Signal']) sig_corr.set_size_inches(6,6) plt.tight_layout() plt.savefig(f'{PLOT_DIR}/features/SignalCorrelationMatrix.pdf') plt.close('all') ###########################################################
TRAINING_COLUMNS_LIST, bins=50, labels=leg_labels, log=True, density=True, figsize=(12, 7), alpha=0.3, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(f'{PLOT_DIR}/features/FeaturePlots_{bin}') plot_utils.plot_corr([background_tree_handler], TRAINING_COLUMNS_LIST, ['background']) plt.savefig( f'{PLOT_DIR}/features/BackgroundCorrelationMatrix_{bin}' ) plot_utils.plot_corr([signal_tree_handler], TRAINING_COLUMNS_LIST, ['signal']) plt.savefig( f'{PLOT_DIR}/features/SignalCorrelationMatrix_{bin}') plt.close('all') # split data into training and test set train_test_data = train_test_generator( [signal_tree_handler, background_tree_handler], [1, 0], test_size=0.5, random_state=RANDOM_STATE) print(
def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)
labels=['Signal', "Background"], colors=["blue", "red"], log=True, density=True, figsize=(18, 13), alpha=0.3, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) plt.savefig(results_ml_path + "/features_distributions.png", bbox_inches='tight') corr = pu.plot_corr([signalH, bkgH], training_columns + ["m"], ['Signal', "Background"]) corr[0].savefig(results_ml_path + "/correlations.png", bbox_inches='tight') print("---------------------------------------------") print("Data loaded. Training and testing ....") params_range = { "max_depth": (8, 18), "learning_rate": (0.07, 0.15), "n_estimators": (150, 250), "gamma": (0.3, 0.5), "min_child_weight": (3, 8), "subsample": (0.5, 1), "colsample_bytree": (0.3, 1), }