X_train = train_set[mylistvariables] y_train = train_set[myvariablesy] X_test = test_set[mylistvariables] y_test = test_set[myvariablesy] trainedmodels = [] if (docorrelation == 1): ptmin = 5 ptmax = 7 var_pt = "pt_cand_ML" var_signal = "signal_ML" train_set_ptsel = filterdataframe_pt(train_set, var_pt, ptmin, ptmax) train_set_ptsel_sig, train_set_ptsel_bkg = splitdataframe_sigbkg( train_set_ptsel, var_signal) vardistplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesall, "plots") scatterplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesx, mylistvariablesy, "plots") correlationmatrix(train_set_ptsel_sig, "plots", "signal") correlationmatrix(train_set_ptsel_bkg, "plots", "background") if (doStandard == 1): X_train = GetDataFrameStandardised(X_train) if (doPCA == 1): n_pca = 9 X_train, pca = GetPCADataFrameAndPC(X_train, n_pca) plotvariancePCA(pca, "plots")
train_set.to_pickle(dataframe + "/dataframetrainsampleN%s.pkl" % (suffix)) test_set.to_pickle(dataframe + "/dataframetestsampleN%s.pkl" % (suffix)) train_set = pd.read_pickle(dataframe + "/dataframetrainsampleN%s.pkl" % (suffix)) test_set = pd.read_pickle(dataframe + "/dataframetestsampleN%s.pkl" % (suffix)) print("dimension of the dataset", len(train_set)) X_train = train_set[mylistvariables] y_train = train_set[myvariablesy] trainedmodels = [] if (docorrelation == 1): train_set_ptsel_sig, train_set_ptsel_bkg = splitdataframe_sigbkg( train_set, myvariablesy) vardistplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesall, plotdir) scatterplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesx, mylistvariablesy, plotdir) correlationmatrix(train_set_ptsel_sig, plotdir, "signal") correlationmatrix(train_set_ptsel_bkg, plotdir, "background") if (doStandard == 1): X_train = GetDataFrameStandardised(X_train) if (doPCA == 1): n_pca = 5 X_train, pca = GetPCADataFrameAndPC(X_train, n_pca) plotvariancePCA(pca, plotdir)