X_train = train_set[mylistvariables]
y_train = train_set[myvariablesy]

X_test = test_set[mylistvariables]
y_test = test_set[myvariablesy]

trainedmodels = []

if (docorrelation == 1):
    ptmin = 5
    ptmax = 7
    var_pt = "pt_cand_ML"
    var_signal = "signal_ML"
    train_set_ptsel = filterdataframe_pt(train_set, var_pt, ptmin, ptmax)
    train_set_ptsel_sig, train_set_ptsel_bkg = splitdataframe_sigbkg(
        train_set_ptsel, var_signal)
    vardistplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesall,
                "plots")
    scatterplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesx,
                mylistvariablesy, "plots")
    correlationmatrix(train_set_ptsel_sig, "plots", "signal")
    correlationmatrix(train_set_ptsel_bkg, "plots", "background")

if (doStandard == 1):
    X_train = GetDataFrameStandardised(X_train)

if (doPCA == 1):
    n_pca = 9
    X_train, pca = GetPCADataFrameAndPC(X_train, n_pca)
    plotvariancePCA(pca, "plots")
    train_set.to_pickle(dataframe + "/dataframetrainsampleN%s.pkl" % (suffix))
    test_set.to_pickle(dataframe + "/dataframetestsampleN%s.pkl" % (suffix))

train_set = pd.read_pickle(dataframe + "/dataframetrainsampleN%s.pkl" %
                           (suffix))
test_set = pd.read_pickle(dataframe + "/dataframetestsampleN%s.pkl" % (suffix))

print("dimension of the dataset", len(train_set))

X_train = train_set[mylistvariables]
y_train = train_set[myvariablesy]

trainedmodels = []

if (docorrelation == 1):
    train_set_ptsel_sig, train_set_ptsel_bkg = splitdataframe_sigbkg(
        train_set, myvariablesy)
    vardistplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesall,
                plotdir)
    scatterplot(train_set_ptsel_sig, train_set_ptsel_bkg, mylistvariablesx,
                mylistvariablesy, plotdir)
    correlationmatrix(train_set_ptsel_sig, plotdir, "signal")
    correlationmatrix(train_set_ptsel_bkg, plotdir, "background")

if (doStandard == 1):
    X_train = GetDataFrameStandardised(X_train)

if (doPCA == 1):
    n_pca = 5
    X_train, pca = GetPCADataFrameAndPC(X_train, n_pca)
    plotvariancePCA(pca, plotdir)