Пример #1
0
    def _fit_resample(self, X, y):
        n_samples = X.shape[0]

        # convert y to z_score
        y_z = (y - y.mean()) / y.std()

        index0 = np.arange(n_samples)
        index_negative = index0[y_z > self.negative_thres]
        index_positive = index0[y_z <= self.positive_thres]
        index_unclassified = [x for x in index0
                              if x not in index_negative
                              and x not in index_positive]

        y_z[index_negative] = 0
        y_z[index_positive] = 1
        y_z[index_unclassified] = -1

        ros = RandomOverSampler(
            sampling_strategy=self.sampling_strategy,
            random_state=self.random_state,
            ratio=self.ratio)
        _, _ = ros.fit_resample(X, y_z)
        sample_indices = ros.sample_indices_

        print("Before sampler: %s. Total after: %s"
              % (Counter(y_z), sample_indices.shape))

        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (safe_indexing(X, sample_indices),
                    safe_indexing(y, sample_indices),
                    sample_indices)
        return (safe_indexing(X, sample_indices),
                safe_indexing(y, sample_indices))
def test_multiclass_fit_resample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 5
    assert count_y_res[1] == 5
    assert count_y_res[2] == 5
def test_random_over_sampling_heterogeneous_data():
    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
                        dtype=np.object)
    y = np.array([0, 0, 1])
    ros = RandomOverSampler(random_state=RND_SEED)
    X_res, y_res = ros.fit_resample(X_hetero, y)

    assert X_res.shape[0] == 4
    assert y_res.shape[0] == 4
    assert X_res.dtype == object
    assert X_res[-1, 0] in X_hetero[:, 0]
def test_ros_fit_resample_half():
    sampling_strategy = {0: 3, 1: 7}
    ros = RandomOverSampler(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_resample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [
        0.20792588, 1.49407907
    ], [0.47104475, 0.44386323], [0.22950086,
                                  0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_random_over_sampling_return_indices():
    ros = RandomOverSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [
        0.20792588, 1.49407907
    ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [
        0.15490546, 0.3130677
    ], [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
Пример #6
0
SEED=0xDEADBEEF

y_col = 'add'
X_cols = ['pct_contrib','turnover','VWAP','vol','VWMC','SPTSXComp']
all_cols = X_cols + [y_col]
X = filtered[X_cols]
y = filtered[y_col]

X_test, X_train, y_test, y_train  = sk.model_selection.train_test_split(X.values, y.values, test_size=0.2, random_state=SEED)

filtered[all_cols].to_sql('model_inputs', conn, if_exists='replace', index=False)

#oversampler = SMOTE(random_state=SEED)
oversampler = RandomOverSampler(random_state=SEED)

X_train_resample, y_train_resamle = oversampler.fit_resample(X_train, y_train)

print(len(X_train), len(X_test))

#log_clf = LogisticRegression()#
#log_clf = RandomForestClassifier()
#log_clf = xgb.XGBClassifier(max_depth=4, min_child_weight=50, learning_rate=0.01, n_estimators=50, gamma=1)
log_clf = svm.LinearSVC()
 ##LogisticRegression()
log_clf.fit(X_train_resample, y_train_resamle)

print(log_clf.score(X_train, y_train))

y_pred = log_clf.predict(X_test)
try:
	y_pred_prob = log_clf.predict_proba(X_test)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an over-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` has a ``str``
# ...................................
#
# ``sampling_strategy`` can be given as a string which specify the class
# targeted by the resampling. With under- and over-sampling, the number of
# samples will be equalized.
#
# Note that we are using multiple classes from now on.
Пример #8
0
y_pred = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(accuracy_score(y_pred , y_test))) 
print('AUC : {0:0.5f}'.format(roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(recall_score(y_test , y_pred)))
print('F1 : {0:0.5f}'.format(f1_score(y_test , y_pred)))

"""# オーバーサンプリング"""

from imblearn.over_sampling import RandomOverSampler

print('Original dataset shape %s' % Counter(y))

ros = RandomOverSampler(random_state=my_random_state)
X_res2, y_res2 = ros.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_res2))

X_train, X_test, y_train, y_test = train_test_split(X_res2, y_res2, test_size=0.3, shuffle=True, random_state=my_random_state)

# Oversampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(accuracy_score(y_test , y_pred))) 
print('AUC : {0:0.5f}'.format(roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(recall_score(y_test , y_pred)))
        return 1
    else:
        return 0

df['Approval'] = df['Approval'].apply(switch_val)

## TRAIN/TEST SPLIT

X = dtm.loc[:, dtm.columns != 'Approval']
y = dtm['Approval']
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

## RESAMPLE DATASET

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train_X, train_y)

## SGDClassifier GridSearch

grid = {
    #'alpha': [1e-4, 1e-3, 1e-2, 1e-1],
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    #'l1_ratio': [0.15, 0.30, 0.45, 0.60, 0.75, 0.90]
    #'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    #'class_weight': ['balanced']
}

paramGrid = ParameterGrid(grid)

bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier, paramGrid, X_resampled, y_resampled, test_X, test_y, metric=roc_auc_score, scoreLabel="AUC", n_jobs=1)
Пример #10
0
#scaling
sdscaler = StandardScaler()
sdscaler.fit(data)
sdscaler_data = sdscaler.transform(data)
sdscaler_pd = pd.DataFrame(sdscaler_data, columns=data.columns)

#성능 비교를 위한 test set설정
X_train, X_test, Y_train, Y_test = train_test_split(sdscaler_pd,
                                                    label,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=5)
ros = RandomOverSampler(random_state=2019)
rus = RandomUnderSampler(random_state=2019)
oversampled_data, oversampled_label = ros.fit_resample(X_train, Y_train)
undersampled_data, undersampled_label = rus.fit_resample(X_train, Y_train)
oversampled_data = pd.DataFrame(oversampled_data, columns=data.columns)
undersampled_data = pd.DataFrame(undersampled_data, columns=data.columns)

print('원본 데이터의 클래스 비율\n{}'.format(pd.get_dummies(Y_train).sum()))
print('\nOversampled_data  클래스 비율 \n{}'.format(
    pd.get_dummies(oversampled_label).sum()))
print('\nUndersampled_data 클래스 비율 결과 \n{}'.format(
    pd.get_dummies(undersampled_label).sum()))


#성능 비교
def train_and_test(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
Пример #11
0
#######
X_ = np.array(feature)[:, 1:]

X = scale(X_)

y = np.array(label.values.ravel())

# choose the method
option = sys.argv[1]
# the input sequence
file = sys.argv[2]

if (option == "1"):
    #Random over sampling method
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)

if (option == "2"):
    #SMOTE method
    X_resampled, y_resampled = SMOTE().fit_resample(X, y)
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)

if (option == "3"):
    #ADASYN method
Пример #12
0
base_col_names = col_names[0:13]  # for baseline model 包含银行数据+早中晚数据
df_fillna = df.fillna(0)  # fill NA with 0. 无消费以0计
X = df_fillna[col_names]
y = df_fillna.default_geq_1  # Target variable
X_base = df_fillna[base_col_names]
y_base = df_fillna.default_geq_1  # Target variable
random_state = 1234
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=random_state)
#如果 random_state = None (默认值),会随机选择一个种子,这样每次都会得到不同的数据划分。给 random_state 设置相同的值,那么当别人重新运行你的代码的时候就能得到完全一样的结果,复现和你一样的过程。
X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(
    X_base, y_base, test_size=0.30)
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)
X_base_train, y_base_train = ros.fit_resample(X_base_train, y_base_train)
min_max_scaler = MinMaxScaler()
#X_train = min_max_scaler.fit_transform(X_train)
#X_test = min_max_scaler.fit_transform(X_test)
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.fit_transform(X_test)

#numerical_columns=['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership']

#Specifying the parameter
n_estimators = 100
learning_rate = 0.1
max_depth = 6
num_leaves = 16
pyplot.savefig("Plot_PRT.jpeg", dpi = 800, facecolor = "white")





##### Rebalance data with over-sampling -------------------------------------------------------------------

# Fit to training data and evaluate performance on test data (no rebalancing)
clfLrgC = LogisticRegression(solver = "lbfgs", class_weight = {0:0.01, 1:0.99})
model_cv_bal(clfLrgC, 1000)

# Oversample minority class at 1:1 ratio
# This block of code does not influence models, and just shows how over-sampling works
os = RandomOverSampler(sampling_strategy = "minority")
os_x, os_y = os.fit_resample(train_x, train_y)
print(Counter(train_y))
print(Counter(os_y))

# Fit to training data and evaluate performance on test data (1:1 rebalancing)
pipeline = Pipeline([("samp", RandomOverSampler(sampling_strategy = "minority")),
                     ("model", LogisticRegression(solver = "lbfgs", class_weight = {0:0.01, 1:0.99}))])
model_cv_bal(pipeline, 1000)

# Oversample minority class at 1:4 ratio
# This block of code does not influence models, and just shows how over-sampling works
os = RandomOverSampler(sampling_strategy = 0.25)
os_x, os_y = os.fit_resample(train_x, train_y)
print(Counter(train_y))
print(Counter(os_y))
Пример #14
0
def oversampling(x, lbl):  # apply imblearn pack.
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.utils import shuffle
    ros = RandomOverSampler(random_state=87)
    x_res, y_res = ros.fit_resample(x, lbl)
    return shuffle(x_res, y_res, random_state=87)
Пример #15
0
    cln = [
        lem.lemmatize(word) for word in cln
        if word not in stopwords.words('english')
    ]
    cln = ' '.join(cln)
    corpus.append(cln)

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(corpus).toarray()

Y = pd.get_dummies(Y, drop_first=True)

from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler()
X, Y = os.fit_resample(X, Y)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)
pred = mnb.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test, pred))
print("\n")
print(classification_report(Y_test, pred))

filename = 'spam-classifier.pkl'
Пример #16
0
def main(n_comp, n_samples_per_cat, Threshold_prob):
    seconds = time.time()

    file_test = "bridged_10k.csv"
    file_train = "train.csv"
    file_valid = "valid.csv"

    load_data(file_test, file_train, file_valid, n_samples_per_cat)

    with open('Simplified_smuto.pkl', 'rb') as fl:
        tt = pickle.load(fl)
    X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Training_map, Validation_map = tt

    X_train, Y_train, X_valid, Y_valid, Training_map, Validation_map = cleaning_data_from_topics(
        X_train, Y_train, X_valid, Y_valid, Training_map, Validation_map,
        n_samples_per_cat)

    # Validation = Part of training !!!
    # Validation_map = Training_map
    # X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, random_state=42)

    # Vectorization
    model = load_Glove_Model()
    X_train_vect, X_train, Y_train = vect(X_train, Y_train, model)
    X_test_vect, X_test, Y_test = vect(X_test, Y_test, model)
    X_valid_vect, X_valid, Y_valid = vect(X_valid, Y_valid, model)

    # Making vectors positive
    min_val = 3.1
    X_train_vect = [x + min_val for x in X_train_vect]
    X_test_vect = [x + min_val for x in X_test_vect]
    X_valid_vect = [x + min_val for x in X_valid_vect]
    print('Vectorization is done')

    # # SMOTE
    # sm = SMOTE(random_state=42)
    # X_train_vect, Y_train = sm.fit_resample(X_train_vect, Y_train)

    ros = RandomOverSampler(random_state=42)
    X_train_vect, Y_train = ros.fit_resample(X_train_vect, Y_train)

    unique, counts = np.unique(Y_train, return_counts=True)
    print('Train data distribution after Smote:{}'.format(
        dict(zip(np.array(unique), np.array(counts)))))

    unique, counts = np.unique(Y_valid, return_counts=True)
    print('Valid data distribution after Smote:{}'.format(
        dict(zip(np.array(unique), np.array(counts)))))

    # Dimensionality reduction using NMF
    X_train_vect, X_test_vect, X_valid_vect = dimensionality_reduction(
        X_train_vect, X_test_vect, X_valid_vect, Y_train, n_comp)
    print('Dimensionality reduction is done')

    # Classification using SVC
    Y_pred, Y_train_pred, Y_valid_pred, Y_pred_prob, Y_train_pred_prob, Y_valid_pred_prob = classification(
        X_train_vect, X_test_vect, X_valid_vect, Y_train, Threshold_prob)

    data_total = [
        X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Y_pred,
        Y_train_pred, Y_valid_pred, Y_pred_prob, Y_train_pred_prob,
        Y_valid_pred_prob, Training_map, Validation_map
    ]

    with open('Estimated.pkl', 'wb') as fs:
        pickle.dump(data_total, fs)

    with open('Estimated.pkl', 'rb') as fl:
        tt = pickle.load(fl)
    X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Y_pred, Y_train_pred, Y_valid_pred,\
    Y_pred_prob, Y_train_pred_prob, Y_valid_pred_prob, Training_map, Validation_map = tt

    # print(np.unique(Y_train))
    # Diction = keys_output(Y_test, Y_pred, Y_pred_prob, Threshold=0.026)
    # final_excel(Diction)
    acc_valid, acc_train = efficiency_estimation(Y_valid_pred, Y_train_pred,
                                                 X_valid, X_train,
                                                 Training_map, Validation_map)

    unique, counts = np.unique(Y_valid, return_counts=True)
    print('Validation data distribution:{}'.format(
        dict(zip(np.array(unique), np.array(counts)))))

    X = "i" * len(Y_valid_pred)
    X, Y_valid_pred = flattening(X, Y_valid_pred)
    unique, counts = np.unique(Y_valid_pred, return_counts=True)
    print('Predicted Validation data distribution:{}'.format(
        dict(zip(np.array(unique), np.array(counts)))))

    print('Predicted categories amount: ', len(Y_valid_pred))
    print('True categories amount: ', len(Y_valid))
    mlb = MultiLabelBinarizer()
    mlb = mlb.fit(Y_valid)
    Y_valid = mlb.transform(Y_valid)
    Y_valid_pred = mlb.transform(Y_valid_pred)

    # matrix = confusion_matrix(Y_valid.argmax(axis =1), Y_valid_pred.argmax(axis =1))
    # print('Validation confusion matrix:'.format(matrix))

    return acc_valid, acc_train, seconds
Пример #17
0
# # 11.7 过采样和欠采样
# 11.7.1 过采样
import pandas as pd
data = pd.read_excel("信用卡数据.xlsx")
data.head()

X = data.drop(columns='分类')
y = data['分类']

from collections import Counter
Counter(y)

# (1)随机过采样
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_oversampled, y_oversampled = ros.fit_resample(X, y)

print(Counter(y_oversampled))

print(X_oversampled.shape)

# (2)SMOTE过采样
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_smotesampled, y_smotesampled = smote.fit_resample(X, y)

print(Counter(y_smotesampled))

# 11.7.2 欠采样
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5)
class DataProcessor():
    """Refactor of processing.py to reuse scaler from train data when scaling test data
    """
    def __init__(self, train_data, **kwargs):
        """Constructor, used as a base for using the builder methods
        
        Arguments:
            train_data  -- Dataframe with full training data without any preprocessing done
            labels -- Data labels in kwargs for convenience when we have data and labels seperate
        """

        self.target = 'target'
        self.id_column = "ID_code"

        self.scaler = None
        self.sampler = None

        self.train_data = self.__remove_columns(train_data, self.id_column)
        if "labels" in kwargs:
            self.train_data[self.target] = kwargs["labels"]

    def with_scaling(self):
        """Builder method to add data scaling to processor. Can be used in method chaining.
        
        Returns:
            Instance of the processor it is called on
        """

        self.scaler = StandardScaler()
        X, _ = self.__xy_split(self.train_data)
        self.scaler.fit(X)
        return self

    def with_undersampling(self, seed=0):
        """Builder method to add undersampling. Overwrites oversampling. Can be used in method chaining.
        
        Returns:
            Instance of the processor it is called on
        """
        self.sampler = RandomUnderSampler(random_state=seed)
        return self

    def with_oversampling(self, seed=0):
        """Builder method to add oversampling. Overwrites undersampling. Can be used in method chaining.
        
        Returns:
            Instance of the processor it is called on
        """
        self.sampler = RandomOverSampler(random_state=seed)
        return self

    def process_train(self):
        """Performs enabled data processing tasks for the train dataset.
        
        Returns:
            DataFrame with processed training data + array with training labels 
                + input size (number of columns of processed training data without target or ID columns)
        """

        X, y = self.__xy_split(self.train_data)
        columns = X.columns

        if self.sampler is not None:
            X, y = self.sampler.fit_resample(X, y)
            X = pd.DataFrame(X, columns=columns)
            y = np.array(y, dtype=np.float64)

        if self.scaler is not None:
            X = self.scaler.transform(X)
            X = pd.DataFrame(X, columns=columns)

        return X, y, len(columns)

    def process_data(self, data, **kwargs):
        """Performs enabled data processing tasks for a given dataset. This is used for example after splitting training data into train/val datasets,
        then this method can be used to process the val dataset.

        Arguments:
            data -- Dataframe with full data without any preprocessing done. Should also have labels.
            labels -- Data labels in kwargs for convenience when we have data and labels seperate
        
        Returns:
            DataFrame with processed data + array with labels + input size (number of columns of processed data without target or ID columns)
        """
        data = self.__remove_columns(data, self.id_column)
        if "labels" in kwargs:
            data[self.target] = kwargs["labels"]
        X, y = self.__xy_split(data)
        columns = X.columns

        if self.scaler is not None:
            X = self.scaler.transform(X)
            X = pd.DataFrame(X, columns=columns)

        return X, y, len(columns)

    def process_test(self, data):
        """Performs enabled data processing tasks for a given test dataset. This should be used for the test set from kaggle.

        Arguments:
            data -- Dataframe with full data without any preprocessing done. It should NOT have any labels.
        
        Returns:
            DataFrame with processed data + None (representing the labels, is kept for compatibility reasons) 
                + input size (number of columns of processed data without target or ID columns)
        """
        data = self.__remove_columns(data, self.id_column)
        X = data
        columns = X.columns

        if self.scaler is not None:
            X = self.scaler.transform(X)
            X = pd.DataFrame(X, columns=columns)

        return X, None, len(columns)

    def __remove_columns(self, data, columns):
        return data.drop(columns, axis=1, inplace=False)

    def __xy_split(self, data):
        return self.__remove_columns(data,
                                     self.target), np.array(data[self.target],
                                                            dtype=np.float64)