Пример #1
0
def smote_data(data_train, target_train):
    y = np.bincount(target_train)
    ratio = 1.5 # float(y[2] + y[1]) / float(y[0])
  #  smote = SMOTE(ratio=ratio, verbose=True, kind='regular')
  #  smox, smoy = smote.fit_transform(data_train, target_train)
    OS = OverSampler(ratio=ratio, verbose=True)
    osx, osy = OS.fit_transform(data_train, target_train)
    return osx, osy
def oversample_data(X_t, y_t, ratio):
    x_columns = X_t.columns

    X_t = X_t.reset_index(drop=True).as_matrix()
    y_t = y_t.reset_index(drop=True).as_matrix()

    smote = OverSampler(ratio=ratio, verbose=False)
    smox, smoy = smote.fit_transform(X_t, y_t)
    X_t = pd.DataFrame(smox, columns=x_columns)
    y_t = pd.Series(smoy)
    return X_t, y_t
 def split(self, x_data, y_data):
     Xt, Yt, Xv, Yv = super(OverUnderSplitter, self).split(x_data, y_data)
     Xt_smote, Yt_smote = OverSampler(
         ratio=self._over_sample).fit_transform(Xt.as_matrix(),
                                                Yt.as_matrix())
     Xt_smote, Yt_smote = UnderSampler(
         ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote)
     return Xt_smote, Yt_smote, Xv, Yv
Пример #4
0
def test_rest(x, y):

    print('Random under-sampling')
    US = UnderSampler(verbose=verbose)
    usx, usy = US.fit_transform(x, y)

    print('Tomek links')
    TL = TomekLinks(verbose=verbose)
    tlx, tly = TL.fit_transform(x, y)

    print('Clustering centroids')
    CC = ClusterCentroids(verbose=verbose)
    ccx, ccy = CC.fit_transform(x, y)

    print('NearMiss-1')
    NM1 = NearMiss(version=1, verbose=verbose)
    nm1x, nm1y = NM1.fit_transform(x, y)

    print('NearMiss-2')
    NM2 = NearMiss(version=2, verbose=verbose)
    nm2x, nm2y = NM2.fit_transform(x, y)

    print('NearMiss-3')
    NM3 = NearMiss(version=3, verbose=verbose)
    nm3x, nm3y = NM3.fit_transform(x, y)

    print('Neighboorhood Cleaning Rule')
    NCR = NeighbourhoodCleaningRule(verbose=verbose)
    ncrx, ncry = NCR.fit_transform(x, y)

    print('Random over-sampling')
    OS = OverSampler(verbose=verbose)
    ox, oy = OS.fit_transform(x, y)

    print('SMOTE Tomek links')
    STK = SMOTETomek(verbose=verbose)
    stkx, stky = STK.fit_transform(x, y)

    print('SMOTE ENN')
    SENN = SMOTEENN(verbose=verbose)
    sennx, senny = SENN.fit_transform(x, y)

    print('EasyEnsemble')
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(x, y)
Пример #5
0
def sampling():
    verbose = False
    y = np.bincount(target_train1)
    print y
    ratio = float(y[2]) / float(y[1])
    # 'Random over-sampling'
    OS = OverSampler(ratio=ratio, verbose=verbose)
    osx, osy = OS.fit_transform(data_train1, target_train1)
    random_methods(osx,osy)
    # 'SMOTE'
    smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
    smox, smoy = smote.fit_transform(data_train1, target_train1)
    random_methods(smox,smoy)
    # 'SMOTE bordeline 1'
    bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
    bs1x, bs1y = bsmote1.fit_transform(data_train, target_train)
    random_methods(bs1x,bs1y)
    # 'SMOTE bordeline 2'
    bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
    bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1)
    random_methods(bs2x,bs2y)
    # 'SMOTE SVM'
    svm_args={'class_weight' : 'auto'}
    svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args)
    svsx, svsy = svmsmote.fit_transform(data_train1, target_train1)
    random_methods(svsx,svsy)
    # 'SMOTE Tomek links'
    STK = SMOTETomek(ratio=ratio, verbose=verbose)
    stkx, stky = STK.fit_transform(data_train1, target_train1)
    random_methods(stkx,stky)
    # 'SMOTE ENN'
    SENN = SMOTEENN(ratio=ratio, verbose=verbose)
    ennx, enny = SENN.fit_transform(data_train1, target_train1)
    random_methods(ennx,enny)
    # 'EasyEnsemble'
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(data_train1, target_train1)
    random_methods(eex,eey)
    # 'BalanceCascade'
    BS = BalanceCascade(verbose=verbose)
    bsx, bsy = BS.fit_transform(data_train1, target_train1)
    random_methods(bsx,bsy)
Пример #6
0
def _parallel_build_trees(tree, forest, X, y):
    if forest.sampling is None:
        sampler = BootstrapSampler(random_state=tree.random_state)
    elif forest.sampling == 'up':
        sampler = OverSampler(random_state=tree.random_state, verbose=False)
    elif forest.sampling == 'down':
        sampler = UnderSampler(random_state=tree.random_state, verbose=False)

    X_sample, y_sample = sampler.fit_transform(X, y)
    tree.fit(X_sample, y_sample, check_input=False)
    return tree
Пример #7
0
def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Perform any kind of sampling(over and under).

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Data.
    y : array, shape = [n_samples]
        Target.
    method : str, optional default: None
        Over or under smapling method.
    ratio: float
        Unbalanced class ratio.

    Returns
    -------
    X, y : tuple
        Sampled X and y.
    """
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    elif method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    elif method == 'random_over_sample':
        sampler = OverSampler(ratio=ratio, verbose=verbose)

    elif method == 'random_under_sample':
        sampler = UnderSampler(verbose=verbose)

    elif method == 'TomekLinks':
        sampler = TomekLinks(verbose=verbose)

    return sampler.fit_transform(X, y)
Пример #8
0
 def random_over_sampling(self):
     OS = OverSampler(ratio=self._ratio, verbose=self.verbose)
     osx, osy = OS.fit_transform(self.x, self.y)
     return osx, osy
Пример #9
0
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen):
    ratio = float(np.count_nonzero(Y_data == 1)) / \
        float(np.count_nonzero(Y_data == 0))
    X_data = np.reshape(X_data, (len(X_data), n_states * maxlen))
    # 'Random over-sampling'
    if sampling == 'OverSampler':
        OS = OverSampler(ratio=ratio, verbose=True)
    # 'Random under-sampling'
    elif sampling == 'UnderSampler':
        OS = UnderSampler(verbose=True)
    # 'Tomek under-sampling'
    elif sampling == 'TomekLinks':
        OS = TomekLinks(verbose=True)
    # Oversampling
    elif sampling == 'SMOTE':
        OS = SMOTE(ratio=1, verbose=True, kind='regular')
    # Oversampling - Undersampling
    elif sampling == 'SMOTETomek':
        OS = SMOTETomek(ratio=ratio, verbose=True)
    # Undersampling
    elif sampling == 'OneSidedSelection':
        OS = OneSidedSelection(verbose=True)
    # Undersampling
    elif sampling == 'CondensedNearestNeighbour':
        OS = CondensedNearestNeighbour(verbose=True)
    # Undersampling
    elif sampling == 'NearMiss':
        OS = NearMiss(version=1, verbose=True)
    # Undersampling
    elif sampling == 'NeighbourhoodCleaningRule':
        OS = NeighbourhoodCleaningRule(verbose=True)
    # ERROR: WRONG SAMPLER, TERMINATE
    else:
        print('Wrong sampling variable you have set... Exiting...')
        sys.exit()
    # print('shape ' + str(X.shape))
    X_data, Y_data = OS.fit_transform(X_data, Y_data)
    return X_data, Y_data
Пример #10
0
del df['ID']
id = df_test['ID']
del df_test['ID']

from src.transfomations import remove_correlated
_, to_remove = remove_correlated(df, 0.99)

df_test.drop(to_remove, axis=1, inplace=True)

variance_threshold = VarianceThreshold(threshold=0.001)
df = variance_threshold.fit_transform(df)

ratio = float(sum(target)) / float(len(target) - sum(target))
print(ratio)

smote = OverSampler(ratio=ratio * 2, verbose=True)
print(type(target))
# df, target = smote.fit_transform(df.as_matrix(), target.as_matrix())

df_test = variance_threshold.fit(df_test)

gbc = GradientBoostingClassifier()
gbc.fit(df, target)

# best = randomizedSearch.best_estimator_
# print(randomizedSearch.best_params_)
scores = cross_validation.cross_val_score(gbc,
                                          df,
                                          target,
                                          cv=5,
                                          scoring='roc_auc',
Пример #11
0
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
axes[2, 1].set_title('One-sided selection', fontsize=fs)
# Neighboorhood cleaning rule
axes[2, 2].scatter(ncrx_vis[ncry==0, 0], ncrx_vis[ncry==0, 1], label="Class #0", alpha=0.5, 
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
axes[2, 2].scatter(ncrx_vis[ncry==1, 0], ncrx_vis[ncry==1, 1], label="Class #1", alpha=0.5, 
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
axes[2, 2].set_title('Neighboorhood cleaning rule', fontsize=fs)

plt.show()

# Generate the new dataset using under-sampling method
verbose = False
ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0))
# 'Random over-sampling'
OS = OverSampler(ratio=ratio, verbose=verbose)
osx, osy = OS.fit_transform(x, y)
# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
smox, smoy = smote.fit_transform(x, y)
# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
bs1x, bs1y = bsmote1.fit_transform(x, y)
# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
bs2x, bs2y = bsmote2.fit_transform(x, y)
# 'SMOTE SVM'
svm_args={'class_weight' : 'auto'}
svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args)
svsx, svsy = svmsmote.fit_transform(x, y)
# 'SMOTE Tomek links'
Пример #12
0
y = pd.read_csv(tain_path,
                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
# 'Random over-sampling'
OS = OverSampler(ratio=ratio, verbose=verbose)
x, y = OS.fit_transform(main_x, main_y)

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]