예제 #1
0
def under_sampling(df, title):
    features, output_label = split_data(df)
    ncr = NeighbourhoodCleaningRule()
    X_undersampled, y_undersampled = ncr.fit_resample(features, output_label)
    df_full = pd.concat([
        pd.DataFrame(X_undersampled, columns=features.columns),
        pd.DataFrame(y_undersampled, columns=output_label.columns)
    ],
                        axis=1)
    return (df_full)
def test_ncr_fit_resample_mode():
    ncr = NeighbourhoodCleaningRule(kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_resample_mode():
    ncr = NeighbourhoodCleaningRule(kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [
        -0.20413357, 0.64628718
    ], [0.35967591, 2.61186964], [0.90701028,
                                  -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #4
0
def ncrReSample():
    raw_train, raw_test = splitTrainTest(datapath)
    img_data, y = getFullImgFeature(raw_train)
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    X_res, y_res = ncr.fit_resample(img_data, y)
    print('Resampled dataset shape %s' % Counter(y_res))
    trainset = np.append(X_res, y_res, axis=1)

    textX, texty = getFullImgFeature(raw_test)
    testset = np.append(textX, texty, axis=1)

    return trainset, testset
def test_ncr_fit_resample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True)
    X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_resample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True)
    X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [
        -0.20413357, 0.64628718
    ], [0.35967591, 2.61186964], [0.90701028,
                                  -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #7
0
x=dataset.iloc[: ,1:640]

#print(x)
y=dataset['Class']

clf = RandomForestClassifier(n_estimators=300, max_depth=9,
                             random_state=0)
clf.fit(x, y) 
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
x = model.transform(x)
x.shape  


ncr = NeighbourhoodCleaningRule()
x_resampled, y_resampled = ncr.fit_resample(x, y)


#Optimization Algorithm
opt=keras.optimizers.RMSprop(lr=0.00014, rho=0.9, epsilon=None, decay=0.0)

#Multi Layer Perceptron Model
model = Sequential()
model.add(Dense(128, input_dim=639, activation='relu'))
keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None)
  
model.add(Dense(128, activation='relu'))
keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None)
  
model.add(Dense(128, activation='relu'))
keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None)
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        ncr.fit_resample(X, Y)
    arr = arr[:, mask]

    input_arr = np.swapaxes(arr, 0, 1)

    X_train, X_test, y_train, y_test = train_test_split(
        input_arr, ground_truth, test_size=0.5, random_state=rng, shuffle=True
    )
    X_remove, X_test, y_remove, y_test = train_test_split(
        X_test, y_test, test_size=0.00402228873, random_state=rng, shuffle=True
    )
    X_remove, y_remove = np.nan, np.nan

    rus = NeighbourhoodCleaningRule(
        n_jobs=7, n_neighbors=8, threshold_cleaning=0.2, sampling_strategy="all"
    )
    X_train, y_train = rus.fit_resample(X_train, y_train)

    dict = get_class_count_value(y_train, 1470588)
    print(year, " pre resample ", dict)

    rus = RandomUnderSampler(random_state=rng, sampling_strategy=dict)
    X_train, y_train = rus.fit_resample(X_train, y_train)

    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

print("finished years for loop")

X_train_array = X_train_list[0]
예제 #10
0
def undersample(args):
    ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
    if args.data == "elliptic":
        elliptic_args = Namespace(args.elliptic_args)

        elliptic_data = cdr.get_data(
            source="elliptic",
            config_file=args.data_config_file,
            encode_classes=elliptic_args.encode_classes)

        dataset = elliptic_data.train_test_split(
            train_size=elliptic_args.train_size,
            feat_set="AF_NE",
            inc_meta=True,
            inc_unknown=elliptic_args.inc_unknown)

        train_X = dataset.train_X
        train_y = dataset.train_y
        test_X = dataset.test_X

        counter = Counter(train_y)
        print("Train set counter [Label]: {}".format(counter))

        if args.stratify_timestep == False:
            _, y = ncr.fit_resample(train_X[elliptic_data.feature_cols_AF_NE_],
                                    train_y)
            counter = Counter(y)
            print("Train set counter after NCL [Label]: {}".format(counter))

            indices = ncr.sample_indices_
            samples_kept = train_X.iloc[indices]
            undersampled_set = samples_kept.append(test_X, ignore_index=True)
            undersampled_set.drop(elliptic_data.feature_cols_NE_,
                                  inplace=True,
                                  axis=1)
            undersampled_set.to_csv(args.output_file,
                                    index=False,
                                    header=False)
        # stratify on time version
        else:
            tmp_data = train_X.copy()
            tmp_data["label"] = train_y.copy()
            ts_data = tmp_data.groupby("ts")

            removed = 0
            total_pre = tmp_data.shape[0]
            undersampled_set = pd.DataFrame()
            for ts, group in ts_data:

                grouped_X = group.iloc[:, :-1]
                ts_X = grouped_X[elliptic_data.feature_cols_AF_NE_]
                ts_y = group["label"]
                counter = Counter(ts_y)
                print("Train set (ts:{}) counter Label: {}".format(
                    ts, counter))

                X, y = ncr.fit_resample(ts_X, ts_y)
                indices = ncr.sample_indices_

                counter = Counter(y)
                print("Train set (ts:{}) counter after NCR Label: {}".format(
                    ts, counter))

                total_removed = ts_X.shape[0] - X.shape[0]
                print("Total removed (ts:{}): {}".format(ts, total_removed))
                removed += total_removed

                samples_kept = grouped_X.iloc[indices]
                print("Total samples kept (ts:{}): {}".format(
                    ts, samples_kept.shape[0]))

                undersampled_set = undersampled_set.append(samples_kept,
                                                           ignore_index=True)

            print("-------------------------------------")
            print("Total samples removed: {} from {}".format(
                removed, total_pre))
            undersampled_set = undersampled_set.append(test_X,
                                                       ignore_index=True)
            undersampled_set.drop(elliptic_data.feature_cols_NE_,
                                  inplace=True,
                                  axis=1)
            undersampled_set.to_csv(args.output_file,
                                    index=False,
                                    header=False)

    else:
        raise NotImplementedError("'{}' dataset not yet implemented".format(
            args.data))
def test_ncr_wrong_nn_obj():
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ncr.fit_resample(X, Y)
def test_ncr_error(ncr_params, err_msg):
    ncr = NeighbourhoodCleaningRule(**ncr_params)
    with pytest.raises(ValueError, match=err_msg):
        ncr.fit_resample(X, Y)
예제 #13
0
# or try cost-sensitive down-weighting

# CondensedNearestNeighbour technique for undersampling
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(n_neighbors=5, n_seeds_S=55)
cnn_X, cnn_y = cnn.fit_resample(X, y)
plotData2D(cnn_X, cnn_y)

# CondensedNearestNeighbour removes too many instances we will not use it

# NeighbourhoodCleaningRule technique for undersampling
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(sampling_strategy='majority',
                                n_neighbors=5,
                                kind_sel='mode')
ncr_X, ncr_y = ncr.fit_resample(X, y)
plotData2D(ncr_X, ncr_y)

# NeighbourhoodCleaningRule also doesn't work  for this dataset because it removes many data points
# in just one region and that will mess up our decision boundary and make our predictions worse.

# to clarify more, all UNN methods won't work perfectly for this dataset because most of data points of
# the majority class are condensed in one region while this region is where data points should be removed
# from, but UNN methods would remove data points from weeker regions until it clears them out before even
# starting to remove from the strong region, and this would change the distribution of data points of
# the majority class, and that's a thing we don't want to happen

# this leave us to use random undersampling

# RandomUnderSampler technique for undersampling
from imblearn.under_sampling import RandomUnderSampler
예제 #14
0
    img_set = torch.stack(img_set, 0)
    return img_set, dataset[:, -1]


def ncrReSample():
    raw_train, raw_test = splitTrainTest(datapath)
    img_data, y = getFullImgFeature(raw_train)
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    X_res, y_res = ncr.fit_resample(img_data, y)
    print('Resampled dataset shape %s' % Counter(y_res))
    trainset = np.append(X_res, y_res, axis=1)

    textX, texty = getFullImgFeature(raw_test)
    testset = np.append(textX, texty, axis=1)

    return trainset, testset


if __name__ == "__main__":
    imgpath = '../data/memotion_analysis_training_data/data_7000/'
    datapath = '../data/data_7000_new.csv'
    batchsize = 4

    raw_train, raw_test = splitTrainTest(datapath)
    img_data, y = getFullImgFeature(raw_train)
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    X_res, y_res = ncr.fit_resample(img_data.reshape(-1, 1), y)
    print('Resampled dataset shape %s' % Counter(y_res))
    print(X_res)
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        ncr.fit_resample(X, Y)
def neighbourhood_cleaning(x, y):
    print("----Neighbourhood Cleaning Rule----")
    sampler = NeighbourhoodCleaningRule()
    X, y = sampler.fit_resample(x, y)
    return X, y
예제 #17
0
def all_imblearn(xx, yy):
    
    imblearnlist = []  
    
    """OVER SAMPLING"""
    
    """Random Over Sampler"""
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(xx, yy)
    randomOverSampler = [X_resampled, y_resampled, 'random over sampler']
    imblearnlist.append(randomOverSampler)
    
    """SMOTE"""
    X_resampled, y_resampled = SMOTE().fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote']
    imblearnlist.append(smote)
    
    """SMOTE borderline1"""
    sm = SMOTE(kind='borderline1')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline1']
    imblearnlist.append(smote)
    
    """SMOTE borderline2"""
    sm = SMOTE(kind='borderline2')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline2']
    imblearnlist.append(smote)
    
    """SMOTE svm"""
    sm = SMOTE(kind='svm')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote svm']
    imblearnlist.append(smote)
    
    """SMOTENC"""
    smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
    X_resampled, y_resampled = smote_nc.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smotenc']
    imblearnlist.append(smote)
    
#    """ADASYN"""
#    X_resampled, y_resampled = ADASYN.fit_resample(xx, yy)
#    adasyn = [X_resampled, y_resampled, 'adasyn']
#    imblearnlist.append(adasyn)
#    


    """UNDER SAMPLING"""
    
    """Cluster Centroids"""
    cc = ClusterCentroids(random_state=0)
    X_resampled, y_resampled = cc.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'cluster centroids']
    imblearnlist.append(reSampled)

    """Random Over Sampler"""
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'random under sampler']
    imblearnlist.append(reSampled)
    
    """Near Miss 1"""
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 1']
    imblearnlist.append(reSampled)
    
    """Near Miss 2"""
    nm2 = NearMiss(version=2)
    X_resampled, y_resampled = nm2.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 2']
    imblearnlist.append(reSampled)
    
    """Near Miss 3"""
    nm3 = NearMiss(version=3)
    X_resampled, y_resampled = nm3.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 3']
    imblearnlist.append(reSampled)
    
    """Edited Nearest Neighbours"""
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """Repeated Edited Nearest Neighbours"""
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """All KNN"""
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'allKNN']
    imblearnlist.append(reSampled)
    
    """Condensed Nearest Neighbour"""
    cnn = CondensedNearestNeighbour(random_state=0)
    X_resampled, y_resampled = cnn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour']
    imblearnlist.append(reSampled)
    
    """One Sided Selection"""
    oss = OneSidedSelection(random_state=0)
    X_resampled, y_resampled = oss.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'One Sided Selection']
    imblearnlist.append(reSampled)
    
    """Neighbourhood Cleaning Rule"""
    ncr = NeighbourhoodCleaningRule()
    X_resampled, y_resampled = ncr.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule']
    imblearnlist.append(reSampled)


    """OVER AND UNDER SAMPLING"""
    
    """SMOTEENN"""
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTEENN']
    imblearnlist.append(reSampled)
    
    """SMOTETomek"""
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTETomek']
    imblearnlist.append(reSampled)
    
    return imblearnlist
    
    
    
    
    
    
    
예제 #18
0
# undersample and plot imbalanced dataset with the neighborhood cleaning rule
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import NeighbourhoodCleaningRule
from plotDataset import plot_dataset

if __name__ == '__main__':
    X, y = make_classification(n_samples=10000,
                               n_features=2,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               weights=[0.99],
                               flip_y=0,
                               random_state=1)
    counter = Counter(y)
    print(counter)
    plot_dataset(X, y, counter)

    undersample = NeighbourhoodCleaningRule(n_neighbors=3,
                                            threshold_cleaning=0.5)
    X, y = undersample.fit_resample(X, y)

    counter = Counter(y)
    print(counter)
    plot_dataset(X, y, counter)
예제 #19
0

df.iloc[:, 2:] = df.iloc[:, 2:].applymap(mapping)

#df[['QFANSHIPr1','QSHOW_ELEMENTS_r13', 'QSHOW_ELEMENTS_r14']].groupby(['QFANSHIPr1']).agg(['mean', 'count'])
''' model '''
# get mtx
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values
''' ALLKNN '''
from collections import Counter
from imblearn.under_sampling import AllKNN, NeighbourhoodCleaningRule
# define undersampling strategy
under_allknn = NeighbourhoodCleaningRule()
# fit and apply the transform
X, y = under_allknn.fit_resample(X, y)
# summarize class distribution
print(Counter(y))
''' 5: Decision Tree'''
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(random_state=1337)
clf_dt.fit(X, y)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337)
acc = cross_val_score(estimator=clf_dt, X=X, y=y, cv=cv, scoring='f1')
acc.mean(), acc.std()

parameters = {
    'criterion': ['gini', 'entropy'],
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule(return_indices=True)
X_resampled, y_resampled, idx_resampled = ncl.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')
예제 #21
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
예제 #22
0
# Undersample with One-Sided Selection (Tomek Links + Condensed Nearest Neighbor)
print("Undersampling...")
# n_seeds_S is the number of majority class to be added to set C, which is then used as a reference for a kNN on the remaining majority samples not in set C
undersample_oss = OneSidedSelection(n_neighbors=1,
                                    n_seeds_S=counter[1],
                                    n_jobs=-1,
                                    random_state=seed)
X_train_full_fs, y_train_full = undersample_oss.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After OSS undersampling, the class distribution is:")
print(counter)
undersample_ncr = NeighbourhoodCleaningRule(n_neighbors=3,
                                            threshold_cleaning=0.5,
                                            n_jobs=-1)
X_train_full_fs, y_train_full = undersample_ncr.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After NCR undersampling, the class distribution is:")
print(counter)

# Saving to Local
print("Saving to Local in csv...")
X_train_full_fs.to_csv("./data/X_train.csv", index=False)
X_validation_full_fs.to_csv("./data/X_validation.csv", index=False)
y_train_full.to_csv("./data/Y_train.csv", index=False)
y_validation_full.to_csv("./data/Y_validation.csv", index=False)

# Read from Local
print("Reading from local...")
X_train_full_fs = pd.read_csv("./data/X_train.csv")
X_validation_full_fs = pd.read_csv("./data/X_validation.csv")
def test_ncr_error(ncr_params, err_msg):
    ncr = NeighbourhoodCleaningRule(**ncr_params)
    with pytest.raises(ValueError, match=err_msg):
        ncr.fit_resample(X, Y)
예제 #24
0
#Splitting data into train data and labels
training_df_fl = training_df_final[[
    col for col in training_df_final if col not in ['outcome', 'bidder_id']
]]
training_df_flabel = training_df_final[[
    col for col in training_df_final if col in ['outcome']
]]

# In[ ]:

#Applying neighborhood cleaning rule and preparing 1st phase model data
ncr = NeighbourhoodCleaningRule(n_neighbors=15,
                                random_state=32,
                                ratio={0: 0.5})
training_df_X, training_df_y = ncr.fit_resample(
    training_df_fl,
    training_df_flabel.values.reshape(1, -1)[0])

# In[ ]:


#Creating class for containing different model executions
class ClassifierContainer:
    def __init__(self,
                 model,
                 training_X,
                 training_y,
                 measuring_parameter='auc'):
        self.model = model
        self.training_X = training_X
        self.training_y = training_y
예제 #25
0
import numpy as np

from common.import_data import ImportData
from collections import Counter
from imblearn.under_sampling import NeighbourhoodCleaningRule

if __name__ == "__main__":
    data_set = ImportData()
    x: np.ndarray = data_set.import_all_data()
    y: np.ndarray = data_set.import_columns(np.array(['Class'])).ravel()
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    x_res, y_res = ncr.fit_resample(x, y)
    print('Reduced dataset shape %s' % Counter(y_res))
예제 #26
0
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)

if (option == "3"):
    #ADASYN method
    X_resampled, y_resampled = ADASYN().fit_resample(X, y)
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)

if (option == "4"):
    #Random under sampling method
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(X, y)
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)

if (option == "5"):
    #Neighbourhood cleaning rule method
    ncr = NeighbourhoodCleaningRule()
    X_resampled, y_resampled = ncr.fit_resample(X, y)
    csv_X = pd.DataFrame(data=X_resampled)
    csv_y = pd.DataFrame(data=y_resampled)
    csv_X.to_csv('ros_feature.csv', header=False, index=False)
    csv_y.to_csv('ros_label.csv', header=False, index=False)