def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([ [-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125], ]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ -0.00717161, 0.00318087 ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def one_sided_selection(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): oss = OneSidedSelection(random_state=42) X_res, y_res = oss.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_oss_fit_resample_with_indices(): oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_fit_resample_with_indices(): oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ -0.00717161, 0.00318087 ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def under_sampling(self, data, label, n_neighbors=5, method=None): #Input # data: 2D array data (im_height*im_width, num of band) # label: 1D array label(0,1,2...) per each data # n_neighbors: num of neighbors used in OSS # method: select under sampling method (OSS) #Output # return under sampled data, label if method in self.under_method: print("Before sampling label proportion: ",Counter(label)) if method == 'OSS' or method == 'OneSidedSelection': undersample = OneSidedSelection(n_neighbors=n_neighbors, n_seeds_S=200) data, label = undersample.fit_resample(data, label) print("After sampling label proportion: ",Counter(label)) return data, label
# undersample and plot imbalanced dataset with One-Sided Selection from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import OneSidedSelection from plotDataset import plot_dataset if __name__ == '__main__': X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) counter = Counter(y) print(counter) plot_dataset(X, y, counter) undersample = OneSidedSelection( n_neighbors=1, n_seeds_S=200) # n_seeds_S sets the number of samples in set S X, y = undersample.fit_resample(X, y) counter = Counter(y) print(counter) plot_dataset(X, y, counter)
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply One-Sided Selection oss = OneSidedSelection(return_indices=True) X_resampled, y_resampled, idx_resampled = oss.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples')
#scaling scaler = StandardScaler() scaled_X_train = scaler.fit_transform(train_set.drop('plan', axis=1)) scaled_X_test = scaler.transform(test_set.drop('plan', axis=1)) scaled_test_users = scaler.transform(the_users_test) scaled_test_users = pd.DataFrame(scaled_test_users, columns=the_users_test.columns) #UNDERSAMPLING and/or OVERSAMPLING #undersampling the train set under = OneSidedSelection() X_train_res, y_train_res = under.fit_resample(scaled_X_train, y_train) #oversampling the train set sm = SMOTE() X_train_res, y_train_res = sm.fit_resample(X_train_res, y_train_res) X_train_res = pd.DataFrame(X_train_res, columns=train_set.drop('plan', axis=1).columns) #creating the final train and test set for modeling train_set = pd.concat([X_train_res, y_train_res], axis=1) scaled_X_test = pd.DataFrame(scaled_X_test, columns=test_set.drop('plan', axis=1).columns) test_set = pd.concat([scaled_X_test, test_set['plan'].reset_index(drop=True)],
def one_sided_selection(X, y): oss = OneSidedSelection(random_state=42) X_res, y_res = oss.fit_resample(X, y) return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
df=d.iloc[:,6:] X_all = df.iloc[:,[0,2]].values y_all = df.iloc[:,10].values # undersample and plot imbalanced dataset with One-Sided Selection from collections import Counter from imblearn.under_sampling import OneSidedSelection from numpy import where # summarize class distribution counter = Counter(y_all) print(counter) # define the undersampling method undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=600000, n_jobs=-1, random_state=42) # transform the dataset X, y = undersample.fit_resample(X_all, y_all) # summarize the new class distribution counter = Counter(y) print(counter) #Old: Counter({0.0: 7319966, 1.0: 972038}) #New: Counter({0.0: 3857552, 1.0: 972038}) # scatter plot of examples by class label plt.figure(figsize=[13,8]) plt.ylim([0,7]) for label, _ in counter.items(): row_ix = where(y == label)[0] plt.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label), s=16, marker='.') plt.legend() plt.show()
def one_sided_selection(x, y): print("----One Sided Selection----") sampler = OneSidedSelection() X, y = sampler.fit_resample(x, y) return X, y
def all_imblearn(xx, yy): imblearnlist = [] """OVER SAMPLING""" """Random Over Sampler""" ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(xx, yy) randomOverSampler = [X_resampled, y_resampled, 'random over sampler'] imblearnlist.append(randomOverSampler) """SMOTE""" X_resampled, y_resampled = SMOTE().fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote'] imblearnlist.append(smote) """SMOTE borderline1""" sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline1'] imblearnlist.append(smote) """SMOTE borderline2""" sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline2'] imblearnlist.append(smote) """SMOTE svm""" sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote svm'] imblearnlist.append(smote) """SMOTENC""" smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smotenc'] imblearnlist.append(smote) # """ADASYN""" # X_resampled, y_resampled = ADASYN.fit_resample(xx, yy) # adasyn = [X_resampled, y_resampled, 'adasyn'] # imblearnlist.append(adasyn) # """UNDER SAMPLING""" """Cluster Centroids""" cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'cluster centroids'] imblearnlist.append(reSampled) """Random Over Sampler""" rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'random under sampler'] imblearnlist.append(reSampled) """Near Miss 1""" nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 1'] imblearnlist.append(reSampled) """Near Miss 2""" nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 2'] imblearnlist.append(reSampled) """Near Miss 3""" nm3 = NearMiss(version=3) X_resampled, y_resampled = nm3.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 3'] imblearnlist.append(reSampled) """Edited Nearest Neighbours""" enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'edited nearest neighbours'] imblearnlist.append(reSampled) """Repeated Edited Nearest Neighbours""" renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours'] imblearnlist.append(reSampled) """All KNN""" allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'allKNN'] imblearnlist.append(reSampled) """Condensed Nearest Neighbour""" cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour'] imblearnlist.append(reSampled) """One Sided Selection""" oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'One Sided Selection'] imblearnlist.append(reSampled) """Neighbourhood Cleaning Rule""" ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule'] imblearnlist.append(reSampled) """OVER AND UNDER SAMPLING""" """SMOTEENN""" smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTEENN'] imblearnlist.append(reSampled) """SMOTETomek""" smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTETomek'] imblearnlist.append(reSampled) return imblearnlist
X_resampled, y_resample = nm.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) deleted_ind = np.setdiff1d(np.arange(len(X)), ind) plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2 plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) from imblearn.under_sampling import OneSidedSelection oss=OneSidedSelection(random_state=0, n_neighbors=1, n_seeds_S=1) X_resampled, y_resampled = oss.fit_resample(X,y) np.bincount(y_resampled) deleted_ind = np.setdiff1d(np.arange(len(X)), ind) plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2 plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2) plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=y[deleted_ind], marker='x') colors = plt.cm.virdis(y[deleted_ind]/2) plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2) plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x')
np.set_printoptions(formatter={'float': '{:.1f}'.format}) print(tol/10) print("\n") print("Number of cases in each class") print(Counter(y)) ################### 4. Under-sampling: Selected Class (Compare Each Technique) ################### ########## one sided selection ########## np.set_printoptions(formatter={'float': '{:.2f}'.format}) X = np.array(principal_4_Df.iloc[:,:-1]) y = np.array(principal_4_Df.iloc[:,-1]) model = KNeighborsClassifier(n_neighbors=5) skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) us = OneSidedSelection(random_state=0, n_neighbors=8) pipeline = make_pipeline(us, model) X_res , y_res = us.fit_resample(X, y) overall = [] recall = np.zeros((1,13)) spe = np.zeros((1,13)) tol = np.zeros((13,13)) trial = 0 for train_index, test_index in skf.split(X,y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = cross_val_predict(pipeline, X_test, y_test, cv=skf) score = cross_val_score(pipeline, X_test, y_test, cv=skf).mean() overall.append(score) trial+=1 print("KNN k=5 4 dim PCA One Sided Selection")
def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int"): oss.fit_resample(X, Y)
def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int"): oss.fit_resample(X, Y)
random_state=seed, n_jobs=-1) X_train_full_fs, y_train_full = oversample_smoten.fit_resample( X_train_full_fs, y_train_full) counter = Counter(y_train_full) print("After oversampling, the class distribution is:") print(counter) # Undersample with One-Sided Selection (Tomek Links + Condensed Nearest Neighbor) print("Undersampling...") # n_seeds_S is the number of majority class to be added to set C, which is then used as a reference for a kNN on the remaining majority samples not in set C undersample_oss = OneSidedSelection(n_neighbors=1, n_seeds_S=counter[1], n_jobs=-1, random_state=seed) X_train_full_fs, y_train_full = undersample_oss.fit_resample( X_train_full_fs, y_train_full) counter = Counter(y_train_full) print("After OSS undersampling, the class distribution is:") print(counter) undersample_ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1) X_train_full_fs, y_train_full = undersample_ncr.fit_resample( X_train_full_fs, y_train_full) counter = Counter(y_train_full) print("After NCR undersampling, the class distribution is:") print(counter) # Saving to Local print("Saving to Local in csv...") X_train_full_fs.to_csv("./data/X_train.csv", index=False)