def under_sampling(df, title): features, output_label = split_data(df) ncr = NeighbourhoodCleaningRule() X_undersampled, y_undersampled = ncr.fit_resample(features, output_label) df_full = pd.concat([ pd.DataFrame(X_undersampled, columns=features.columns), pd.DataFrame(y_undersampled, columns=output_label.columns) ], axis=1) return (df_full)
def test_ncr_fit_resample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_resample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 ], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def ncrReSample(): raw_train, raw_test = splitTrainTest(datapath) img_data, y = getFullImgFeature(raw_train) print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() X_res, y_res = ncr.fit_resample(img_data, y) print('Resampled dataset shape %s' % Counter(y_res)) trainset = np.append(X_res, y_res, axis=1) textX, texty = getFullImgFeature(raw_test) testset = np.append(textX, texty, axis=1) return trainset, testset
def test_ncr_fit_resample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_resample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 ], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
x=dataset.iloc[: ,1:640] #print(x) y=dataset['Class'] clf = RandomForestClassifier(n_estimators=300, max_depth=9, random_state=0) clf.fit(x, y) clf.feature_importances_ model = SelectFromModel(clf, prefit=True) x = model.transform(x) x.shape ncr = NeighbourhoodCleaningRule() x_resampled, y_resampled = ncr.fit_resample(x, y) #Optimization Algorithm opt=keras.optimizers.RMSprop(lr=0.00014, rho=0.9, epsilon=None, decay=0.0) #Multi Layer Perceptron Model model = Sequential() model.add(Dense(128, input_dim=639, activation='relu')) keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None) model.add(Dense(128, activation='relu')) keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None) model.add(Dense(128, activation='relu')) keras.layers.AlphaDropout(0.3, noise_shape=None, seed=None)
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_resample(X, Y)
arr = arr[:, mask] input_arr = np.swapaxes(arr, 0, 1) X_train, X_test, y_train, y_test = train_test_split( input_arr, ground_truth, test_size=0.5, random_state=rng, shuffle=True ) X_remove, X_test, y_remove, y_test = train_test_split( X_test, y_test, test_size=0.00402228873, random_state=rng, shuffle=True ) X_remove, y_remove = np.nan, np.nan rus = NeighbourhoodCleaningRule( n_jobs=7, n_neighbors=8, threshold_cleaning=0.2, sampling_strategy="all" ) X_train, y_train = rus.fit_resample(X_train, y_train) dict = get_class_count_value(y_train, 1470588) print(year, " pre resample ", dict) rus = RandomUnderSampler(random_state=rng, sampling_strategy=dict) X_train, y_train = rus.fit_resample(X_train, y_train) X_train_list.append(X_train) X_test_list.append(X_test) y_train_list.append(y_train) y_test_list.append(y_test) print("finished years for loop") X_train_array = X_train_list[0]
def undersample(args): ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5) if args.data == "elliptic": elliptic_args = Namespace(args.elliptic_args) elliptic_data = cdr.get_data( source="elliptic", config_file=args.data_config_file, encode_classes=elliptic_args.encode_classes) dataset = elliptic_data.train_test_split( train_size=elliptic_args.train_size, feat_set="AF_NE", inc_meta=True, inc_unknown=elliptic_args.inc_unknown) train_X = dataset.train_X train_y = dataset.train_y test_X = dataset.test_X counter = Counter(train_y) print("Train set counter [Label]: {}".format(counter)) if args.stratify_timestep == False: _, y = ncr.fit_resample(train_X[elliptic_data.feature_cols_AF_NE_], train_y) counter = Counter(y) print("Train set counter after NCL [Label]: {}".format(counter)) indices = ncr.sample_indices_ samples_kept = train_X.iloc[indices] undersampled_set = samples_kept.append(test_X, ignore_index=True) undersampled_set.drop(elliptic_data.feature_cols_NE_, inplace=True, axis=1) undersampled_set.to_csv(args.output_file, index=False, header=False) # stratify on time version else: tmp_data = train_X.copy() tmp_data["label"] = train_y.copy() ts_data = tmp_data.groupby("ts") removed = 0 total_pre = tmp_data.shape[0] undersampled_set = pd.DataFrame() for ts, group in ts_data: grouped_X = group.iloc[:, :-1] ts_X = grouped_X[elliptic_data.feature_cols_AF_NE_] ts_y = group["label"] counter = Counter(ts_y) print("Train set (ts:{}) counter Label: {}".format( ts, counter)) X, y = ncr.fit_resample(ts_X, ts_y) indices = ncr.sample_indices_ counter = Counter(y) print("Train set (ts:{}) counter after NCR Label: {}".format( ts, counter)) total_removed = ts_X.shape[0] - X.shape[0] print("Total removed (ts:{}): {}".format(ts, total_removed)) removed += total_removed samples_kept = grouped_X.iloc[indices] print("Total samples kept (ts:{}): {}".format( ts, samples_kept.shape[0])) undersampled_set = undersampled_set.append(samples_kept, ignore_index=True) print("-------------------------------------") print("Total samples removed: {} from {}".format( removed, total_pre)) undersampled_set = undersampled_set.append(test_X, ignore_index=True) undersampled_set.drop(elliptic_data.feature_cols_NE_, inplace=True, axis=1) undersampled_set.to_csv(args.output_file, index=False, header=False) else: raise NotImplementedError("'{}' dataset not yet implemented".format( args.data))
def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_resample(X, Y)
def test_ncr_error(ncr_params, err_msg): ncr = NeighbourhoodCleaningRule(**ncr_params) with pytest.raises(ValueError, match=err_msg): ncr.fit_resample(X, Y)
# or try cost-sensitive down-weighting # CondensedNearestNeighbour technique for undersampling from imblearn.under_sampling import CondensedNearestNeighbour cnn = CondensedNearestNeighbour(n_neighbors=5, n_seeds_S=55) cnn_X, cnn_y = cnn.fit_resample(X, y) plotData2D(cnn_X, cnn_y) # CondensedNearestNeighbour removes too many instances we will not use it # NeighbourhoodCleaningRule technique for undersampling from imblearn.under_sampling import NeighbourhoodCleaningRule ncr = NeighbourhoodCleaningRule(sampling_strategy='majority', n_neighbors=5, kind_sel='mode') ncr_X, ncr_y = ncr.fit_resample(X, y) plotData2D(ncr_X, ncr_y) # NeighbourhoodCleaningRule also doesn't work for this dataset because it removes many data points # in just one region and that will mess up our decision boundary and make our predictions worse. # to clarify more, all UNN methods won't work perfectly for this dataset because most of data points of # the majority class are condensed in one region while this region is where data points should be removed # from, but UNN methods would remove data points from weeker regions until it clears them out before even # starting to remove from the strong region, and this would change the distribution of data points of # the majority class, and that's a thing we don't want to happen # this leave us to use random undersampling # RandomUnderSampler technique for undersampling from imblearn.under_sampling import RandomUnderSampler
img_set = torch.stack(img_set, 0) return img_set, dataset[:, -1] def ncrReSample(): raw_train, raw_test = splitTrainTest(datapath) img_data, y = getFullImgFeature(raw_train) print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() X_res, y_res = ncr.fit_resample(img_data, y) print('Resampled dataset shape %s' % Counter(y_res)) trainset = np.append(X_res, y_res, axis=1) textX, texty = getFullImgFeature(raw_test) testset = np.append(textX, texty, axis=1) return trainset, testset if __name__ == "__main__": imgpath = '../data/memotion_analysis_training_data/data_7000/' datapath = '../data/data_7000_new.csv' batchsize = 4 raw_train, raw_test = splitTrainTest(datapath) img_data, y = getFullImgFeature(raw_train) print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() X_res, y_res = ncr.fit_resample(img_data.reshape(-1, 1), y) print('Resampled dataset shape %s' % Counter(y_res)) print(X_res)
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_resample(X, Y)
def neighbourhood_cleaning(x, y): print("----Neighbourhood Cleaning Rule----") sampler = NeighbourhoodCleaningRule() X, y = sampler.fit_resample(x, y) return X, y
def all_imblearn(xx, yy): imblearnlist = [] """OVER SAMPLING""" """Random Over Sampler""" ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(xx, yy) randomOverSampler = [X_resampled, y_resampled, 'random over sampler'] imblearnlist.append(randomOverSampler) """SMOTE""" X_resampled, y_resampled = SMOTE().fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote'] imblearnlist.append(smote) """SMOTE borderline1""" sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline1'] imblearnlist.append(smote) """SMOTE borderline2""" sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline2'] imblearnlist.append(smote) """SMOTE svm""" sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote svm'] imblearnlist.append(smote) """SMOTENC""" smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smotenc'] imblearnlist.append(smote) # """ADASYN""" # X_resampled, y_resampled = ADASYN.fit_resample(xx, yy) # adasyn = [X_resampled, y_resampled, 'adasyn'] # imblearnlist.append(adasyn) # """UNDER SAMPLING""" """Cluster Centroids""" cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'cluster centroids'] imblearnlist.append(reSampled) """Random Over Sampler""" rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'random under sampler'] imblearnlist.append(reSampled) """Near Miss 1""" nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 1'] imblearnlist.append(reSampled) """Near Miss 2""" nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 2'] imblearnlist.append(reSampled) """Near Miss 3""" nm3 = NearMiss(version=3) X_resampled, y_resampled = nm3.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 3'] imblearnlist.append(reSampled) """Edited Nearest Neighbours""" enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'edited nearest neighbours'] imblearnlist.append(reSampled) """Repeated Edited Nearest Neighbours""" renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours'] imblearnlist.append(reSampled) """All KNN""" allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'allKNN'] imblearnlist.append(reSampled) """Condensed Nearest Neighbour""" cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour'] imblearnlist.append(reSampled) """One Sided Selection""" oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'One Sided Selection'] imblearnlist.append(reSampled) """Neighbourhood Cleaning Rule""" ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule'] imblearnlist.append(reSampled) """OVER AND UNDER SAMPLING""" """SMOTEENN""" smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTEENN'] imblearnlist.append(reSampled) """SMOTETomek""" smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTETomek'] imblearnlist.append(reSampled) return imblearnlist
# undersample and plot imbalanced dataset with the neighborhood cleaning rule from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import NeighbourhoodCleaningRule from plotDataset import plot_dataset if __name__ == '__main__': X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) counter = Counter(y) print(counter) plot_dataset(X, y, counter) undersample = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5) X, y = undersample.fit_resample(X, y) counter = Counter(y) print(counter) plot_dataset(X, y, counter)
df.iloc[:, 2:] = df.iloc[:, 2:].applymap(mapping) #df[['QFANSHIPr1','QSHOW_ELEMENTS_r13', 'QSHOW_ELEMENTS_r14']].groupby(['QFANSHIPr1']).agg(['mean', 'count']) ''' model ''' # get mtx X = df.iloc[:, 2:].values y = df.iloc[:, 1].values ''' ALLKNN ''' from collections import Counter from imblearn.under_sampling import AllKNN, NeighbourhoodCleaningRule # define undersampling strategy under_allknn = NeighbourhoodCleaningRule() # fit and apply the transform X, y = under_allknn.fit_resample(X, y) # summarize class distribution print(Counter(y)) ''' 5: Decision Tree''' from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV from sklearn.tree import DecisionTreeClassifier clf_dt = DecisionTreeClassifier(random_state=1337) clf_dt.fit(X, y) cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337) acc = cross_val_score(estimator=clf_dt, X=X, y=y, cv=cv, scoring='f1') acc.mean(), acc.std() parameters = { 'criterion': ['gini', 'entropy'],
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples')
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
# Undersample with One-Sided Selection (Tomek Links + Condensed Nearest Neighbor) print("Undersampling...") # n_seeds_S is the number of majority class to be added to set C, which is then used as a reference for a kNN on the remaining majority samples not in set C undersample_oss = OneSidedSelection(n_neighbors=1, n_seeds_S=counter[1], n_jobs=-1, random_state=seed) X_train_full_fs, y_train_full = undersample_oss.fit_resample( X_train_full_fs, y_train_full) counter = Counter(y_train_full) print("After OSS undersampling, the class distribution is:") print(counter) undersample_ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1) X_train_full_fs, y_train_full = undersample_ncr.fit_resample( X_train_full_fs, y_train_full) counter = Counter(y_train_full) print("After NCR undersampling, the class distribution is:") print(counter) # Saving to Local print("Saving to Local in csv...") X_train_full_fs.to_csv("./data/X_train.csv", index=False) X_validation_full_fs.to_csv("./data/X_validation.csv", index=False) y_train_full.to_csv("./data/Y_train.csv", index=False) y_validation_full.to_csv("./data/Y_validation.csv", index=False) # Read from Local print("Reading from local...") X_train_full_fs = pd.read_csv("./data/X_train.csv") X_validation_full_fs = pd.read_csv("./data/X_validation.csv")
#Splitting data into train data and labels training_df_fl = training_df_final[[ col for col in training_df_final if col not in ['outcome', 'bidder_id'] ]] training_df_flabel = training_df_final[[ col for col in training_df_final if col in ['outcome'] ]] # In[ ]: #Applying neighborhood cleaning rule and preparing 1st phase model data ncr = NeighbourhoodCleaningRule(n_neighbors=15, random_state=32, ratio={0: 0.5}) training_df_X, training_df_y = ncr.fit_resample( training_df_fl, training_df_flabel.values.reshape(1, -1)[0]) # In[ ]: #Creating class for containing different model executions class ClassifierContainer: def __init__(self, model, training_X, training_y, measuring_parameter='auc'): self.model = model self.training_X = training_X self.training_y = training_y
import numpy as np from common.import_data import ImportData from collections import Counter from imblearn.under_sampling import NeighbourhoodCleaningRule if __name__ == "__main__": data_set = ImportData() x: np.ndarray = data_set.import_all_data() y: np.ndarray = data_set.import_columns(np.array(['Class'])).ravel() print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() x_res, y_res = ncr.fit_resample(x, y) print('Reduced dataset shape %s' % Counter(y_res))
csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False) if (option == "3"): #ADASYN method X_resampled, y_resampled = ADASYN().fit_resample(X, y) csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False) if (option == "4"): #Random under sampling method rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(X, y) csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False) if (option == "5"): #Neighbourhood cleaning rule method ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(X, y) csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False)