def neighbourhood_cleaning_rule(feature_list_of_all_instances, class_list_of_all_instances,neighbours): # Apply neighbourhood cleaning rule c1 = 0 c2 = 0 count = 0 for i in class_list_of_all_instances: if i == 1: c1 += 1 if i == 0: c2 += 1 if i != 1 and i != 0: count += 1 print(" Data of class 1 ", c1, " ,Data of cls 0 ", c2, ",Other class ", count) # for i in range(5,200,5): ncl = NeighbourhoodCleaningRule(n_neighbors=neighbours, n_jobs=4) X_resampled, y_resampled = ncl.fit_sample(feature_list_of_all_instances, class_list_of_all_instances) # X_res_vis = pca.transform(X_resampled) # 13 print(" Cleaned ", len(feature_list_of_all_instances) - len(X_resampled), " points", end='') c1 = 0 c2 = 0 for ii in y_resampled: if ii == 1: c1 += 1 if ii == 0: c2 += 1 print(" and data of class 1 ", c1, "data of cls 0 ", c2, "for ", neighbours, "neighbours ") return X_resampled, y_resampled # feature_list_of_all_instances,class_list_of_all_instances=neighbourhood_cleaning_rule(feature_list_of_all_instances,class_list_of_all_instances)
def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_sample(X, Y)
def test_ncr_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) ncr.fit(X, Y) assert_raises(RuntimeError, ncr.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def under_sampling(df, title): features, output_label = split_data(df) ncr = NeighbourhoodCleaningRule() X_undersampled, y_undersampled = ncr.fit_resample(features, output_label) df_full = pd.concat([ pd.DataFrame(X_undersampled, columns=features.columns), pd.DataFrame(y_undersampled, columns=output_label.columns) ], axis=1) return (df_full)
def test_ncr_fit_sample(): # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_mode(): ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit(): """Test the fitting method""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) # Fit the data ncr.fit(X, Y) # Check if the data information have been computed assert_equal(ncr.min_c_, 0) assert_equal(ncr.maj_c_, 1) assert_equal(ncr.stats_c_[0], 500) assert_equal(ncr.stats_c_[1], 4500)
def test_ncr_fit_resample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') X_resampled, y_resampled = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 ], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_error(): threshold_cleaning = -10 assert_raises_regex( ValueError, "'threshold_cleaning' is a value between" " 0 and 1.", NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample, X, Y) threshold_cleaning = 10 assert_raises_regex( ValueError, "'threshold_cleaning' is a value between" " 0 and 1.", NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample, X, Y)
def ncrReSample(): raw_train, raw_test = splitTrainTest(datapath) img_data, y = getFullImgFeature(raw_train) print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() X_res, y_res = ncr.fit_resample(img_data, y) print('Resampled dataset shape %s' % Counter(y_res)) trainset = np.append(X_res, y_res, axis=1) textX, texty = getFullImgFeature(raw_test) testset = np.append(textX, texty, axis=1) return trainset, testset
def test_ncr_error(): threshold_cleaning = -10 with raises(ValueError, match=("'threshold_cleaning' is a value between" " 0 and 1")): NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample(X, Y) threshold_cleaning = 10 with raises(ValueError, match=("'threshold_cleaning' is a value between" " 0 and 1")): NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample(X, Y)
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 1, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_resample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ -0.20413357, 0.64628718 ], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def __init__(self, estimators, estimators_bag, estimators_ada, n_jobs=1, function_compare='precision_tp_fp', n_folds=3, n_estimators=100): self.estimators = estimators self.named_estimators = dict(estimators) self.n_jobs = n_jobs self.groups = [] self.g_mean = [-1, -1] self.function_compare = function_compare self.clfs = [] self.n_folds = n_folds self.max_g = [-1, -1, -1] self.clf_id = [-1, -1, -1] self.n_estimators = n_estimators self.meta_clf_ = MLPClassifier(solver='lbfgs', random_state=1) self.clfs_ensemble = [] self.estimators_bag = estimators_bag self.estimators_ada = estimators_ada self.random_st = 5 self.methods = [ SMOTE(k_neighbors=3, random_state=self.random_st), NeighbourhoodCleaningRule(n_neighbors=3, random_state=self.random_st) ] self.methoda = [0, 1] self.name_met = ["ADASYN", "NCR"] self.ensemble_ = []
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
def test_ncr_init(): # Define a ratio ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_equal(ncr.n_neighbors, 3) assert_equal(ncr.n_jobs, 1) assert_equal(ncr.random_state, RND_SEED)
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 5, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_nn_obj(): # Resample the data nn = NearestNeighbors(n_neighbors=3) ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_wrong_nn_obj(): # Resample the data nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) assert_raises_regex(ValueError, "has to be one of", ncr.fit_sample, X, Y)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 2268) assert_equal(count_y_res[2], 42)
def test_ncr_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_raises(RuntimeError, ncr.sample, X, Y)
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False): ''' Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation If normalize flag is True then the data are being normalised The sampling parameter sets the type of sampling to be used ''' print('----------{} with {}----------'.format(clf_name, sampling)) totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0 plot_ind = randint(0, 9) j = 0 skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(usx, usy): x_train, x_test = usx[train_index], usx[test_index] y_train, y_test = usy[train_index], usy[test_index] if sampling == 'SMOTE': x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ADASYN': x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ENN': x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train) elif sampling == 'Tomek': x_train, y_train = TomekLinks().fit_resample(x_train, y_train) elif sampling == 'SMOTETomek': x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'SMOTEENN': x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'NCR': x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train) elif sampling == 'OSS': x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train) if normalize: scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clf.fit(x_train, y_train) # if plot_ind == j and clf_name == 'DecisionTreeClassifier': # plot_decision_tree(clf) y_predict = clf.predict(x_test) for i in range(len(y_predict)): if y_test[i] and y_predict[i]: totalTP += 1 if not y_test[i] and y_predict[i]: totalFP += 1 if y_test[i] and not y_predict[i]: totalFN += 1 if not y_test[i] and not y_predict[i]: totalTN += 1 j += 1 print('TOTAL TP: ' + str(totalTP)) print('TOTAL FP: ' + str(totalFP)) print('TOTAL FN: ' + str(totalFN)) print('TOTAL TN: ' + str(totalTN))
def test_ncr_wrong_nn_obj(): """Test either if an error is raised with wrong NN object""" # Resample the data nn = 'rnd' ncr = NeighbourhoodCleaningRule( return_indices=True, random_state=RND_SEED, n_neighbors=nn) assert_raises(ValueError, ncr.fit_sample, X, Y)
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 15) ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_warns(UserWarning, ncr.fit, X, y)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def test_ncr_init(): """Test the initialisation of the object""" # Define a ratio ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_equal(ncr.n_neighbors, 3) assert_equal(ncr.n_jobs, 1) assert_equal(ncr.random_state, RND_SEED)
def test_ncr_fit_single_class(): """Test either if an error when there is a single class""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, ncr.fit, X, y_single_class)
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
def test_ncr_error(ncr_params, err_msg): ncr = NeighbourhoodCleaningRule(**ncr_params) with pytest.raises(ValueError, match=err_msg): ncr.fit_resample(X, Y)
def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_resample(X, Y)
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_resample(X, Y)
from imblearn.under_sampling import NeighbourhoodCleaningRule # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncl.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)