def test_ncr_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) ncr.fit(X, Y) assert_raises(RuntimeError, ncr.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_ncr_init(): """Test the initialisation of the object""" # Define a ratio ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_equal(ncr.n_neighbors, 3) assert_equal(ncr.n_jobs, 1) assert_equal(ncr.random_state, RND_SEED)
def under_sampling(df, title): features, output_label = split_data(df) ncr = NeighbourhoodCleaningRule() X_undersampled, y_undersampled = ncr.fit_resample(features, output_label) df_full = pd.concat([ pd.DataFrame(X_undersampled, columns=features.columns), pd.DataFrame(y_undersampled, columns=output_label.columns) ], axis=1) return (df_full)
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train
def test_ncr_fit_sample(): # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_mode(): ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit(): """Test the fitting method""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) # Fit the data ncr.fit(X, Y) # Check if the data information have been computed assert_equal(ncr.min_c_, 0) assert_equal(ncr.maj_c_, 1) assert_equal(ncr.stats_c_[0], 500) assert_equal(ncr.stats_c_[1], 4500)
def ncrReSample(): raw_train, raw_test = splitTrainTest(datapath) img_data, y = getFullImgFeature(raw_train) print('Original dataset shape %s' % Counter(y)) ncr = NeighbourhoodCleaningRule() X_res, y_res = ncr.fit_resample(img_data, y) print('Resampled dataset shape %s' % Counter(y_res)) trainset = np.append(X_res, y_res, axis=1) textX, texty = getFullImgFeature(raw_test) testset = np.append(textX, texty, axis=1) return trainset, testset
def get_under_sample_models(): models, names = list(), list() models.append(TomekLinks()) names.append('TomesLinks') models.append(EditedNearestNeighbours()) names.append('EditedNearestNeighbors') models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') models.append(OneSidedSelection()) names.append('OneSidedSelection') models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_init(): """Test the initialisation of the object""" # Define a ratio verbose = True ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, verbose=verbose) assert_equal(ncr.size_ngh, 3) assert_equal(ncr.n_jobs, -1) assert_equal(ncr.random_state, RND_SEED) assert_equal(ncr.verbose, verbose) assert_equal(ncr.min_c_, None) assert_equal(ncr.maj_c_, None) assert_equal(ncr.stats_c_, {})
def resample(self, X, y, by, random_state=None): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'ROS': sampler = RandomOverSampler(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y return X_train, y_train
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 5, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 2268) assert_equal(count_y_res[2], 42)
def test_ncr_fit_sample_nn_obj(): # Resample the data nn = NearestNeighbors(n_neighbors=3) ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def UnderSample(X, Y, method='Random', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'Cluster': # 默认kmeans估计器 sampler = ClusterCentroids(ratio='auto', random_state=random_state, estimator=None) elif method is 'Random': sampler = RandomUnderSampler(ratio='auto', random_state=random_state, replacement=False) elif method is 'NearMiss_1': sampler = NearMiss(ratio='auto', random_state=random_state, version=1) elif method is 'NearMiss_2': sampler = NearMiss(ratio='auto', random_state=random_state, version=2) elif method is 'NearMiss_3': sampler = NearMiss(ratio='auto', random_state=random_state, version=3) elif method is 'TomekLinks': sampler = TomekLinks(ratio='auto', random_state=random_state) elif method is 'ENN': # kind_sel可取'all'和'mode' sampler = EditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'RENN': # kind_sel可取'all'和'mode' sampler = RepeatedEditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'All_KNN': sampler = AllKNN(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'CNN': sampler = CondensedNearestNeighbour(ratio='auto', random_state=random_state) elif method is 'One_SS': sampler = OneSidedSelection(ratio='auto', random_state=random_state) elif method is 'NCR': sampler = NeighbourhoodCleaningRule(ratio='auto', random_state=random_state, kind_sel='all', threshold_cleaning=0.5) elif method is 'IHT': sampler = InstanceHardnessThreshold(estimator=None, ratio='auto', random_state=random_state) X_resampled, Y_resampled = sampler.fit_sample(X, Y) return X_resampled, Y_resampled
def equalize_training_dataset_with_NClearningRule(x_train, y_train): from imblearn.under_sampling import NeighbourhoodCleaningRule old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = NeighbourhoodCleaningRule( sampling_strategy={i: 180 for i in range(0, 43)}, n_neighbors=5, n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None): """ NearMiss - Select values which are closest to minority class. TomeLinks - uses connected sets between class borders which are closest. If there are no other points closer, it assumes they are noise or borderline and remove them. ENN - Edited Nearest Neighbors, remove instances from majorit which are near bordeline NCL - NeighborhoodCleaningRule - Uses ENN to remove majority samples. Finds Nearest neighbors and if all are correctly label it keeps them. CC - Cluster Centroids - Finds Clusters of Majority Samples with K-means, then keeps cluster centroids of the clusters as the new majority sample. """ #https://towardsdatascience.com/sampling-techniques-for-extremely-imbalanced-data-part-i-under-sampling-a8dbc3d8d6d8 Y = df[_class] X = df.drop(_class, axis = 1) if method.lower() == 'nearmiss': x, y = NearMiss(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'tomelinks': x, y = TomekLinks(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'ncl': x, y = NeighbourhoodCleaningRule(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) elif method.lower() == 'cc': x, y = ClusterCentroids(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y) else: raise Exception("{} is not a valid method for UserSampling".format(method)) df = pd.DataFrame([x, y], columns = list(df.columns) + [_class]) fig = go.Figure() fig.add_trace( go.Splom( dimensions = [ dict(label = column, values = df[column]) for column in df.columns ], marker = dict( color = df[_class] ) ) ) fig.show() if transform: return df return
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def get_models(): models, names = list(), list() # TL models.append(TomekLinks()) names.append('TL') # ENN models.append(EditedNearestNeighbours()) names.append('ENN') # RENN models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') # OSS models.append(OneSidedSelection()) names.append('OSS') # NCR models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def get_samplers(): samplers = { # Under-samplers 'RandomUn': RandomUnderSampler(), 'TL': TomekLinks(), # 'ENN': EditedNearestNeighbours(), 'RENN': RepeatedEditedNearestNeighbours(), 'OSS': OneSidedSelection(), 'NCR': NeighbourhoodCleaningRule(), 'IHT': InstanceHardnessThreshold(), # Over-Samplers 'RandomOv': RandomOverSampler(), 'SMOTE': SMOTE(), 'SMOTESVM': SVMSMOTE(), # 'SMOTEKMeans': KMeansSMOTE(), 'ADASYN': ADASYN(), # Combined Under and Over Samplers 'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')), 'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')), } return samplers
def resample_data(predictors, target, df_data, method): """ This function resamples training datasets prior to training models. """ if method=='adasyn': util = ADASYN() elif method=='random-over-sampler': util = RandomOverSampler() elif method=='smote': util = SMOTE(kind='borderline2') elif method=='smote-tomek': util = SMOTETomek() elif method=='smote-enn': util = SMOTEENN() elif method=='edited-nn': util = EditedNearestNeighbours() elif method=='repeated-edited-nn': util = RepeatedEditedNearestNeighbours() elif method=='all-knn': util = AllKNN() elif method=='one-sided-selection': util = OneSidedSelection() elif method=='cluster-centroids': util = ClusterCentroids() elif method=='random-under-sampler': util = RandomUnderSampler() elif method=='neighbourhood-cleaning-rule': util = NeighbourhoodCleaningRule() elif method=='condensed-nearest-neighbour': util = CondensedNearestNeighbour() elif method=='near-miss': util = NearMiss(version=1) elif method=='instance-hardness-threshold': util = InstanceHardnessThreshold() x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target]) x_resampled = pd.DataFrame(x_resampled, columns=predictors) y_resampled = pd.DataFrame(y_resampled, columns=[target]) return x_resampled, y_resampled
def create_sampler(sampler_name, random_state=None): if sampler_name is None or sampler_name == 'None': return None if sampler_name.lower() == 'randomundersampler': return RandomUnderSampler(random_state=random_state) if sampler_name.lower() == 'tomeklinks': return TomekLinks(random_state=random_state) if sampler_name.lower() == 'enn': return EditedNearestNeighbours(random_state=random_state) if sampler_name.lower() == 'ncl': return NeighbourhoodCleaningRule(random_state=random_state) if sampler_name.lower() == 'randomoversampler': return RandomOverSampler(random_state=random_state) if sampler_name.lower() == 'smote': return SMOTE(random_state=random_state) if sampler_name.lower() == 'smotetomek': return SMOTETomek(random_state=random_state) if sampler_name.lower() == 'smoteenn': return SMOTEENN(random_state=random_state) else: raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
# remove the samples considered noisy. The ``NeighbourhoodCleaningRule`` use a # ``EditedNearestNeighbours`` to remove some sample. Additionally, they use a 3 # nearest-neighbors to remove samples which do not agree with this rule. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip( ax_arr, ( CondensedNearestNeighbour(random_state=0), OneSidedSelection(random_state=0), NeighbourhoodCleaningRule(), ), ): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title(f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax[1]) ax[1].set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() ############################################################################### # ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude # samples. All samples which are classified with a low probability will be # removed.
cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中. #因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本). from imblearn.under_sampling import OneSidedSelection oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用 EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集. ''' from imblearn.under_sampling import NeighbourhoodCleaningRule ncr = NeighbourhoodCleaningRule(random_state=0) X_resampled, y_resampled = ncr.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉. from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression()) X_resampled, y_resampled = iht.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 64), (2, 64)] ''' 过采样与下采样的结合
def test_ncr_error(ncr_params, err_msg): ncr = NeighbourhoodCleaningRule(**ncr_params) with pytest.raises(ValueError, match=err_msg): ncr.fit_resample(X, Y)
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_resample(X, Y)