def unbalance_helper(X_train, X_test, y_train, y_test, imbalance_method='under_sampling'): """ Args: imbalance_method (str, optional): over_sampling, or under_sampling. Defaults to 'under_sampling'. Returns: processed data """ # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTETomek deal with unbalance data ") # 插值生成新样本 X_train, y_train = SMOTETomek().fit_resample(X_train, y_train) X_test, y_test = SMOTETomek().fit_resample(X_train, y_train) elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data ") X_train, y_train = ClusterCentroids(random_state=0).fit_resample( X_train, y_train) X_test, y_test = ClusterCentroids(random_state=0).fit_resample( X_test, y_test) return X_train, y_train, X_test, y_test
def test_fit_resample_check_voting(): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == 'soft' cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(sparse.csr_matrix(X), Y) assert cc.voting_ == 'hard'
def test_fit_sample_error(): ratio = 'auto' cluster = 'rnd' cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED, estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): cc.fit_sample(X, Y) voting = 'unknown' cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): cc.fit_sample(X, Y)
def test_cluster_centroids_error_estimator(): """Check that an error is raised when estimator does not have a cluster API.""" err_msg = ( "`estimator` should be a clustering estimator exposing a parameter " "`n_clusters` and a fitted parameter `cluster_centers_`.") with pytest.raises(ValueError, match=err_msg): ClusterCentroids(estimator=LogisticRegression()).fit_resample(X, Y) err_msg = ( "`estimator` should be a clustering estimator exposing a fitted parameter " "`cluster_centers_`.") with pytest.raises(RuntimeError, match=err_msg): ClusterCentroids(estimator=_CustomClusterer()).fit_resample(X, Y)
def test_cluster_centroids_hard_target_class(): # check that the samples selecting by the hard voting corresponds to the # targeted class # non-regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/738 X, y = make_classification( n_samples=1000, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_clusters_per_class=1, weights=[0.3, 0.7], class_sep=0.01, random_state=0, ) cc = ClusterCentroids(voting="hard", random_state=0) X_res, y_res = cc.fit_resample(X, y) minority_class_indices = np.flatnonzero(y == 0) X_minority_class = X[minority_class_indices] resampled_majority_class_indices = np.flatnonzero(y_res == 1) X_res_majority = X_res[resampled_majority_class_indices] sample_from_minority_in_majority = [ np.all(np.isclose(selected_sample, minority_sample)) for selected_sample in X_res_majority for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def perform_balancing(x, y, strategy=None): """ Performs under/over sampling, according to the number of true and false instances of the x, y dataset. :param x: feature values :param y: labels :return: a balanced x, y """ if strategy is None: strategy = BALANCE_DATASET_STRATEGY if strategy == 'random': # more info: # https://imbalanced-learn.readthedocs.io/en/stable/under_sampling.html rus = RandomUnderSampler(random_state=SEED) elif strategy == 'oversampling': rus = RandomOverSampler(random_state=SEED) elif strategy == 'cluster_centroids': rus = ClusterCentroids(random_state=SEED, n_jobs=CORE_COUNT) elif strategy == 'nearmiss': rus = NearMiss(version=1, n_jobs=CORE_COUNT) else: raise ValueError("algorithm not found") # keeping column names new_x, new_y = rus.fit_resample(x, y) new_x = pd.DataFrame(new_x, columns=x.columns) return new_x, new_y
def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (9, 2) assert y_resampled.shape == (9, )
def test_fit_resample_auto(): sampling_strategy = "auto" cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, )
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def test_cluster_centroids_n_jobs(): # check that we deprecate the `n_jobs` parameter. cc = ClusterCentroids(n_jobs=1) with pytest.warns(FutureWarning) as record: cc.fit_resample(X, Y) assert len(record) == 1 assert "'n_jobs' was deprecated" in record[0].message.args[0]
def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
def test_fit_sample_wrong_object(): ratio = 'auto' cluster = 'rnd' cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) assert_raises_regex(ValueError, "has to be a KMeans clustering", cc.fit_sample, X, Y)
def test_balanced_batch_generator_function_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): balanced_batch_generator(X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42)
def buildModel(clf, X, y, cv_nums=10, is_random=False): # 是否打乱数据 if is_random == True: random_lst = list(np.random.randint(0, 1000, 4)) elif is_random == False: random_lst = [0] * 4 print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------') # 不做处理,使用原始数据集做预测 print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums))) ros = RandomOverSampler(random_state=random_lst[0]) X_oversampled, y_oversampled = ros.fit_sample(X, y) # print(sorted(Counter(y_oversampled).items())) print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums))) cc = ClusterCentroids(random_state=random_lst[1]) X_undersampled, y_undersampled = cc.fit_sample(X, y) #print(sorted(Counter(y_undersampled).items())) print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums))) sm = SMOTE(random_state=random_lst[2]) X_smote, y_smote = sm.fit_sample(X, y) #print(sorted(Counter(y_smote).items())) print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums))) # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样, # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份, # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用 ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10) X_ee, y_ee = ee.fit_sample(X, y)
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 10) cc = ClusterCentroids(random_state=RND_SEED) assert_warns(UserWarning, cc.fit, X, y)
def test_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) assert_equal(cc.ratio, ratio)
def fix_imbalance(X, y): """Fix imbalanced data in features X with labels Y. This is an important step because an over representation of a label means that it's easy to score high by guessing one label the whole time.""" cluster_centroids = ClusterCentroids() return cluster_centroids.fit_resample(X, y)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def test_cc_fit_invalid_ratio(): """Test either if an error is raised when the balancing ratio to fit is smaller than the one of the data""" # Create the object ratio = 1. / 10000. cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) # Fit the data assert_raises(RuntimeError, cc.fit, X, Y)
def perform_Under_ClusterCentroids(self): print('Under sampling with ClusterCentroids, preserves imformation') cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(self.X, self.y) return X_resampled, y_resampled
def use_parameters(self, X_train, selected_features): """ Default Parameter """ test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__C': test_C_linear, # default C=1 'model__kernel': ['linear'] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) return parameters
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object cc = ClusterCentroids(random_state=RND_SEED) cc.fit(X, Y) assert_raises(RuntimeError, cc.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def unbanlance_helper(self, imbalance_method='under_sampling', search_method='grid'): logger.info('get all feature ... ') self.x_train, self.x_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None if imbalance_method == 'over_sampling': logger.info('Use SMOTE deal with unbalance data ... ') self.x_train, self.y_train = SMOTE().fit_resample( self.x_train, self.y_train) self.x_test, self.y_test = SMOTE().fit_resample( self.x_test, self.y_test) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info('User ClusterCentroids deal with unbalance data ... ') self.x_train, self.y_train = ClusterCentroids( random_state=11).fit_resample(self.x_train, self.y_train) self.x_test, self.y_test = ClusterCentroids( random_state=11).fit_resample(self.x_test, self.y_test) model_name = 'lgb_under_sampling' logger.info('search best param ... ') param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ... ') self.model.fit(self.x_train, self.y_train) test_predict_label = self.model.predict(self.x_test) train_predict_label = self.model.predict(self.x_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, train_predict_label, test_predict_label) logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name)
def test_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) assert_raises(RuntimeError, cc.sample, X, Y)
def test_multiclass_fit_sample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def test_fit_sample_auto(): ratio = 'auto' cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def undersample(X, y): cc = ClusterCentroids(random_state=12) rX, rY = cc.fit_resample(X, y) if isinstance(X, pd.DataFrame): rX = pd.DataFrame(data=rX, columns=X.columns) elif isinstance(X, pd.Series): rX = pd.Series(data=rX) if isinstance(y, pd.Series): rY = pd.Series(data=rY) return rX, rY
def _under_sampling(table, label_col, sampling_strategy='not majority', seed=None, estimator='KMeans', n_clusters=8, voting='auto', n_jobs=1): # Separate features and label features = table.drop([label_col], axis=1) y = table[label_col] if(sklearn_utils.multiclass.type_of_target(y) == 'continuous'): raise_error('0718', 'label_col') # Initialization label encoder lab_encoder = preprocessing.LabelEncoder() # Filter out categorical columns in features categorical_cols = [col for col in features.columns if features[col].dtypes == 'object'] # Transform categorical columns and add to the original features for cate_col in categorical_cols: features_encoder = lab_encoder.fit_transform(features[cate_col]) features[cate_col] = features_encoder # Transform label column with object type if (y.dtypes == 'object'): y_encoder = lab_encoder.fit_transform(y) else: y_encoder = y if (estimator == 'Kmeans'): estimator_model = KMeans(n_clusters=n_clusters) else: estimator_model = None # Process under sampling sm = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=seed, estimator=estimator_model, voting=voting, n_jobs=n_jobs) X_res, y_res = sm.fit_resample(features, y_encoder) # Invert to original data if (y.dtypes == 'object'): y_decoder = lab_encoder.inverse_transform(y_res) else: y_decoder = y_res df = pd.DataFrame(data=X_res, columns=features.columns) for cate_col in categorical_cols: df[cate_col] = lab_encoder.inverse_transform(df[cate_col].astype('int32')) df1 = pd.DataFrame(data=y_decoder, columns=[label_col]) # Output result out_table = df.join(df1) return {'out_table' : out_table}
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train