def test_cluster_centroids_n_jobs(): # check that we deprecate the `n_jobs` parameter. cc = ClusterCentroids(n_jobs=1) with pytest.warns(FutureWarning) as record: cc.fit_resample(X, Y) assert len(record) == 1 assert "'n_jobs' was deprecated" in record[0].message.args[0]
def test_fit_resample_check_voting(): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == 'soft' cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(sparse.csr_matrix(X), Y) assert cc.voting_ == 'hard'
def test_fit_resample_check_voting(): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == 'soft' cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(sparse.csr_matrix(X), Y) assert cc.voting_ == 'hard'
def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
def test_fit_resample_error(): sampling_strategy = 'auto' cluster = 'rnd' cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): cc.fit_resample(X, Y) voting = 'unknown' cc = ClusterCentroids(sampling_strategy=sampling_strategy, voting=voting, random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): cc.fit_resample(X, Y)
def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (9, 2) assert y_resampled.shape == (9, )
def test_fit_resample_auto(): sampling_strategy = "auto" cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, )
def test_cluster_centroids_hard_target_class(): # check that the samples selecting by the hard voting corresponds to the # targeted class # non-regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/738 X, y = make_classification( n_samples=1000, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_clusters_per_class=1, weights=[0.3, 0.7], class_sep=0.01, random_state=0, ) cc = ClusterCentroids(voting="hard", random_state=0) X_res, y_res = cc.fit_resample(X, y) minority_class_indices = np.flatnonzero(y == 0) X_minority_class = X[minority_class_indices] resampled_majority_class_indices = np.flatnonzero(y_res == 1) X_res_majority = X_res[resampled_majority_class_indices] sample_from_minority_in_majority = [ np.all(np.isclose(selected_sample, minority_sample)) for selected_sample in X_res_majority for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0
def fix_imbalance(X, y): """Fix imbalanced data in features X with labels Y. This is an important step because an over representation of a label means that it's easy to score high by guessing one label the whole time.""" cluster_centroids = ClusterCentroids() return cluster_centroids.fit_resample(X, y)
def test_fit_resample_error(): sampling_strategy = 'auto' cluster = 'rnd' cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): cc.fit_resample(X, Y) voting = 'unknown' cc = ClusterCentroids( sampling_strategy=sampling_strategy, voting=voting, random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): cc.fit_resample(X, Y)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def perform_Under_ClusterCentroids(self): print('Under sampling with ClusterCentroids, preserves imformation') cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(self.X, self.y) return X_resampled, y_resampled
def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def undersample(X, y): cc = ClusterCentroids(random_state=12) rX, rY = cc.fit_resample(X, y) if isinstance(X, pd.DataFrame): rX = pd.DataFrame(data=rX, columns=X.columns) elif isinstance(X, pd.Series): rX = pd.Series(data=rX) if isinstance(y, pd.Series): rY = pd.Series(data=rY) return rX, rY
def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) _, y_resampled = cc.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def _under_sampling(table, label_col, sampling_strategy='not majority', seed=None, estimator='KMeans', n_clusters=8, voting='auto', n_jobs=1): # Separate features and label features = table.drop([label_col], axis=1) y = table[label_col] if(sklearn_utils.multiclass.type_of_target(y) == 'continuous'): raise_error('0718', 'label_col') # Initialization label encoder lab_encoder = preprocessing.LabelEncoder() # Filter out categorical columns in features categorical_cols = [col for col in features.columns if features[col].dtypes == 'object'] # Transform categorical columns and add to the original features for cate_col in categorical_cols: features_encoder = lab_encoder.fit_transform(features[cate_col]) features[cate_col] = features_encoder # Transform label column with object type if (y.dtypes == 'object'): y_encoder = lab_encoder.fit_transform(y) else: y_encoder = y if (estimator == 'Kmeans'): estimator_model = KMeans(n_clusters=n_clusters) else: estimator_model = None # Process under sampling sm = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=seed, estimator=estimator_model, voting=voting, n_jobs=n_jobs) X_res, y_res = sm.fit_resample(features, y_encoder) # Invert to original data if (y.dtypes == 'object'): y_decoder = lab_encoder.inverse_transform(y_res) else: y_decoder = y_res df = pd.DataFrame(data=X_res, columns=features.columns) for cate_col in categorical_cols: df[cate_col] = lab_encoder.inverse_transform(df[cate_col].astype('int32')) df1 = pd.DataFrame(data=y_decoder, columns=[label_col]) # Output result out_table = df.join(df1) return {'out_table' : out_table}
def test_fit_resample_auto(): sampling_strategy = 'auto' cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_auto(): sampling_strategy = 'auto' cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_object(): sampling_strategy = "auto" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, )
def cluster_centroids(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): cc = ClusterCentroids(random_state=42) X_res, y_res = cc.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [ 0.47104475, 0.44386323 ], [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], [0.20792588, 1.49407907], [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [0.47104475, 0.44386323], [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], [0.20792588, 1.49407907], [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def clusterCentroidsUnderSample(x, y, label='class'): print('Balancing with clusterCentroids') print('Current x state: ', x.shape) x_columns = x.columns.values sampler = ClusterCentroids(random_state=0) x, y = sampler.fit_resample(x, y) print('Resampled dataset shape %s' % Counter(y)) x_bal = pd.DataFrame(x, columns=x_columns) y_bal = pd.DataFrame(y, columns=[label]) return x_bal, y_bal
def resample(X, Y, resampling): X_resampled, y_resampled = X, Y if resampling == 'oversampling': from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(X, Y) if resampling == 'undersampling': from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(X, Y) if resampling == 'smote': from imblearn.over_sampling import BorderlineSMOTE # from imblearn.over_sampling import SMOTE X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, Y) return X_resampled.fillna(0), y_resampled.fillna(0)
def test_fit_hard_voting(): sampling_strategy = "auto" voting = "hard" cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting, ) X_resampled, y_resampled = cc.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, ) for x in X_resampled: assert np.any(np.all(x == X, axis=1))
def CENTROID_us(X_train, Y_train, seed, sampling_strategy): if not isinstance(sampling_strategy, str): sampling_strategy = compute_sampling_strategy(sampling_strategy, Y_train, 'undersampling') cc = ClusterCentroids(random_state=seed, n_jobs=-1, sampling_strategy=sampling_strategy) print('Before Cluster Centroid undersampling : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = cc.fit_resample(X_train, Y_train) print('After Cluster Centroid undersampling : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def main(): X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.04, 0.95], class_sep=0.8, random_state=42) print(sorted(Counter(y).items())) cc = ClusterCentroids(random_state=42) X_resampled, y_resampled = cc.fit_resample(X, y) print(sorted(Counter(y_resampled).items())) print('DONE')
def test_fit_hard_voting(): sampling_strategy = 'auto' voting = 'hard' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) for x in X_resampled: assert np.any(np.all(x == X, axis=1))
def test_fit_hard_voting(): sampling_strategy = 'auto' voting = 'hard' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) for x in X_resampled: assert np.any(np.all(x == X, axis=1))
def sampling(X_train, y_train, smpl): if smpl == 'ROS': ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_resample(X_train, y_train) elif smpl == 'SMOTE': X_train, y_train = SMOTE().fit_resample(X_train, y_train) elif smpl == 'ADASYN': X_train, y_train = ADASYN().fit_resample(X_train, y_train) elif smpl == 'CC': cc = ClusterCentroids(random_state=0) X_train, y_train = cc.fit_resample(X_train, y_train) elif smpl == 'RUS': rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_resample(X_train, y_train) return X_train, y_train
def cluster(df, drop, target): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) cc = ClusterCentroids(random_state=1) X_resampled, y_resampled = cc.fit_resample(X_train, y_train) model = LogisticRegression(solver='lbfgs', random_state=1) model.fit(X_resampled, y_resampled) y_predictions = model.predict(X_test) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) return acc_score * 100
for i, v in enumerate(importance): print('Feature: %0d, Score: %.5f' % (i, v)) # plot feature importance plt.bar([x for x in range(len(importance))], importance) plt.show() # In[280]: # LOGISTIC REGRESSION WITH ClusterCentroids # In[281]: from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(random_state=1) X_train_cc, Y_train_cc = cc.fit_resample(X_train, Y_train) Counter(Y_train_cc) # In[282]: # Train the Logistic Regression model using the resampled data cluster_model = LogisticRegression(solver='saga', random_state=1, max_iter=1000) cluster_model.fit(X_train_cc, Y_train_cc) # In[283]: Y_pred_LR_cc = cluster_model.predict(X_test) # In[284]:
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set')
def test_fit_resample_check_voting(X, expected_voting): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == expected_voting
def test_fit_resample_error(cluster_centroids_params, err_msg): cc = ClusterCentroids(**cluster_centroids_params) with pytest.raises(ValueError, match=err_msg): cc.fit_resample(X, Y)
def undersample(self, X, y): cc = ClusterCentroids(random_state=12) return cc.fit_resample(X, y)
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0],