def test_fit_resample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble( sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y)
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y)
def ensemble_sample(self, method="BalanceCascade", sampling_strategy="majority", random_state=42, replacement=True): """ 下采样方法 :param method: str, option:'EasyEnsemble','BalanceCascade' :param sampling_strategy: 采样策略, str, dict, 'majority','not minority','not majority','all','auto' :param random_state:int :param replacement: bool :return:df """ feature_name = self._df.columns.difference(["id", self._target]).tolist() X = self._df[feature_name].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "EasyEnsemble": enS = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=random_state, replacement=replacement) elif method == "BalanceCascade": enS = BalanceCascade(sampling_strategy=sampling_strategy, random_state=random_state) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = enS.fit_resample(X, y) print("enSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=feature_name + [self._target]) return df_new
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Easy Ensemble ee = EasyEnsemble(n_subsets=3) X_resampled, y_resampled = ee.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Easy Ensemble ee = EasyEnsemble(n_subsets=3) X_resampled, y_resampled = ee.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1], label="Class #1 - set #{}".format(iy), alpha=0.5)