예제 #1
0
def test_fit_resample_half():
    # Define the sampling_strategy parameter
    sampling_strategy = {0: 2, 1: 3, 2: 3}

    # Create the sampling object
    ee = EasyEnsemble(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)

    X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556],
                      [1.35269503, 0.44812421], [-1.23195149, 0.15427291],
                      [0.5220963, 0.11349303], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [-2.10724436, 0.70263997],
                      [-1.23195149, 0.15427291], [0.59091459, 0.40692742],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.35269503, 0.44812421], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]]])
    y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2],
                     [0, 0, 1, 1, 1, 2, 2, 2]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #2
0
def test_fit_resample_half():
    # Define the sampling_strategy parameter
    sampling_strategy = {0: 2, 1: 3, 2: 3}

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy,
                      random_state=RND_SEED,
                      n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)

    X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556],
                      [1.35269503, 0.44812421], [-1.23195149, 0.15427291],
                      [0.5220963, 0.11349303], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [-2.10724436, 0.70263997],
                      [-1.23195149, 0.15427291], [0.59091459, 0.40692742],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.35269503, 0.44812421], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]]])
    y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2],
                     [0, 0, 1, 1, 1, 2, 2, 2]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #3
0
def test_random_state_none():
    # Define the sampling_strategy parameter
    sampling_strategy = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)
예제 #4
0
def test_random_state_none():
    # Define the sampling_strategy parameter
    sampling_strategy = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)
예제 #5
0
    def ensemble_sample(self,
                        method="BalanceCascade",
                        sampling_strategy="majority",
                        random_state=42,
                        replacement=True):
        """
        下采样方法
        :param method: str, option:'EasyEnsemble','BalanceCascade'
        :param sampling_strategy: 采样策略, str, dict, 'majority','not minority','not majority','all','auto'
        :param random_state:int
        :param replacement: bool
        :return:df
        """
        feature_name = self._df.columns.difference(["id",
                                                    self._target]).tolist()
        X = self._df[feature_name].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "EasyEnsemble":
            enS = EasyEnsemble(sampling_strategy=sampling_strategy,
                               random_state=random_state,
                               replacement=replacement)
        elif method == "BalanceCascade":
            enS = BalanceCascade(sampling_strategy=sampling_strategy,
                                 random_state=random_state)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = enS.fit_resample(X, y)

        print("enSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data,
                              columns=feature_name + [self._target])
        return df_new
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=100,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Easy Ensemble
ee = EasyEnsemble(n_subsets=3)
X_resampled, y_resampled = ee.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0],
                e[y_resampled[iy] == 1, 1],
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Easy Ensemble
ee = EasyEnsemble(n_subsets=3)
X_resampled, y_resampled = ee.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],
                label="Class #1 - set #{}".format(iy), alpha=0.5)