def test_fit_transform_auto_early_stop(): """Test the fit and transform routine with auto ratio with a static number of subsets.""" # Define the ratio parameter ratio = 'auto' n_subset = 4 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_n_sub.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_n_sub.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_n_sub.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_transform_auto_gradient_boosting(): """Test the fit and transform routine with auto ratio with a gradient boosting.""" # Define the ratio parameter ratio = 'auto' classifier = 'gradient-boosting' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, classifier=classifier) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_gb.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_gb.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_gb.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_bc_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Fit the data bc.fit(X, Y) # Check if the data information have been computed assert_equal(bc.min_c_, 0) assert_equal(bc.maj_c_, 1) assert_equal(bc.stats_c_[0], 500) assert_equal(bc.stats_c_[1], 4500)
def test_bc_fit_invalid_ratio(): """Test either if an error is raised when the balancing ratio to fit is smaller than the one of the data""" # Create the object ratio = 1. / 10000. bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Fit the data assert_raises(RuntimeError, bc.fit, X, Y)
def test_fit_transform_half(): """Test the fit and transform routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Get the different subset X_resampled, y_resampled = bc.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_05.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_transform_wt_fit(): """Test either if an error is raised when transform is called before fitting""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) assert_raises(RuntimeError, bc.transform, X, Y)
def test_bc_fit_single_class(): """Test either if an error when there is a single class""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_raises(RuntimeError, bc.fit, X, y_single_class)
def test_fit_transform_auto(): """Test the fit and transform routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_bc_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. verbose = True bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, verbose=verbose) assert_equal(bc.ratio_, ratio) assert_equal(bc.bootstrap, True) assert_equal(bc.n_max_subset, None) assert_equal(bc.rs_, RND_SEED) assert_equal(bc.verbose, verbose) assert_equal(bc.min_c_, None) assert_equal(bc.maj_c_, None) assert_equal(bc.stats_c_, {})
from unbalanced_dataset.ensemble import BalanceCascade # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling bc = BalanceCascade() X_resampled, y_resampled = bc.fit_transform(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,