def test_random_over_sampler_shrinkage_behaviour(data): # check the behaviour of the shrinkage parameter # the covariance of the data generated with the larger shrinkage factor # should also be larger. X, y = data ros = RandomOverSampler(shrinkage=1, random_state=0) X_res_shink_1, y_res_shrink_1 = ros.fit_resample(X, y) ros.set_params(shrinkage=5) X_res_shink_5, y_res_shrink_5 = ros.fit_resample(X, y) disperstion_shrink_1 = np.linalg.det( np.cov(X_res_shink_1[y_res_shrink_1 == 0].T)) disperstion_shrink_5 = np.linalg.det( np.cov(X_res_shink_5[y_res_shrink_5 == 0].T)) assert disperstion_shrink_1 < disperstion_shrink_5
plot_decision_function(X, y, model, axs[1], f"Using {model[0].__class__.__name__}") fig.suptitle(f"Decision function of {clf.__class__.__name__}") fig.tight_layout() # %% [markdown] # By default, random over-sampling generates a bootstrap. The parameter # `shrinkage` allows adding a small perturbation to the generated data # to generate a smoothed bootstrap instead. The plot below shows the difference # between the two data generation strategies. # %% fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 7)) sampler.set_params(shrinkage=None) plot_resampling(X, y, sampler, ax=axs[0], title="Normal bootstrap") sampler.set_params(shrinkage=0.3) plot_resampling(X, y, sampler, ax=axs[1], title="Smoothed bootstrap") fig.suptitle(f"Resampling with {sampler.__class__.__name__}") fig.tight_layout() # %% [markdown] # It looks like more samples are generated with smoothed bootstrap. This is due # to the fact that the samples generated are not superimposing with the # original samples. # # More advanced over-sampling using ADASYN and SMOTE # --------------------------------------------------