def test_rest(x, y,c=0,ratio='auto'): c=c if(c==0): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==1): print('Tomek links') TL = TomekLinks(verbose=verbose,ratio=ratio) x, y = TL.fit_transform(x, y) elif(c==2): print('Clustering centroids') CC = ClusterCentroids(verbose=verbose,ratio=ratio) x, y = CC.fit_transform(x, y) elif(c==3): print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==4): print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==5): print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==6): print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==7): print('Random over-sampling') OS = OverSampler(verbose=verbose,ratio=ratio) x, y = OS.fit_transform(x, y) elif(c==8): print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose,ratio=ratio) x, y = STK.fit_transform(x, y) elif(c==9): print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose,ratio=ratio) x, y = SENN.fit_transform(x, y) else: print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose,ratio=ratio) x, y = EE.fit_transform(x, y) return x, y
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose) usx, usy, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose) nm1x, nm1y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose) nm2x, nm2y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose) nm3x, nm3y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose) ncrx, ncry, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def test_ncr_fit_transform(): """Test the fit transform routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit(): """Test the fitting method""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) # Fit the data ncr.fit(X, Y) # Check if the data information have been computed assert_equal(ncr.min_c_, 0) assert_equal(ncr.maj_c_, 1) assert_equal(ncr.stats_c_[0], 500) assert_equal(ncr.stats_c_[1], 4500)
def test_ncr_fit_transform_with_indices(): """Test the fit transform routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def ncl_undersampling(X, y): """ Perform the Neighbourhood Cleaning Rule undersampling Keyword arguments: X -- The feature vectors y -- The target vector """ if verbose: print '\nUndersampling with the Neighbourhood Cleaning Rule ...' undersampler = NeighbourhoodCleaningRule(verbose=verbose) X_undersampled, y_undersampled = undersampler.fit_transform(X, y) return X_undersampled, y_undersampled
def ncl_undersampling(X,y): """ Perform the Neighbourhood Cleaning Rule undersampling Keyword arguments: X -- The feature vectors y -- The target vector """ if verbose: print '\nUndersampling with the Neighbourhood Cleaning Rule ...' undersampler=NeighbourhoodCleaningRule(verbose=verbose) X_undersampled,y_undersampled = undersampler.fit_transform(X,y) return X_undersampled,y_undersampled
def test_ncr_transform_wt_fit(): """Test either if an error is raised when transform is called before fitting""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) assert_raises(RuntimeError, ncr.transform, X, Y)
def test_ncr_fit_single_class(): """Test either if an error when there is a single class""" # Create the object ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_raises(RuntimeError, ncr.fit, X, y_single_class)
def test_ncr_init(): """Test the initialisation of the object""" # Define a ratio verbose = True ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, verbose=verbose) assert_equal(ncr.size_ngh, 3) assert_equal(ncr.n_jobs, -1) assert_equal(ncr.rs_, RND_SEED) assert_equal(ncr.verbose, verbose) assert_equal(ncr.min_c_, None) assert_equal(ncr.maj_c_, None) assert_equal(ncr.stats_c_, {})
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling ncl = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncl.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],