def test_rest(x, y,c=0,ratio='auto'): c=c if(c==0): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==1): print('Tomek links') TL = TomekLinks(verbose=verbose,ratio=ratio) x, y = TL.fit_transform(x, y) elif(c==2): print('Clustering centroids') CC = ClusterCentroids(verbose=verbose,ratio=ratio) x, y = CC.fit_transform(x, y) elif(c==3): print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==4): print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==5): print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==6): print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose,ratio=ratio) x, y, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) elif(c==7): print('Random over-sampling') OS = OverSampler(verbose=verbose,ratio=ratio) x, y = OS.fit_transform(x, y) elif(c==8): print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose,ratio=ratio) x, y = STK.fit_transform(x, y) elif(c==9): print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose,ratio=ratio) x, y = SENN.fit_transform(x, y) else: print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose,ratio=ratio) x, y = EE.fit_transform(x, y) return x, y
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(indices_support=indices_support, verbose=verbose) usx, usy, idx_tmp = US.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose) nm1x, nm1y, idx_tmp = NM1.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-2') NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose) nm2x, nm2y, idx_tmp = NM2.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('NearMiss-3') NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose) nm3x, nm3y, idx_tmp = NM3.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose) ncrx, ncry, idx_tmp = NCR.fit_transform(x, y) print ('Indices selected') print(idx_tmp) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def nearmiss_undersampling(X, y, version): """ Perform NearMiss undersampling Keyword arguments: X -- The feature vectors y -- The target classes """ if verbose: print '\nUndersampling with NearMiss-' + str(version) + ' ...' undersampler = NearMiss(verbose=verbose, version=version) X_undersampled, y_undersampled = undersampler.fit_transform(X, y) return X_undersampled, y_undersampled
def nearmiss_undersampling(X,y,version): """ Perform NearMiss undersampling Keyword arguments: X -- The feature vectors y -- The target classes """ if verbose: print '\nUndersampling with NearMiss-'+str(version)+' ...' undersampler=NearMiss(verbose=verbose,version=version) X_undersampled,y_undersampled = undersampler.fit_transform(X,y) return X_undersampled,y_undersampled
def test_nm1_fit_transform_half(): """Test fit and transform routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .5 # Create the object nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and transform X_resampled, y_resampled = nm1.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm1_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm1_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm1_fit_transform_auto_indices(): """Test fit and transform routines with auto ratio and indices support""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True) # Fit and transform X_resampled, y_resampled, idx_under = nm1.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm1_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm1_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'nm1_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
from unbalanced_dataset.under_sampling import NearMiss # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling nm3 = NearMiss(version=1) X_resampled, y_resampled = nm3.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling nm3 = NearMiss(version=1) X_resampled, y_resampled = nm3.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",