def fit(self, c_data, x_data, y_data): # this is to track evolution of the size of the training samples self.samplesize = [] self.samplesize.append(len(x_data)) if self.reject_by_calendar: mask = self.mask_cal(c_data, y_data) # filter rows rejected by this calendar criteria # not filtering them might improve second classifier training #x_data = normalize(x_data[mask]) #y_data = y_data[mask] self.samplesize.append(len(x_data)) if self.use_resampling: # undersample resampler = AllKNN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # oversample resampler = SMOTEENN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # train clf only with filtered and resampled data if self.use_weights: try: self.clf.fit(x_data, y_data, self.get_weights(y_data)) except TypeError: print "The classifier selected does not admit weights for training samples" print "Switching to no weights" self.use_weights = False self.clf.fit(x_data, y_data) else: self.clf.fit(x_data, y_data)
def test_allknn_fit_sample_with_indices(): allknn = AllKNN(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL) assert_allclose(idx_under, idx_gt, rtol=R_TOL)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_with_nn_object(): """Test the fit sample routine using a NN object""" # Resample the data nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL)
def test_all_knn_allow_minority(): X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], class_sep=0.4, random_state=0) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_sample(X, y) allknn = AllKNN() X_res_2, y_res_2 = allknn.fit_sample(X, y) assert len(y_res_1) < len(y_res_2)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_mode(): """Test the fit sample routine using the mode as selection""" # Resample the data allknn = AllKNN(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy')) assert_array_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample(): """Test the fit sample routine""" # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_mode(): """Test the fit sample routine using the mode as selection""" # Resample the data allknn = AllKNN(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy')) assert_array_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data allknn = AllKNN(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt) assert_array_almost_equal(idx_under, idx_gt)
def test_allknn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data allknn = AllKNN(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy')) assert_array_almost_equal(X_resampled, X_gt) assert_array_almost_equal(y_resampled, y_gt) assert_array_almost_equal(idx_under, idx_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ann = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = ann.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 341) assert_equal(count_y_res[1], 2485) assert_equal(count_y_res[2], 212)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ann = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = ann.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 341) assert_equal(count_y_res[1], 2485) assert_equal(count_y_res[2], 212)
def under_sampling(xTrain, yTrain, neighbors=200): """ It reduces the sample size for the majority class using the model specificed. Always it has to be applied to the training set. :param xTrain: X training set. :param yTrain: Y training set. :param neighbors: size of the neighbourhood to consider to compute the average distance to the minority point samples :return: xTrain and yTrain oversampled """ xTrainNames = xTrain.columns.values.tolist() yTrainNames = yTrain.columns.values.tolist() model = AllKNN(random_state=42, ratio='majority', n_neighbors=neighbors) xTrain, yTrain = model.fit_sample(xTrain, yTrain) xTrain = pd.DataFrame(xTrain, columns=[xTrainNames]) yTrain = pd.DataFrame(yTrain, columns=[yTrainNames]) return xTrain, yTrain
''' [(0, 64), (1, 262), (2, 4674)] [(0, 64), (1, 213), (2, 4568)] ''' ''' 在此基础上, 延伸出了RepeatedEditedNearestNeighbours算法, 重复基础的EditedNearestNeighbours算法多次 ''' from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours(random_state=0) X_resampled, y_resampled = renn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 208), (2, 4551)] #与RepeatedEditedNearestNeighbours算法不同的是, ALLKNN算法在进行每次迭代的时候, 最近邻的数量都在增加. from imblearn.under_sampling import AllKNN allknn = AllKNN(random_state=0) X_resampled, y_resampled = allknn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 220), (2, 4601)] #Condensed nearest neighbors and derived algorithms ''' CondensedNearestNeighbour使用1近邻的方法来进行迭代,来判断一个样本是应该保留还是剔除,具体的实现步骤如下: 集合C:所有的少数类样本; 选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S; 使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类; 将集合S中错分的样本加入集合C; 重复上述过程, 直到没有样本再加入到集合C. ''' from imblearn.under_sampling import CondensedNearestNeighbour cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_sample(X, y) print(sorted(Counter(y_resampled).items()))
def sampling(algorithm, x_train, y_train): if (algorithm == 'standard'): print('\nUsing Standard Scaler.\n') scaler = StandardScaler().fit(x_train) X_resampled = scaler.transform(x_train) y_resampled = y_train elif(algorithm == 'undersampling'): # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) print('\nUsing Random Under Sampling.\n') rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using random under-sampling') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'smote'): print('\nUsing SMOTE.\n') # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) kinds = ['regular', 'borderline1', 'borderline2', 'svm'] kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]] print(kind) sm = [SMOTE(kind=k) for k in kind] X_resampled = [] y_resampled = [] X_res_vis = [] for method in sm: X_res, y_res = method.fit_sample(x_train, y_train) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2) ax2.axis('off') ax_res = [ax3, ax4, ax5, ax6] c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set') for i in range(len(kind)): plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i], 'SMOTE {}'.format(kind[i])) ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center', ncol=1, labelspacing=0.) plt.tight_layout() plt.show() elif(algorithm=='neighbourhood'): print('\nUsing Neighbourhood Cleaning Rule.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using neighbourhood cleaning rule') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'ENN'): print('\nUsing ENN.\n') enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'RENN'): print('\nUsing RENN.\n') renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'AllKNN'): print('\nUsing AllKNN.\n') allknn = AllKNN(return_indices=True) X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'centroids'): print('\nUsing Cluster Centroids.\n') # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) elif(algorithm == 'centroidshard'): print('\nUsing Cluster Centroids with Hard Voting.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis_soft[y_resampled == 0, 0], X_res_vis_soft[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis_soft[y_resampled == 1, 0], X_res_vis_soft[y_resampled == 1, 1], label="Class #1", alpha=.5) ax2.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Original #1", alpha=0.2) ax2.set_title('Cluster centroids with soft voting') ax3.scatter(X_res_vis_hard[y_resampled == 0, 0], X_res_vis_hard[y_resampled == 0, 1], label="Class #0", alpha=.5) ax3.scatter(X_res_vis_hard[y_resampled == 1, 0], X_res_vis_hard[y_resampled == 1, 1], label="Class #1", alpha=.5) ax3.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], alpha=0.2) ax3.set_title('Cluster centroids with hard voting') # make nice plotting for ax in (ax1, ax2, ax3): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 8]) ax.set_ylim([-6, 6]) plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'), loc='lower center', ncol=3, labelspacing=0.) plt.tight_layout(pad=3) plt.show() else: # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) return x_train, y_train return X_resampled, y_resampled
try: X_resampled, Y_resampled = enn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y elif index == 6: renn = RepeatedEditedNearestNeighbours(random_state=0) try: X_resampled, Y_resampled = renn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y elif index == 7: allknn = AllKNN(random_state=0) try: X_resampled, Y_resampled = allknn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y return X_resampled, Y_resampled algo_list = ['dt', 'GaNB', 'linear_svc', 'logistic', 'nn', 'rf', 'svc'] X_list = [] Y_list = [] for algo in algo_list: username_val, X, Y = read_file(algo) X_list.append(X)
def test_alknn_not_good_object(): nn = 'rnd' allknn = AllKNN(n_neighbors=nn, kind_sel='mode') with raises(ValueError): allknn.fit_sample(X, Y)
print('RENN') renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax3.set_title('Repeated Edited nearest neighbours') # Apply the AllKNN print('AllKNN') allknn = AllKNN() X_resampled, y_resampled = allknn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax4.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax4.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax4.set_title('AllKNN') plt.show()
def fit_sample(self, X, y): allknn = AllKNN() return allknn.fit_sample(X, y)
def test_deprecation_random_state(): allknn = AllKNN(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): allknn.fit_sample(X, Y)