def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_sample(X, Y)
def neighbourhood_cleaning_rule(feature_list_of_all_instances, class_list_of_all_instances,neighbours): # Apply neighbourhood cleaning rule c1 = 0 c2 = 0 count = 0 for i in class_list_of_all_instances: if i == 1: c1 += 1 if i == 0: c2 += 1 if i != 1 and i != 0: count += 1 print(" Data of class 1 ", c1, " ,Data of cls 0 ", c2, ",Other class ", count) # for i in range(5,200,5): ncl = NeighbourhoodCleaningRule(n_neighbors=neighbours, n_jobs=4) X_resampled, y_resampled = ncl.fit_sample(feature_list_of_all_instances, class_list_of_all_instances) # X_res_vis = pca.transform(X_resampled) # 13 print(" Cleaned ", len(feature_list_of_all_instances) - len(X_resampled), " points", end='') c1 = 0 c2 = 0 for ii in y_resampled: if ii == 1: c1 += 1 if ii == 0: c2 += 1 print(" and data of class 1 ", c1, "data of cls 0 ", c2, "for ", neighbours, "neighbours ") return X_resampled, y_resampled # feature_list_of_all_instances,class_list_of_all_instances=neighbourhood_cleaning_rule(feature_list_of_all_instances,class_list_of_all_instances)
def test_ncr_fit_sample(): # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_mode(): ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample(): """Test the fit sample routine""" # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 1, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 5, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 2268) assert_equal(count_y_res[2], 42)
def test_ncr_fit_sample_nn_obj(): # Resample the data nn = NearestNeighbors(n_neighbors=3) ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [-0.91735824, 0.93110278], [0.35967591, 2.61186964], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([0, 0, 1, 2, 1, 2]) idx_gt = np.array([10, 11, 3, 7, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') X_resampled, y_resampled = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [-0.20413357, 0.64628718], [0.35967591, 2.61186964], [0.90701028, -0.57636928], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [1.39272351, -0.51631728], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
rus = NeighbourhoodCleaningRule(sampling_strategy='all') # rus = TomekLinks(sampling_strategy='all') # rus = RandomUnderSampler(sampling_strategy="not minority") # rus = OneSidedSelection(sampling_strategy='all',n_seeds_S=1000) # for i in unique_labels: # print("Before Class" + str(i) + "number of samples: ", len(crops_only_flatten[crops_only_flatten==i])) # file.write("Before Class" + str(i) + "number of samples: " + str(len(crops_only_flatten[crops_only_flatten==i])) + "\n") # print("Before Class 0 number of samples: ", len(crops_only_flatten[crops_only_flatten==0])) # print("Before Class 1 number of samples: ", len(crops_only_flatten[crops_only_flatten==1])) # print("Before Class 2 number of samples: ", len(crops_only_flatten[crops_only_flatten==2])) # print("Before Class 10 number of samples: ", len(crops_only_flatten[crops_only_flatten==10])) # print() start_train = time.time() X_rus, y_rus = rus.fit_sample(data_array_combined_flatten, crops_only_flatten) end_train = time.time() print("Balancing time: ", end_train - start_train) # print() # print("After Class 0 number of samples: ", len(y_rus[y_rus==0])) # print("After Class 1 number of samples: ", len(y_rus[y_rus==1])) # print("After Class 2 number of samples: ", len(y_rus[y_rus==2])) # print("After Class 10 number of samples: ", len(y_rus[y_rus==10])) # print() file.write("Balancing time: " + str(end_train - start_train) + "\n\n") for i in unique_labels: print("Before Class " + str(i) + " number of samples: ", len(crops_only_flatten[crops_only_flatten == i])) file.write("Before Class " + str(i) + " number of samples: " +
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8,
def undersampling(X, Y): rus = NeighbourhoodCleaningRule(ratio='majority') x_new, y_new = rus.fit_sample(X, Y) return (x_new, y_new)
from imblearn.under_sampling import NeighbourhoodCleaningRule # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply neighbourhood cleaning rule ncl = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncl.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_sample(X, Y)
def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_sample(X, Y)
def sampling(algorithm, x_train, y_train): if (algorithm == 'standard'): print('\nUsing Standard Scaler.\n') scaler = StandardScaler().fit(x_train) X_resampled = scaler.transform(x_train) y_resampled = y_train elif(algorithm == 'undersampling'): # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) print('\nUsing Random Under Sampling.\n') rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using random under-sampling') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'smote'): print('\nUsing SMOTE.\n') # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) kinds = ['regular', 'borderline1', 'borderline2', 'svm'] kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]] print(kind) sm = [SMOTE(kind=k) for k in kind] X_resampled = [] y_resampled = [] X_res_vis = [] for method in sm: X_res, y_res = method.fit_sample(x_train, y_train) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2) ax2.axis('off') ax_res = [ax3, ax4, ax5, ax6] c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set') for i in range(len(kind)): plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i], 'SMOTE {}'.format(kind[i])) ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center', ncol=1, labelspacing=0.) plt.tight_layout() plt.show() elif(algorithm=='neighbourhood'): print('\nUsing Neighbourhood Cleaning Rule.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using neighbourhood cleaning rule') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'ENN'): print('\nUsing ENN.\n') enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'RENN'): print('\nUsing RENN.\n') renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'AllKNN'): print('\nUsing AllKNN.\n') allknn = AllKNN(return_indices=True) X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'centroids'): print('\nUsing Cluster Centroids.\n') # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) elif(algorithm == 'centroidshard'): print('\nUsing Cluster Centroids with Hard Voting.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis_soft[y_resampled == 0, 0], X_res_vis_soft[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis_soft[y_resampled == 1, 0], X_res_vis_soft[y_resampled == 1, 1], label="Class #1", alpha=.5) ax2.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Original #1", alpha=0.2) ax2.set_title('Cluster centroids with soft voting') ax3.scatter(X_res_vis_hard[y_resampled == 0, 0], X_res_vis_hard[y_resampled == 0, 1], label="Class #0", alpha=.5) ax3.scatter(X_res_vis_hard[y_resampled == 1, 0], X_res_vis_hard[y_resampled == 1, 1], label="Class #1", alpha=.5) ax3.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], alpha=0.2) ax3.set_title('Cluster centroids with hard voting') # make nice plotting for ax in (ax1, ax2, ax3): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 8]) ax.set_ylim([-6, 6]) plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'), loc='lower center', ncol=3, labelspacing=0.) plt.tight_layout(pad=3) plt.show() else: # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) return x_train, y_train return X_resampled, y_resampled
X_resampled, y_resampled = cnn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中. #因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本). from imblearn.under_sampling import OneSidedSelection oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用 EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集. ''' from imblearn.under_sampling import NeighbourhoodCleaningRule ncr = NeighbourhoodCleaningRule(random_state=0) X_resampled, y_resampled = ncr.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉. from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression()) X_resampled, y_resampled = iht.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 64), (2, 64)] ''' 过采样与下采样的结合 在之前的SMOTE方法中,当由边界的样本与其他样本进行过采样差值时,很容易生成一些噪音数据.
def fit(self, X, y): if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') cv_predictions = [] targets = [] # klonowanie self.estimators_ = [ clone(estimator) for _, estimator in self.estimators ] # dodawanie klasyfikatorow AdaBoost for clf in self.estimators_ada: self.clfs.append( AdaBoostClassifier(clone(clf), n_estimators=self.n_estimators)) # dodawanie klasyfikatorow Bagging for clf in self.estimators_bag: self.clfs.append( BaggingClassifier(clone(clf), n_estimators=100, max_samples=0.9)) self.clfs.append( StackingClassifier(classifiers=self.estimators_, meta_classifier=LogisticRegression())) self.clfs.append(clf_expert(self.estimators)) # ocena klasyfikatorow for clf in self.clfs: testpredict, testtarget = cross_val_pred2ict(clf, X, y, cv=self.n_folds, n_jobs=1) cv_predictions.append((testpredict)) targets.append(testtarget) skf = StratifiedKFold(n_splits=2, random_state=self.random_st) # trenowanie i ocenianie klasyfiktorow dla zbioru SMOTE i NCR for clf in self.clfs: for method, name in zip(self.methoda, self.name_met): metodaa = SMOTE(k_neighbors=3, random_state=self.random_st) metodaj = NeighbourhoodCleaningRule( n_neighbors=3, random_state=self.random_st) predict_re = [] targets_re = [] for train_index, test_index in skf.split(X, y): if method == 0: data_re, tar_re = metodaa.fit_sample( np.asarray(X[train_index]), np.asarray(y[train_index])) else: data_re, tar_re = metodaj.fit_sample( np.asarray(X[train_index]), np.asarray(y[train_index])) clf_ = clone(clf) # trenowanie clf_.fit(data_re, tar_re) # testowanie predict_re.append(clf_.predict(X[test_index])) targets_re.append(y[test_index]) cv_predictions.append((predict_re)) targets.append(targets_re) # wylanianie 2 najlepszych ekspertow for idx, (prediction, target) in enumerate(zip(cv_predictions, targets)): matrixes1 = [] matrixes2 = [] for pred, tar in zip(prediction, target): matrixes1.append(simplefunctions.confusion_matrix(tar, pred)) for matrix in matrixes1: matrixes2.append( np.array([[matrix[1, 1], matrix[1, 0]], [matrix[0, 1], matrix[0, 0]]])) fun_cmp = getattr(simplefunctions, self.function_compare)(matrixes1) if fun_cmp > self.max_g[0]: self.clf_id[1] = self.clf_id[0] self.clf_id[0] = idx self.max_g[1] = self.max_g[0] self.max_g[0] = fun_cmp elif fun_cmp > self.max_g[1]: self.clf_id[2] = self.clf_id[1] self.clf_id[1] = idx self.max_g[2] = self.max_g[1] self.max_g[0] = fun_cmp elif fun_cmp > self.max_g[2]: self.clf_id[2] = idx self.max_g[2] = fun_cmp for clf_id in self.clf_id: if clf_id > len(self.estimators_ada) + len(self.estimators_bag): if clf_id % 2 == 0: met = self.methods[0] data_re, tar_re = met.fit_sample(X, y) clf_ = clone(self.clfs[(clf_id - 7) / 2]) self.ensemble_.append(clf_.fit(data_re, tar_re)) else: met = self.methods[1] data_re, tar_re = met.fit_sample(X, y) clf_ = clone(self.clfs[(clf_id - 7) / 2]) self.ensemble_.append(clf_.fit(data_re, tar_re)) else: clf_ = clone(self.clfs[clf_id]) self.ensemble_.append(clf_.fit(X, y)) meta_features = self._predict_meta_features(X) self.meta_clf_.fit(meta_features, y)
# CREATE TRAIN TEST DATASETS FOR ML CLASSIFIERS train_test_ratio = 0.2 if clean_dataset == 'yes': file.write("UnderSampling: " + str(clean_dataset) + "\n\n") total_elements = len(labels) background_elements = len(labels[labels != 0]) resample_dict = {0: int(background_elements)} rus = NeighbourhoodCleaningRule(sampling_strategy='all') # rus = TomekLinks(sampling_strategy='all') # rus = RandomUnderSampler(sampling_strategy="not minority") # rus = OneSidedSelection(sampling_strategy='all',n_seeds_S=1000) start_train = time.time() X_rus, y_rus = rus.fit_sample(data, labels) end_train = time.time() print("Balancing time: ", end_train - start_train) file.write("Balancing time: " + str(end_train - start_train) + "\n\n") for i in labels_nums: print("Before Class " + str(i) + " number of samples: ", len(labels[labels == i])) file.write("Before Class " + str(i) + " number of samples: " + str(len(labels[labels == i])) + "\n") file.write("\n") for i in labels_nums: print("After Class " + str(i) + " number of samples: ", len(y_rus[y_rus == i])) file.write("After Class " + str(i) + " number of samples: " +
def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule( return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_sample(X, Y)
rows[i].append(score) print("----------") print(str(clf)) print_scores(testpredict, testtarget) # NCR ncr_ = NeighbourhoodCleaningRule(random_state=random_st, n_neighbors=3) scores = [] # powtorzenie X razy i obliczenie sredniej for iteration in range(iterations): predict_re = [] targets_re = [] for train_index, test_index in skf.split(db.data, db.target): data_re, tar_re = ncr_.fit_sample(db.data[train_index], db.target[train_index]) clf_ = clone(clf) # trenowanie clf_.fit(data_re, tar_re) # testowanie predict_re.append(clf_.predict(db.data[test_index])) targets_re.append(db.target[test_index]) # obliczanie wyniku ze sprawdzianu krzyzowego scores.append(accsespf1g(predict_re, targets_re)) print("NCR") print(str(clf)) print_scores(predict_re, targets_re) avgscores = avgaccsespf1g(scores)