def test_renn_fit_sample_mode(): """Test the fit sample routine using the mode as selection""" # Resample the data nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample(): """Test the fit sample routine""" # Resample the data renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_with_indices(): renn = RepeatedEditedNearestNeighbours( return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def rnn_undersampling( self, x: pandas.DataFrame, y: numpy.ndarray, neighbors: int) -> typing.Tuple[numpy.ndarray, numpy.ndarray]: """ Repeated Edited Nearest Neighbors. Args: x: X training covariates for the ML model. y: y training binary outcomes of the ML model. Returns: resampled (undersampled) observations that reduce bias in the receiving operating characteristic (ROC). """ x = self.check_id(x) rnn_undersampler = RepeatedEditedNearestNeighbours( random_state=82, n_neighbors=neighbors, return_indices=True, kind_sel="mode", max_iter=400, ratio="majority", ) X_resampled, y_resampled, resampled_idx = rnn_undersampler.fit_sample( copy.deepcopy(x), copy.deepcopy(y)) LOGGER.info(X_resampled) LOGGER.info( "RNN undersampling yielded {} number of X_resampled observations". format(len(X_resampled))) LOGGER.info(y_resampled) LOGGER.info( "RNN undersampling yielded {} number of y_resampled observations". format(len(y_resampled))) return X_resampled, y_resampled
def test_renn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data renn = RepeatedEditedNearestNeighbours( return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_renn_fit_sample(): """Test the fit sample routine""" # Resample the data renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = renn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data renn = RepeatedEditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'renn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 378) assert_equal(count_y_res[1], 1828) assert_equal(count_y_res[2], 5)
def test_renn_fit_sample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_iter_wrong(): max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) with raises(ValueError): renn.fit_sample(X, Y)
continue b_ref_dict[B] = [c_id, c_score, ref_dict_B] ref_by_b = selectTopKref(b_ref_dict) ref_overlap_A = overlapWithA(ref_by_b, ref_dict_A) return ref_by_b, ref_overlap_A, ref_dict_A # 14608(0.7,yes) 14895(0.047,no) dictionary = gensim.corpora.Dictionary.load("dictionary") tf_idf = gensim.models.TfidfModel.load("tf_idf") X_no, y_no = getXY("features_no.csv") X_yes, y_yes = getXY("features_yes.csv") training_testing_variation = 0.3 wiki_As, train_yes, train_no = getTrainingTestingData("features9.csv", len(X_yes), scale=1) print "No. yes", len(train_yes) print "No. no", len(train_no) print "Test set of A's", len(wiki_As) X_train = train_yes + train_no y_train = ['Y'] * len(train_yes) + ['N'] * len(train_no) renn = RepeatedEditedNearestNeighbours(random_state=0) X_resampled, y_resampled = renn.fit_sample(X_train, y_train) save_pickle(X_resampled, "X_resampled_renn_9_1234567") save_pickle(y_resampled, "y_resampled_renn_9_1234567")
X_resampled, y_resampled = enn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax2.set_title('Edited nearest neighbours') # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax3.set_title('Repeated Edited nearest neighbours') # Apply the AllKNN print('AllKNN') allknn = AllKNN() X_resampled, y_resampled = allknn.fit_sample(X, y)
def sampling(algorithm, x_train, y_train): if (algorithm == 'standard'): print('\nUsing Standard Scaler.\n') scaler = StandardScaler().fit(x_train) X_resampled = scaler.transform(x_train) y_resampled = y_train elif(algorithm == 'undersampling'): # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) print('\nUsing Random Under Sampling.\n') rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using random under-sampling') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'smote'): print('\nUsing SMOTE.\n') # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) kinds = ['regular', 'borderline1', 'borderline2', 'svm'] kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]] print(kind) sm = [SMOTE(kind=k) for k in kind] X_resampled = [] y_resampled = [] X_res_vis = [] for method in sm: X_res, y_res = method.fit_sample(x_train, y_train) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2) ax2.axis('off') ax_res = [ax3, ax4, ax5, ax6] c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set') for i in range(len(kind)): plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i], 'SMOTE {}'.format(kind[i])) ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center', ncol=1, labelspacing=0.) plt.tight_layout() plt.show() elif(algorithm=='neighbourhood'): print('\nUsing Neighbourhood Cleaning Rule.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) ncl = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 6]) ax.set_ylim([-6, 6]) plt.title('Under-sampling using neighbourhood cleaning rule') plt.legend() plt.tight_layout() plt.show() elif(algorithm == 'ENN'): print('\nUsing ENN.\n') enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'RENN'): print('\nUsing RENN.\n') renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'AllKNN'): print('\nUsing AllKNN.\n') allknn = AllKNN(return_indices=True) X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train) reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) / len(x_train)))) print(reduction_str) elif(algorithm == 'centroids'): print('\nUsing Cluster Centroids.\n') # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) elif(algorithm == 'centroidshard'): print('\nUsing Cluster Centroids with Hard Voting.\n') pca = PCA(n_components=2) X_vis = pca.fit_transform(x_train) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_sample(x_train, y_train) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis_soft[y_resampled == 0, 0], X_res_vis_soft[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis_soft[y_resampled == 1, 0], X_res_vis_soft[y_resampled == 1, 1], label="Class #1", alpha=.5) ax2.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Original #1", alpha=0.2) ax2.set_title('Cluster centroids with soft voting') ax3.scatter(X_res_vis_hard[y_resampled == 0, 0], X_res_vis_hard[y_resampled == 0, 1], label="Class #0", alpha=.5) ax3.scatter(X_res_vis_hard[y_resampled == 1, 0], X_res_vis_hard[y_resampled == 1, 1], label="Class #1", alpha=.5) ax3.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], alpha=0.2) ax3.set_title('Cluster centroids with hard voting') # make nice plotting for ax in (ax1, ax2, ax3): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) ax.set_xlim([-6, 8]) ax.set_ylim([-6, 6]) plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'), loc='lower center', ncol=3, labelspacing=0.) plt.tight_layout(pad=3) plt.show() else: # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train) return x_train, y_train return X_resampled, y_resampled
def pre_process(train_index, test_index): train_x, test_x = X_train_all[train_index], X_train_all[test_index] train_y, test_y = y_train[train_index], y_train[test_index] #Class Balance on the training split if class_balance_method == 'rand_under': rus = RandomUnderSampler(sampling_strategy='majority', random_state=0) train_x, train_y = rus.fit_sample(train_x, train_y) elif class_balance_method == 'enn': enn = EditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = enn.fit_sample(train_x, train_y) elif class_balance_method == 'renn': renn = RepeatedEditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = renn.fit_sample(train_x, train_y) elif class_balance_method == 'tomek': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) elif class_balance_method == 'tomek_enn': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) enn = EditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = enn.fit_sample(train_x, train_y) elif class_balance_method == 'tomek_renn': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) renn = RepeatedEditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = renn.fit_sample(train_x, train_y) #Feature Selection on the training split #For all methods except the relief based feature_scores = 'N/A' if feature_selection_method == 'no': selected_features = X_df.columns elif feature_selection_method == 'chi2': selected_features, X_train_df, train_x, test_x = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) elif feature_selection_method == 'anovaF': selected_features, X_train_df, train_x, test_x = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) elif feature_selection_method == 'reliefF': selected_features, feature_scores, train_x, test_x = relieff_fs( X_df, train_x, test_x, train_y) elif feature_selection_method == 'multisurf': selected_features, feature_scores, train_x, test_x = multisurf_fs( X_df, train_x, test_x, train_y) elif feature_selection_method == 'chi2_reliefF': selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = relieff_fs( X_train_df, X_train_chi2, X_test_chi2, train_y) elif feature_selection_method == 'chi2_multisurf': selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = multisurf_fs( X_train_df, X_train_chi2, X_test_chi2, train_y) elif feature_selection_method == 'anova_reliefF': selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = relieff_fs( X_train_df, X_train_anova, X_test_anova, train_y) elif feature_selection_method == 'anova_multisurf': selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = multisurf_fs( X_train_df, X_train_anova, X_test_anova, train_y) return train_x, train_y, test_x, test_y, selected_features, feature_scores
def test_deprecation_random_state(): renn = RepeatedEditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): renn.fit_sample(X, Y)
x = np.array( UserData. loc[:, ['wk2', 'wk4', 'wk6', 'wk8', 'Nchans', 'Nusers', 'chanScore']]) x = np.array(UserData.loc[:, ['wk2', 'wk4', 'Nchans', 'Nusers', 'chanScore']]) x = np.nan_to_num(x) y = np.nan_to_num(y) #~~~~~~~~~~~~~~over sampling to deal with class imbalance ~~~~~~~~~~~~~~~~~~~ sm = SMOTE(kind='svm') tm = TomekLinks() renn = RepeatedEditedNearestNeighbours() x_res, y_res = sm.fit_sample(x, y) x_res, y_res = tm.fit_sample(x, y) x_res, y_res = renn.fit_sample(x, y) train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2) train_x, test_x, train_y, test_y = train_test_split(x_res, y_res, test_size=0.2) C = np.corrcoef(x.T) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ random forest RF = RandomForestClassifier(min_samples_leaf=5) RF = RF.fit(train_x, train_y) y_pred = RF.predict(test_x) y_score = RF.predict_proba(test_x) print(RF.score(test_x, test_y))
'sentiment'] = X_test.loc[:, 'body_polarity'] + X_test.loc[:, 'title_polarity'] + X_test.loc[:, 'body_subjectivity'] + X_test.loc[:, 'title_subjectivity'] X_train.drop(dropcols, axis=1, inplace=True) X_test.drop(dropcols, axis=1, inplace=True) print(X_train.head) from imblearn.under_sampling import (AllKNN, EditedNearestNeighbours, RepeatedEditedNearestNeighbours) print('RENN') enn = RepeatedEditedNearestNeighbours(return_indices=True) X_res, Y_res, idx_res = enn.fit_sample(X_train, Y_train) reduction_str = ('Reduced {:.2f}%'.format( 100 * (1 - float(len(X_res)) / len(X_train)))) print(reduction_str) print(X_res.shape, Y_res.shape) print(Y_res.sum(), Y_train.sum()) from xgboost import XGBClassifier from sklearn.model_selection import train_test_split import xgboost as xgb X_rs_trn, X_rs_val, Y_res_trn, Y_res_val = train_test_split(X_res, Y_res, test_size=0.075, shuffle=True,
try: X_resampled, Y_resampled = nm1.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y elif index == 5: enn = EditedNearestNeighbours(random_state=0) try: X_resampled, Y_resampled = enn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y elif index == 6: renn = RepeatedEditedNearestNeighbours(random_state=0) try: X_resampled, Y_resampled = renn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y elif index == 7: allknn = AllKNN(random_state=0) try: X_resampled, Y_resampled = allknn.fit_sample(X, Y) except Exception, e: print str(e) X_resampled, Y_resampled = X, Y return X_resampled, Y_resampled algo_list = ['dt', 'GaNB', 'linear_svc', 'logistic', 'nn', 'rf', 'svc']
] return classifier_list, classifier_name_list def print_evaluation_metrics(trained_model, trained_model_name, X_test, y_test): print '--------- For Model : ', trained_model_name, ' ---------------\n' predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test, predicted_values) print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values) print "---------------------------------------\n" filename = 'creditcard.csv' credit_frame = pd.read_csv(filename) class_labels = list(credit_frame['Class'].values) del credit_frame['Class'] random_under = RandomUnderSampler(random_state=42) rnn = RepeatedEditedNearestNeighbours(random_state=42) tomek = TomekLinks(random_state=42) X, y = rnn.fit_sample(credit_frame.values, class_labels) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) classifier_list, classifier_name_list = get_ensemble_models() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
def test_renn_not_good_object(): nn = 'rnd' renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, kind_sel='mode') with raises(ValueError): renn.fit_sample(X, Y)
''' print(sorted(Counter(y).items())) from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours(random_state=0) X_resampled, y_resampled = enn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' [(0, 64), (1, 262), (2, 4674)] [(0, 64), (1, 213), (2, 4568)] ''' ''' 在此基础上, 延伸出了RepeatedEditedNearestNeighbours算法, 重复基础的EditedNearestNeighbours算法多次 ''' from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours(random_state=0) X_resampled, y_resampled = renn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 208), (2, 4551)] #与RepeatedEditedNearestNeighbours算法不同的是, ALLKNN算法在进行每次迭代的时候, 最近邻的数量都在增加. from imblearn.under_sampling import AllKNN allknn = AllKNN(random_state=0) X_resampled, y_resampled = allknn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #[(0, 64), (1, 220), (2, 4601)] #Condensed nearest neighbors and derived algorithms ''' CondensedNearestNeighbour使用1近邻的方法来进行迭代,来判断一个样本是应该保留还是剔除,具体的实现步骤如下: 集合C:所有的少数类样本; 选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S; 使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类; 将集合S中错分的样本加入集合C;
def test_renn_not_good_object(): nn = 'rnd' renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError): renn.fit_sample(X, Y)