def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED, sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def ENN(df, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('ENN: Original dataset shape %s' % Counter(y)) enn = EditedNearestNeighbours(sampling_strategy="auto") X_res, y_res = enn.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('ENN: Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def resample(x, y, sampling_type=None): x_out, y_out = x, y if sampling_type == "smoteenn": sme = SMOTEENN(random_state=1) x_out, y_out = sme.fit_sample(x, y) else: if sampling_type == "enn": enn = EditedNearestNeighbours(random_state=1) x_out, y_out = enn.fit_sample(x, y) print("Before resampling:", sorted(Counter(y).items())) print("After resampling:", sorted(Counter(y_out).items())) return x_out, y_out
def test_enn_fit_sample_with_indices(): enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def edited_nearest_neighbour(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): enn = EditedNearestNeighbours() X_res, y_res = enn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def get_under_sample_models(): models, names = list(), list() models.append(TomekLinks()) names.append('TomesLinks') models.append(EditedNearestNeighbours()) names.append('EditedNearestNeighbors') models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') models.append(OneSidedSelection()) names.append('OneSidedSelection') models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def smote_en_resampling(data_X, data_y, k_neighbors=5): # Perform under and over sampling using SMOTE and EN smote = SMOTE(sampling_strategy='minority', k_neighbors=k_neighbors, n_jobs=8) enn = EditedNearestNeighbours(n_neighbors=k_neighbors, n_jobs=8) smoteen = SMOTEENN(sampling_strategy="minority", smote=smote, enn=enn, n_jobs=8) resamp_X, resamp_y = smoteen.fit_sample(data_X, data_y) return resamp_X, resamp_y
def test_enn_fit(): """Test the fitting method""" # Create the object enn = EditedNearestNeighbours(random_state=RND_SEED) # Fit the data enn.fit(X, Y) # Check if the data information have been computed assert_equal(enn.min_c_, 0) assert_equal(enn.maj_c_, 1) assert_equal(enn.stats_c_[0], 500) assert_equal(enn.stats_c_[1], 4500)
def test_enn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def readFile(path, y_label, method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False, fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop( skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies( raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X = raw.drop(y_label, axis=1) y = raw[y_label] X_train, X_test, y_train, y_test = split(X, y, training_ratio) if (method == 'OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X_train, y_train) X_train = X_res y_train = y_res if (method == 'UnderSample'): # for i in [] #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP model = EditedNearestNeighbours(random_state=42) X_res, y_res = model.fit_resample(X_train, y_train) X_train = X_res y_train = y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) return X_train, X_test, y_train, y_test
def test_enn_fit_sample(): """Test the fit sample routine""" # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_pass_smote_enn(): smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED), enn=EditedNearestNeighbours(ratio='all', random_state=RND_SEED), random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def enn(self, data): ''' Applies editted nearest neighbor to remove samples whose neighbors mostly belong to other classes ''' df = data X = df.as_matrix(self.features) y = np.ravel(df.as_matrix(['label'])) enn = EditedNearestNeighbours(ratio='all',kind_sel='mode',n_neighbors=5,random_state=42,n_jobs=4) X_res, y_res = enn.fit_sample(X, y) df_enn = pd.DataFrame(X_res, columns=self.features) df_enn['label'] = y_res return df_enn
def samplingMethod(X_train, y_train, sampling="None"): if sampling == "SMOTE": sm = SMOTE(random_state=42, n_jobs=-1) X, y_train = sm.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "ENN": enn = EditedNearestNeighbours(random_state=42, n_jobs=-1) X, y_train = enn.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "SMOTEENN": sme = SMOTEENN(random_state=42, n_jobs=-1) X, y_train = sme.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) return X_train, y_train
def test_enn_fit_sample_mode(): enn = EditedNearestNeighbours(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resample(self, X, y, by, random_state=None): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'ROS': sampler = RandomOverSampler(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y return X_train, y_train
def plot_data(X, Y): # train_X = PCA(n_components=2).fit_transform(train_X) plt.rcParams['figure.figsize'] = (27.0, 5.0) fig = plt.figure() ax0 = fig.add_subplot(1, 5, 1) ax0.scatter(X[:, 0], X[:, 1], c=Y) #ax0.set_title('Original dataset') plt.axis('off') plt.xticks([]) plt.yticks([]) X1, Y1 = SMOTE().fit_sample(X, Y) ax1 = fig.add_subplot(1, 5, 2) ax1.scatter(X1[:, 0], X1[:, 1], c=Y1) #ax1.set_title('SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y) ax2 = fig.add_subplot(1, 5, 3) ax2.scatter(X2[:, 0], X2[:, 1], c=Y2) #ax2.set_title('Borderline-SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) enn = EditedNearestNeighbours() X3, Y3 = enn.fit_sample(X, Y) smo = SMOTE(k_neighbors=5) X3, Y3 = smo.fit_sample(X3, Y3) ax3 = fig.add_subplot(1, 5, 4) ax3.scatter(X3[:, 0], X3[:, 1], c=Y3) #ax3.set_title('ADASYN') plt.axis('off') plt.xticks([]) plt.yticks([]) X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y) ax4 = fig.add_subplot(1, 5, 4) ax4.scatter(X4[:, 0], X4[:, 1], c=Y4) #ax4.set_title('SMOTE+ENN') plt.axis('off') plt.xticks([]) plt.yticks([]) X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y) ax5 = fig.add_subplot(1, 5, 5) ax5.scatter(X5[:, 0], X5[:, 1], c=Y5) #ax5.set_title('MC-ODG') plt.axis('off') plt.xticks([]) plt.yticks([]) plt.show()
def test_enn_init(): """Test the initialisation of the object""" # Define a ratio verbose = True enn = EditedNearestNeighbours(random_state=RND_SEED, verbose=verbose) assert_equal(enn.size_ngh, 3) assert_equal(enn.kind_sel, 'all') assert_equal(enn.n_jobs, -1) assert_equal(enn.random_state, RND_SEED) assert_equal(enn.verbose, verbose) assert_equal(enn.min_c_, None) assert_equal(enn.maj_c_, None) assert_equal(enn.stats_c_, {})
def under_sample_data(matrix, y_train): add_to_log('Under Sampling') add_to_log('Sample distribution %s' % Counter(y_train)) # clean proximity samples using TomeKLinks tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1) X_res, y_res = tl.fit_resample(matrix, y_train) add_to_log('TomekLinks distribution %s' % Counter(y_res)) enn = EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1) X_res, y_res = enn.fit_resample(X_res, y_res) add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res)) return X_res, y_res
def equalize_training_dataset_with_EditedNN(x_train, y_train): from imblearn.under_sampling import EditedNearestNeighbours old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = EditedNearestNeighbours( sampling_strategy='not minority', n_neighbors=5, n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def final_model(X, y): # define the model smoteenn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') pipeline = imb_pipe(steps=[('e', smoteenn), ('m', model)]) # fit the model pipeline.fit(X, y) # evaluate on some non-spill cases (known class 0) print('Non-Spill Cases:') data = [[329, 1627.54, 1409.43, 51, 822500, 35, 6.1, 4610, 0.17, 178.4, 0.2, 0.24, 0.39, 0.12, 0.27, 138.32, 34.81, 2.02, 0.14, 0.19, 75.26, 0.47, 351.67, 0.18, 9.24, 0.38, 2.57, -2.96, -0.28, 1.93, 0, 1.93, 34, 1710, 0, 25.84, 78, 55, 1460.31, 710.63, 451.78, 150.85, 3.23, 0, 4530.75, 66.25, 7.85], [3234, 1091.56, 1357.96, 32, 8085000, 40.08, 8.98, 25450, 0.22, 317.7, 0.18, 0.2, 0.49, 0.09, 0.41, 114.69, 41.87, 2.31, 0.15, 0.18, 75.26, 0.53, 351.67, 0.18, 9.24, 0.24, 3.56, -3.09, -0.31, 2.17, 0, 2.17, 281, 14490, 0, 80.11, 78, 55, 4287.77, 3095.56, 1937.42, 773.69, 2.21, 0, 4927.51, 66.15, 7.24], [2339, 1537.68, 1633.02, 45, 5847500, 38.13, 9.29, 22110, 0.24, 264.5, 0.21, 0.26, 0.79, 0.08, 0.71, 89.49, 32.23, 2.2, 0.17, 0.22, 75.26, 0.51, 351.67, 0.18, 9.24, 0.27, 4.21, -2.84, -0.29, 2.16, 0, 2.16, 228, 12150, 0, 83.6, 78, 55, 3959.8, 2404.16, 1530.38, 659.67, 2.59, 0, 4732.04, 66.34, 7.67]] for row in data: # make prediction yhat = pipeline.predict([row]) # get the label label = yhat[0] # summarize print('>Predicted=%d (expected 0)' % (label)) # evaluate on some spill cases (known class 1) print('Spill Cases:') data = [[2971, 1020.91, 630.8, 59, 7427500, 32.76, 10.48, 17380, 0.32, 427.4, 0.22, 0.29, 0.5, 0.08, 0.42, 149.87, 50.99, 1.89, 0.14, 0.18, 75.26, 0.44, 351.67, 0.18, 9.24, 2.5, 10.63, -3.07, -0.28, 2.18, 0, 2.18, 164, 8730, 0, 40.67, 78, 55, 5650.88, 1749.29, 1245.07, 348.7, 4.54, 0, 25579.34, 65.78, 7.41], [3155, 1118.08, 469.39, 11, 7887500, 30.41, 7.99, 15880, 0.26, 496.7, 0.2, 0.26, 0.69, 0.11, 0.58, 118.11, 43.96, 1.76, 0.15, 0.18, 75.26, 0.4, 351.67, 0.18, 9.24, 0.78, 8.68, -3.19, -0.33, 2.19, 0, 2.19, 150, 8100, 0, 31.97, 78, 55, 3471.31, 3059.41, 2043.9, 477.23, 1.7, 0, 28172.07, 65.72, 7.58], [115, 1449.85, 608.43, 88, 287500, 40.42, 7.34, 3340, 0.18, 86.1, 0.21, 0.32, 0.5, 0.17, 0.34, 71.2, 16.73, 1.82, 0.19, 0.29, 87.65, 0.46, 132.78, -0.01, 3.78, 0.7, 4.79, -3.36, -0.23, 1.95, 0, 1.95, 29, 1530, 0.01, 38.8, 89, 69, 1400, 250, 150, 45.13, 9.33, 1, 31692.84, 65.81, 7.84]] for row in data: # make prediction yhat = pipeline.predict([row]) # get the label label = yhat[0] # summarize print('>Predicted=%d (expected 1)' % (label))
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 1836) assert_equal(count_y_res[2], 5)
def test_enn_fit_resample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array([ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648], ]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def sampling(X, Y, sample_type="over"): """ This is to pick the sampling technique and output the data after sampled :param X: input data :param Y: classification data :param sample_type: can take a list or str of sampling technique default is oversampling. options: over, under, combine :return: cascade data of X and Y """ if "over" in sample_type: # using SMOTE for over sampling X_oversampled, y_oversampled = SMOTE(sampling_strategy="minority", random_state=42).fit_resample( X, Y) if "under" in sample_type: # using ENN for under sampling, since centroid has memory issues # centroid undersample # X_under, y_under = ClusterCentroids(random_state=42).fit_resample(X,Y) X_under, y_under = EditedNearestNeighbours( random_state=42).fit_resample(X, Y) if "combine" in sample_type: # using sklearn built-in SMOTEENN for comebined sampling # because centroids has memory issue X_comb, y_comb = SMOTEENN(random_state=42).fit_resample(X, Y) # X_oversampled, y_oversampled = SMOTE(sampling_strategy="minority", random_state=42).fit_resample(X, Y) # X_comb, y_comb = ClusterCentroids(random_state=42).fit_resample(X_oversampled,y_oversampled) X_Y_under = list() X_Y_over = list() X_Y_comb = list() X_Y = dict() # append the data back for return if 'under' in sample_type: X_Y_under = np.append(X_under, y_under.reshape(len(y_under), 1), axis=1) if 'over' in sample_type: X_Y_over = np.append(X_oversampled, y_oversampled.reshape(len(y_oversampled), 1), axis=1) if 'combine' in sample_type: X_Y_comb = np.append(X_comb, y_comb.reshape(len(y_comb), 1), axis=1) X_Y.setdefault("under", X_Y_under) X_Y.setdefault("over", X_Y_over) X_Y.setdefault("combine", X_Y_comb) return X_Y
def UnderSample(X, Y, method='Random', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'Cluster': # 默认kmeans估计器 sampler = ClusterCentroids(ratio='auto', random_state=random_state, estimator=None) elif method is 'Random': sampler = RandomUnderSampler(ratio='auto', random_state=random_state, replacement=False) elif method is 'NearMiss_1': sampler = NearMiss(ratio='auto', random_state=random_state, version=1) elif method is 'NearMiss_2': sampler = NearMiss(ratio='auto', random_state=random_state, version=2) elif method is 'NearMiss_3': sampler = NearMiss(ratio='auto', random_state=random_state, version=3) elif method is 'TomekLinks': sampler = TomekLinks(ratio='auto', random_state=random_state) elif method is 'ENN': # kind_sel可取'all'和'mode' sampler = EditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'RENN': # kind_sel可取'all'和'mode' sampler = RepeatedEditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'All_KNN': sampler = AllKNN(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'CNN': sampler = CondensedNearestNeighbour(ratio='auto', random_state=random_state) elif method is 'One_SS': sampler = OneSidedSelection(ratio='auto', random_state=random_state) elif method is 'NCR': sampler = NeighbourhoodCleaningRule(ratio='auto', random_state=random_state, kind_sel='all', threshold_cleaning=0.5) elif method is 'IHT': sampler = InstanceHardnessThreshold(estimator=None, ratio='auto', random_state=random_state) X_resampled, Y_resampled = sampler.fit_sample(X, Y) return X_resampled, Y_resampled
def __init__(self): self.time_stamp = datetime.datetime.now().strftime("%Y_%b_%d_%H_%M") print('Model Stamp:' + self.time_stamp) self.clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, criterion='gini', n_estimators=30, warm_start=True) self.vector = HashingVectorizer(n_features=2 ** 22, alternate_sign=False, analyzer='word', decode_error='ignore', token_pattern=r'\b\w{1,}[^\d\W]+\b', ngram_range=(2, 2)) # Samplers are not needed during testing self.samplers = [ TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1), EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1) ]
def balancing_data(X, y, method): if method == "RandomOverSampler": b_method = RandomOverSampler(random_state=0) elif method == "TomekLinks": b_method = TomekLinks(random_state=0) elif method == "SMOTEENN": b_method = SMOTEENN(random_state=0) elif method == "SMOTETomek": b_method = SMOTETomek(random_state=0) elif method == "EditedNearestNeighbours": b_method = EditedNearestNeighbours(random_state = 0) #Balancing and returning the balanced data. X_resampled, y_resampled = b_method.fit_sample(X, y) return(X_resampled, y_resampled)
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def get_models(): models, names = list(), list() # TL models.append(TomekLinks()) names.append('TL') # ENN models.append(EditedNearestNeighbours()) names.append('ENN') # RENN models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') # OSS models.append(OneSidedSelection()) names.append('OSS') # NCR models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def balance_data(X_content, ratings): """ Balance the training data, first apply oversampling (SMOTE) afterwards clean the data/undersample (ENN) imput arguments: X_content: The full feature matrix, not yet transformed to TFIDF format ratings: The corresponding ratings output arguments: return_csr: The balanced X_content return_ratings: The balanced, corresponding ratings """ # Initialize SMOTE object for oversampling and ENN object for cleaning the oversampled data sm = SMOTE() enn = EditedNearestNeighbours() nr_revs = X_content.shape[0] # Handle content in 20 parts to avoind Memory errors! return_csr = csr_matrix((0, X_content.shape[1])) return_ratings = [] nr_chuncks = 20 chunck = nr_revs/nr_chuncks for x in range(0,nr_chuncks): # Get appropriot part of the data if x < nr_chuncks-1: X_now = X_content[x*chunck:(x+1)*chunck, :].toarray() ratings_now = ratings[x*chunck:(x+1)*chunck] else: X_now = X_content[x*chunck:nr_revs, :].toarray() ratings_now = ratings[x*chunck:nr_revs] # Apply SMOTE for each minority class for i in range(0,4): X_now, ratings_now = sm.fit_sample(X_now, ratings_now) # Apply ENN for cleaning X_now, ratings_now = enn.fit_sample(X_now, ratings_now) # Append data to the return matrix vstack([return_csr,csr_matrix(X_now)]) return_ratings.extend(ratings_now) print "balanced" return return_csr, return_ratings