def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_sample(X, Y)
def test_enn_fit_sample_with_nn_object(): """Test the fit sample routine using a NN object""" # Resample the data nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def model_preprocess(data): ''' Function that applies Edited Nearest Neighbors from the imblearn library to create a more balanced training set. Arguments: data: dataframe with features and labels Returns: train: the training set with more balanced label distribution trainlab: the labels for the training set test: test set with same label distribution as the original dataset test_nlab: subset of test set that does not contain the labels testlab: the labels for the test set ''' a = time.time() #encoding #split into testing and training train, test = train_test_split(data, test_size=0.2, stratify=data.Label) #training trainlab = train.Label train = train.drop('Label', axis=1) tlabels = list(train) #testing testlab = test.Label test_nlab = test.drop('Label', axis=1) #perform the imbalance technique: Edited Nearest Neighbors enn = EditedNearestNeighbours() train, trainlab = enn.fit_sample(train, trainlab) train = pd.DataFrame(train, columns=tlabels) print('Preprocessing Completed in %.3f seconds.' % (time.time() - a)) return train, trainlab, test, test_nlab, testlab
def predict_defects(self, train: pd.DataFrame, test: pd.DataFrame, oversample: bool = True, binarize: bool = True) -> Tuple[list, list]: """ Predict for Defects Parameters ---------- train: numpy.ndarray or pandas.core.frame.DataFrame Training dataset as a pandas dataframe test: pandas.core.frame.DataFrame Test dataset as a pandas dataframe oversample: Bool Oversample with SMOTE binarize: Bool A boolean variable to Return ------ actual: numpy.ndarray Actual defect counts predicted: numpy.ndarray Predictied defect counts """ if binarize: train = self._binarize(train) test = self._binarize(test) x_train = train[train.columns[:-1]].values y_train = train[train.columns[-1]].values # pca = PCA(n_components=3) # pca.fit(x_train) # x_train = pca.transform(x_train) # x_train = model.transform(x_train) if oversample: k = min(2, sum(y_train) - 1) # sm = SMOTE(kind='regular', k_neighbors=k) sm = EditedNearestNeighbours() x_train, y_train = sm.fit_sample(x_train, y_train) lsvc = clone(self.clf, safe=True) lsvc.fit(x_train, y_train) model = SelectFromModel(lsvc, prefit=True) x_train = model.transform(x_train) # set_trace() # pca = PCA(n_components=3) # pca.fit(x_train) # x_train = pca.transform(x_train) self.clf.fit(x_train, y_train) actual = test[test.columns[-1]].values.astype(int) x_test = test[test.columns[:-1]] x_test = model.transform(x_test) # x_test = pca.transform(x_test) predicted = self.clf.predict(x_test).astype(int) return actual, predicted
def test_enn_fit_sample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample(): enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample(): """Test the fit sample routine""" # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def renn_sampling(X,Y): enn = ENN(return_indices=True) nsamples, nx, ny = X.shape print(X.shape) X = X.reshape((nsamples, nx*ny)) X, Y, idx_resampled = enn.fit_sample(X,Y) nsamples, ny = X.shape print(X.shape) X = X.reshape((nsamples, nx, ny/nx)) Y = Y.reshape((nsamples, 1)) return X, Y
def resample(x, y, sampling_type=None): x_out, y_out = x, y if sampling_type == "smoteenn": sme = SMOTEENN(random_state=1) x_out, y_out = sme.fit_sample(x, y) else: if sampling_type == "enn": enn = EditedNearestNeighbours(random_state=1) x_out, y_out = enn.fit_sample(x, y) print("Before resampling:", sorted(Counter(y).items())) print("After resampling:", sorted(Counter(y_out).items())) return x_out, y_out
def test_enn_fit_sample_with_indices(): enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_enn_fit_sample_mode(): enn = EditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def samplingMethod(X_train, y_train, sampling="None"): if sampling == "SMOTE": sm = SMOTE(random_state=42, n_jobs=-1) X, y_train = sm.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "ENN": enn = EditedNearestNeighbours(random_state=42, n_jobs=-1) X, y_train = enn.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "SMOTEENN": sme = SMOTEENN(random_state=42, n_jobs=-1) X, y_train = sme.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) return X_train, y_train
def enn(self, data): ''' Applies editted nearest neighbor to remove samples whose neighbors mostly belong to other classes ''' df = data X = df.as_matrix(self.features) y = np.ravel(df.as_matrix(['label'])) enn = EditedNearestNeighbours(ratio='all',kind_sel='mode',n_neighbors=5,random_state=42,n_jobs=4) X_res, y_res = enn.fit_sample(X, y) df_enn = pd.DataFrame(X_res, columns=self.features) df_enn['label'] = y_res return df_enn
def plot_data(X, Y): # train_X = PCA(n_components=2).fit_transform(train_X) plt.rcParams['figure.figsize'] = (27.0, 5.0) fig = plt.figure() ax0 = fig.add_subplot(1, 5, 1) ax0.scatter(X[:, 0], X[:, 1], c=Y) #ax0.set_title('Original dataset') plt.axis('off') plt.xticks([]) plt.yticks([]) X1, Y1 = SMOTE().fit_sample(X, Y) ax1 = fig.add_subplot(1, 5, 2) ax1.scatter(X1[:, 0], X1[:, 1], c=Y1) #ax1.set_title('SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y) ax2 = fig.add_subplot(1, 5, 3) ax2.scatter(X2[:, 0], X2[:, 1], c=Y2) #ax2.set_title('Borderline-SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) enn = EditedNearestNeighbours() X3, Y3 = enn.fit_sample(X, Y) smo = SMOTE(k_neighbors=5) X3, Y3 = smo.fit_sample(X3, Y3) ax3 = fig.add_subplot(1, 5, 4) ax3.scatter(X3[:, 0], X3[:, 1], c=Y3) #ax3.set_title('ADASYN') plt.axis('off') plt.xticks([]) plt.yticks([]) X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y) ax4 = fig.add_subplot(1, 5, 4) ax4.scatter(X4[:, 0], X4[:, 1], c=Y4) #ax4.set_title('SMOTE+ENN') plt.axis('off') plt.xticks([]) plt.yticks([]) X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y) ax5 = fig.add_subplot(1, 5, 5) ax5.scatter(X5[:, 0], X5[:, 1], c=Y5) #ax5.set_title('MC-ODG') plt.axis('off') plt.xticks([]) plt.yticks([]) plt.show()
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 1836) assert_equal(count_y_res[2], 5)
def test_enn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def balance_data(X_content, ratings): """ Balance the training data, first apply oversampling (SMOTE) afterwards clean the data/undersample (ENN) imput arguments: X_content: The full feature matrix, not yet transformed to TFIDF format ratings: The corresponding ratings output arguments: return_csr: The balanced X_content return_ratings: The balanced, corresponding ratings """ # Initialize SMOTE object for oversampling and ENN object for cleaning the oversampled data sm = SMOTE() enn = EditedNearestNeighbours() nr_revs = X_content.shape[0] # Handle content in 20 parts to avoind Memory errors! return_csr = csr_matrix((0, X_content.shape[1])) return_ratings = [] nr_chuncks = 20 chunck = nr_revs/nr_chuncks for x in range(0,nr_chuncks): # Get appropriot part of the data if x < nr_chuncks-1: X_now = X_content[x*chunck:(x+1)*chunck, :].toarray() ratings_now = ratings[x*chunck:(x+1)*chunck] else: X_now = X_content[x*chunck:nr_revs, :].toarray() ratings_now = ratings[x*chunck:nr_revs] # Apply SMOTE for each minority class for i in range(0,4): X_now, ratings_now = sm.fit_sample(X_now, ratings_now) # Apply ENN for cleaning X_now, ratings_now = enn.fit_sample(X_now, ratings_now) # Append data to the return matrix vstack([return_csr,csr_matrix(X_now)]) return_ratings.extend(ratings_now) print "balanced" return return_csr, return_ratings
def test_enn_fit_sample_with_nn_object(): """Test the fit sample routine using a NN object""" # Resample the data nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def run(X=None, Y=None, random_state=42, smote_ratio="minority", smote_kind="regular", enn_ratio="all", enn_kind_sel="all", enn_n_neighbors=3, save_dist=False, file=None): sm = None if smote_kind == "svm": sm = SMOTE(random_state=random_state, ratio=smote_ratio, kind=smote_kind, svm_estimator=SVC()) else: sm = SMOTE(random_state=random_state, ratio=smote_ratio, kind=smote_kind,) enn = EditedNearestNeighbours(random_state=random_state, ratio=enn_ratio, kind_sel=enn_kind_sel, n_neighbors=enn_n_neighbors) X_resampled, Y_resampled = sm.fit_sample(X, Y) if(save_dist): with open(file, "a") as arch: arch.write("SMOTE: " + str(Counter(Y_resampled)) + " ") X_st, Y_st = enn.fit_sample(X_resampled, Y_resampled) if(save_dist): with open(file, "a") as arch: arch.write("ENN:" + str(Counter(y_st))+"\n") return X_st, Y_st
def compare_different_oversample_method(model, sample_method, X, Y): n_split = 5 skf = StratifiedKFold(n_splits=n_split, shuffle=True) res_list = np.zeros(4) cnt=0 for train_indices, test_indices in skf.split(X, Y): cnt+=1 print('正在进行第{}次交叉验证'.format(cnt)) train_X, train_Y, test_X, test_Y = X[train_indices], Y[train_indices], X[test_indices], Y[test_indices] min_k_kearest = min(Counter(train_Y)) - 1 if sample_method == 'SMOTE_ENN': enn = EditedNearestNeighbours() train_X, train_Y = enn.fit_sample(train_X, train_Y) smo = SMOTE(k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'smote': smo = SMOTE(k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'borderline_smote': smo = BorderlineSMOTE(kind='borderline-1', k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'adasyn': ada = ADASYN(n_neighbors=min(2, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = ada.fit_sample(train_X, train_Y) elif sample_method: train_X, train_Y = sample_method.fit_sample(train_X, train_Y) model.fit(train_X, train_Y) y_score = model.predict(test_X) y_score_prob = model.predict_proba(test_X)[:, 1] # res_list1 += cal_multi_class_matrics(test_Y,y_sampled_score,y_sampled_score_prob) res_list += cal_multi_class_matrics(test_Y, y_score, y_score_prob) return res_list / n_split
print("ratio", i) results['ratio'][a] = i print("neighbors", j) results['neighbors'][a] = j b = a a = a + 1 results['Class'][b] = 0 results['Class'][a] = 1 results['Datasize'][b] = datasize[0] results['Datasize'][a] = datasize[1] results['Training Datasize'][b] = trainingdatasize[0] results['Training Datasize'][a] = trainingdatasize[1] results['Testing Datasize'][b] = testingdatasize[0] results['Testing Datasize'][a] = testingdatasize[1] enn = EditedNearestNeighbours(random_state=5, n_neighbors=j) X_train_sampled, y_train_sampled = enn.fit_sample( X_train_sampled1, y_train_sampled1) samplingdatasize = collections.Counter(y_train_sampled) print("sampled training data size", samplingdatasize) results['After sampling'][b] = samplingdatasize[0] results['After sampling'][a] = samplingdatasize[1] #random forest clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0, oob_score=True) clf.fit(X_train_sampled, y_train_sampled) y_pred = clf.predict(X_test) y_test_arr = np.array(y_test['Outcome']) oobscore = clf.oob_score_ print("oob score", oobscore)
def hyperParamSearch(X_train, y_train, X_test, y_test, clf="logistic", scoring='accuracy', preprocess='MaxMin', sampling="None"): tuned_parameters = dict() # sampling if sampling == "SMOTE": sm = SMOTE(random_state=42, n_jobs=-1) X, y_train = sm.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "ENN": enn = EditedNearestNeighbours(random_state=42, n_jobs=-1) X, y_train = enn.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) elif sampling == "SMOTEENN": sme = SMOTEENN(random_state=42, n_jobs=-1) X, y_train = sme.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) # preprocessing if preprocess == 'MaxMin': preprocessing = ('MaxMin', MaxAbsScaler()) if preprocess == 'Binarization': preprocessing = ('Bin', Binarizer()) if clf == "logistic": #Parameters of pipelines can be set using ‘__’ separated parameter names: tuned_parameters = [{ 'logistic__penalty': ['l2'], 'logistic__C': [0.001, 0.1, 1, 10, 100], 'logistic__class_weight': [None] }] pipe = Pipeline( steps=[preprocessing, ('logistic', LogisticRegression(n_jobs=-1))]) if clf == "randomForest": tuned_parameters = [{ 'randomForest__n_estimators': [100, 500], 'randomForest__min_samples_leaf': [1, 10, 25], 'randomForest__class_weight': [None, 'balanced'] }] pipe = Pipeline(steps=[ preprocessing, ('randomForest', RandomForestClassifier(n_jobs=-1)) ]) if clf == "KNN": tuned_parameters = [{ 'KNN__n_neighbors': [5, 10, 20, 40], 'KNN__weights': ['distance', 'uniform'], 'KNN__metric': ['euclidean', 'manhattan'] }] pipe = Pipeline( steps=[preprocessing, ('KNN', KNeighborsClassifier(n_jobs=-1))]) for score in scoring: estimator = GridSearchCV(pipe, tuned_parameters, cv=3, scoring=score, error_score=-1, n_jobs=-1) estimator.fit(X_train, y_train) save_name = "final_%s(%s based_%s preprocessed_%s sampling).pkl" % ( clf, score, preprocess, sampling) joblib.dump(estimator, save_name, compress=True) # print information print("************************* GENERAL INFO ***********************") print(" - classifier : %s" % (clf)) print(" - sampling : %s" % (sampling)) print(" - preprocessing : %s" % (preprocess)) print(" - hyperParam based on : %s" % (score)) print("**************************************************************") print("Best parameters set found on development set:") print(estimator.best_params_) print("%s scores on development set:" % (score)) means = estimator.cv_results_['mean_test_score'] stds = estimator.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, estimator.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print("Detailed classification report:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, estimator.predict(X_test) # print(classification_report(y_true, y_pred)) confus = confusion_matrix(y_true, y_pred) print '*****CV python****' print confus
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours( n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_sample(X, Y)
if corr: df_corr = df.corr() plt.figure(figsize=(15,10)) seaborn.heatmap(df_corr, cmap="YlGnBu") # Displaying the Heatmap seaborn.set(font_scale=2,style='white') plt.title('Heatmap correlation') plt.show() exit() X_train = df.as_matrix(columns = ['gaze0_x','gaze0_y','gaze0_z','gaze1_x','gaze1_y','gaze1_z','poser_x','poser_y','poser_z','au23','au05','au12']) ## Features with High Correlation and Importance Values train_label1 = df.as_matrix(columns = ['label']) y_train = np.ravel(train_label1) rus = EditedNearestNeighbours(random_state=42) X_resampled, y_resampled = rus.fit_sample(X_train, y_train) df1 = pd.read_csv('test.csv') test_data = df1.as_matrix(columns = ['gaze0_x','gaze0_y','gaze0_z','gaze1_x','gaze1_y','gaze1_z','poser_x','poser_y','poser_z','au23','au05','au12']) test_label1 = df1.as_matrix(columns = ['label']) test_label = np.ravel(test_label1) if gridsearch: C_range = 10. ** np.arange(-2, 3) gamma_range = 10. ** np.arange(-3, 2) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=y_resampled, n_folds=5)) grid.fit(X_resampled, y_resampled) print("The best classifier is: ", grid.best_estimator_) exit()
clf_smote.fit(X_smote,y_smote) preditions_smote=clf_smote.predict(X_test) #学习曲线 train_sizes,train_scores,test_scores=learning_curve(estimator=clf_smote, X=X_smote,y=y_smote, train_sizes=np.linspace(0.05,1,10), cv=10, n_jobs=1,random_state=0) train_mean_smote=np.mean(train_scores,axis=1) test_mean_smote=np.mean(test_scores,axis=1) train_std_smote=np.std(train_scores,axis=1) test_std_smote=np.std(train_scores,axis=1) ################################################################### ##ENN from imblearn.under_sampling import EditedNearestNeighbours ENN=EditedNearestNeighbours(random_state=42) X_enn,y_enn=ENN.fit_sample(X_train,y_train) ##建立模型 clf_enn = RandomForestClassifier(oob_score=True) clf_enn.fit(X_enn,y_enn) preditions_enn=clf_enn.predict(X_test) #学习曲线 train_sizes,train_scores,test_scores=learning_curve(estimator=clf_enn, X=X_enn,y=y_enn, train_sizes=np.linspace(0.05,1,10), cv=10, n_jobs=1,random_state=0) train_mean_enn=np.mean(train_scores,axis=1) test_mean_enn=np.mean(test_scores,axis=1) train_std_enn=np.std(train_scores,axis=1) test_std_enn=np.std(train_scores,axis=1) ################################################################### ##SMOTE+ENN
def pre_process(train_index, test_index): train_x, test_x = X_train_all[train_index], X_train_all[test_index] train_y, test_y = y_train[train_index], y_train[test_index] #Class Balance on the training split if class_balance_method == 'rand_under': rus = RandomUnderSampler(sampling_strategy='majority', random_state=0) train_x, train_y = rus.fit_sample(train_x, train_y) elif class_balance_method == 'enn': enn = EditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = enn.fit_sample(train_x, train_y) elif class_balance_method == 'renn': renn = RepeatedEditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = renn.fit_sample(train_x, train_y) elif class_balance_method == 'tomek': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) elif class_balance_method == 'tomek_enn': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) enn = EditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = enn.fit_sample(train_x, train_y) elif class_balance_method == 'tomek_renn': tl = TomekLinks(random_state=0) train_x, train_y = tl.fit_sample(train_x, train_y) renn = RepeatedEditedNearestNeighbours(n_neighbors=5, random_state=0, n_jobs=1) train_x, train_y = renn.fit_sample(train_x, train_y) #Feature Selection on the training split #For all methods except the relief based feature_scores = 'N/A' if feature_selection_method == 'no': selected_features = X_df.columns elif feature_selection_method == 'chi2': selected_features, X_train_df, train_x, test_x = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) elif feature_selection_method == 'anovaF': selected_features, X_train_df, train_x, test_x = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) elif feature_selection_method == 'reliefF': selected_features, feature_scores, train_x, test_x = relieff_fs( X_df, train_x, test_x, train_y) elif feature_selection_method == 'multisurf': selected_features, feature_scores, train_x, test_x = multisurf_fs( X_df, train_x, test_x, train_y) elif feature_selection_method == 'chi2_reliefF': selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = relieff_fs( X_train_df, X_train_chi2, X_test_chi2, train_y) elif feature_selection_method == 'chi2_multisurf': selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = multisurf_fs( X_train_df, X_train_chi2, X_test_chi2, train_y) elif feature_selection_method == 'anova_reliefF': selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = relieff_fs( X_train_df, X_train_anova, X_test_anova, train_y) elif feature_selection_method == 'anova_multisurf': selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs( X_df, train_x, test_x, train_y, p_val_thresh) selected_features, feature_scores, train_x, test_x = multisurf_fs( X_train_df, X_train_anova, X_test_anova, train_y) return train_x, train_y, test_x, test_y, selected_features, feature_scores
for i in r2l: y_3[y_3 == i] = 'R2L' # r2l for i in dos: y_3[y_3 == i] = 'DOS' # dos for i in probe: y_3[y_3 == i] = 'Probing' # probe y_3[y_3 == "normal."] = 'Normal' # normal y_3 = np.array(y_3) # 变成array格式,一维 classes=['Normal','Probing','DOS','U2R','R2L'] colors=['blue','red','y','m','g'] #欠采样 ENN from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from imblearn.under_sampling import EditedNearestNeighbours params=[3,6,9,12,15,18] for i in params: oversampler = EditedNearestNeighbours(random_state=42, n_neighbors=i) X_enn, y_e = oversampler.fit_sample(X_3, y_3) #标准化 scaler=StandardScaler().fit(X_enn) X_e=scaler.transform(X_enn) #可视化 X_train_2,X_test_2,y_train_2,y_test_2=train_test_split(X_e,y_e,test_size=0.2,random_state=0) #切分样本 X_embedded = TSNE(n_components=2).fit_transform(X_test_2) plt.figure() plt.title("ENN") for index,label,color in zip(range(len(classes)),classes,colors): plt.scatter(X_embedded[y_test_2==label,0],X_embedded[y_test_2==label,1],label=classes[index],c=color) plt.legend(loc='best') plt.show()
# # print('重新取样数据集的形状 - Resampled dataset shape {}'.format(Counter(sm_target))) # # End: 过采样使用SMOTE - oversampling using smote classfication(sm_data, sm_target, "Data after oversampling using SMOTE") # # Start: 欠采样使用tomekLink - undersampling using tomekLink tlink = TomekLinks(random_state=42, ratio='auto') tl_data, tl_target = tlink.fit_sample(ada_data, ada_target) print('Resampled dataset shape {}'.format(Counter(tl_target))) # # # End: 欠采样使用tomekLink - undersampling using tomekLink classfication(tl_data, tl_target, "ADASYN Data after cleaning using TomekLink") # # Start: 欠采样使用CondensedNearesNeighbors - undersampling using CondensedNearesNeighbors enn = EditedNearestNeighbours(random_state=42, n_neighbors=1, ratio='auto') enn_data, enn_target = enn.fit_sample(X_data, target) # # print('重新取样数据集的形状 - Resampled dataset shape {}'.format(Counter(enn_target))) # # End: 欠采样使用CondensedNearesNeighbors - undersampling using CondensedNearesNeighbors classfication( enn_data, enn_target, "使用随机采样器进行欠采样后的数据 - Data after under sampling using Edited Nearest Neighbors" ) # Start:欠采样使用RandomUnderSampler - undersampling using RandomUnderSampler rus = RandomUnderSampler(random_state=42) rus_data, rus_target = rus.fit_sample(X_data, target) print('Resampled dataset shape {}'.format(Counter(rus_target))) # End : 欠采样使用RandomUnderSampler - undersampling using RandomUnderSampler classfication(
cv=10, scoring=('roc_auc', 'average_precision')) scores['test_roc_auc'].mean(), scores['test_average_precision'].mean() # (0.9518183780276207, 0.6767076447148238) ######### Edited Nearest Neighbor ######### # removes all samples that are misclassified by KNN from the training data (`mode`) # Or if have any point from other class as neighbor (`all`) # So basically, what you're doing here is you clean up outliers and boundaries. from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours(n_neighbors=5) X_train_enn, y_train_enn = enn.fit_sample(X_train, y_train) enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=5) X_train_enn_mode, y_train_enn_mode = enn_mode.fit_sample(X_train, y_train) print(X_train_enn_mode.shape) print(np.bincount(y_train_enn_mode)) ### Pipeline method enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5), LogisticRegression()) scores = cross_validate(enn_pipe, X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'))
# Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Three subplots, unpack the axes array immediately f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') # Apply the ENN print('ENN') enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax2.set_title('Edited nearest neighbours') # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_sample(X, y)
# # In[135]: # # # smotenc+enn X_smote = np.array(df_smotenc[[ 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21' ]]) Y_smote = list(df_smotenc['click']) # from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X_smotenc, y_smotenc) # # # In[52]: # # # df_smotenc = pd.DataFrame(X_smotenc, # columns=column1) # df_smotenc = pd.concat([df_smotenc, pd.DataFrame(y_smotenc, columns=['click'])], axis=1) # for i in column1: # df_smotenc[i] = df_smotenc[i].astype(int) # # # In[53]: # # # df_smX_resampledotenc.head() #
def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): enn.fit_sample(X, Y)
random_state=np.random.randint(100), kind='regular', n_jobs=-1) os_X_train, os_y_train = oversampler.fit_sample(X_train.fillna(0), y_train) ##ADASYN 运行起来很慢### X_resampled_adasyn, y_resampled_adasyn = ADASYN( sampling_strategy=0.2, n_jobs=-1).fit_sample(train.loc[:, feature].fillna(0).values, train["y"].values.astype('int')) ###删除边界的一些噪声点### from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours(random_state=0) X_resampled, y_resampled = enn.fit_sample(X, y) dtrain = xgb.DMatrix(data=train.loc[:, feature].astype('float'), label=train['y'].astype('int')) dval = xgb.DMatrix(data=val.loc[:, feature].astype('float'), label=val['y'].astype('int')) train.loc[:, feature].info(null_counts=True) params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 2,
class SMOTEENN(SMOTEENN): def __init__(self, ratio='auto', random_state=None, smote=None, enn=None): """ Creates an object of the imblearn.combine.SMOTEENN class. :param ratio: str, dict, or callable, optional (default='auto') Ratio to use for resampling the data set. - If "str", has to be one of: (i) 'minority': resample the minority class; (ii) 'majority': resample the majority class, (iii) 'not minority': resample all classes apart of the minority class, (iv) 'all': resample all classes, and (v) 'auto': correspond to 'all' with for over-sampling methods and 'not_minority' for under-sampling methods. The classes targeted will be over-sampled or under-sampled to achieve an equal number of sample with the majority or minority class. - If "dict`", the keys correspond to the targeted classes. The values correspond to the desired number of samples. - If callable, function taking "y" and returns a "dict". The keys correspond to the targeted classes. The values correspond to the desired number of samples. :param random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by 'np.random'. :param smote: object, optional (default=SMOTE()) The :class: imblearn.over_sampling.SMOTE object to use. If none provide a :class: imblearn.over_sampling.SMOTE object with default parameters will be given. :param enn: object, optional (default=EditedNearestNeighbours()) The :class: imblearn.under_sampling.EditedNearestNeighbours object to use. If none provide a :class: imblearn.under_sampling.EditedNearestNeighbours object with default parameters will be given. """ super(SMOTEENN, self).__init__(ratio=ratio, random_state=random_state, smote=smote, enn=enn) def _validate_estimator(self): """ Private function to validate SMOTE and ENN objects. :return: """ if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = self.smote else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) else: self.smote_ = SMOTE(ratio=self.ratio, k_neighbors=3, random_state=self.random_state) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): self.enn_ = self.enn else: raise ValueError('enn needs to be an EditedNearestNeighbours.' ' Got {} instead.'.format(type(self.enn))) else: self.enn_ = EditedNearestNeighbours(ratio="all", kind_sel="mode", random_state=self.random_state) def fit(self, X, y): """ Find the classes statistics before to perform sampling. :param X: {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. :param y: array-like, shape (n_samples,) Corresponding label for each sample in X. :return: object; Return self """ return super(SMOTEENN, self).fit(X, y) def _sample(self, X, y): """ Edited to apply ENN first to remove problematic samples and then apply SMOTE. :param X: {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. :param y: array-like, shape (n_samples,) Corresponding label for each sample in X. :return: X_resampled, y_resampled """ self._validate_estimator() X_res, y_res = self.enn_.fit_sample(X, y) return self.smote_.fit_sample(X_res, y_res)