def undersampling(X, y, sampling_strategy='auto', n_neighbors=1): sampler = OneSidedSelection(n_jobs=36, sampling_strategy=sampling_strategy, n_neighbors=n_neighbors) X_us, y_us = sampler.fit_sample(X, y) return X_us.copy(), y_us.copy()
def test_oss_with_object(): knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample(): """Test the fit sample routine""" # Resample the data oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample(): """Test the fit sample routine""" # Resample the data oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_with_object(): """Test the fit sample routine with an knn object""" # Resample the data knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) # Resample the data knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def use_OSSSMOTEENN(self): X,y = preparation(self.path) ############################## dy = pd.DataFrame(y) dy.value_counts().plot(kind='bar',title='Count(label)') plt.show() ################################# oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority") X_res,y_res = oss.fit_sample(X,y) dy_res = pd.DataFrame(y_res) dy_res.value_counts().plot(kind='bar',title='Count(label)') plt.show() ############################## sme = SMOTEENN(random_state=42,n_jobs=-1) X_sme, y_sme = sme.fit_sample(X_res, y_res) #draw bar dy_sme = pd.DataFrame(y_sme) dy_sme.value_counts().plot(kind='bar',title='Count(label)') plt.show() #generate csv df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1) df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f') ###the first line of data will be delete ##########draw PCA pca = PCA(n_components=2) X_sme = pca.fit_transform(X_sme) plot_2d_space(X_sme,y_sme, 'SMOTE + ENN') return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') # if __name__ == '__main__': # path ="++Final_Test++_pre.csv" # #draw_bar(path) # mhi = My_handle_imbalance(path) # mhi.use_OSSSMOTEENN() # # #use_SMOTETomek(path) # #draw_origin(path)
def test_oss_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices(): oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample(): """Test the fit sample routine""" # Resample the data oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply One-Sided Selection oss = OneSidedSelection() X_resampled, y_resampled = oss.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
# ROS와 SMOTE data unique성 비교 import pandas as pd X_sampled1 = pd.DataFrame(X_sampled1) len(X_sampled1.drop_duplicates()) # unique가 많지않다 X_sampled2 = pd.DataFrame(X_sampled2) len(X_sampled2.drop_duplicates()) # new data가 있기 때문에 unique가 상대적으로 많긴 하다. # Tomek Link from imblearn.under_sampling import TomekLinks tl = TomekLinks(return_indices=True) X_resampled, y_resampled, inds = tl.fit_sample(X, y) # One-sided selection # remove every data point, 근데 그 전에 k-nn을 적용해야한다. from imblearn.under_sampling import OneSidedSelection oss = OneSidedSelection(n_neighbors=1, n_seeds_S=1) X_resampled, y_resampled = oss.fit_sample(X, y) #Cost-sensitive Learning svc = SVC(kernel='linear', class_weight={1: 10}) svc.fit(X, y) y_pred = svc.predict(X) recall_score(y, y_pred)
from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss from notify import notify eeg_feat = readStoredData('eeg_pat22_feats.p') X = eeg_feat['feats'] y = np.ravel(eeg_feat['labels']) nsamples,nfeats = X.shape classRatio = np.sum(y)/130.0e3 #%% print 'starting OSS' OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51, n_jobs=-1) ossx, ossy = OSS.fit_sample(X, y) #%% #print 'starting CC_cr' #CC = ClusterCentroids(n_jobs=-1,ratio=classRatio) #ccx_cr,ccy_cr = CC.fit_sample(X,y) #%% #print 'starting CC_a' #CC = ClusterCentroids(n_jobs=-1,ratio='auto') #ccx_a,ccy_a = CC.fit_sample(X,y) #%% #print 'starting NM3_cr' #NM3 = NearMiss(version=3,n_jobs=-1,ratio=classRatio) #nm3x_cr, nm3y_cr = NM3.fit_sample(X, y)
print data['target'].value_counts(normalize=True) print "d" del data['target'] del data['connection_id'] from sklearn.model_selection import train_test_split testdata = pd.read_csv('test_data.csv') d = testdata['connection_id'] del testdata['connection_id'] pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(data) # Apply the random under-sampling rus = OneSidedSelection(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(data, y) # fit model on all training data rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1) # find all relevant features - 5 features should be selected feat_selector.fit(X_resampled, y_resampled) print feat_selector.support_ # check ranking of features print feat_selector.ranking_
# The second algorithm uses an undersampling technique called One Sided Selection. Because a majority of our data fits into only 2 of our 7 loan status categories, we must be careful that our algorithm doesn't simply decide to put all loans into those two categories and call it a day. For this run, we use the undersampled data to train the algorithm and the entire dataset to test accuracy. # In[33]: y = df_ml['loan_status'] X = df_ml.drop(['loan_status'], axis=1) # In[34]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) oss = OneSidedSelection() X_resampled, y_resampled = oss.fit_sample(X, y) # In[35]: dt = DecisionTreeClassifier() dt.fit(X_train, y_train) print("Decision tree - Normal data set") print(metrics.accuracy_score(y_test, dt.predict(X_test))) print(metrics.confusion_matrix(y_test, dt.predict(X_test))) dt = DecisionTreeClassifier() dt.fit(X_resampled, y_resampled) print("Decision tree - Imbalanced lean -One Sided Selection") print(metrics.accuracy_score(y, dt.predict(X))) print(metrics.confusion_matrix(y, dt.predict(X)))
# Under Sampling from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(return_indices = True) under_X, under_y, inds = rus.fit_sample(X,y) logistic_(pd.DataFrame(under_X), pd.DataFrame(under_y)) svc_(pd.DataFrame(under_X),pd.DataFrame(under_y)) # SMOTE from imblearn.over_sampling import SMOTE smote = SMOTE(k_neighbors = 10) smote_X, smote_y = smote.fit_sample(X,y) logistic_(pd.DataFrame(smote_X), pd.DataFrame(smote_y)) svc_(pd.DataFrame(smote_X), pd.DataFrame(smote_y)) # time spend # Tomek Link from imblearn.under_sampling import TomekLinks tl = TomekLinks(return_indices = True) tomek_X , tomek_y, inds = tl.fit_sample(X,y) logistic_(pd.DataFrame(tomek_X), pd.DataFrame(tomek_y)) svc_(pd.DataFrame(tomek_X), pd.DataFrame(tomek_y)) # One-sided selection from imblearn.under_sampling import OneSidedSelection oss = OneSidedSelection(n_neighbors=1, n_seeds_S=1) os_X, os_y = oss.fit_sample(X,y) logistic_(pd.DataFrame(os_X), pd.DataFrame(os_y)) svc_(pd.DataFrame(os_X), pd.DataFrame(os_y))
def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int"): oss.fit_sample(X, Y)
############################################################################### ### Condensed Nearest Neighbor cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) plot_(X_resampled, y_resampled, remove=True) cnn_tree = tree.fit(X_resampled, y_resampled) cnn_ = confusion_matrix(y_test, cnn_tree.predict(X_test)) ############################################################################### ### One-side selection oss = OneSidedSelection(return_indices=True) X_resampled, y_resampled, idx_resampled = oss.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) plot_(X_resampled, y_resampled, remove=True) oss_tree = tree.fit(X_resampled, y_resampled) oss_ = confusion_matrix(y_test, oss_tree.predict(X_test)) ############################################################################### ### Random oversampling ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) ros_tree = tree.fit(X_resampled, y_resampled)
# Picking the indices of the normal classes normal_indices = data[data.Class == 0].index # Out of the indices we picked, randomly select "x" number (number_records_fraud) random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False) random_normal_indices = np.array(random_normal_indices) print("go") print(datetime.datetime.now().time()) oss = OneSidedSelection(return_indices=True) cnn = CondensedNearestNeighbour(return_indices=True) rus = RandomUnderSampler(return_indices=True) nm = NearMiss(version=3, return_indices=True) X_resampled, y_resampled, idx_resampled = oss.fit_sample(X, Y.values.ravel()) # X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X, Y.values.ravel()) # X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y.values.ravel()) # X_resampled, y_resampled, idx_resampled = nm.fit_sample(X, Y.values.ravel()) print("stop") print(datetime.datetime.now().time()) # print(X_resampled) # print(y_resampled) # print(idx_resampled) idx_resampled = np.array(idx_resampled) # print(len(idx_resampled)) # Appending the 2 indices under_sample_indices = np.concatenate([fraud_indices, idx_resampled])