def undersampling(X, y, sampling_strategy='auto', n_neighbors=1):
    sampler = OneSidedSelection(n_jobs=36,
                                sampling_strategy=sampling_strategy,
                                n_neighbors=n_neighbors)
    X_us, y_us = sampler.fit_sample(X, y)

    return X_us.copy(), y_us.copy()
示例#2
0
def test_oss_with_object():
    knn = KNeighborsClassifier(n_neighbors=1)
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    knn = 1
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#4
0
def test_oss_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_with_object():
    """Test the fit sample routine with an knn object"""

    # Resample the data
    knn = KNeighborsClassifier(n_neighbors=1)
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    # Resample the data
    knn = 1
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#6
0
    def use_OSSSMOTEENN(self):
        X,y = preparation(self.path)
##############################
        dy = pd.DataFrame(y)
        dy.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
#################################
        oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority")
        X_res,y_res = oss.fit_sample(X,y)

        dy_res = pd.DataFrame(y_res)
        dy_res.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
##############################
        sme = SMOTEENN(random_state=42,n_jobs=-1)
        X_sme, y_sme = sme.fit_sample(X_res, y_res)

    #draw bar

        dy_sme = pd.DataFrame(y_sme)
        dy_sme.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()

    #generate csv

        df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1)

        df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f')
        
    ###the first line of data will be delete    


    ##########draw PCA
        pca = PCA(n_components=2)
        X_sme = pca.fit_transform(X_sme)
        plot_2d_space(X_sme,y_sme, 'SMOTE + ENN')

        return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv')




# if __name__ == '__main__':
#     path ="++Final_Test++_pre.csv"
#     #draw_bar(path)
#     mhi = My_handle_imbalance(path)
#     mhi.use_OSSSMOTEENN()
#
#     #use_SMOTETomek(path)
#     #draw_origin(path)
示例#7
0
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'oss_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'oss_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'oss_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#9
0
def test_oss_fit_sample_with_indices():
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    oss = OneSidedSelection(random_state=RND_SEED)
    X_resampled, y_resampled = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#12
0

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply One-Sided Selection
oss = OneSidedSelection()
X_resampled, y_resampled = oss.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
示例#13
0
# ROS와 SMOTE data unique성 비교
import pandas as pd

X_sampled1 = pd.DataFrame(X_sampled1)
len(X_sampled1.drop_duplicates())  # unique가 많지않다

X_sampled2 = pd.DataFrame(X_sampled2)
len(X_sampled2.drop_duplicates())  # new data가 있기 때문에 unique가 상대적으로 많긴 하다.

# Tomek Link
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, inds = tl.fit_sample(X, y)

# One-sided selection
# remove every data point, 근데 그 전에 k-nn을 적용해야한다.

from imblearn.under_sampling import OneSidedSelection

oss = OneSidedSelection(n_neighbors=1, n_seeds_S=1)
X_resampled, y_resampled = oss.fit_sample(X, y)

#Cost-sensitive Learning

svc = SVC(kernel='linear', class_weight={1: 10})
svc.fit(X, y)

y_pred = svc.predict(X)
recall_score(y, y_pred)
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import NearMiss
from notify import notify

eeg_feat = readStoredData('eeg_pat22_feats.p')
X = eeg_feat['feats']
y = np.ravel(eeg_feat['labels'])

nsamples,nfeats = X.shape

classRatio = np.sum(y)/130.0e3

#%%
print 'starting OSS'
OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51, n_jobs=-1)
ossx, ossy = OSS.fit_sample(X, y)

#%%
#print 'starting CC_cr'
#CC = ClusterCentroids(n_jobs=-1,ratio=classRatio)
#ccx_cr,ccy_cr = CC.fit_sample(X,y)

#%%
#print 'starting CC_a'
#CC = ClusterCentroids(n_jobs=-1,ratio='auto')
#ccx_a,ccy_a = CC.fit_sample(X,y)

#%%
#print 'starting NM3_cr'
#NM3 = NearMiss(version=3,n_jobs=-1,ratio=classRatio)
#nm3x_cr, nm3y_cr = NM3.fit_sample(X, y)
示例#15
0
print data['target'].value_counts(normalize=True)
print "d"
del data['target']
del data['connection_id']
from sklearn.model_selection import train_test_split
testdata = pd.read_csv('test_data.csv')

d = testdata['connection_id']
del testdata['connection_id']
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(data)

# Apply the random under-sampling
rus = OneSidedSelection(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(data, y)
# fit model on all training data

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X_resampled, y_resampled)

print feat_selector.support_

# check ranking of features
print feat_selector.ranking_
示例#16
0
# The second algorithm uses an undersampling technique called One Sided Selection. Because a majority of our data fits into only 2 of our 7 loan status categories, we must be careful that our algorithm doesn't simply decide to put all loans into those two categories and call it a day. For this run, we use the undersampled data to train the algorithm and the entire dataset to test accuracy.

# In[33]:

y = df_ml['loan_status']
X = df_ml.drop(['loan_status'], axis=1)

# In[34]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

oss = OneSidedSelection()
X_resampled, y_resampled = oss.fit_sample(X, y)

# In[35]:

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

print("Decision tree - Normal data set")
print(metrics.accuracy_score(y_test, dt.predict(X_test)))
print(metrics.confusion_matrix(y_test, dt.predict(X_test)))

dt = DecisionTreeClassifier()
dt.fit(X_resampled, y_resampled)
print("Decision tree - Imbalanced lean -One Sided Selection")
print(metrics.accuracy_score(y, dt.predict(X)))
print(metrics.confusion_matrix(y, dt.predict(X)))
示例#17
0
# Under Sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices = True) 
under_X, under_y, inds = rus.fit_sample(X,y)
logistic_(pd.DataFrame(under_X), pd.DataFrame(under_y))
svc_(pd.DataFrame(under_X),pd.DataFrame(under_y))

# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors = 10)
smote_X, smote_y = smote.fit_sample(X,y)
logistic_(pd.DataFrame(smote_X), pd.DataFrame(smote_y))
svc_(pd.DataFrame(smote_X), pd.DataFrame(smote_y)) # time spend

# Tomek Link
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(return_indices = True)
tomek_X , tomek_y, inds = tl.fit_sample(X,y)
logistic_(pd.DataFrame(tomek_X), pd.DataFrame(tomek_y))
svc_(pd.DataFrame(tomek_X), pd.DataFrame(tomek_y))

# One-sided selection
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(n_neighbors=1, n_seeds_S=1)
os_X, os_y = oss.fit_sample(X,y)
logistic_(pd.DataFrame(os_X), pd.DataFrame(os_y))
svc_(pd.DataFrame(os_X), pd.DataFrame(os_y))



示例#18
0
def test_oss_with_wrong_object():
    knn = 'rnd'
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int"):
        oss.fit_sample(X, Y)
示例#19
0
###############################################################################
### Condensed Nearest Neighbor
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)
plot_(X_resampled, y_resampled, remove=True)

cnn_tree = tree.fit(X_resampled, y_resampled)
cnn_ = confusion_matrix(y_test, cnn_tree.predict(X_test))

###############################################################################
### One-side selection
oss = OneSidedSelection(return_indices=True)
X_resampled, y_resampled, idx_resampled = oss.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)
plot_(X_resampled, y_resampled, remove=True)

oss_tree = tree.fit(X_resampled, y_resampled)
oss_ = confusion_matrix(y_test, oss_tree.predict(X_test))

###############################################################################
### Random oversampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)

ros_tree = tree.fit(X_resampled, y_resampled)
示例#20
0
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
random_normal_indices = np.random.choice(normal_indices,
                                         number_records_fraud,
                                         replace=False)
random_normal_indices = np.array(random_normal_indices)

print("go")
print(datetime.datetime.now().time())
oss = OneSidedSelection(return_indices=True)
cnn = CondensedNearestNeighbour(return_indices=True)
rus = RandomUnderSampler(return_indices=True)
nm = NearMiss(version=3, return_indices=True)
X_resampled, y_resampled, idx_resampled = oss.fit_sample(X, Y.values.ravel())
# X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X, Y.values.ravel())
# X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y.values.ravel())
# X_resampled, y_resampled, idx_resampled = nm.fit_sample(X, Y.values.ravel())
print("stop")
print(datetime.datetime.now().time())

# print(X_resampled)
# print(y_resampled)
# print(idx_resampled)
idx_resampled = np.array(idx_resampled)
# print(len(idx_resampled))

# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices, idx_resampled])