예제 #1
0
def get_binary_Tomek_Links_cleaned_data(id_df, X_df, y_df):
    tLinks = TomekLinks()
    a = y_df.iloc[:, 0]
    tLinks.fit_sample(X_df, y_df.iloc[:, 0])
    sample_indices = tLinks.sample_indices_
    id_df_cleaned = id_df.iloc[sample_indices]
    X_df_cleaned = X_df.iloc[sample_indices]
    y_df_cleaned = y_df.iloc[sample_indices]
    return id_df_cleaned, X_df_cleaned, y_df_cleaned
예제 #2
0
def getData(splitData=True, useImbalancer=False, useStratify=False):
    global standard_scaler
    data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv")
    X = data.values[:, 1:-1]
    rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy()
    X = np.concatenate((X, rank_dummy), axis=1)
    y = data.values[:, 0].reshape(-1, 1)
    if useStratify:
        stratify = y
    else:
        stratify = None
    if splitData:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=101,
                                                            shuffle=True,
                                                            stratify=stratify)
    else:
        X_train = X
        y_train = y
    if useImbalancer and splitData:
        tl = TomekLinks(sampling_strategy='majority')
        X_train, y_train = tl.fit_sample(X=X_train, y=y_train)
        # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T)
    if splitData:
        unique, counts = np.unique(y_test, return_counts=True)
    # print("y_test\n", np.asarray((unique, counts)).T)
    if splitData:
        return X_train, X_test, y_train.ravel(), y_test.ravel()
    else:
        return X_train, y_train.ravel()
예제 #3
0
def undersample(X, y, bal_strategy):
	print 'Shape of X: ', X.shape
	print 'Shape of y_Train: ', y.shape

	if(bal_strategy == "RANDOM" or bal_strategy == "ALL"):
		# apply random under-sampling
		rus = RandomUnderSampler()
		X_sampled, y_sampled = rus.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"):
		# Apply Tomek Links cleaning
		tl = TomekLinks()
		X_sampled, y_sampled = tl.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE'
		sys.exit(1)

	return (X_sampled, y_sampled)
예제 #4
0
def undersample_tomek_link(X,
                           y,
                           label='Tomek links under-sampling',
                           plot=False):
    tl = TomekLinks(return_indices=True, ratio='all')
    X_tl, y_tl, id_tl = tl.fit_sample(X, y)
    X_tl = pd.DataFrame(X_tl, columns=X.columns)
    y_tl = pd.Series(y_tl, name=y.name)
    if plot == True:
        #print('Removed indexes:', id_tl)
        # plotting using pca
        pca = PCA(n_components=2)
        X_pca = pd.DataFrame(pca.fit_transform(X_tl))
        colors = ['#1F77B4', '#FF7F0E']
        markers = ['o', 's']
        for l, c, m in zip(np.unique(y_tl), colors, markers):
            plt.scatter(
                X_pca.loc[y_tl == l, 0],  # pc 1
                X_pca.loc[y_tl == l, 1],  # pc 2
                c=c,
                label=l,
                marker=m)
        plt.title(label)
        plt.legend(loc='upper right')
        plt.show()
    return X_tl, y_tl, tl, id_tl
예제 #5
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #6
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def _tomek_data(self):
        """Performs tomek links. Can not handle nominal values."""

        if self.cols_nominal.size > 0:
            print("Skipping Tomek Links. Cannot perform with raw categorical data. Create dummies to use.")
            return
        tl = TomekLinks()
        self.X_train, self.y_train = tl.fit_sample(self.X_train, self.y_train)
예제 #8
0
def get_tomeklinks_under_sampled_dataset():
    tl = TomekLinks(return_indices=True, ratio='majority')
    X_tl, y_tl, id_tl = tl.fit_sample(X_train, y_train)

    print('Removed indexes:', id_tl)
    shuffle(X_tl)
    y_tl = X_tl[target]

    return X_tl, y_tl
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #10
0
def imbalanced_resampling(method, x, y):
    if method == "under":
        sampling = TomekLinks(sampling_strategy="auto")
    elif method == "over":
        sampling = SMOTE(ratio='auto')
    elif method == "combined":
        sampling = SMOTETomek()
    else:
        return x, y

    X, Y = sampling.fit_sample(x, y)
    return X, Y
예제 #11
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #12
0
    def oversample(self, train, labels):
        """
            Over samples data according to SMOTE algorithm
        """
        #Oversample
        sm = SMOTE(random_state=2)
        train_res, labels_res = sm.fit_sample(train, labels)

        #clear noise points that emerged from oversampling
        tl = TomekLinks(random_state=42)
        train_res, labels_res = tl.fit_sample(train_res, labels_res)

        return train_res, labels_res
예제 #13
0
def trainModelWithResults(model, X, y,rd_state=None,autoscale=1,usetomeklinks=1):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=rd_state) # stratify the split because we have unbalanced target
    if autoscale==1:
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    if usetomeklinks==1:
        tl = TomekLinks(return_indices=False)
        X_train, y_train = tl.fit_sample(X_train, y_train)
    mfitted = model.fit(X_train,y_train)
    predictions = mfitted.predict(X_test)
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
def test_tl_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #15
0
def test_tl_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #16
0
def dataset_sampling(X,y):
	sm = SMOTE(random_state=42,ratio='minority')
	smt = SMOTETomek(ratio='auto')
	ros = RandomOverSampler(random_state=0)
	rus = RandomUnderSampler(random_state=0)
	tl = TomekLinks(return_indices=True, ratio='majority')
	cc = ClusterCentroids(ratio={0: 10})
	#X_res, y_res = sm.fit_resample(X, y)
	#X_res, y_res = ros.fit_resample(X, y)
	#X_res, y_res = rus.fit_resample(X, y)
	X_res, y_res, id_tl = tl.fit_sample(X, y)
	#X_res, y_res = cc.fit_sample(X, y)
	#X_res, y_res = smt.fit_sample(X, y)
	return X_res,y_res
def undersampling(df):
    tl = TomekLinks(ratio='all', n_jobs=16, return_indices=True)
    X = []
    y = []
    add2list(df, X, y)

    X, y, idx = tl.fit_sample(X, y)

    criterion = [False] * len(df)
    for i in idx:
        criterion[i] = True

    newdf = df[criterion].reset_index(drop=True)
    return newdf
예제 #18
0
def resample(X, y, method):

    if method == 'smote':
        sm = SMOTE(random_state=2777, ratio=1.0)
        X, y = sm.fit_sample(X, y)
    elif method == 'tomek':
        tomek = TomekLinks(random_state=2777, sampling_strategy='majority')
        X, y = tomek.fit_sample(X, y)
    elif method == 'smote-tomek':
        smt = SMOTETomek(random_state=2777, ratio='auto')
        X, y = smt.fit_sample(X, y)
    elif method == 'none':
        pass
    else:
        raise ValueError('Resampling method not recognized.')

    return X, y
예제 #19
0
def test_tl_fit_sample_with_indices():
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052], [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207], [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695], [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653], [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    idx_gt = np.array(
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #20
0
def test_tl_fit_sample_with_indices():
    tl = TomekLinks(return_indices=True)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052], [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207], [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695], [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653], [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    idx_gt = np.array(
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def undersampling_pair(df1, df2):
    tl = TomekLinks(ratio='auto', n_jobs=16, return_indices=True)
    X = []
    y = []
    add2list(df1, X, y)
    add2list(df2, X, y)
    X, y, idx = tl.fit_sample(X, y)
    l1 = len(df1)
    l2 = len(df2)
    criterion1 = [False] * l1
    criterion2 = [False] * l2
    for i in idx:
        if i < l1:
            criterion1[i] = True
        else:
            criterion2[i - l1] = True
    newdf1 = df1[criterion1].reset_index(drop=True)
    newdf2 = df2[criterion2].reset_index(drop=True)
    return newdf1, newdf2
예제 #22
0
    def under_sampling(self, x, y, method='random'):
        # if the y data is dataframe, it must be flatted (num_sample, )
        if isinstance(y, pd.DataFrame):
            y_ = y.ravel()
        else:
            y_ = y

        # use tomeklinks under-sample
        if 'tomek' in str(method).lower():
            tl = TomekLinks(return_indices=True, ratio='majority')
            x_res, y_res, id_res = tl.fit_sample(x, y_)
        # Need to be implemented
        elif 'cluster' in str(method).lower():
            pass
        # the default option is use Random-Sample
        else:
            rus = RandomUnderSampler(return_indices=True)
            x_res, y_res, id_res = rus.fit_sample(x, y_)

        # return desired information
        if self.return_indices:
            return x_res, y_res, id_res
        return x_res, y_res
예제 #23
0
def sampling(X_train, y_train):
    ran_over = RandomOverSampler(random_state=42)
    X_train_oversample,y_train_oversample = ran_over.fit_resample(X_train,y_train)
    ran_under = RandomUnderSampler(random_state=42)
    X_train_undersample, y_train_undersample = ran_under.fit_resample(X_train,y_train)
    tl = TomekLinks(n_jobs=6)
    X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train)
    sm = SMOTE(random_state=42, n_jobs=5)
    X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
    enn = EditedNearestNeighbours()
    X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

    print(np.unique(y_train, return_counts=True))
    print("after sampling")
    print("randomg over sampling")
    print(np.unique(y_train_oversample, return_counts=True))
    print("SMOTE sampling")
    print(np.unique(y_train_sm, return_counts=True))
    print("random under sampling")
    print(np.unique(y_train_undersample, return_counts=True))
    print("TomekLinks under sampling")
    print(np.unique(y_train_tl, return_counts=True))
    return (X_train_oversample, y_train_oversample, X_train_undersample, y_train_undersample,
     X_train_tl, y_train_tl, X_train_sm, y_train_sm, X_train_enn, y_train_enn)
예제 #24
0
#
# - Elimination of closely intact data points from Majority Class increases the gap between the 2 classes which further eases the Classification Process.
#
#
# - Below Figure best explains the Tomek Link Technique.

# ![alt text](https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/tomek.png?v=2)

# In[23]:

#implementing tomek links

from imblearn.under_sampling import TomekLinks

tl = TomekLinks(random_state=50, ratio='not minority')
x_tl_res, y_tl_res = tl.fit_sample(xtrain, ytrain)

print(ytrain.value_counts(), '\n')
np.bincount(y_tl_res)

# In[24]:

#applying logistic regression

est = LogisticRegression(solver='lbfgs')
est.fit(x_tl_res, y_tl_res)

pred = est.predict(xtest)
print('Prediction : ', pred, '\n')

print('Validation Score : ', est.score(xtest, ytest) * 100)
                               ,predictions)

print(f'Recall Logistic Regression {recall: .2f}')
print(report)
print(balanced_accuracy_score(Y_test, predictions))

t1 = pl.time.time() - t0
print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60)))
print("best parameters",LR_model.best_params_)
plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin'])

# #forth test----------------------Logestic Regression--------------------------------with undersampling

from imblearn.under_sampling import TomekLinks
undersample = TomekLinks()
Xtrain_tomek, Ytrain_tomek = undersample.fit_sample(X_train, Y_train)
t0 = pl.time.time()
LR = LogisticRegression(max_iter=4000,
                            random_state=49,
                            n_jobs=1, class_weight='balanced') # for liblinear n_jobs is +1.

parameters = {"penalty": ['l1', 'l2'],'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], "solver":['liblinear','sag','saga']}

LR_model = GridSearchCV(LR, parameters, scoring="precision", cv=3)

# fit the classifier
LR_model.fit(Xtrain_tomek,Ytrain_tomek.values.ravel())

# get the prediction
predictions = LR_model.predict(X_test)
예제 #26
0
# samples. If ``ratio='auto'`` only the sample from the majority class will be
# removed. If ``ratio='all'`` both samples will be removed.

sampler = TomekLinks()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
             'Removing all samples')
for ax, title, sampler in zip(ax_arr,
                              title_arr,
                              [TomekLinks(ratio='auto', random_state=0),
                               TomekLinks(ratio='all', random_state=0)]):
    X_res, y_res = sampler.fit_sample(np.vstack((X_minority, X_majority)),
                                      np.array([0] * X_minority.shape[0] +
                                               [1] * X_majority.shape[0]))
    ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
               label='Minority class', s=200, marker='_')
    ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
               label='Majority class', s=200, marker='+')

    # highlight the samples of interest
    ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
               [X_minority[-1, 1], X_majority[1, 1]],
               label='Tomek link', s=200, alpha=0.3)

    ax.set_title(title)
    make_plot_despine(ax)
fig.tight_layout()
예제 #27
0
X_sampled2, y_sampled2 = smote.fit_sample(X, y)

# ROS와 SMOTE data unique성 비교
import pandas as pd

X_sampled1 = pd.DataFrame(X_sampled1)
len(X_sampled1.drop_duplicates())  # unique가 많지않다

X_sampled2 = pd.DataFrame(X_sampled2)
len(X_sampled2.drop_duplicates())  # new data가 있기 때문에 unique가 상대적으로 많긴 하다.

# Tomek Link
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, inds = tl.fit_sample(X, y)

# One-sided selection
# remove every data point, 근데 그 전에 k-nn을 적용해야한다.

from imblearn.under_sampling import OneSidedSelection

oss = OneSidedSelection(n_neighbors=1, n_seeds_S=1)
X_resampled, y_resampled = oss.fit_sample(X, y)

#Cost-sensitive Learning

svc = SVC(kernel='linear', class_weight={1: 10})
svc.fit(X, y)

y_pred = svc.predict(X)
예제 #28
0
from imblearn.under_sampling import TomekLinks
smote_nc = SMOTENC(categorical_features=[4, 5, 6, 7, 8, 9, 10, 11, 12],
                   random_state=0)
X_res, y_res = smote_nc.fit_resample(X_train, y_train)

geo_bool_mask = (X_res[:, 6] + X_res[:, 7] + X_res[:, 8]) == 1
X_res = X_res[geo_bool_mask]
y_res = y_res[geo_bool_mask]

gender_bool_mask = (X_res[:, 9] + X_res[:, 10]) == 1
X_resampled1 = X_res[gender_bool_mask]
y_resampled1 = y_res[gender_bool_mask]
print(sorted(Counter(y_resampled1).items()))

tl = TomekLinks()
X_resampled, y_resampled = tl.fit_sample(X_resampled1, y_resampled1)

svm_svc_poly_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [1e-6, 1e-5, 1e-4, 1e-3],
    'kernel': ['poly'],
    'degree': [2, 3, 4],
    'class_weight': ['balanced', None],
    'probability': [True, False],
    'tol': [1e-5, 1e-4, 1e-3, 1e-2]
}
svm_svc_poly = SVC()
SVM_poly_grid_search = GridSearchCV(svm_svc_poly,
                                    svm_svc_poly_param_grid,
                                    cv=5,
                                    refit=True,
    plot_confusion_matrix(y_test, predict_mine)


interact(update, var=FloatSlider(min=0.001, max=0.04, step=0.001))

# /Logistic Regression with balanced class weight to resolve the issue of imbalance data/
lr_b = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_b.fit(X_train, y_train)
y_pred_b = lr_b.predict(X_test)
plot_confusion_matrix(y_test, y_pred_b)

# /Tomeklinks/
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train)
lr_tl = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_tl.fit(X_train_tl, y_train_tl)
y_pred_tl = lr_tl.predict(X_test)
plot_confusion_matrix(y_test, y_pred_tl)

# /SMOTE(Synthetic Minority OverSampling Technique)/
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_train_sm, y_train_sm = smote.fit_sample(X_train, y_train)
lr_sm = LogisticRegression(max_iter=1000)
lr_sm.fit(X_train_sm, y_train_sm)
y_pred_sm = lr_sm.predict(X_test)
plot_confusion_matrix(y_test, y_pred_sm)
예제 #30
0
#
# manifold.manifold(imgs_origin, labels_origin, manifold_args, scores_predict,
#                  index_to_class=index_to_class, showLabels=False, showImages=False,
#                  imageZoom=0.15, imageDist=8e-3)


# %%

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from collections import Counter

print(sorted(Counter(labels_origin).items()))

smote_enn = TomekLinks(random_state=0)
scores_predict2, labels_origin2 = smote_enn.fit_sample(
    scores_predict, labels_origin)
print(sorted(Counter(labels_origin2).items()))

# %%

import manifold

manifold_args = dict(
    RandomTrees=True,
    PCA=True,
    LinearDiscriminant=True,
    Spectral=True,
    TSNE=True,
    n_neighbors=30)

index_to_class = io.reverse_dict(class_to_index)
예제 #31
0
def test_deprecation_random_state():
    tl = TomekLinks(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        tl.fit_sample(X, Y)
예제 #32
0
nm1 = NearMiss(random_state=0, version=1)
X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

'''
Cleaning under-sampling techniques
omek’s links
TomekLinks:样本x与样本y来自于不同的类别,满足以下条件,它们之间被称之为TomekLinks;
不存在另外一个样本z,使得d(x,z)<d(x,y)或者 d(y,z)<d(x,y)成立.其中d(.)表示两个样本之间的距离,也就是说两个样本之间互为近邻关系.
这个时候,样本x或样本y很有可能是噪声数据,或者两个样本在边界的位置附近.
TomekLinks函数中的auto参数控制Tomek’s links中的哪些样本被剔除.
默认的ratio='auto'移除多数类的样本,当ratio='all'时,两个样本均被移除.
'''
from imblearn.under_sampling import TomekLinks
tl =TomekLinks(random_state=0,ratio='all')
X_resampled, y_resampled = tl.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
'''
[(0, 55), (1, 249), (2, 4654)]
'''
'''
Edited data set using nearest neighbours
EditedNearestNeighbours这种方法应用最近邻算法来编辑(edit)数据集,
找出那些与邻居不太友好的样本然后移除.对于每一个要进行下采样的样本,那些不满足一些准则的样本将会被移除;
他们的绝大多数(kind_sel='mode')或者全部(kind_sel='all')的近邻样本都属于同一个类,这些样本会被保留在数据集中.
'''
print(sorted(Counter(y).items()))
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = enn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
예제 #33
0
X_ros, y_ros = ros.fit_sample(X, y)



print(X_ros.shape[0] - X.shape[0], 'new random picked points')



plot_2d_space(X_ros, y_ros, 'Random over-sampling')
from imblearn.under_sampling import TomekLinks



tl = TomekLinks(return_indices=True, ratio='majority')

X_tl, y_tl, id_tl = tl.fit_sample(X, y)



print('Removed indexes:', id_tl)



plot_2d_space(X_tl, y_tl, 'Tomek links under-sampling')
from imblearn.under_sampling import ClusterCentroids



cc = ClusterCentroids(ratio={0: 10})

X_cc, y_cc = cc.fit_sample(X, y)
예제 #34
0
# In[36]:

X_imb_train, X_imb_test, y_imb_train, y_imb_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0)

print(get_report(y_imb_train, y_imb_train))

# ### Under-Sampling with TomekLinks

# In[39]:

from imblearn.under_sampling import TomekLinks

tl = TomekLinks(return_indices=False, ratio='majority')

X_tl, y_tl = tl.fit_sample(X_imb_train, y_imb_train)

plot_2d_space(X_tl, y_tl, 'Tomek links under-sampling')

# In[40]:

from sklearn.ensemble import RandomForestClassifier
sample_weights = np.array([1 if i == 0 else 5 for i in y_tl])

rfc_weighted_balanced = RandomForestClassifier(n_jobs=-1, warm_start=True)
rfc_weighted_balanced.fit(X_tl, y_tl, sample_weight=sample_weights)

print(get_report(y_imb_test, rfc_weighted_balanced.predict(X_imb_test)))

# Not good. lets try somthing else...
예제 #35
0
# from imblearn.under_sampling import NearMiss
# nm1 = NearMiss(random_state=0, version=1)
# X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
# print(sorted(Counter(y_resampled).items()))
'''
Cleaning under-sampling techniques
omek’s links
TomekLinks:样本x与样本y来自于不同的类别,满足以下条件,它们之间被称之为TomekLinks;
不存在另外一个样本z,使得d(x,z)<d(x,y)或者 d(y,z)<d(x,y)成立.其中d(.)表示两个样本之间的距离,也就是说两个样本之间互为近邻关系.
这个时候,样本x或样本y很有可能是噪声数据,或者两个样本在边界的位置附近.
TomekLinks函数中的auto参数控制Tomek’s links中的哪些样本被剔除.
默认的ratio='auto'移除多数类的样本,当ratio='all'时,两个样本均被移除.
'''
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(random_state=0, ratio='all')
X_resampled_tl, y_resampled_tl = tl.fit_sample(train_set_1_1, label)
print('TomekLinks ;', sorted(Counter(y_resampled_cc).items()))
x_train_tl, x_test_tl, y_train_tl, y_test_tl = train_test_split(X_resampled_tl,
                                                                y_resampled_tl,
                                                                random_state=1)
svm_clf.fit(x_train_tl, y_train_tl)
joblib.dump(svm_clf, '../model/tl_sample_model.pkl')

#tl评估
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm_clf, x_test_tl, y_test_tl, cv=5)
print('tl_score:', scores)
pred6 = svm_clf.predict(x_test_cc)
print('tl_accuracy_score:', metrics.accuracy_score(y_test_cc, pred3))
print('tl_f1_score:', metrics.f1_score(y_test_tl, pred6, average="micro"))
from sklearn.metrics import cohen_kappa_score  #Kappa系数是基于混淆矩阵的计算得到的模型评价参数
예제 #36
0
print(__doc__)

rng = np.random.RandomState(0)
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
              0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn,
                                                                    y_syn)

# remove Tomek links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_syn, y_syn)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),
                                   idx_resampled)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')

# make nice plotting
예제 #37
0
from imblearn.under_sampling import TomekLinks

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Tomek Links cleaning
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],