示例#1
0
def test_cnn_fit_sample_with_object():
    """Test the fit sample routine with a knn object"""

    # Resample the data
    knn = KNeighborsClassifier(n_neighbors=1)
    cnn = CondensedNearestNeighbour(random_state=RND_SEED,
                                    n_neighbors=knn)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181],
                     [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907],
                     [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382],
                     [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973],
                     [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657],
                     [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    cnn = CondensedNearestNeighbour(random_state=RND_SEED,
                                    n_neighbors=1)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def fit(self, X , y = None):
        # 'Random under-sampling'
        smote =  CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)
        #Accuracy: 0.939693267481
        #Precision: 0.238095238095
        #Recall: 0.897435897436

        #Accuracy: 0.962568234988
        #Precision: 0.324468085106
        #Recall: 0.782051282051
        #SMOTE(ratio=ratio, kind='borderline1')
        #Accuracy: 0.971146347803
        #Precision: 0.372093023256
        #Recall: 0.615384615385
        #SMOTE(ratio=ratio, kind='borderline2')
        #Accuracy: 0.965427605927
        #Precision: 0.333333333333
        #Recall: 0.705128205128
        #svm_args = {'class_weight': 'auto'}
        #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args)
        #Accuracy: 0.972186119054
        #Precision: 0.395683453237
        #Recall: 0.705128205128

       # smote = SMOTE(ratio='auto', kind='regular')
        X, y = smote.fit_sample(X.toarray(), y)
        weights = np.array([1/y.mean() if i == 1 else 1 for i in y])
        return super(RandomForestClassifier, self).fit(X,y,sample_weight=weights)
示例#3
0
def random_instance_selection(dfZ, x, blackbox, dataset):
    dfZ1, Z = random_neighborhood(dfZ, x, blackbox, dataset)
    y = blackbox.predict(Z)

    cnn = CondensedNearestNeighbour(return_indices=True)
    Z, _, _ = cnn.fit_sample(Z, y)
    dfZ = build_df2explain(blackbox, Z, dataset)
    return dfZ, Z
示例#4
0
def test_cnn_fit_sample_with_object():
    knn = KNeighborsClassifier(n_neighbors=1)
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#10
0
def test_cnn_fit_sample_with_indices():
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#12
0
def train_stage(df_path, cb_path):

    print('Load Train Data.')
    df = pd.read_csv(df_path)
    print('\nShape of Train Data: {}'.format(df.shape))

    y_df = np.array(df['target'])
    df_ids = np.array(df.index)
    df.drop(['ID_code', 'target'], axis=1, inplace=True)

    cb_cv_result = np.zeros(df.shape[0])

    skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42)
    skf.get_n_splits(df_ids, y_df)

    #sm = TomekLinks(random_state=42)
    sm = CondensedNearestNeighbour(random_state=42, n_jobs=3)

    print('\nModel Fitting...')
    for counter, ids in enumerate(skf.split(df_ids, y_df)):
        print('\nFold {}'.format(counter + 1))
        X_fit, y_fit = df.values[ids[0]], y_df[ids[0]]
        X_val, y_val = df.values[ids[1]], y_df[ids[1]]

        X_fit, y_fit = sm.fit_sample(X_fit, y_fit)

        print('CatBoost')
        cb_cv_result[ids[1]] += fit_cb(X_fit,
                                       y_fit,
                                       X_val,
                                       y_val,
                                       counter,
                                       cb_path,
                                       name='cb')

        del X_fit, X_val, y_fit, y_val
        gc.collect()

    auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4)
    print('Catboost VAL AUC: {}'.format(auc_cb))

    return 0
示例#13
0
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181],
                     [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907],
                     [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382],
                     [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973],
                     [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657],
                     [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#14
0
scores = cross_validate(enn_pipe_rf,
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()
# (0.9248526844001812, 0.6883592815252976)

######### Condensed Nearest Neighbor #########

from imblearn.under_sampling import CondensedNearestNeighbour

# opposite of ENN; iteratively adds points to the data that are misclassified by KNN

cnn = CondensedNearestNeighbour()
X_train_cnn, y_train_cnn = cnn.fit_sample(X_train, y_train)
print(X_train_cnn.shape)
print(np.bincount(y_train_cnn))

### Pipeline method

cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(), LogisticRegression())

scores = cross_validate(cnn_pipe,
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean()

cnn_pipe_rf = make_imb_pipeline(CondensedNearestNeighbour(),
示例#15
0
def balance_cnn(input):
    input_x, input_y = input
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_sample(input_x, input_y)
    return X_res, y_res
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

#initial statistics
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
#using standard scaling on sepa-length,sepal-width ,petal -length,petal-width and encoding on
#different species of iris
x = iris.data[:, 0:4]
y = iris.target
X_normalized = normalize(x, axis=0)
x_train, x_test, y_train, y_test = train_test_split(X_normalized,
                                                    y,
                                                    test_size=0.20)
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_normalized, y)
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_resampled, y_resampled)

y_pred = clf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
print(classification_report(y_test, y_pred, target_names=target_names))
示例#17
0
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from itertools import product
from sklearn.neighbors import DistanceMetric
from imblearn.under_sampling import CondensedNearestNeighbour

iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

cnn = CondensedNearestNeighbour()
X_cnn, y_cnn = cnn.fit_sample(X, y)

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

metrics = ['euclidean', 'mahalanobis']

n_neighbors = [1, 3]

datasets = [{
    "X": X,
    "y": y,
    "cnn": False
}, {
    "X": X_cnn,
    "y": y_cnn,
    "cnn": True
示例#18
0
# Method3:(optional) -->
income2 = pd.get_dummies(income_raw)['>50K']

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))

# Uncomment the following line to see the encoded feature names
print(encoded)

#-----------------
# @Raafat: Some techniques to deal imbalanced data:
# --> under sampling
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(random_state=42)
X_res, y_res = cnn.fit_sample(features_final[0:300], income[0:300])
print('not Resampled dataset shape {}'.format(income[0:300].value_counts()))
print('cnn Resampled dataset shape {}'.format(pd.Series(y_res).value_counts()))

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_sample(features_final[0:300], income[0:300])
print('rus Resampled dataset shape {}'.format(pd.Series(y_res).value_counts()))

from imblearn.under_sampling import TomekLinks
tl = TomekLinks(random_state=42)
X_res, y_res = tl.fit_sample(features_final[0:300], income[0:300])
print('tl Resampled dataset shape {}'.format(pd.Series(y_res).value_counts()))

# --> over sampling
from imblearn.over_sampling import SMOTE
示例#19
0

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour()
X_resampled, y_resampled = cnn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
示例#20
0
from matplotlib.colors import ListedColormap
from sklearn import neighbors
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF','#FFFFE0'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#9B870C'])

for data in X:
    yind = X.index(data)
    yt = y[yind]
    
    X_train,X_test,y_train,y_test = train_test_split(data,yt,test_size=0.2,random_state=1,stratify=yt)

    cnn = CondensedNearestNeighbour()
    Xc,yc = cnn.fit_sample(data,yt)
    X_train_cnn,X_test_cnn,y_train_cnn,y_test_cnn = train_test_split(data,yt,test_size=0.2,random_state=1,stratify=yt)

    clf1 = neighbors.KNeighborsClassifier(n_neighbors=1)
    clf1.fit(X_train,y_train)
    pred1 = clf1.predict(X_test)
    pred_cnn1 = clf1.predict(X_test_cnn)

    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))

    x_minc, x_maxc = Xc[:, 0].min() - 1, Xc[:, 0].max() + 1
    y_minc, y_maxc = Xc[:, 1].min() - 1, Xc[:, 1].max() + 1
    xxc, yyc = np.meshgrid(np.arange(x_minc, x_maxc, 0.02),np.arange(y_minc, y_maxc, 0.02))
示例#21
0
from imblearn.under_sampling import CondensedNearestNeighbour

## Generate the dataset
#X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
#                           n_informative=3, n_redundant=1, flip_y=0,
#                           n_features=20, n_clusters_per_class=1,
#                           n_samples=200, random_state=10)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour(return_indices=True)
#Xtrain_new, Ytrain_new, idx_resampled = cnn.fit_sample(Xtrain[selectList].iloc[:100,:], Ytrain.iloc[:100])

from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)

Xtrain_new, Ytrain_new, _ = cnn.fit_sample(Xtrain[selectList].iloc[:400, :],
                                           Ytrain.iloc[:400])
Xtrain_new = pd.DataFrame(Xtrain_new)
Ytrain_new = pd.DataFrame(Ytrain_new)

for i in range(800, Xtrain.shape[0], 400):  #Xtrain.shape[0]  36279
    X_resampled, y_resampled, _ = cnn.fit_sample(
        Xtrain[selectList].iloc[(i - 400):i, :], Ytrain.iloc[(i - 400):i])
    Xtrain_new = pd.concat([
        Xtrain_new.reset_index(drop=True),
        pd.DataFrame(X_resampled).reset_index(drop=True)
    ],
                           axis=0)
    Ytrain_new = pd.concat([
        Ytrain_new.reset_index(drop=True),
        pd.DataFrame(y_resampled).reset_index(drop=True)
    ],
示例#22
0
import matplotlib.pyplot as plt
from pylab import subplot, title 
from matplotlib.colors import ListedColormap
from imblearn.under_sampling import CondensedNearestNeighbour

X1, y1 = make_blobs(n_samples=150, centers=4, n_features=2,random_state=21)
X2, y2 = make_gaussian_quantiles(mean=(2,2),cov=3., n_samples=150, n_features=2, n_classes=3, random_state=9)
X3, y3 = make_gaussian_quantiles(mean=(5,5),cov=5., n_samples=150, n_features=2, n_classes=2, random_state=15)

X = concatenate([X1,X2,X3])
y = concatenate([y1,y2,y3])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=35)

cnn = CondensedNearestNeighbour(random_state=0)   #random_state is used to get the same result for every run
X_res1, y_res1 = cnn.fit_sample(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_res1, y_res1, test_size=0.25, random_state=35)    #CNN İLE ALAKALI ACCURACY İÇİN!

h = .02

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#8B008B'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#8B008B'])

clf1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
clf2 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
clf1.fit(X_train, y_train)
clf2.fit(X_train1,y_train1)
pred1 = clf1.predict(X_test)

pred2 = clf2.predict(X_test1)
示例#23
0
 def balance_cnn(self):
     cnn = CondensedNearestNeighbour(random_state=42)
     X_res, y_res = cnn.fit_sample(self.vec, self.target)
     Classification_JCL.split_data(self, X_res, y_res)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import collections

data = pd.read_csv("creditcard.csv")
d1 = np.array(data['Amount'])
data['normAmount'] = StandardScaler().fit_transform(d1.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

#samppling

cnn = CondensedNearestNeighbour(random_state=1)
X_sampled, y_sampled = cnn.fit_sample(X, y.values.ravel())
print("sampled data size", collections.Counter(y_sampled))
X_train, X_test, y_train, y_test = train_test_split(X_sampled,
                                                    y_sampled,
                                                    test_size=0.3,
                                                    random_state=0)
X_train_sampled_df = pd.DataFrame(X_train)
y_train_sampled_df = pd.DataFrame(y_train)
X_test_sampled_df = pd.DataFrame(X_test)
y_test_sampled_df = pd.DataFrame(y_test)

#random forest
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train_sampled_df, y_train_sampled_df.values.ravel())
y_pred = clf.predict(X_test_sampled_df)
print("predicted")
示例#25
0
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 18 16:03:54 2018

@author: Student
"""

import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd

dataset=pd.read_csv('car_datacat.csv')
x = dataset.iloc[:,0:6].values
y =dataset.iloc[:,6].values
x_train, x_test, y_train, y_test =train_test_split(x,y,test_size=0.2)
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_sample(x, y)
clf=KNeighborsClassifier(n_neighbors=1)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
示例#26
0
            except Exception as e:
                print('an issue with {}, rate: {}, variant: {}'.format(
                    dataset, rate, variant))
                print(e)

        if CNN_FLAG:
            variant = 'CNN'
            print('>> {}, rate: {}, variant: {}'.format(
                dataset, rate, variant))
            try:
                dataset_size = X_train.shape[0]
                coreset_size = max(int(dataset_size * rate / 100), 1)
                startTime = datetime.now()
                cnn = CondensedNearestNeighbour(random_state=SEED)
                observers, total_labels = cnn.fit_sample(X_train, y_train)
                observers, total_labels = fix_rate(X_train, y_train, observers,
                                                   total_labels, coreset_size)
                interm = (datetime.now() - startTime).total_seconds()
                startTime = datetime.now()
                try:
                    neigh = KNeighborsClassifier(n_neighbors=5)
                except:
                    neigh = KNeighborsClassifier(n_neighbors=1)
                neigh.fit(observers, total_labels)
                y_pred = neigh.predict(X_test)
                final = (datetime.now() - startTime).total_seconds()
                inds = get_indices_(y_test, y_pred)
                CNN_RES = fill_table(CNN_RES, inds, j, interm, final)

            except Exception as e:
示例#27
0
##############################################################################
### tomaek Links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)
plot_(X_resampled, y_resampled, remove=True)

tl_tree = tree.fit(X_resampled, y_resampled)
tl_ = confusion_matrix(y_test, tl_tree.predict(X_test))

###############################################################################
### Condensed Nearest Neighbor
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)
plot_(X_resampled, y_resampled, remove=True)

cnn_tree = tree.fit(X_resampled, y_resampled)
cnn_ = confusion_matrix(y_test, cnn_tree.predict(X_test))

###############################################################################
### One-side selection
oss = OneSidedSelection(return_indices=True)
X_resampled, y_resampled, idx_resampled = oss.fit_sample(X_train, y_train)

plot_(X_resampled, y_resampled, remove=False)
plot_(X_resampled, y_resampled, remove=True)
示例#28
0
def draw_cnn(k, metric):
    names = ['x', 'y', 'color']

    df = pd.DataFrame(mapped_colors, columns=names)
    # print(df.head())

    originalX = np.array(df.ix[:, 0:2])
    originaly = np.array(df['color'])

    cnn = CondensedNearestNeighbour(n_neighbors=k, return_indices=True)
    X_resampled, y_resampled, idx_resampled = cnn.fit_sample(
        originalX, originaly)

    X = X_resampled
    y = y_resampled

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    if metric == 'mahalanobis':
        knn = KNeighborsClassifier(
            n_neighbors=k,
            metric=metric,
            metric_params={'V': np.cov(np.transpose(X))})
    else:
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric)

    knn.fit(X_train, y_train)

    pred = knn.predict(X_test)

    err = 1 - accuracy_score(y_test, pred)
    print('\nThe error is ' + str(err * 100))
    print('\nPercentage points left after CNN: ' +
          str(len(idx_resampled) / total_points * 100))

    h = .02

    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
    cmap_black = ListedColormap(['#FFFFFF', '#FFFFFF', '#FFFFFF'])

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    plt.scatter(originalX[:, 0],
                originalX[:, 1],
                c=originaly,
                cmap=cmap_black,
                edgecolor='k',
                s=20)

    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i)" % k)
示例#29
0
def test_cnn_fit_sample_with_wrong_object():
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int or an "):
        cnn.fit_sample(X, Y)
示例#30
0

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour()
X_resampled, y_resampled = cnn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
        score = clf.predict_proba(X_test)
        evaluate(y_test, score)

    # Tomek's links

    # Edited data set using nearest neighbours
    print("######################## ENN ########################")
    enn = EditedNearestNeighbours(random_state=0)
    X_res, y_res = enn.fit_sample(X_train, y_train)
    print(X_train.shape)
    print(X_res.shape)
    print(np.sum(y_res))
    clf = SVC(probability=True)
    clf.fit(X_res, y_res)
    score = clf.predict_proba(X_test)
    evaluate(y_test, score)

    # Condensed nearest neighbors and derived algorithms
    print("######################## CNN ########################")
    cnn = CondensedNearestNeighbour(random_state=0)
    X_res, y_res = cnn.fit_sample(X_train, y_train)
    print(X_train.shape)
    print(X_res.shape)
    print(np.sum(y_res))
    clf = SVC(probability=True)
    clf.fit(X_res, y_res)
    score = clf.predict_proba(X_test)
    evaluate(y_test, score)

    pass