def fit(self, c_data, x_data, y_data):
        # this is to track evolution of the size of the training samples
        self.samplesize = []
        self.samplesize.append(len(x_data))

        if self.reject_by_calendar:
            mask = self.mask_cal(c_data, y_data)
            # filter rows rejected by this calendar criteria
            # not filtering them might improve second classifier training
            #x_data = normalize(x_data[mask])
            #y_data = y_data[mask]
            self.samplesize.append(len(x_data))

        if self.use_resampling:
            # undersample
            resampler = AllKNN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

            # oversample
            resampler = SMOTEENN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

        # train clf only with filtered and resampled data
        if self.use_weights:
            try:
                self.clf.fit(x_data, y_data, self.get_weights(y_data))
            except TypeError:
                print "The classifier selected does not admit weights for training samples"
                print "Switching to no weights"
                self.use_weights = False
                self.clf.fit(x_data, y_data)
        else:
            self.clf.fit(x_data, y_data)
Exemplo n.º 2
0
def test_allknn_fit_sample_with_indices():
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21,
        25, 26, 28, 31, 33, 34, 35, 36
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
    assert_allclose(idx_under, idx_gt, rtol=R_TOL)
Exemplo n.º 3
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Exemplo n.º 4
0
def test_allknn_fit_sample_with_nn_object():
    """Test the fit sample routine using a NN object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    allknn = AllKNN(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [-0.28479268, 0.70459548], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [1.32319756, -0.13181616], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 5
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
Exemplo n.º 6
0
def test_all_knn_allow_minority():
    X, y = make_classification(n_samples=10000,
                               n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=3,
                               n_clusters_per_class=1,
                               weights=[0.2, 0.3, 0.5],
                               class_sep=0.4,
                               random_state=0)

    allknn = AllKNN(allow_minority=True)
    X_res_1, y_res_1 = allknn.fit_sample(X, y)
    allknn = AllKNN()
    X_res_2, y_res_2 = allknn.fit_sample(X, y)
    assert len(y_res_1) < len(y_res_2)
Exemplo n.º 7
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Exemplo n.º 8
0
def test_allknn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
    assert_array_almost_equal(idx_under, idx_gt)
Exemplo n.º 12
0
def test_allknn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
    assert_array_almost_equal(idx_under, idx_gt)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ann = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = ann.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 341)
    assert_equal(count_y_res[1], 2485)
    assert_equal(count_y_res[2], 212)
Exemplo n.º 14
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ann = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = ann.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 341)
    assert_equal(count_y_res[1], 2485)
    assert_equal(count_y_res[2], 212)
def under_sampling(xTrain, yTrain, neighbors=200):
    """
    It reduces the sample size for the majority class using the model specificed. Always it has
    to be applied to the training set.
    :param xTrain: X training set.
    :param yTrain: Y training set.
    :param neighbors: size of the neighbourhood to consider to compute the
        average distance to the minority point samples
    :return: xTrain and yTrain oversampled
    """

    xTrainNames = xTrain.columns.values.tolist()
    yTrainNames = yTrain.columns.values.tolist()

    model = AllKNN(random_state=42, ratio='majority', n_neighbors=neighbors)

    xTrain, yTrain = model.fit_sample(xTrain, yTrain)

    xTrain = pd.DataFrame(xTrain, columns=[xTrainNames])
    yTrain = pd.DataFrame(yTrain, columns=[yTrainNames])

    return xTrain, yTrain
Exemplo n.º 16
0
'''
[(0, 64), (1, 262), (2, 4674)]
[(0, 64), (1, 213), (2, 4568)]
'''
'''
在此基础上, 延伸出了RepeatedEditedNearestNeighbours算法, 重复基础的EditedNearestNeighbours算法多次
'''
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = renn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 208), (2, 4551)]
#与RepeatedEditedNearestNeighbours算法不同的是, ALLKNN算法在进行每次迭代的时候, 最近邻的数量都在增加.
from imblearn.under_sampling import AllKNN
allknn = AllKNN(random_state=0)
X_resampled, y_resampled = allknn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 220), (2, 4601)]
#Condensed nearest neighbors and derived algorithms
'''
CondensedNearestNeighbour使用1近邻的方法来进行迭代,来判断一个样本是应该保留还是剔除,具体的实现步骤如下:
集合C:所有的少数类样本;
选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S;
使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类;
将集合S中错分的样本加入集合C;
重复上述过程, 直到没有样本再加入到集合C.
'''
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(random_state=0)
X_resampled, y_resampled = cnn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
Exemplo n.º 17
0
def sampling(algorithm, x_train, y_train):

    if (algorithm == 'standard'):

        print('\nUsing Standard Scaler.\n')

        scaler = StandardScaler().fit(x_train)
        X_resampled = scaler.transform(x_train)
        y_resampled = y_train

    elif(algorithm == 'undersampling'):
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        print('\nUsing Random Under Sampling.\n')

        rus = RandomUnderSampler(return_indices=True)
        X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')
        
        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using random under-sampling')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'smote'):

        print('\nUsing SMOTE.\n')

        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        kinds = ['regular', 'borderline1', 'borderline2', 'svm']
        kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]]
        print(kind)
        sm = [SMOTE(kind=k) for k in kind]
        X_resampled = []
        y_resampled = []
        X_res_vis = []
        for method in sm:
            X_res, y_res = method.fit_sample(x_train, y_train)
            X_resampled.append(X_res)
            y_resampled.append(y_res)
            X_res_vis.append(pca.transform(X_res))

        f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2)
        ax2.axis('off')
        ax_res = [ax3, ax4, ax5, ax6]

        c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set')
        for i in range(len(kind)):
            plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i],
                            'SMOTE {}'.format(kind[i]))

        ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center',
                ncol=1, labelspacing=0.)
        plt.tight_layout()
        plt.show()

    elif(algorithm=='neighbourhood'):

        print('\nUsing Neighbourhood Cleaning Rule.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)

        ncl = NeighbourhoodCleaningRule(return_indices=True)
        X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')

        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using neighbourhood cleaning rule')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'ENN'):

        print('\nUsing ENN.\n')

        enn = EditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'RENN'):

        print('\nUsing RENN.\n')

        renn = RepeatedEditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'AllKNN'):

        print('\nUsing AllKNN.\n')

        allknn = AllKNN(return_indices=True)
        X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'centroids'):

        print('\nUsing Cluster Centroids.\n')

        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)

    elif(algorithm == 'centroidshard'):

        print('\nUsing Cluster Centroids with Hard Voting.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)
        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_soft = pca.transform(X_resampled)

        # Use hard voting instead of soft voting
        cc = ClusterCentroids(voting='hard')
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_hard = pca.transform(X_resampled)

        # Two subplots, unpack the axes array immediately
        f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

        c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0",
                        alpha=0.5)
        c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1",
                        alpha=0.5)
        ax1.set_title('Original set')

        ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
                    X_res_vis_soft[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
                    X_res_vis_soft[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax2.scatter(X_vis[y_train == 1, 0],
                        X_vis[y_train == 1, 1], label="Original #1",
                        alpha=0.2)
        ax2.set_title('Cluster centroids with soft voting')

        ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
                    X_res_vis_hard[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
                    X_res_vis_hard[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax3.scatter(X_vis[y_train == 1, 0],
                    X_vis[y_train == 1, 1],
                    alpha=0.2)
        ax3.set_title('Cluster centroids with hard voting')

        # make nice plotting
        for ax in (ax1, ax2, ax3):
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.get_xaxis().tick_bottom()
            ax.get_yaxis().tick_left()
            ax.spines['left'].set_position(('outward', 10))
            ax.spines['bottom'].set_position(('outward', 10))
            ax.set_xlim([-6, 8])
            ax.set_ylim([-6, 6])

        plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
                    loc='lower center',
                    ncol=3, labelspacing=0.)
        plt.tight_layout(pad=3)
        plt.show()

    else:
        
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        return x_train, y_train

    return X_resampled, y_resampled
Exemplo n.º 18
0
        try:
            X_resampled, Y_resampled = enn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y
    elif index == 6:
        renn = RepeatedEditedNearestNeighbours(random_state=0)
        try:
            X_resampled, Y_resampled = renn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y
    elif index == 7:
        allknn = AllKNN(random_state=0)
        try:
            X_resampled, Y_resampled = allknn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y

    return X_resampled, Y_resampled


algo_list = ['dt', 'GaNB', 'linear_svc', 'logistic', 'nn', 'rf', 'svc']

X_list = []
Y_list = []

for algo in algo_list:
    username_val, X, Y = read_file(algo)
    X_list.append(X)
Exemplo n.º 19
0
def test_alknn_not_good_object():
    nn = 'rnd'
    allknn = AllKNN(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        allknn.fit_sample(X, Y)
Exemplo n.º 20
0
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax3.set_title('Repeated Edited nearest neighbours')

# Apply the AllKNN
print('AllKNN')
allknn = AllKNN()
X_resampled, y_resampled = allknn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax4.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax4.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax4.set_title('AllKNN')

plt.show()
 def fit_sample(self, X, y):
     allknn = AllKNN()
     return allknn.fit_sample(X, y)
Exemplo n.º 22
0
def test_deprecation_random_state():
    allknn = AllKNN(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        allknn.fit_sample(X, Y)