Пример #1
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
Пример #2
0
def test_allknn_fit_resample_with_nn_object():
    nn = NearestNeighbors(n_neighbors=4)
    allknn = AllKNN(n_neighbors=nn, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [-0.28479268, 0.70459548], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [1.32319756, -0.13181616], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def fit(self, c_data, x_data, y_data):
        # this is to track evolution of the size of the training samples
        self.samplesize = []
        self.samplesize.append(len(x_data))

        if self.reject_by_calendar:
            mask = self.mask_cal(c_data, y_data)
            # filter rows rejected by this calendar criteria
            # not filtering them might improve second classifier training
            #x_data = normalize(x_data[mask])
            #y_data = y_data[mask]
            self.samplesize.append(len(x_data))

        if self.use_resampling:
            # undersample
            resampler = AllKNN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

            # oversample
            resampler = SMOTEENN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

        # train clf only with filtered and resampled data
        if self.use_weights:
            try:
                self.clf.fit(x_data, y_data, self.get_weights(y_data))
            except TypeError:
                print "The classifier selected does not admit weights for training samples"
                print "Switching to no weights"
                self.use_weights = False
                self.clf.fit(x_data, y_data)
        else:
            self.clf.fit(x_data, y_data)
Пример #4
0
def test_allknn_fit_resample_with_nn_object():
    nn = NearestNeighbors(n_neighbors=4)
    allknn = AllKNN(n_neighbors=nn, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [
        1.02956816, 0.36061601
    ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [
        -1.10146139, 0.91782682
    ], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [
        0.50307437, 0.498805
    ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [
        0.98382284, 0.37184502
    ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [
        0.04296502, -0.37981873
    ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [
        0.2096964, -0.61814058
    ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [
        0.79270821, -0.41386668
    ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [
        0.48921682, -1.38504507
    ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #5
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Пример #6
0
def test_allknn_fit_resample_with_indices():
    allknn = AllKNN(return_indices=True)
    X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [
        1.12202806, 0.33811558
    ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [
        0.50307437, 0.498805
    ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [
        0.98382284, 0.37184502
    ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [
        0.28294738, -1.00125525
    ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [
        1.59068979, -0.96622933
    ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [
        1.16606871, -0.25641059
    ], [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21,
        25, 26, 28, 31, 33, 34, 35, 36
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
    assert_allclose(idx_under, idx_gt, rtol=R_TOL)
Пример #7
0
def test_allknn_fit_resample_with_indices():
    allknn = AllKNN(return_indices=True)
    X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21,
        25, 26, 28, 31, 33, 34, 35, 36
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
    assert_allclose(idx_under, idx_gt, rtol=R_TOL)
Пример #8
0
def test_allknn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    allknn.fit(X, Y)
    assert_raises(RuntimeError, allknn.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
Пример #9
0
def test_allknn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    allknn.fit(X, Y)
    assert_raises(RuntimeError, allknn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Пример #10
0
 def resample(self):
     """
     Resampling data usinf AllKNN and SMOTE
     """
     print("Sampling data...")
     # Under Sampling
     allknn = AllKNN(sampling_strategy={28: 565})
     self.X, self.y = allknn.fit_resample(self.X, self.y)
     #Over Sampling
     smote = SMOTE(ratio="all")
     self.X, self.y = smote.fit_resample(self.X, self.y)
Пример #11
0
def all_KNN(X, Y):
    from imblearn.under_sampling import AllKNN
    allknn = AllKNN()
    allknn.fit_resample(X, Y)
    indexes = allknn.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
Пример #12
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Пример #15
0
def all_KNN(X, Y):
    from imblearn.under_sampling import AllKNN
    allknn = AllKNN()
    allknn.fit_resample(X, Y)
    indexes = allknn.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
Пример #16
0
def test_allknn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
def test_allknn_fit():
    """Test the fitting method"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    # Fit the data
    allknn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(allknn.min_c_, 0)
    assert_equal(allknn.maj_c_, 1)
    assert_equal(allknn.stats_c_[0], 500)
    assert_equal(allknn.stats_c_[1], 4500)
Пример #18
0
def aiiknn(X,
           y,
           visualize=False,
           pca2d=True,
           pca3d=True,
           tsne=True,
           pie_evr=True):
    allknn = AllKNN()
    X_res, y_res = allknn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Пример #19
0
def test_allknn_fit():
    """Test the fitting method"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    # Fit the data
    allknn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(allknn.min_c_, 0)
    assert_equal(allknn.maj_c_, 1)
    assert_equal(allknn.stats_c_[0], 500)
    assert_equal(allknn.stats_c_[1], 4500)
def test_allknn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
    assert_array_almost_equal(idx_under, idx_gt)
Пример #21
0
def test_allknn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
    assert_array_almost_equal(idx_under, idx_gt)
    def fit(self, X, y):
        # Preparação dos argumentos para os métodos da biblioteca ``scikit-learn``
        #Xlinha = X[self.columns]
        #ylinha = y
        allknn = AllKNN()

        #allknn = AllKNN()
        X_resampled, y_resampled = allknn.fit_resample(X, y)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                            y_resampled,
                                                            test_size=0.2,
                                                            random_state=1)
        model = XGBClassifier()
        return model
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ann = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = ann.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 341)
    assert_equal(count_y_res[1], 2485)
    assert_equal(count_y_res[2], 212)
Пример #24
0
 def getsampler(self, type):
     if type == 'none':
         sampler = NoSampler()
     elif type == 'randomunder':
         sampler = RandomUnderSampler()
     elif type == 'nearmiss':
         sampler = NearMiss()
     elif type == 'allknn':
         sampler = AllKNN()
     elif type == 'condensednn':
         sampler = CondensedNearestNeighbour()
     elif type == 'editednn':
         sampler = EditedNearestNeighbours()
     elif type == 'repeatededitednn':
         sampler = RepeatedEditedNearestNeighbours()
     elif type == 'tomeklinks':
         sampler = TomekLinks()
     elif type == 'randomover':
         sampler = RandomOverSampler()
     elif type == 'smote':
         sampler = SMOTE()
     elif type == 'adasyn':
         sampler = ADASYN()
     elif type == 'smotenc':
         sampler = SMOTENC()
     elif type == 'quality':  # and self.quality_model_selection_type == 'extended':
         sampler = QualitySampler(self.n_init)
     else:
         print("Unsupported sampler %s" % type)
         exit(1)
     if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params(
     ).keys():
         sampler.set_params(random_state=self.random_state)
     return sampler
Пример #25
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ann = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = ann.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 341)
    assert_equal(count_y_res[1], 2485)
    assert_equal(count_y_res[2], 212)
Пример #26
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
Пример #27
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
Пример #28
0
def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled
Пример #29
0
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs):
    if name == "rus":
        sampler = RandomUnderSampler(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "nm":
        sampler = NearMiss(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "enn":
        sampler = EditedNearestNeighbours(return_indices=return_indices,
                                          random_state=random_state,
                                          **kwargs)
    elif name == "renn":
        sampler = RepeatedEditedNearestNeighbours(
            return_indices=return_indices, random_state=random_state, **kwargs)
    elif name == "allknn":
        sampler = AllKNN(return_indices=return_indices,
                         random_state=random_state,
                         **kwargs)
    elif name == "tl":
        sampler = TomekLinks(return_indices=return_indices,
                             random_state=random_state,
                             **kwargs)
    else:
        raise ValueError
    return sampler
Пример #30
0
def test_allknn_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    assert_raises(RuntimeError, allknn.sample, X, Y)
Пример #31
0
def test_allknn_init():
    # Define a ratio
    allknn = AllKNN(random_state=RND_SEED)

    assert_equal(allknn.n_neighbors, 3)
    assert_equal(allknn.kind_sel, 'all')
    assert_equal(allknn.n_jobs, -1)
    assert_equal(allknn.random_state, RND_SEED)
Пример #32
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 40)
    ann = AllKNN(random_state=RND_SEED)
    assert_warns(UserWarning, ann.fit, X, y)
Пример #33
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
Пример #34
0
def test_allknn_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, allknn.fit, X, y_single_class)
Пример #35
0
def test_all_knn_allow_minority():
    X, y = make_classification(
        n_samples=10000,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=3,
        n_clusters_per_class=1,
        weights=[0.2, 0.3, 0.5],
        class_sep=0.4,
        random_state=0)

    allknn = AllKNN(allow_minority=True)
    X_res_1, y_res_1 = allknn.fit_resample(X, y)
    allknn = AllKNN()
    X_res_2, y_res_2 = allknn.fit_resample(X, y)
    assert len(y_res_1) < len(y_res_2)
Пример #36
0
def test_allknn_init():
    """Test the initialisation of the object"""

    # Define a ratio
    allknn = AllKNN(random_state=RND_SEED)

    assert_equal(allknn.n_neighbors, 3)
    assert_equal(allknn.kind_sel, 'all')
    assert_equal(allknn.n_jobs, -1)
    assert_equal(allknn.random_state, RND_SEED)
Пример #37
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
Пример #38
0
    def __init__(self, name):
        self.strategie = None
        self.name = name

        if name == "enn":
            self.strategie = EditedNearestNeighbours(sampling_strategy='auto',
                                                     n_neighbors=3,
                                                     kind_sel='all',
                                                     n_jobs=-1)
        elif name == "allknn":
            self.strategie = AllKNN(sampling_strategy='auto',
                                    n_neighbors=3,
                                    kind_sel='all',
                                    allow_minority=False,
                                    n_jobs=-1)
        elif name == "renn":
            self.strategie = RepeatedEditedNearestNeighbours(
                sampling_strategy='auto',
                n_neighbors=3,
                max_iter=100,
                kind_sel='all',
                n_jobs=-1)

        elif name == "tomek":
            self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1)

        elif name == "smote":
            self.strategie = SMOTE(sampling_strategy='auto',
                                   k_neighbors=5,
                                   n_jobs=-1,
                                   random_state=42)

        elif name == "bdsmote":
            self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1)

        elif name == "adasyn":
            self.strategie = ADASYN(sampling_strategy='auto',
                                    n_neighbors=5,
                                    n_jobs=-1,
                                    random_state=42)

        elif name == "smoteenn":
            self.strategie = SMOTEENN(sampling_strategy='auto',
                                      smote=None,
                                      enn=None,
                                      n_jobs=-1,
                                      random_state=42)

        elif name == "smotetomek":
            self.strategie = SMOTETomek(sampling_strategy='auto',
                                        smote=None,
                                        tomek=None,
                                        n_jobs=-1,
                                        random_state=42)
def Allknn_us(X_train,
              Y_train,
              seed,
              sampling_strategy,
              n_neighbors=3,
              kind_sel='all'):
    knn = AllKNN(random_state=seed,
                 n_jobs=-1,
                 n_neighbors=n_neighbors,
                 kind_sel=kind_sel,
                 sampling_strategy=sampling_strategy)
    print('Before Allknn undersampling : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = knn.fit_resample(X_train, Y_train)
    print('After Allknn undersampling : ',
          sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
Пример #40
0
def undersampling(type):
    if type == 'random':
        und = RandomUnderSampler(ratio='majority', random_state=42)
    elif type == 'knn':
        und = AllKNN(ratio='majority', random_state=42, n_jobs=4)
    elif type == 'centroids':
        und = ClusterCentroids(ratio='majority', n_jobs=-1)
    x, y = und.fit_sample(train, label)
    x = pd.DataFrame(x, columns=train.columns.values)
    y = pd.DataFrame(y, columns=['is_attributed'])

    return x, y
Пример #41
0
def test_alknn_not_good_object():
    nn = 'rnd'
    allknn = AllKNN(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        allknn.fit_resample(X, Y)
Пример #42
0
def test_deprecation_random_state():
    allknn = AllKNN(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        allknn.fit_resample(X, Y)
Пример #43
0
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax3.set_title('Repeated Edited nearest neighbours')

# Apply the AllKNN
print('AllKNN')
allknn = AllKNN()
X_resampled, y_resampled = allknn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax4.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax4.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax4.set_title('AllKNN')

plt.show()