def test_ncr_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    ncr.fit(X, Y)
    assert_raises(RuntimeError, ncr.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_ncr_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)

    assert_equal(ncr.n_neighbors, 3)
    assert_equal(ncr.n_jobs, 1)
    assert_equal(ncr.random_state, RND_SEED)
예제 #3
0
def under_sampling(df, title):
    features, output_label = split_data(df)
    ncr = NeighbourhoodCleaningRule()
    X_undersampled, y_undersampled = ncr.fit_resample(features, output_label)
    df_full = pd.concat([
        pd.DataFrame(X_undersampled, columns=features.columns),
        pd.DataFrame(y_undersampled, columns=output_label.columns)
    ],
                        axis=1)
    return (df_full)
예제 #4
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
예제 #5
0
def test_ncr_fit_sample():
    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #6
0
def test_ncr_fit_sample_mode():
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit():
    """Test the fitting method"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    # Fit the data
    ncr.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ncr.min_c_, 0)
    assert_equal(ncr.maj_c_, 1)
    assert_equal(ncr.stats_c_[0], 500)
    assert_equal(ncr.stats_c_[1], 4500)
예제 #9
0
def ncrReSample():
    raw_train, raw_test = splitTrainTest(datapath)
    img_data, y = getFullImgFeature(raw_train)
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    X_res, y_res = ncr.fit_resample(img_data, y)
    print('Resampled dataset shape %s' % Counter(y_res))
    trainset = np.append(X_res, y_res, axis=1)

    textX, texty = getFullImgFeature(raw_test)
    testset = np.append(textX, texty, axis=1)

    return trainset, testset
예제 #10
0
def get_under_sample_models():
    models, names = list(), list()
    models.append(TomekLinks())
    names.append('TomesLinks')
    models.append(EditedNearestNeighbours())
    names.append('EditedNearestNeighbors')
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    models.append(OneSidedSelection())
    names.append('OneSidedSelection')
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #12
0
def test_ncr_fit_sample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, verbose=verbose)

    assert_equal(ncr.size_ngh, 3)
    assert_equal(ncr.n_jobs, -1)
    assert_equal(ncr.random_state, RND_SEED)
    assert_equal(ncr.verbose, verbose)
    assert_equal(ncr.min_c_, None)
    assert_equal(ncr.maj_c_, None)
    assert_equal(ncr.stats_c_, {})
예제 #14
0
    def resample(self, X, y, by, random_state=None):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'ROS':
            sampler = RandomOverSampler(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y

        return X_train, y_train
예제 #15
0
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964], [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 5, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #16
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 2268)
    assert_equal(count_y_res[2], 42)
예제 #17
0
def test_ncr_fit_sample_nn_obj():
    # Resample the data
    nn = NearestNeighbors(n_neighbors=3)
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #18
0
def UnderSample(X, Y, method='Random', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'Cluster':  # 默认kmeans估计器
        sampler = ClusterCentroids(ratio='auto',
                                   random_state=random_state,
                                   estimator=None)
    elif method is 'Random':
        sampler = RandomUnderSampler(ratio='auto',
                                     random_state=random_state,
                                     replacement=False)
    elif method is 'NearMiss_1':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=1)
    elif method is 'NearMiss_2':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=2)
    elif method is 'NearMiss_3':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=3)
    elif method is 'TomekLinks':
        sampler = TomekLinks(ratio='auto', random_state=random_state)
    elif method is 'ENN':  # kind_sel可取'all'和'mode'
        sampler = EditedNearestNeighbours(ratio='auto',
                                          random_state=random_state,
                                          kind_sel='all')
    elif method is 'RENN':  # kind_sel可取'all'和'mode'
        sampler = RepeatedEditedNearestNeighbours(ratio='auto',
                                                  random_state=random_state,
                                                  kind_sel='all')
    elif method is 'All_KNN':
        sampler = AllKNN(ratio='auto',
                         random_state=random_state,
                         kind_sel='all')
    elif method is 'CNN':
        sampler = CondensedNearestNeighbour(ratio='auto',
                                            random_state=random_state)
    elif method is 'One_SS':
        sampler = OneSidedSelection(ratio='auto', random_state=random_state)
    elif method is 'NCR':
        sampler = NeighbourhoodCleaningRule(ratio='auto',
                                            random_state=random_state,
                                            kind_sel='all',
                                            threshold_cleaning=0.5)
    elif method is 'IHT':
        sampler = InstanceHardnessThreshold(estimator=None,
                                            ratio='auto',
                                            random_state=random_state)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    return X_resampled, Y_resampled
def equalize_training_dataset_with_NClearningRule(x_train, y_train):
    from imblearn.under_sampling import NeighbourhoodCleaningRule

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = NeighbourhoodCleaningRule(
        sampling_strategy={i: 180
                           for i in range(0, 43)},
        n_neighbors=5,
        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
예제 #20
0
def UnderSample(df, _class, method = 'cc', strategy = 'auto', n_jobs = 1, ratio = None, transform = None, offline = None):
    """
       NearMiss - Select values which are closest to minority class.
       TomeLinks - uses connected sets between class borders which are closest. If there are no other points closer, it assumes they are noise or borderline and remove them.
       ENN - Edited Nearest Neighbors, remove instances from majorit which are near bordeline
       NCL - NeighborhoodCleaningRule - Uses ENN to remove majority samples. Finds Nearest neighbors and if all are correctly label it keeps them.
       CC - Cluster Centroids - Finds Clusters of Majority Samples with K-means, then keeps cluster centroids of the clusters as the new majority sample.   
    """
    #https://towardsdatascience.com/sampling-techniques-for-extremely-imbalanced-data-part-i-under-sampling-a8dbc3d8d6d8

    Y = df[_class]
    X = df.drop(_class, axis = 1)

    if method.lower() == 'nearmiss':
        x, y = NearMiss(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'tomelinks':
        x, y = TomekLinks(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'ncl':
        x, y = NeighbourhoodCleaningRule(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    elif method.lower() == 'cc':
        x, y = ClusterCentroids(stratey = strategy, n_jobs = n_jobs, ratio = ratio).fit_resample(X, Y)
    else:
        raise Exception("{} is not a valid method for UserSampling".format(method))

    df = pd.DataFrame([x, y], columns = list(df.columns) + [_class])

    fig = go.Figure()

    fig.add_trace(
    
        go.Splom(
            dimensions = [
                dict(label = column, values = df[column]) for column in df.columns
            ], 
            marker = dict(
                color = df[_class]
            )
        )
    )

    fig.show()
    
    if transform:
        return df
    
    return
예제 #21
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
def get_models():
    models, names = list(), list()
    # TL
    models.append(TomekLinks())
    names.append('TL')
    # ENN
    models.append(EditedNearestNeighbours())
    names.append('ENN')
    # RENN
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    # OSS
    models.append(OneSidedSelection())
    names.append('OSS')
    # NCR
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def get_samplers():
    samplers = {
        # Under-samplers
        'RandomUn': RandomUnderSampler(),
        'TL': TomekLinks(),
        # 'ENN': EditedNearestNeighbours(),
        'RENN': RepeatedEditedNearestNeighbours(),
        'OSS': OneSidedSelection(),
        'NCR': NeighbourhoodCleaningRule(),
        'IHT': InstanceHardnessThreshold(),
        # Over-Samplers
        'RandomOv': RandomOverSampler(),
        'SMOTE': SMOTE(),
        'SMOTESVM': SVMSMOTE(),
        # 'SMOTEKMeans': KMeansSMOTE(),
        'ADASYN': ADASYN(),
        # Combined Under and Over Samplers
        'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')),
        'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
    }
    return samplers
예제 #24
0
def resample_data(predictors, target, df_data, method):
    """
    This function resamples training datasets prior to training models.
    """
    if method=='adasyn':
        util = ADASYN()
    elif method=='random-over-sampler':
        util = RandomOverSampler()
    elif method=='smote':
        util = SMOTE(kind='borderline2')
    elif method=='smote-tomek':
        util = SMOTETomek()
    elif method=='smote-enn':
        util = SMOTEENN()
    elif method=='edited-nn':
        util = EditedNearestNeighbours()
    elif method=='repeated-edited-nn':
        util = RepeatedEditedNearestNeighbours()
    elif method=='all-knn':
        util = AllKNN()
    elif method=='one-sided-selection':
        util = OneSidedSelection()
    elif method=='cluster-centroids':
        util = ClusterCentroids()
    elif method=='random-under-sampler':
        util = RandomUnderSampler()
    elif method=='neighbourhood-cleaning-rule':
        util = NeighbourhoodCleaningRule()
    elif method=='condensed-nearest-neighbour':
        util = CondensedNearestNeighbour()
    elif method=='near-miss':
        util = NearMiss(version=1)
    elif method=='instance-hardness-threshold':
        util = InstanceHardnessThreshold()
    
    x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target])
    x_resampled = pd.DataFrame(x_resampled, columns=predictors)
    y_resampled = pd.DataFrame(y_resampled, columns=[target])
    return x_resampled, y_resampled
예제 #25
0
def create_sampler(sampler_name, random_state=None):

    if sampler_name is None or sampler_name == 'None':
        return None
    if sampler_name.lower() == 'randomundersampler':
        return RandomUnderSampler(random_state=random_state)
    if sampler_name.lower() == 'tomeklinks':
        return TomekLinks(random_state=random_state)
    if sampler_name.lower() == 'enn':
        return EditedNearestNeighbours(random_state=random_state)
    if sampler_name.lower() == 'ncl':
        return NeighbourhoodCleaningRule(random_state=random_state)
    if sampler_name.lower() == 'randomoversampler':
        return RandomOverSampler(random_state=random_state)
    if sampler_name.lower() == 'smote':
        return SMOTE(random_state=random_state)
    if sampler_name.lower() == 'smotetomek':
        return SMOTETomek(random_state=random_state)
    if sampler_name.lower() == 'smoteenn':
        return SMOTEENN(random_state=random_state)
    else:
        raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
예제 #26
0
    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }
예제 #27
0
# remove the samples considered noisy. The ``NeighbourhoodCleaningRule`` use a
# ``EditedNearestNeighbours`` to remove some sample. Additionally, they use a 3
# nearest-neighbors to remove samples which do not agree with this rule.

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,
                                                         2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(
        ax_arr,
    (
        CondensedNearestNeighbour(random_state=0),
        OneSidedSelection(random_state=0),
        NeighbourhoodCleaningRule(),
    ),
):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title(f"Decision function for {sampler.__class__.__name__}")
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title(f"Resampling using {sampler.__class__.__name__}")
fig.tight_layout()

###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
# removed.
예제 #28
0
cnn = CondensedNearestNeighbour(random_state=0)
X_resampled, y_resampled = cnn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中.
#因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本).
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(random_state=0)
X_resampled, y_resampled = oss.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

'''
NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用
EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集.
'''
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(random_state=0)
X_resampled, y_resampled = ncr.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

#InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉.
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import InstanceHardnessThreshold
iht = InstanceHardnessThreshold(random_state=0,
                              estimator=LogisticRegression())
X_resampled, y_resampled = iht.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 64), (2, 64)]


'''
过采样与下采样的结合
def test_ncr_error(ncr_params, err_msg):
    ncr = NeighbourhoodCleaningRule(**ncr_params)
    with pytest.raises(ValueError, match=err_msg):
        ncr.fit_resample(X, Y)
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        ncr.fit_resample(X, Y)