def neighbourhood_cleaning_rule(feature_list_of_all_instances, class_list_of_all_instances,neighbours):
    # Apply neighbourhood cleaning rule
    c1 = 0
    c2 = 0
    count = 0
    for i in class_list_of_all_instances:
        if i == 1:
            c1 += 1
        if i == 0:
            c2 += 1
        if i != 1 and i != 0:
            count += 1

    print("     Data of class 1 ", c1, " ,Data of cls 0 ", c2, ",Other class ", count)

    # for i in range(5,200,5):

    ncl = NeighbourhoodCleaningRule(n_neighbors=neighbours, n_jobs=4)
    X_resampled, y_resampled = ncl.fit_sample(feature_list_of_all_instances, class_list_of_all_instances)
    # X_res_vis = pca.transform(X_resampled)
    # 13
    print("     Cleaned ", len(feature_list_of_all_instances) - len(X_resampled), " points", end='')

    c1 = 0
    c2 = 0
    for ii in y_resampled:
        if ii == 1:
            c1 += 1
        if ii == 0:
            c2 += 1

    print(" and data of class 1 ", c1, "data of cls 0 ", c2, "for ", neighbours, "neighbours ")


    return X_resampled, y_resampled  # feature_list_of_all_instances,class_list_of_all_instances=neighbourhood_cleaning_rule(feature_list_of_all_instances,class_list_of_all_instances)
예제 #2
0
def test_ncr_wrong_nn_obj():
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ncr.fit_sample(X, Y)
예제 #3
0
def test_ncr_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    ncr.fit(X, Y)
    assert_raises(RuntimeError, ncr.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
예제 #4
0
def under_sampling(df, title):
    features, output_label = split_data(df)
    ncr = NeighbourhoodCleaningRule()
    X_undersampled, y_undersampled = ncr.fit_resample(features, output_label)
    df_full = pd.concat([
        pd.DataFrame(X_undersampled, columns=features.columns),
        pd.DataFrame(y_undersampled, columns=output_label.columns)
    ],
                        axis=1)
    return (df_full)
예제 #5
0
def test_ncr_fit_sample():
    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #8
0
def test_ncr_fit_sample_mode():
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit():
    """Test the fitting method"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    # Fit the data
    ncr.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ncr.min_c_, 0)
    assert_equal(ncr.maj_c_, 1)
    assert_equal(ncr.stats_c_[0], 500)
    assert_equal(ncr.stats_c_[1], 4500)
def test_ncr_fit_resample_mode():
    ncr = NeighbourhoodCleaningRule(kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [
        -0.20413357, 0.64628718
    ], [0.35967591, 2.61186964], [0.90701028,
                                  -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_error():
    threshold_cleaning = -10
    assert_raises_regex(
        ValueError, "'threshold_cleaning' is a value between"
        " 0 and 1.",
        NeighbourhoodCleaningRule(
            threshold_cleaning=threshold_cleaning).fit_sample, X, Y)
    threshold_cleaning = 10
    assert_raises_regex(
        ValueError, "'threshold_cleaning' is a value between"
        " 0 and 1.",
        NeighbourhoodCleaningRule(
            threshold_cleaning=threshold_cleaning).fit_sample, X, Y)
def test_ncr_fit():
    """Test the fitting method"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    # Fit the data
    ncr.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ncr.min_c_, 0)
    assert_equal(ncr.maj_c_, 1)
    assert_equal(ncr.stats_c_[0], 500)
    assert_equal(ncr.stats_c_[1], 4500)
예제 #13
0
def ncrReSample():
    raw_train, raw_test = splitTrainTest(datapath)
    img_data, y = getFullImgFeature(raw_train)
    print('Original dataset shape %s' % Counter(y))
    ncr = NeighbourhoodCleaningRule()
    X_res, y_res = ncr.fit_resample(img_data, y)
    print('Resampled dataset shape %s' % Counter(y_res))
    trainset = np.append(X_res, y_res, axis=1)

    textX, texty = getFullImgFeature(raw_test)
    testset = np.append(textX, texty, axis=1)

    return trainset, testset
예제 #14
0
def test_ncr_error():
    threshold_cleaning = -10
    with raises(ValueError,
                match=("'threshold_cleaning' is a value between"
                       " 0 and 1")):
        NeighbourhoodCleaningRule(
            threshold_cleaning=threshold_cleaning).fit_sample(X, Y)
    threshold_cleaning = 10
    with raises(ValueError,
                match=("'threshold_cleaning' is a value between"
                       " 0 and 1")):
        NeighbourhoodCleaningRule(
            threshold_cleaning=threshold_cleaning).fit_sample(X, Y)
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #16
0
def test_ncr_fit_sample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964], [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 1, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_resample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True)
    X_resampled, y_resampled, idx_under = ncr.fit_resample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [
        -0.20413357, 0.64628718
    ], [0.35967591, 2.61186964], [0.90701028,
                                  -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #20
0
 def __init__(self,
              estimators,
              estimators_bag,
              estimators_ada,
              n_jobs=1,
              function_compare='precision_tp_fp',
              n_folds=3,
              n_estimators=100):
     self.estimators = estimators
     self.named_estimators = dict(estimators)
     self.n_jobs = n_jobs
     self.groups = []
     self.g_mean = [-1, -1]
     self.function_compare = function_compare
     self.clfs = []
     self.n_folds = n_folds
     self.max_g = [-1, -1, -1]
     self.clf_id = [-1, -1, -1]
     self.n_estimators = n_estimators
     self.meta_clf_ = MLPClassifier(solver='lbfgs', random_state=1)
     self.clfs_ensemble = []
     self.estimators_bag = estimators_bag
     self.estimators_ada = estimators_ada
     self.random_st = 5
     self.methods = [
         SMOTE(k_neighbors=3, random_state=self.random_st),
         NeighbourhoodCleaningRule(n_neighbors=3,
                                   random_state=self.random_st)
     ]
     self.methoda = [0, 1]
     self.name_met = ["ADASYN", "NCR"]
     self.ensemble_ = []
예제 #21
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
예제 #22
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
예제 #23
0
def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled
예제 #24
0
def test_ncr_init():
    # Define a ratio
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)

    assert_equal(ncr.n_neighbors, 3)
    assert_equal(ncr.n_jobs, 1)
    assert_equal(ncr.random_state, RND_SEED)
예제 #25
0
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964], [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 5, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #26
0
def test_ncr_fit_sample_nn_obj():
    # Resample the data
    nn = NearestNeighbors(n_neighbors=3)
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #27
0
def test_ncr_wrong_nn_obj():
    # Resample the data
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    assert_raises_regex(ValueError, "has to be one of", ncr.fit_sample, X, Y)
예제 #28
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 2268)
    assert_equal(count_y_res[2], 42)
예제 #29
0
def test_ncr_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    assert_raises(RuntimeError, ncr.sample, X, Y)
예제 #30
0
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False):
    '''
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    If normalize flag is True then the data are being normalised
    The sampling parameter sets the type of sampling to be used
    '''
    print('----------{} with {}----------'.format(clf_name, sampling))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    plot_ind = randint(0, 9)
    j = 0
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(usx, usy):
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]

        if sampling == 'SMOTE':
            x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ADASYN':
            x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ENN':
            x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train)
        elif sampling == 'Tomek':
            x_train, y_train = TomekLinks().fit_resample(x_train, y_train)
        elif sampling == 'SMOTETomek':
            x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'SMOTEENN':
            x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'NCR':
            x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train)
        elif sampling == 'OSS':
            x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train)

        if normalize:
            scaler = StandardScaler().fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

        clf.fit(x_train, y_train)

        # if plot_ind == j and clf_name == 'DecisionTreeClassifier':
        #     plot_decision_tree(clf)

        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
예제 #31
0
def test_ncr_wrong_nn_obj():
    """Test either if an error is raised with wrong NN object"""

    # Resample the data
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(
        return_indices=True, random_state=RND_SEED, n_neighbors=nn)
    assert_raises(ValueError, ncr.fit_sample, X, Y)
예제 #32
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 15)
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    assert_warns(UserWarning, ncr.fit, X, y)
예제 #33
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
예제 #34
0
def test_ncr_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)

    assert_equal(ncr.n_neighbors, 3)
    assert_equal(ncr.n_jobs, 1)
    assert_equal(ncr.random_state, RND_SEED)
예제 #35
0
def test_ncr_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, ncr.fit, X, y_single_class)
예제 #36
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule(return_indices=True)
X_resampled, y_resampled, idx_resampled = ncl.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
def test_ncr_error(ncr_params, err_msg):
    ncr = NeighbourhoodCleaningRule(**ncr_params)
    with pytest.raises(ValueError, match=err_msg):
        ncr.fit_resample(X, Y)
def test_ncr_wrong_nn_obj():
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ncr.fit_resample(X, Y)
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        ncr.fit_resample(X, Y)
from imblearn.under_sampling import NeighbourhoodCleaningRule

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule()
X_resampled, y_resampled = ncl.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)