def test_enn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    enn.fit(X, Y)
    assert_raises(RuntimeError, enn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_enn_fit_sample():
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit():
    """Test the fitting method"""

    # Create the object
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    # Fit the data
    enn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(enn.min_c_, 0)
    assert_equal(enn.maj_c_, 1)
    assert_equal(enn.stats_c_[0], 500)
    assert_equal(enn.stats_c_[1], 4500)
def test_enn_fit_sample_mode():
    enn = EditedNearestNeighbours(kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.42772181, 0.526027],
                     [1.92365863, 0.82718767], [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973], [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329], [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815], [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648], [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 1836)
    assert_equal(count_y_res[2], 5)
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    idx_gt = np.array([4, 11, 0, 3, 1, 8, 15])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_enn_fit_sample_with_nn_object():
    """Test the fit sample routine using a NN object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    enn = EditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.42772181, 0.526027],
                     [1.92365863, 0.82718767], [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973], [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329], [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815], [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648], [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #10
0
    def fit(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 
                'SMOTETomek', 'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours()
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule()
        elif by == 'Tomek':
            sampler = TomekLinks()
        elif by == 'ALLKNN':
            sampler = AllKNN()
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss()
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        self.base_estimator.fit(X_train, y_train)
예제 #11
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
def get_models():
    models, names = list(), list()
    # TL
    models.append(TomekLinks())
    names.append('TL')
    # ENN
    models.append(EditedNearestNeighbours())
    names.append('ENN')
    # RENN
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    # OSS
    models.append(OneSidedSelection())
    names.append('OSS')
    # NCR
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
예제 #13
0
def test_validate_estimator_init():
    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED)

    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668], [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
예제 #14
0
def resample(X, Y, rate=0.9, strategy='hybrid'):
    """ Sampling based methods to balance dataset

    Args:
        X (pd.DataFrame): Main dataset with the variables
        Y (pd.Series): Target variable
        rate (float): Ratio of the number of samples in the minority class over
            the number of samples in the majority class after resampling
        strategy ('hybrid' | 'over_sampling' | 'under_sampling'): Strategy to
            balance the dataset
    """
    strategies = {
        'hybrid': SMOTEENN(sampling_strategy=rate),
        'over_sampling': SMOTE(sampling_strategy=rate),
        'under_sampling': EditedNearestNeighbours(),
    }
    resampling = strategies[strategy]
    cols = X.columns
    X_r, Y_r = resampling.fit_resample(X, Y)
    return pd.DataFrame(data=X_r, columns=cols), Y_r
예제 #15
0
def test_sample_regular_pass_smote_enn():
    smote = SMOTEENN(
        smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED),
        enn=EditedNearestNeighbours(sampling_strategy="all"),
        random_state=RND_SEED,
    )
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    X_gt = np.array([
        [1.52091956, -0.49283504],
        [0.84976473, -0.15570176],
        [0.61319159, -0.11571667],
        [0.66052536, -0.28246518],
        [-0.28162401, -2.10400981],
        [0.83680821, 1.72827342],
        [0.08711622, 0.93259929],
    ])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def get_samplers():
    samplers = {
        # Under-samplers
        'RandomUn': RandomUnderSampler(),
        'TL': TomekLinks(),
        # 'ENN': EditedNearestNeighbours(),
        'RENN': RepeatedEditedNearestNeighbours(),
        'OSS': OneSidedSelection(),
        'NCR': NeighbourhoodCleaningRule(),
        'IHT': InstanceHardnessThreshold(),
        # Over-Samplers
        'RandomOv': RandomOverSampler(),
        'SMOTE': SMOTE(),
        'SMOTESVM': SVMSMOTE(),
        # 'SMOTEKMeans': KMeansSMOTE(),
        'ADASYN': ADASYN(),
        # Combined Under and Over Samplers
        'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')),
        'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
    }
    return samplers
예제 #17
0
def create_sampler(sampler_name, random_state=None):

    if sampler_name is None or sampler_name == 'None':
        return None
    if sampler_name.lower() == 'randomundersampler':
        return RandomUnderSampler(random_state=random_state)
    if sampler_name.lower() == 'tomeklinks':
        return TomekLinks(random_state=random_state)
    if sampler_name.lower() == 'enn':
        return EditedNearestNeighbours(random_state=random_state)
    if sampler_name.lower() == 'ncl':
        return NeighbourhoodCleaningRule(random_state=random_state)
    if sampler_name.lower() == 'randomoversampler':
        return RandomOverSampler(random_state=random_state)
    if sampler_name.lower() == 'smote':
        return SMOTE(random_state=random_state)
    if sampler_name.lower() == 'smotetomek':
        return SMOTETomek(random_state=random_state)
    if sampler_name.lower() == 'smoteenn':
        return SMOTEENN(random_state=random_state)
    else:
        raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
예제 #18
0
def resample_data(predictors, target, df_data, method):
    """
    This function resamples training datasets prior to training models.
    """
    if method=='adasyn':
        util = ADASYN()
    elif method=='random-over-sampler':
        util = RandomOverSampler()
    elif method=='smote':
        util = SMOTE(kind='borderline2')
    elif method=='smote-tomek':
        util = SMOTETomek()
    elif method=='smote-enn':
        util = SMOTEENN()
    elif method=='edited-nn':
        util = EditedNearestNeighbours()
    elif method=='repeated-edited-nn':
        util = RepeatedEditedNearestNeighbours()
    elif method=='all-knn':
        util = AllKNN()
    elif method=='one-sided-selection':
        util = OneSidedSelection()
    elif method=='cluster-centroids':
        util = ClusterCentroids()
    elif method=='random-under-sampler':
        util = RandomUnderSampler()
    elif method=='neighbourhood-cleaning-rule':
        util = NeighbourhoodCleaningRule()
    elif method=='condensed-nearest-neighbour':
        util = CondensedNearestNeighbour()
    elif method=='near-miss':
        util = NearMiss(version=1)
    elif method=='instance-hardness-threshold':
        util = InstanceHardnessThreshold()
    
    x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target])
    x_resampled = pd.DataFrame(x_resampled, columns=predictors)
    y_resampled = pd.DataFrame(y_resampled, columns=[target])
    return x_resampled, y_resampled
예제 #19
0
def build_loaders(titles, labels, batch_size,
                  under_sample=False, over_sample=False):
    train_titles, test_titles, train_labels, test_labels = \
        train_test_split(titles, labels, test_size=0.1)
    val_titles, test_titles, val_labels, test_labels = \
        train_test_split(test_titles, test_labels, test_size=0.01)

    steps = []
    if under_sample:
        steps.append(("Under", EditedNearestNeighbours(n_neighbors=2)))
    if over_sample:
        steps.append(("Over", SMOTE(sampling_strategy=1)))
    if under_sample or over_sample:
        pipeline = Pipeline(steps=steps)
        train_titles, train_labels = pipeline.fit_resample(train_titles,
                                                           train_labels)
    print("Train:")
    calc_ratio(train_labels)
    print("Validation:")
    calc_ratio(val_labels)
    print("Test:")
    calc_ratio(test_labels)

    train = TensorDataset(torch.from_numpy(train_titles),
                          torch.from_numpy(train_labels))
    val = TensorDataset(torch.from_numpy(val_titles),
                        torch.from_numpy(val_labels))
    test = TensorDataset(torch.from_numpy(test_titles),
                         torch.from_numpy(test_labels))

    train_loader = DataLoader(train, shuffle=True, batch_size=batch_size,
                              drop_last=True)
    test_loader = DataLoader(test, shuffle=True, batch_size=batch_size,
                             drop_last=True)
    val_loader = DataLoader(val, shuffle=True, batch_size=batch_size,
                            drop_last=True)

    return train_loader, test_loader, val_loader
예제 #20
0
    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }
예제 #21
0
def test_enn_init():
    enn = EditedNearestNeighbours()

    assert enn.n_neighbors == 3
    assert enn.kind_sel == 'all'
    assert enn.n_jobs == 1
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(
        n_neighbors=nn, kind_sel='mode')
    with raises(ValueError, match="has to be one of"):
        enn.fit_sample(X, Y)
예제 #23
0
oversampler = SMOTE(ratio=0.2,
                    random_state=np.random.randint(100),
                    kind='regular',
                    n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(X_train.fillna(0), y_train)

##ADASYN 运行起来很慢###
X_resampled_adasyn, y_resampled_adasyn = ADASYN(
    sampling_strategy=0.2,
    n_jobs=-1).fit_sample(train.loc[:, feature].fillna(0).values,
                          train["y"].values.astype('int'))

###删除边界的一些噪声点###
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = enn.fit_sample(X, y)

dtrain = xgb.DMatrix(data=train.loc[:, feature].astype('float'),
                     label=train['y'].astype('int'))
dval = xgb.DMatrix(data=val.loc[:, feature].astype('float'),
                   label=val['y'].astype('int'))
train.loc[:, feature].info(null_counts=True)

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
예제 #24
0
import numpy as np
from sklearn.svm import SVC
from hyperopt import hp
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

from config import random_seed
from utils.python_utils import quniform_int

steps = [('undersampler', EditedNearestNeighbours(random_state=random_seed)),
         ('SVC',
          SVC(C=1, kernel='linear', random_state=random_seed,
              probability=True))]
model = Pipeline(steps=steps)

params_space = {'svm__C': hp.quniform('C', 1, 100, 5)}
예제 #25
0
#
# # In[135]:
#
#
# smotenc+enn
X_smote = np.array(df_smotenc[[
    'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id',
    'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model',
    'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
    'C19', 'C20', 'C21'
]])
Y_smote = list(df_smotenc['click'])
#
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X_smotenc, y_smotenc)
#
# # In[52]:
#
#
# df_smotenc = pd.DataFrame(X_smotenc,
#                           columns=column1)
# df_smotenc = pd.concat([df_smotenc, pd.DataFrame(y_smotenc, columns=['click'])], axis=1)
# for i in column1:
#     df_smotenc[i] = df_smotenc[i].astype(int)
#
# # In[53]:
#
#
# df_smX_resampledotenc.head()
예제 #26
0
print(spe/10)
print("\n")
print("Overall Confusion Matrix: ")
np.set_printoptions(formatter={'float': '{:.1f}'.format})
print(tol/10)
print("\n")
print("Number of cases in each class")
print(Counter(y_res))

########## Edited Nearest Neighbour ##########
np.set_printoptions(formatter={'float': '{:.2f}'.format})
X = np.array(principal_4_Df.iloc[:,:-1])
y = np.array(principal_4_Df.iloc[:,-1])
model = KNeighborsClassifier(n_neighbors=8)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
us =  EditedNearestNeighbours(random_state=0)
pipeline = make_pipeline(us, model)
X_res , y_res = us.fit_resample(X, y)

overall = []
recall = np.zeros((1,13))
spe = np.zeros((1,13))
tol = np.zeros((13,13))
trial = 0
for train_index, test_index in skf.split(X,y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  model.fit(X_train, y_train)
  y_pred = cross_val_predict(pipeline, X_test, y_test, cv=skf)
  score = cross_val_score(pipeline, X_test, y_test, cv=skf).mean() 
  overall.append(score)
예제 #27
0
        print("---------------")
        print("ratio", i)
        results['ratio'][a] = i
        print("neighbors", j)
        results['neighbors'][a] = j
        b = a
        a = a + 1
        results['Class'][b] = 0
        results['Class'][a] = 1
        results['Datasize'][b] = datasize[0]
        results['Datasize'][a] = datasize[1]
        results['Training Datasize'][b] = trainingdatasize[0]
        results['Training Datasize'][a] = trainingdatasize[1]
        results['Testing Datasize'][b] = testingdatasize[0]
        results['Testing Datasize'][a] = testingdatasize[1]
        enn = EditedNearestNeighbours(random_state=5, n_neighbors=j)
        X_train_sampled, y_train_sampled = enn.fit_sample(
            X_train_sampled1, y_train_sampled1)
        samplingdatasize = collections.Counter(y_train_sampled)
        print("sampled training data size", samplingdatasize)
        results['After sampling'][b] = samplingdatasize[0]
        results['After sampling'][a] = samplingdatasize[1]

        #random forest
        clf = RandomForestClassifier(n_estimators=100,
                                     max_depth=5,
                                     random_state=0,
                                     oob_score=True)
        clf.fit(X_train_sampled, y_train_sampled)
        y_pred = clf.predict(X_test)
        y_test_arr = np.array(y_test['Outcome'])
예제 #28
0
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Three subplots, unpack the axes array immediately
f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('Edited nearest neighbours')

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
def test_deprecation_random_state():
    enn = EditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        enn.fit_sample(X, Y)
예제 #30
0
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError, match="has to be one of"):
        enn.fit_resample(X, Y)
                           n_samples=100,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Three subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours(return_indices=True)
X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)
reduction_str = ('Reduced {:.2f}%'.format(
    100 * (1 - float(len(X_resampled)) / len(X))))
print(reduction_str)
c3 = ax2.scatter(X_vis[idx_samples_removed, 0],
                 X_vis[idx_samples_removed, 1],
                 alpha=.2,
                 label='Removed samples',
                 c='g')
plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str)

# Apply the RENN
print('RENN')
예제 #32
0
import numpy as np
import xgboost as xgb
from hyperopt import hp
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import EditedNearestNeighbours

from config import random_seed
from utils.python_utils import quniform_int

steps = [('undersampler',
          EditedNearestNeighbours(random_state=random_seed, n_neighbors=3)),
         ('xgb',
          xgb.XGBClassifier(n_estimators=6450,
                            colsample_bytree=0.9,
                            learning_rate=0.0271311414499,
                            min_child_weight=4,
                            subsample=0.917109565217,
                            max_depth=25,
                            gamma=0.0100121777578,
                            silent=True,
                            nthread=3,
                            seed=random_seed))]

model = Pipeline(steps=steps)

params_space = {
    'undersampler__n_neighbors':
    quniform_int('n_neighbors', 2, 10, 1),
    'xgb__max_depth':
    quniform_int('max_depth', 10, 30, 1),
    'xgb__min_child_weight':
# model eval
recall = recall_score(Y_test,predictions)
report = classification_report(Y_test
                               ,predictions)

print(f'Recall Logistic Regression {recall: .2f}')
print(report)
print(balanced_accuracy_score(Y_test, predictions))
t1 = pl.time.time() - t0
print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60)))
print("best parameters",LR_model.best_params_)
plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin'])
#seventh Test----------------------Logestic Regression--------------------------------undersampling with ENN
from imblearn.under_sampling import EditedNearestNeighbours
Xtrain_tomek, Ytrain_tomek = EditedNearestNeighbours().fit_sample(X_train, Y_train)
t0 = pl.time.time()
LR = LogisticRegression(max_iter=4000,
                            random_state=49,
                            n_jobs=1, class_weight='balanced') # for liblinear n_jobs is +1.

parameters = {"penalty": ['l1', 'l2'],'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], "solver":['liblinear','sag','saga']}

LR_model = GridSearchCV(LR, parameters, scoring="precision", cv=3)

# fit the classifier
LR_model.fit(Xtrain_tomek,Ytrain_tomek.values.ravel())

# get the prediction
predictions = LR_model.predict(X_test)
예제 #34
0
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()
# (0.9518183780276207, 0.6767076447148238)

######### Edited Nearest Neighbor #########

# removes all samples that are misclassified by KNN from the training data (`mode`)
# Or if have any point from other class as neighbor (`all`)
# So basically, what you're doing here is you clean up outliers and boundaries.

from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours(n_neighbors=5)

X_train_enn, y_train_enn = enn.fit_sample(X_train, y_train)
enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=5)
X_train_enn_mode, y_train_enn_mode = enn_mode.fit_sample(X_train, y_train)
print(X_train_enn_mode.shape)
print(np.bincount(y_train_enn_mode))

### Pipeline method

enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5),
                             LogisticRegression())

scores = cross_validate(enn_pipe,
                        X_train,
                        y_train,
예제 #35
0
            alpha=.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
            label="Class #1",
            alpha=.5,
            edgecolor=almost_black,
            facecolor=palette[2],
            linewidth=0.15)
ax1.set_title('Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

ax2.scatter(X_res_vis[y_resampled == 0, 0],
            X_res_vis[y_resampled == 0, 1],
            label="Class #0",
            alpha=.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0],
            X_res_vis[y_resampled == 1, 1],
            label="Class #1",
            alpha=.5,
            edgecolor=almost_black,
예제 #36
0
def model_resampling_pipeline(X_train,
                              X_test,
                              y_train,
                              y_test,
                              model,
                              b=0.5,
                              name='',
                              eval_show=True,
                              columns=None):

    if not hasattr(model, 'predict_proba'):
        model = CalibratedClassifierCV(model, cv=3)
    else:
        model = model

    results = {
        'ordinary': {},
        'class_weight': {},
        'oversample': {},
        'undersample': {}
    }

    # ------ No balancing ------
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    probas = [x[1] for x in model.predict_proba(X_test)]
    scores = metrics.classification_report(
        y_test,
        predictions,
        target_names=['negative', 'positive', 'mixed'],
        output_dict=True)

    w_precision = scores['macro avg']['precision']
    w_recall = scores['macro avg']['recall']
    w_fscore = scores['macro avg']['f1-score']

    results['ordinary'] = {
        'w_precision': w_precision,
        'w_recall': w_recall,
        'w_fscore': w_fscore,
        'predictions': np.array(predictions),
        'probas': probas
    }

    # ------ Class weight ------
    if 'class_weight' in model.get_params().keys():
        model.set_params(class_weight='balanced')
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        probas = [x[1] for x in model.predict_proba(X_test)]
        scores = metrics.classification_report(
            y_test,
            predictions,
            target_names=['negative', 'positive', 'mixed'],
            output_dict=True)

        w_precision = scores['macro avg']['precision']
        w_recall = scores['macro avg']['recall']
        w_fscore = scores['macro avg']['f1-score']

        results['class_weight'] = {
            'w_precision': w_precision,
            'w_recall': w_recall,
            'w_fscore': w_fscore,
            'predictions': np.array(predictions),
            'probas': probas
        }

    # ------------ OVERSAMPLING TECHNIQUES ------------
    techniques = [RandomOverSampler(), SMOTE(), ADASYN()]

    for sampler in techniques:
        technique = sampler.__class__.__name__
        X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)

        X_resampled = pd.DataFrame(X_resampled)
        if columns:
            X_resampled.columns = columns
        else:
            X_resampled.columns = X_train.columns

        model.fit(X_resampled, y_resampled)
        predictions = model.predict(X_test)
        probas = [x[1] for x in model.predict_proba(X_test)]
        scores = metrics.classification_report(
            y_test,
            predictions,
            target_names=['negative', 'positive', 'mixed'],
            output_dict=True)

        w_precision = scores['macro avg']['precision']
        w_recall = scores['macro avg']['recall']
        w_fscore = scores['macro avg']['f1-score']

        results['oversample'][technique] = {
            'w_precision': w_precision,
            'w_recall': w_recall,
            'w_fscore': w_fscore,
            'predictions': np.array(predictions),
            'probas': probas
        }

    # ------------ UNDERSAMPLING TECHNIQUES ------------
    techniques = [
        RandomUnderSampler(),
        NearMiss(version=1),
        NearMiss(version=2),
        TomekLinks(),
        EditedNearestNeighbours()
    ]

    for sampler in techniques:
        technique = sampler.__class__.__name__
        if technique == 'NearMiss': technique += str(sampler.version)
        X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)

        X_resampled = pd.DataFrame(X_resampled)
        if columns:
            X_resampled.columns = columns
        else:
            X_resampled.columns = X_train.columns

        model.fit(X_resampled, y_resampled)
        predictions = model.predict(X_test)
        probas = [x[1] for x in model.predict_proba(X_test)]
        scores = metrics.classification_report(
            y_test,
            predictions,
            target_names=['negative', 'positive', 'mixed'],
            output_dict=True)

        w_precision = scores['macro avg']['precision']
        w_recall = scores['macro avg']['recall']
        w_fscore = scores['macro avg']['f1-score']

        results['undersample'][technique] = {
            'w_precision': w_precision,
            'w_recall': w_recall,
            'w_fscore': w_fscore,
            'predictions': np.array(predictions),
            'probas': probas
        }

    if eval_show:
        evaluate_method(results,
                        y_test,
                        'undersample',
                        title=name + '\nUndersampled')
        evaluate_method(results, y_test, 'oversample', title='Oversampled')

    return results
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(n_neighbors=nn,
                                  random_state=RND_SEED,
                                  kind_sel='mode')
    assert_raises_regex(ValueError, "has to be one of", enn.fit_sample, X, Y)
# dados_completo.drop('index', axis=1, inplace=True)

mostrar_quantidade_por_classe(dados_completo, 'dirtiness')
mostrar_quantidade_por_classe(dados_completo, 'white_bgd')
mostrar_quantidade_por_classe(dados_completo, 'viable')
mostrar_quantidade_por_classe(dados_completo, 'not_viable')
# mostrar_quantidade_por_classe(dados_completo, 5)

print(dados_completo.shape)
print(dados_completo.describe(include=['number']))

n_jobs = 5
# classes_balancear = list([2, 3])
# balanceador = EditedNearestNeighbours(n_jobs=n_jobs, n_neighbors=5)
# balanceador = SMOTE(n_jobs=n_jobs, random_state=random_state)
balanceador = SMOTEENN(enn=EditedNearestNeighbours(n_jobs=n_jobs,
                                                   n_neighbors=n_jobs),
                       smote=SMOTE(n_jobs=n_jobs),
                       random_state=random_state)

X_treino, Y_treino = balanceador.fit_resample(
    dados_completo.drop('classe', axis=1), dados_completo['classe'])
X_treino = pd.DataFrame(data=X_treino,
                        columns=dados_completo.drop(['classe'],
                                                    axis=1).columns)
Y_treino = pd.DataFrame(data=Y_treino, columns=['classe'])
# X_treino.to_csv('../input/DadosCompletoTransformadoMLBalanceadoX.csv', encoding='utf-8', sep='\t')
# Y_treino.to_csv('../input/DadosCompletoTransformadoMLBalanceadoY.csv', encoding='utf-8', sep='\t')
# # exit()
# X_treino = pd.read_csv('../input/DadosCompletoTransformadoMLBalanceadoX.csv', encoding='utf-8', delimiter='\t')
# X_treino.drop(X_treino.columns[0], axis=1, inplace=True)
# Y_treino = pd.read_csv('../input/DadosCompletoTransformadoMLBalanceadoY.csv', encoding='utf-8', delimiter='\t')
예제 #39
0
# their class differ from the one of their nearest-neighbors. This sieve can be
# repeated which is the principle of the
# ``RepeatedEditedNearestNeighbours``. ``AllKNN`` is slightly different from
# the ``RepeatedEditedNearestNeighbours`` by changing the :math:`k` parameter
# of the internal nearest neighors algorithm, increasing it at each iteration.

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,
                                                         2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(
        ax_arr,
    (
        EditedNearestNeighbours(),
        RepeatedEditedNearestNeighbours(),
        AllKNN(allow_minority=True),
    ),
):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title(f"Decision function for {sampler.__class__.__name__}")
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title(f"Resampling using {sampler.__class__.__name__}")
fig.tight_layout()

###############################################################################
# ``CondensedNearestNeighbour`` makes use of a 1-NN to iteratively decide if a
# sample should be kept in a dataset or not. The issue is that
예제 #40
0
def validateFitModel(X_train,
                     y_train,
                     X_test=None,
                     y_test=None,
                     cv=False,
                     target=None):
    rs = RobustScaler(quantile_range=(0.1, 0.90))
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(rs.fit_transform(X_train))
    ncr = EditedNearestNeighbours(n_neighbors=1,
                                  sampling_strategy=[7, 10],
                                  random_state=42,
                                  return_indices=True)
    _, _, indexes = ncr.fit_resample(X_train_mms, y_train)
    resampling_index = random.sample(range(len(indexes)), len(indexes))
    sampled_indexes = indexes[resampling_index]
    with open(os.path.join(MODELS_PATH, 'sampled_dfs_%s.bin' % target),
              'wb') as f:
        pickle.dump(sampled_indexes, f)
        f.close()

    model = XGBClassifier(verbosity=2,
                          n_estimators=100,
                          objective='multi:softprob',
                          learning_rate=0.125,
                          min_child_weight=1,
                          max_depth=13,
                          gamma=0.6,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=0.9,
                          reg_lambda=2,
                          scale_pos_weight=0.05)
    if cv:
        param_grid = {
            'n_estimators': [10],
            'objective': ['multi:softprob'],
            'learning_rate': [0.125],
            'min_child_weigth': [1],
            'max_depth': [13],
            'gamma': [0.6],
            'max_delta_step': [0],
            'subsample': [1],
            'colsample_bytree': [0.9],
            'reg_lambda': [2],
            'scale_pos_weight': [0.05]
        }
        validate(X_train[sampled_indexes],
                 y_train[sampled_indexes],
                 X_test,
                 y_test,
                 target=target,
                 model=model,
                 parameters=param_grid,
                 model_name='XGB')
    else:
        model.fit(X_train[sampled_indexes], y_train[sampled_indexes])
        with open(
                os.path.join(MODELS_PATH, '%s_fitted_classifier.bin' % target),
                'wb') as f:
            pickle.dump(model, f)
            f.close()

    return
예제 #41
0
     'MLPClassifier(hidden_layer_sizes=(5), solver="lbfgs", max_iter=1000, random_state=42)'
     ),
    ("KerasNN_3neurons", 'KerasNN_not_fitted(n_neurons=3, init="he_normal")'),
    ("KerasNN_12neurons", 'KerasNN_not_fitted(n_neurons=12,init="he_normal")')
]

scalers = [("StandardScaler", StandardScaler()),
           ("RobustScaler", RobustScaler()), ("MinMaxScaler", MinMaxScaler()),
           ("Normalizer", Normalizer()), ("None", None)]

samplers = [
    ("RandomOverSampler_0.2", RandomOverSampler(random_state=42, ratio=0.2)),
    ("RandomOverSampler_0.5", RandomOverSampler(random_state=42, ratio=0.5)),
    ("RandomOverSampler_0.5", RandomOverSampler(random_state=42, ratio=0.35)),
    ("TomekLinks", TomekLinks(random_state=42)),
    ("EditedNN", EditedNearestNeighbours(random_state=42, n_neighbors=3)),
    ("SMOTE", SMOTE(random_state=42, ratio=0.5)),
    ("SMOTETomek", SMOTETomek(random_state=42, ratio=0.8)), ("None", None)
]

pre_processing_pipelines = [
    ("Joris_Pipeline", preprocessing.joris_preprocessing_pipeline),
    ("Morten_Pipeline", preprocessing.morten_preprocessing_pipeline),
    ("Bin it!", preprocessing.bin_it_preprocessing_pipeline),
    ("simple_pipeline", preprocessing.simple_pipeline),
    ("chop_off", preprocessing.chop_off),
    ("pca_chopoff", preprocessing.pca_chopoff),
    ("box_cox_pipeline", preprocessing.box_cox_pipeline),
    ("feature_engineered", preprocessing.feature_engineered)
]
seed = [1]
예제 #42
0
def test_deprecation_random_state():
    enn = EditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        enn.fit_resample(X, Y)