Exemplo n.º 1
0
def undersample(classifier):
    print('*** UNDERSAMPLE ***')

    pipe = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('resample', under_sampling.RandomUnderSampler()),
        classifier,
    ])

    X, y = prepare_data()

    y_pred = model_selection.cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1)

    c = Counter(under_sampling.RandomUnderSampler().fit_sample(X, y)[1])

    return y, y_pred, c
Exemplo n.º 2
0
def f_TomekRUS(X_train, y_train, seed):
    """
    Use: X_train, y_train, seed
    returns X_train, y_train
    """
    tomek = us.TomekLinks()
    X_tomek, Y_tomek = tomek.fit_sample(X_train, y_train)
    rus = us.RandomUnderSampler()
    X_train, y_train = rus.fit_sample(X_tomek, Y_tomek)
    return (X_train, y_train)
Exemplo n.º 3
0
def f_RUSSmote(X_train, y_train, seed):
    """
    Use: X_train, y_train, seed
    returns X_train, y_train
    """
    rus = us.RandomUnderSampler()
    X_rus, Y_rus = rus.fit_sample(X_train, y_train)
    smote = os.SMOTE(k_neighbors=5)
    X_train, y_train = smote.fit_sample(X_rus, Y_rus)
    return (X_train, y_train)
Exemplo n.º 4
0
def under_sample_train(x_train, y_train, random=False, seed=666):
    if (random):
        model_under_sample = under_sampling.RandomUnderSampler(
            random_state=seed)
        x_train, y_train = model_under_sample.fit_resample(x_train, y_train)
    else:
        model_under_sample = under_sampling.NearMiss(version=2,
                                                     random_state=seed,
                                                     n_jobs=-1)
        x_train, y_train = model_under_sample.fit_resample(x_train, y_train)
    return x_train, y_train
    def ReSampling(self, data, labels, over_s=True):

        label_status = Counter(labels)
        print(self.tasktype, "data " + self.tasktype, label_status)

        featurelen = len(data[0])
        if 1 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 1
        elif 0 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 0
        else:
            x, y = None, None
        if x is not None:
            data = np.insert(data, 0, x, 0)
            labels = np.insert(labels, 0, y, 0)

        if len(label_status) < 2:
            print(self.tasktype, "no need to resample")
            return data, labels
        if label_status[1] / label_status[0] < 5. and label_status[
                1] / label_status[0] > 0.2:
            print("data are not biased too much")
            return data, labels

        maxSamples = label_status[0]
        if label_status[1] > label_status[0]:
            maxSamples = label_status[1]
            resampling = over_sampling.ADASYN(ratio={
                1: maxSamples,
                0: int(0.4 * maxSamples)
            })
        else:
            resampling = over_sampling.ADASYN(ratio={
                0: maxSamples,
                1: int(0.4 * maxSamples)
            })

        try:
            data, labels = resampling.fit_sample(data, labels)
        except:
            print(self.tasktype, "resampling using random method")
            if over_s:
                resampling = over_sampling.RandomOverSampler()
            else:
                resampling = under_sampling.RandomUnderSampler()

            data, labels = resampling.fit_sample(data, labels)

        label_status = Counter(labels)
        print(self.tasktype, "sampling status=", label_status)

        return data, labels
Exemplo n.º 6
0
def resampling():
    """
    Function to do oversampling and undersampling
    Retuns:
        sampling_pipeline a samapling piepleine with steps as oversampling and undersampling
    """
    oversampling = over_sampling.SMOTE(random_state=42)
    undersampling = under_sampling.RandomUnderSampler(random_state=42)


    sampling_pipeline = imbPipeline([
    ('oversample', oversampling),
    ('undersample', undersampling)
    ])
    return sampling_pipeline
Exemplo n.º 7
0
Arquivo: UMCE.py Projeto: w4k2/weles
def us_os_bac(base_clf, X_train, y_train, X_test, y_test):
    us = under_sampling.RandomUnderSampler()
    os = over_sampling.RandomOverSampler()
    X_us, y_us = us.fit_sample(X_train, y_train)
    X_os, y_os = os.fit_sample(X_train, y_train)
    us_clf = base.clone(base_clf)
    os_clf = base.clone(base_clf)
    us_clf.fit(X_us, y_us)
    os_clf.fit(X_os, y_os)
    us_pred = us_clf.predict(X_test)
    os_pred = os_clf.predict(X_test)
    return (
        metrics.balanced_accuracy_score(y_test, us_pred),
        metrics.balanced_accuracy_score(y_test, os_pred),
    )
Exemplo n.º 8
0
    def fit(self, X, y):
        """
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y, multi_output=True)

        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y

        n_rows, n_features = X.shape
        _, n_targets = y.shape

        # Set up the Undersampler
        under_sampler = under_sampling.RandomUnderSampler(random_state=2)
        # Dict to store our trained classifiers
        self.classifiers_ = {}
        for i in range(n_targets):
            classifier = clone(self.base_classifier)
            X_to_fit = X
            y_to_fit = y[:, i]  # Select the i-th label
            if self.balance_classes:
                # Under sample the training features and labels
                X_to_fit, y_to_fit = under_sampler.fit_sample(X, y[:, i])
            self.classifiers_[i] = classifier.fit(X_to_fit, y_to_fit)

        return self
Exemplo n.º 9
0
def imbalance_set(X, y, operation):
    
    methods = {'smoteen' : imb.SMOTEENN(), 'smotetom' : imb.SMOTETomek(), 'adasyn' : imbov.ADASYN(), 'randomunder' : imbun.RandomUnderSampler(), 'condensed' : imbun.CondensedNearestNeighbour(n_jobs=-1)}
    
    sm = methods[str(operation)]
    
    X_resampl, y_resampl = sm.fit_sample(X, y)
    
    return X_resampl, y_resampl
def random_undersampling(features, labels):
    rus = under_sampling.RandomUnderSampler(random_state=0)
    return rus.fit_resample(X=features, y=labels)
Exemplo n.º 11
0
#
# Logistic Regression with Random Undersample
#
# Omar Trejo
# January, 2017
#

from imblearn import under_sampling
from collections import Counter
from sklearn import linear_model
from sklearn import model_selection

import functions
import setup

random_undersampler = under_sampling.RandomUnderSampler(
    random_state=setup.SEED, ratio='auto')

X_train_resampled, y_train_resampled = random_undersampler.fit_sample(
    setup.X_TRAIN, setup.Y_TRAIN)

print "-" * setup.LINE_LENGTH
print "Before resample: {}".format(Counter(setup.Y_TRAIN))
print "After resample:  {}".format(Counter(y_train_resampled))
print "-" * setup.LINE_LENGTH

model = linear_model.LogisticRegression(class_weight='balanced')

cross_validation = model_selection.GridSearchCV(
    param_grid=setup.CROSS_VALIDATION_GRID,
    cv=setup.STRATIFIED_K_FOLD,
    estimator=model,
# Create features
features_df = pd.read_csv("CleanedData.csv")
del features_df['went_on_backorder=Yes']

# Create outcomes
df = pd.read_csv("CleanedData.csv")
outcomes_df = df['went_on_backorder=Yes']

# Create X and y arrays
X = features_df.as_matrix()
y = outcomes_df.as_matrix()
print('Original dataset shape {}'.format(Counter(y)))

# Apply the random under-sampling
pipeline = pl.make_pipeline(
    us.RandomUnderSampler(),
    linear_model.LogisticRegression(penalty='l2',
                                    C=1,
                                    solver='liblinear',
                                    random_state=0))

n_folds = 5
store_Roc_Auc = np.zeros(n_folds)
print(store_Roc_Auc)

# Split to test and train set
results = []
skf = StratifiedKFold(n_splits=n_folds)
aaa = 0
for train_index, test_index in skf.split(X, y):
    probas = pipeline.fit(X[train_index],
Exemplo n.º 13
0
        print('Shape of dataset: ', data.shape)
        print('Count labels: ', data.groupby(['loan_status']).size())

        # Resample
        X = pd.concat((data.iloc[:, 0], data.iloc[:, 2:len(kept_cols)]),
                      axis=1)
        y = data.iloc[:, 1]
        model = None
        if algorithm == 'enn':
            model = us.EditedNearestNeighbours()
        if algorithm == 'renn':
            model = us.RepeatedEditedNearestNeighbours()
        if algorithm == 'all-knn':
            model = us.AllKNN()
        if algorithm == 'random':
            model = us.RandomUnderSampler(sampling_strategy=1.0)
        if model is not None:
            X_res, y_res = model.fit_resample(X, y)
            data_reduced = np.concatenate(
                (X_res[:, 0][:, None], y_res[:, None],
                 X_res[:, 1:(len(kept_cols) - 1)]),
                axis=1)
            df_reduced = pd.DataFrame(data=data_reduced, columns=kept_cols)
        else:
            df_reduced = data

        # Split into train set and test set
        train, test = ms.train_test_split(df_reduced, test_size=0.2)
        print('Shape of train set:', train.shape)
        print('Count labels: ', train.groupby(['loan_status']).size())
        print('Shape of test set:', test.shape)
Exemplo n.º 14
0
#################### Decision Tree Classifier ####################
#dtree = DecisionTreeClassifier(criterion='entropy', random_state=0)
# Want to prune the tree above, not supported in sklearn, to do this we can set a max depth!
# A max depth of 10 acheived the best accuracy, leave it at that.
dtree = DecisionTreeClassifier(criterion='entropy',
                               random_state=0,
                               max_depth=10)
dtree.fit(censusTrain, censusLabels)
error = metrics.mean_absolute_error(testLabels, dtree.predict(testFeatures))
print('For Decision Tree Classifier, the mean absolute error is: {0}'.format(
    error))  # 0.13911921872120878
print('and the accuracy score is thus: {0}\n'.format(1 - error))

#################### Naive Bayes Classifier ####################
under = un.RandomUnderSampler(random_state=0)
sampledTestTrain, sampledTestLabels = under.fit_sample(testFeatures,
                                                       testLabels)
nbayes = BernoulliNB()
nbayes.fit(censusTrain, censusLabels)
error = metrics.mean_absolute_error(sampledTestLabels,
                                    nbayes.predict(sampledTestTrain))
print('For Naive Bayes Classifier, the mean absolute error is: {0}'.format(
    error))  # 0.24167966718668746 <- This seems like a bit high for error?
print('and the accuracy score is thus: {0}\n'.format(1 - error))

############################## K Nearest Neighbors Classifiers ##############################
# Tried several values we could record the accuracy for for comparison and discussion in the report.
#################### K Nearest Neighbors Classifier (KNN = 1) ####################
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(censusTrain, censusLabels)
Exemplo n.º 15
0
print('Precision ', metrics.precision_score(y_test, y_pred))
print('F1 ', metrics.f1_score(y_test, y_pred))

metrics.plot_confusion_matrix(cls, X_test, y_test, normalize='true')

# ## Abordagem Over and Under Random Sample

# In[12]:

from imblearn import over_sampling as over
from imblearn import under_sampling as under

over = over.RandomOverSampler(sampling_strategy=0.1)
X_over, y_over = over.fit_resample(X, y)
print(y_over.value_counts())
under = under.RandomUnderSampler(sampling_strategy=0.5)
X_ou, y_ou = under.fit_resample(X_over, y_over)
print(y_ou.value_counts())

# ### Distribuição 2D
#
# *Redução dos data points azuis e data points laranja ainda se sobrepondo*

# In[13]:

from sklearn.decomposition import TruncatedSVD
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

X2 = TruncatedSVD(n_components=2).fit_transform(X_ou)
Exemplo n.º 16
0
# Define the path of the ground for the prostate
path_gt = ['GT_inv/prostate', 'GT_inv/pz', 'GT_inv/cg', 'GT_inv/cap']
# Define the label of the ground-truth which will be provided
label_gt = ['prostate', 'pz', 'cg', 'cap']
# Define the path where to store the data
path_store = '/data/prostate/balanced/mp-mri-prostate/exp-3'

N_JOBS = -1
# Create the under_samplers and over_samplers list to use
samplers = [
    under_sampling.InstanceHardnessThreshold(n_jobs=N_JOBS,
                                             estimator='random-forest'),
    under_sampling.NearMiss(version=1, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=2, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=3, n_jobs=N_JOBS),
    under_sampling.RandomUnderSampler(),
    over_sampling.SMOTE(kind='regular', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline1', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline2', n_jobs=N_JOBS),
    over_sampling.RandomOverSampler()
]
# Define the sub-folder to use
sub_folder = [
    'iht', 'nm1', 'nm2', 'nm3', 'rus', 'smote', 'smote-b1', 'smote-b2', 'ros'
]

# Generate the different path to be later treated
path_patients_list_gt = []
# Create the generator
id_patient_list = [
    name for name in os.listdir(path_patients)
Exemplo n.º 17
0
from imblearn import pipeline as pl

# create features
features_df = pd.read_csv("CleanedData.csv")
del features_df['went_on_backorder=Yes']

# Create outcomes
df = pd.read_csv("CleanedData.csv")
outcomes_df = df['went_on_backorder=Yes']

# Create X and y arrays
X = features_df.as_matrix()
y = outcomes_df.as_matrix()

# Apply the random under-sampling
pipeline = pl.make_pipeline(us.RandomUnderSampler(),
                            svm.SVC(kernel='rbf', C=1.0,probability=True)
                            )


# Split to test and train set
results=[]
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X,y):
    probas = pipeline.fit(X[train_index], y[train_index]).predict_proba(X[test_index])
    preds = probas[:, 1]
    fpr, tpr, threshold = roc_curve(y[test_index], preds)

    print("Show fpr  ----------------------------")
    print(fpr)
    print("Show tpr -----------------------------")
Exemplo n.º 18
0
def resample_classes(X,
                     Y,
                     how='und1',
                     random_state=None,
                     test_size=0.3,
                     n_jobs=2,
                     split=True,
                     verbose=True):
    """

    """
    if how == 'und1':
        if verbose:
            msg = 'Under-sampling the majority class(es) by randomly picking '
            msg += 'samples without replacement'
            print msg
        samp = imbus.RandomUnderSampler(random_state=random_state,
                                        replacement=False)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und2':
        if verbose:
            msg = 'Under-sampling by generating centroids based on clustering '
            msg += 'methods'
            print msg
        samp = imbus.ClusterCentroids(ratio='auto',
                                      random_state=random_state,
                                      estimator=None,
                                      n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und3':
        if verbose:
            print 'Under-sampling based on NearMiss methods'
        samp = imbus.NearMiss(ratio='auto',
                              return_indices=False,
                              random_state=random_state,
                              version=1,
                              size_ngh=None,
                              n_neighbors=3,
                              ver3_samp_ngh=None,
                              n_neighbors_ver3=3,
                              n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over1':
        if verbose:
            msg = 'Over-sampling the minority class(es) by picking samples at '
            msg += 'random with replacement'
            print
        samp = imbov.RandomOverSampler(random_state=random_state)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over2':
        if verbose:
            msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling '
            msg += 'Technique'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.SMOTE(random_state=random_state,
                               ratio=.99,
                               k=None,
                               k_neighbors=5,
                               m=None,
                               m_neighbors=10,
                               out_step=0.5,
                               kind='regular',
                               svm_estimator=None,
                               n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'over3':
        if verbose:
            msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling '
            msg += 'Approach for Imbalanced Learning'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.ADASYN(ratio=.93,
                                random_state=random_state,
                                k=None,
                                n_neighbors=5,
                                n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'comb1':
        if verbose:
            print 'Combine over- and under-sampling using SMOTE and Tomek links.'
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbcom.SMOTETomek(ratio=.99,
                                     random_state=random_state,
                                     smote=None,
                                     tomek=None,
                                     k=None,
                                     m=None,
                                     out_step=None,
                                     kind_smote=None,
                                     n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    else:
        print 'Sampling approach not recognized'
        return

    if verbose:
        print '\t\t\t1\t2\t3\t4'
        val_y = pd.Series(Y).value_counts(sort=False).values
        msg = 'Counts in y_init:\t{}\t{}\t{}\t{} '
        print msg.format(val_y[0], val_y[1], val_y[2], val_y[3])
        val_yres = pd.Series(y_res).value_counts(sort=False).values
        msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} '
        print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3])

    if split:
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=test_size, random_state=random_state)
        if verbose:
            val_ytr = pd.Series(y_train).value_counts(sort=False).values
            msg = 'Counts in y_train:\t{}\t{}\t{}\t{} '
            print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3])

            val_yte = pd.Series(y_test).value_counts(sort=False).values
            msg = 'Counts in y_test:\t{}\t{}\t{}\t{} '
            print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3])

            print 'X_train:', X_train.shape, ', X_test:', X_test.shape

        return X_train, X_test, y_train, y_test
    else:
        return X_res, y_res