from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
# Make the dataset imbalanced
# Select only half of the first class
iris.data = iris.data[25:-1, :]
iris.target = iris.target[25:-1]

X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    random_state=RANDOM_STATE)

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
Пример #2
0
                X_batch, y_batch = next(training_generator)
                sess.run(
                    [train_op, loss],
                    feed_dict={
                        data: X_batch,
                        targets: y_batch
                    },
                )

            # For each epoch, run accuracy on train and test
            predicts_train = sess.run(predict, feed_dict={data: X})
            print("epoch: {} train accuracy: {:.3f}".format(
                e, accuracy(y, predicts_train)))


@pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()])
def test_balanced_batch_generator(data, sampler):
    if LooseVersion(tf.__version__) < '2':
        check_balanced_batch_generator_tf_1_X_X(data, sampler)
    else:
        check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(data, sampler)


@pytest.mark.parametrize("keep_sparse", [True, False])
def test_balanced_batch_generator_function_sparse(data, keep_sparse):
    X, y = data

    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X),
        y,
        keep_sparse=keep_sparse,
Пример #3
0
# heuristic rules in order to select samples. NearMiss-1 selects samples from
# the majority class for which the average distance of the :math:`k`` nearest
# samples of the minority class is the smallest. NearMiss-2 selects the samples
# from the majority class for which the average distance to the farthest
# samples of the negative class is the smallest. NearMiss-3 is a 2-step
# algorithm: first, for each minority sample, their :math:`m`
# nearest-neighbors will be kept; then, the majority samples selected are the
# on for which the average distance to the :math:`k` nearest neighbors is the
# largest.

# %%
from imblearn.under_sampling import NearMiss

X, y = create_dataset(n_samples=1000, weights=(0.05, 0.15, 0.8), class_sep=1.5)

samplers = [NearMiss(version=1), NearMiss(version=2), NearMiss(version=3)]

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X,
        y,
        model,
        ax[0],
        title=f"Decision function for {sampler.__class__.__name__}-{sampler.version}",
    )
    plot_resampling(
        X,
        y,
        sampler,
print('Accuracy : ', accuracy_score(y, predictions), end='\n')

print('Classification Report : ', end='\n')
print(classification_report(y, predictions))

# # Class Imbalance - UnderSampling

# In[96]:

####################### Class Imbalance - Undersampling ######################
X_train = df_train.iloc[:, df_train.columns != 'renewal_status'].values
y_train = df_train['renewal_status'].values

from imblearn.under_sampling import NearMiss
nr = NearMiss()
X_train, y_train = nr.fit_sample(X_train, y_train)

# In[97]:

np.bincount(y_train)

# In[98]:

# import the ML algorithm
from xgboost import XGBClassifier

# Instantiate the classifier
xgbClassifier = XGBClassifier(random_state=1, learning_rate=0.01)

# Train classifier
Пример #5
0
labels = []
index = 0
for line in contain:  # 一行行读数据文件
    line = line.strip()  # 删除line头和尾的空格
    listFormLine = re.split(r'[ ,;:\t]+', line)  # 指定','为分隔符,将line分割开
    '''将listFormLine中的前len(len(listFormLine)-1)列加入到矩阵中去'''
    features[index:] = listFormLine[0:len(listFormLine) - 1]
    labels.append(listFormLine[-1])  # 最后一列作为类标
    index += 1
    '''返回的features为特征矩阵,labels为类别列表'''
labels=np.array([int(x) for x in labels])
file.close()
X=features
y=labels
# Apply Nearmiss
nm=NearMiss(version=2)
X_resampled = []
y_resampled = []
X_res,y_res=nm.fit_sample(X,y)
X_resampled.append(X_res)
y_resampled.append(y_res)
y_resampled=y_resampled[0]
X_resampled=X_resampled[0]
y_resampled=y_resampled[:,np.newaxis]

resampled=np.hstack((X_resampled,y_resampled)).tolist()
f=open("re_NearMiss2.csv",'w')
for i in range(len(resampled)):
    for j in range(len(resampled[i])):
        if j<len(resampled[i])-1:
            f.write(str(resampled[i][j])+',')
Пример #6
0
    def sample_data(self,
                    sampling_method: str,
                    X_train,
                    Y_train,
                    base_file_name,
                    target_column="star_rating"):
        """
        Creates sampler based in sampling method and return the resulting X and y

        This method will also save the final distribution to a CSV file based on base_file_name

        :param X_train: Original features
        :param Y_train: Original labels
        :param base_file_name: base file name to save the final distribution csv
        :return:
        """
        ## if we want to over sample or under sample
        log.debug(f'Y_train {Y_train.shape}')
        log.debug(f'Y_train {Y_train.head()}')

        grouped_df = Y_train.reset_index().groupby(target_column).count()

        log.info(
            f'Distribution before sampling with {sampling_method}\n{grouped_df}'
        )
        log.debug(f'grouped type: {type(grouped_df)}')
        log.debug(f'grouped: {grouped_df.head()}')
        log.debug(f'grouped: {grouped_df.shape}')

        if sampling_method == "smote":
            sampler = SMOTE(random_state=RSTATE,
                            sampling_strategy='not majority',
                            n_jobs=self.n_jobs)
        elif sampling_method == "adasyn":
            sampler = ADASYN(random_state=RSTATE,
                             sampling_strategy='not majority',
                             n_jobs=self.n_jobs)
        elif sampling_method == "random_over_sampling":
            sampler = RandomOverSampler(random_state=RSTATE,
                                        sampling_strategy='not majority')
        elif sampling_method == "random_under_sampling":
            sampler = RandomUnderSampler(random_state=RSTATE, replacement=True)
        elif sampling_method == "nearmiss2":
            sampler = NearMiss(random_state=RSTATE,
                               sampling_strategy='not minority',
                               version=2,
                               n_jobs=self.n_jobs)
        else:
            raise Exception(
                f"Sampling method not supported: {sampling_method}")

        X_train_res, Y_train_res = sampler.fit_resample(
            X_train, Y_train.ravel())

        X_train = pd.DataFrame(X_train_res, columns=X_train.columns)
        Y_train = pd.DataFrame(Y_train_res, columns=[target_column])

        # get distribution of samples after samping
        dist = Y_train.reset_index().groupby(target_column).count()

        log.info(f'Distribution after sampling with {sampling_method}\n{dist}')

        log.debug(dist.head())
        dist.to_csv(
            f'{REPORT_DIR}/{base_file_name}-histogram-{sampling_method}.csv')
        return X_train, Y_train
Пример #7
0
def crossvalidate(directory_name,
                  splits,
                  data,
                  X,
                  y,
                  baseline=-1,
                  model_num=None,
                  resample=0,
                  feature_set=None,
                  feature_importance=0,
                  average_method='macro',
                  path=None):
    """
    Store the results calculated according to the arguments and store them in a file.
    Arguments:
    directory_name (str): the directory under which the files should be stored
    splits (int): number of folds
    data (dataframe): the whole dataset
    X (dataframe): examples
    y (dataframe): target/label
    baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0
    model_num (int): classification model
    1: 
    2:
    3:
    4:
    5:
    6:
    resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling
    feature_set (list): list of features to be considered
    feature_importance (int): 0 for absent, 1 for present
    average_method: macro by default
    path: the path to the directory where the recordings should be stored
    """

    #prepare the dictionary to be written to the file
    data_dict = dict()
    metrics_dict = dict()

    dir_name = path + directory_name + '/'
    os.mkdir(dir_name)
    #create a directory for each split
    for fold in range(1, splits + 1):
        os.mkdir(dir_name + str(fold))
        print(dir_name + str(fold))
    #open the config file for writing
    config_file = open(dir_name + 'config.json', 'w')
    #open the metrics file for writing
    metrics_file = open(dir_name + 'metrics.json', 'w')

    data_dict = {'model_num': model_num}
    data_dict = {'baseline': baseline}
    data_dict.update({'resample': resample})
    data_dict.update({'feature_set': feature_set})
    data_dict.update({'n_features': n_features})
    data_dict.update({'feature_importance': feature_importance})

    metrics_dict = dict()
    metrics_dict['f1_macro'] = list()
    metrics_dict['tpr'] = list()
    metrics_dict['tnr'] = list()
    metrics_dict['fpr'] = list()
    metrics_dict['precision'] = list()
    metrics_dict['recall'] = list()
    metrics_dict['accuracy'] = list()
    metrics_dict['f1'] = list()

    model = get_model(model_num)
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    #if model_num == 3:
    #kfold = ShuffleSplit(n_splits=splits, test_size=0.2, random_state=0)

    plot_lc(model=model, cv=kfold, X=X, y=y, resample=resample)
    #linearity
    test_for_linearity(X, y)

    i = 0
    for train_index, test_index in kfold.split(X, y):
        #create train-test splits
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        '''
        #create test set labels for the baseline if applicable
        if baseline == 0:
            y_test = y_test.replace(1,0)
        elif baseline == 1:
            y_test = y_test.replace(0,1)
        '''
        #resample the training set (if applicable)
        if resample == -1:
            #undersample
            '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, 
            their :m nearest-neighbors will be kept; then, the majority samples selected are the 
            on for which the average distance to the k nearest neighbors is the largest.'''
            nm = NearMiss(version=3)
            print(str(sorted(Counter(y_train).items())))
            X_resampled, y_resampled = nm.fit_resample(X_train, y_train)
            X_train = X_resampled
            y_train = y_resampled
            print(sorted(Counter(y_train).items()))
        elif resample == 1:
            #oversample
            X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
            X_train = X_resampled
            y_train = y_resampled
            print(sorted(Counter(y_resampled).items()))
        #write the training dataset class distribution to the file
        file = open(dir_name + str(i + 1) + '/train_val_dist.csv', 'a')
        file.write(str(sorted(Counter(y_train).items())))
        file.write('\n')
        file.close()

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        if baseline == 0:
            y_pred = y_pred.replace(1, 0)
        elif baseline == 1:
            y_pred = y_pred.replace(0, 1)

        metrics = get_metrics(y_test, y_pred)
        for key, value in metrics.items():
            metrics_dict[key].append(value)

        #homoscedasticity
        test_for_homoscedasticity(X_train, y_train, X_test, y_test)

        #correlation
        correlation(data)

        if feature_importance == 1:
            if model_num == 1:
                feat_importances = pd.Series(model.feature_importances_,
                                             index=X.columns)
            elif model_num == 3:
                feat_importances = pd.Series(abs(svm.coef_[0]),
                                             index=X.columns)
            if model_num != 2:
                print('Feat. Imp.: ', feat_importances)
                feat_importances.nlargest(20).plot(kind='barh')
                #plot_importance(model)
                plt.show()

                #write the feature importance values to the file
                file = open(dir_name + str(i + 1) + '/feature_importances.csv',
                            'a')
                for ind in range(0, len(feature_set)):
                    file.write(feature_set[ind] + ',' +
                               str(feat_importances[ind]) + '\n')
                file.close()

            perm = PermutationImportance(model,
                                         random_state=1).fit(X_train, y_train)
            print('PERM: ', perm.feature_importances_)
            display(
                eli5.show_weights(perm,
                                  feature_names=X_train.columns.tolist()))

            #write the permutation feature importance decrease in error values to the file
            file = open(
                dir_name + str(i + 1) + '/permutation_feature_importances.csv',
                'a')
            for ind in range(0, len(feature_set)):
                file.write(feature_set[ind] + ',' +
                           str(perm.feature_importances_[ind]) + '\n')
            file.write('\n')
            file.close()

        i += 1
    for key, values in metrics_dict.items():
        metrics_dict[key] = sum(values) / len(values)

    #write the scores to the file
    json.dump(metrics_dict, metrics_file)
    metrics_file.close()

    #write the configuration values to the file
    json.dump(data_dict, config_file)
    config_file.close()
Пример #8
0
    def undersample_data(self, X, y):
        under_sampler = NearMiss('majority', n_jobs=2)
        heart_signal_res, labels_res = under_sampler.fit_sample(X, y)

        heart_signal_res = np.reshape(heart_signal_res, (heart_signal_res.shape[0],))
        return heart_signal_res, labels_res
#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#reading data
data = pd.read_csv("creditcard.csv")
#spliting data
read = data.columns.tolist()
x = data.iloc[:, 0:30]
y = data.iloc[:, -1]
#analyzing data
print(x.shape)
print(y.shape)
print(data.isnull().values.any())  #looking for null value
dif = count_classes = pd.value_counts(data["Class"], sort=True)
print(dif)
#plotting imbalnce data
dif.plot(kind="bar", rot=0)
plt.title("Fraud vs Normal transactions")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()
from imblearn.under_sampling import NearMiss
nm = NearMiss()
x_new, y_new = nm.fit_sample(x, y)
print(x_new.shape)
print(y_new.shape)
from collections import Counter
print("Original Data", format(Counter(y)))
print("resample Data", format(Counter(y_new)))
Пример #10
0
from imblearn.under_sampling import NearMiss

## Cria dataframe com os dados de fertilidade
fertility_df = pandas.read_csv('fertility_Diagnosis.txt', header=-1)
fertility_df.columns = ['Season','Age','Childish diseases','Accident or serious trauma','Surgical intervention',
          'High fevers in the last year','Frequency of alcohol consumption','Smoking habit',
          'Number of hours spent sitting per day ene-16','Output']
## Mapeia a coluna de saída dos dados para valores numéricos
fertility_df['Output'] = fertility_df['Output'].map({'N': 0, 'O': 1}).astype(int)

## Retira dados de saída das amostras
fertility_df_output = fertility_df['Output']
del fertility_df['Output']

## Faz o balanceamento dos dados, baseado nas saídas desbalanceadas do conjunto de dados
nm = NearMiss(random_state=42)
fertility_df_balanced, fertility_output_balanced = nm.fit_sample(fertility_df, fertility_df_output)
# fertility_df_balanced = fertility_df.as_matrix()
# fertility_output_balanced = fertility_df_output.tolist()

## Faz o split dos dados para 70% de treino e 30% de teste
training_data, test_data, training_output, test_output = train_test_split(fertility_df_balanced, fertility_output_balanced, test_size=0.3, random_state=42)

## Realiza o treinamento do MLP
quantidade_features = training_data.shape[1]

mlp = MultiLayerPerceptron(
    numero_de_entradas=quantidade_features, neuronios_por_camada=[quantidade_features, 1],
    taxa_aprendizagem=0.5, epocas=5000, precisao=0, debug_training=False, plot=False
)
    for key in random_samplers:
        print("######################## %s ########################" % (key))
        rus = random_samplers.get(key)
        model = logistic_regression.Module(X_train.shape[1], 2)
        X_res, y_res = rus.fit_sample(X_train, y_train)
        print(X_train.shape)
        print(X_res.shape, y_res.shape)
        print(np.sum(y_res))
        clf = SVC(probability=True)
        clf.fit(X_res, y_res)
        score = clf.predict_proba(X_test)
        evaluate(y_test, score)

    # near miss
    near_miss_models = {
        'near miss1': NearMiss(random_state=0, version=1),
        'near miss2': NearMiss(random_state=0, version=2),
        'near miss3': NearMiss(random_state=0, version=3)
    }
    for key in near_miss_models:
        print("######################## %s ########################" % (key))
        nm = near_miss_models.get(key)
        model = logistic_regression.Module(X_train.shape[1], 2)
        X_res, y_res = rus.fit_sample(X_train, y_train)
        print(X_train.shape)
        print(X_res.shape, y_res.shape)
        print(np.sum(y_res))
        clf = SVC(probability=True)
        clf.fit(X_res, y_res)
        score = clf.predict_proba(X_test)
        evaluate(y_test, score)
def near_miss(X, y):
    nm = NearMiss()
    X_res, y_res = nm.fit_resample(X, y)
    return X_res, y_res
Пример #13
0
def test_nearmiss_wrong_version():
    """Test either if an error is raised when the version is unknown."""

    version = 1000
    nm1 = NearMiss(version=version, random_state=RND_SEED)
    assert_raises(ValueError, nm1.fit_sample, X, Y)
def plot_lc(model, X, y, cv, resample=0):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    #resample the training set (if applicable)
    if resample == -1:
        #undersample
        '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, 
        their :m nearest-neighbors will be kept; then, the majority samples selected are the 
        on for which the average distance to the k nearest neighbors is the largest.'''
        nm = NearMiss(version=3)
        #print(str(sorted(Counter(y_train).items())))
        X_resampled, y_resampled = nm.fit_resample(X_train, y_train)
        X_train = X_resampled
        y_train = y_resampled
        #print(sorted(Counter(y_train).items()))
    elif resample == 1:
        #oversample
        X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
        X_train = X_resampled
        y_train = y_resampled
        print(sorted(Counter(y_resampled).items()))
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=model,
        X=X,
        y=y,
        train_sizes=np.linspace(0.01, 1.0, 50),
        cv=cv,
        scoring='f1_macro')

    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Draw lines
    plt.plot(train_sizes,
             train_mean,
             '--',
             color="#111111",
             label="Training score")
    plt.plot(train_sizes,
             test_mean,
             color="#111111",
             label="Cross-validation score")

    # Draw bands
    plt.fill_between(train_sizes,
                     train_mean - train_std,
                     train_mean + train_std,
                     color="#DDDDDD")
    plt.fill_between(train_sizes,
                     test_mean - test_std,
                     test_mean + test_std,
                     color="#DDDDDD")

    # Create plot
    plt.title("Learning Curve")
    plt.xlabel("Training Set Size"), plt.ylabel("Macro-F1 Score"), plt.legend(
        loc="best")
    plt.tight_layout()
    plt.show()
Пример #15
0
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GroupKFold
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

train_data = pd.read_csv(root_dir + "train.csv")

X = train_data.loc[:, 'mean_x':].values
y = train_data.loc[:, 'activity_id'].values
groups = train_data.loc[:, 'user_id'].values

#%%-------------------------------------------------------------------------
start_time = time.time()

nm = NearMiss(random_state=31416, ratio='auto', n_jobs=-1)
sm1 = SMOTE(random_state=31416, ratio='auto', k_neighbors=5, n_jobs=-1)
sm3 = SMOTE(random_state=31416, ratio='auto', k_neighbors=5, n_jobs=-1)
ros = RandomOverSampler(random_state=31416)

gkf = GroupKFold(n_splits=4)

scores_test = []
scores_train = []
for train_index, test_index in gkf.split(X, y, groups):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    '''
    activity_filter = (y_train==1) | (y_train==5)
    X_res = X_train[activity_filter]
Пример #16
0
def test_nearmiss_error(nearmiss_params, err_msg):
    nm = NearMiss(**nearmiss_params)
    with pytest.raises(ValueError, match=err_msg):
        nm.fit_resample(X, Y)
Пример #17
0
def underSampling(X, Y):
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(X, Y)
    return X_resampled, y_resampled
Пример #18
0
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 3
nm3 = NearMiss(version=3)
X_resampled, y_resampled = nm3.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
def main():
    samplers = [
        None,
        InstanceHardnessThreshold(sampling_strategy='majority',
                                  random_state=123,
                                  n_jobs=-1),
        NearMiss(version=1,
                 sampling_strategy='majority',
                 random_state=123,
                 n_jobs=-1),
        NearMiss(version=3,
                 sampling_strategy='majority',
                 random_state=123,
                 n_jobs=-1),
        RandomUnderSampler(sampling_strategy='majority', random_state=123)
    ]

    outliers = [
        None,
        IsolationForest(random_state=123, behaviour='new', contamination=0.1),
        LocalOutlierFactor(n_neighbors=27, contamination=0.1)
    ]

    for sampler in samplers:
        for out in outliers:

            global sampler_str, out_str, perm_str
            sampler_str = sampler.__class__.__name__
            out_str = out.__class__.__name__

            print(f"\nsampler={sampler_str}, outlier={out_str}")

            X, y, X_valid, y_valid = Dataset.read_all()
            X, y, X_valid, y_valid = Modification.apply_standartization(
                X, y, X_valid, y_valid)

            print(X.shape)

            if out is not None:
                X, y = Modification.apply_outliers(X, y, out)
                print(X.shape)

            if sampler is None:
                weights, weight_valid = Modification.make_weights_column(
                    X, y, X_valid, y_valid)
            else:
                weights, weight_valid = None, None
                X, y = Modification.apply_samplers(X, y, sampler)
                if "Instance" in sampler_str:
                    X, y = Modification.apply_samplers(
                        X, y,
                        RandomUnderSampler(sampling_strategy='majority',
                                           random_state=123))

            print("0st perm:")
            perm_str = "0st"
            est = Model.train(X, y, X_valid, y_valid, weights, weight_valid)

            print("1st perm:")
            perm_str = "1st"

            X, y, X_valid, y_valid = Modification.apply_permutation(
                X, y, X_valid, y_valid, est, sampler.__class__.__name__,
                weight_valid)
            est = Model.train(X, y, X_valid, y_valid, weights, weight_valid)

            print("2nd perm:")
            perm_str = "2nd"
            X, y, X_valid, y_valid = Modification.apply_permutation(
                X, y, X_valid, y_valid, est, sampler.__class__.__name__,
                weight_valid)
            Model.train(X, y, X_valid, y_valid, weights, weight_valid)

    print(results)
    analyze_results()
print('Total time - Without Undersampling: ', end - start, ' seconds\n')
print(metrics.classification_report(y_validation, validation_result))
print()
print('Without Undersampling -  Pipeline Score {}'.format(multiC.fit(X_train, y_train).score(X_validation, y_validation)))
print()
print_results("Without Undersampling - Validation set: ", true_validation, validation_result)

print('===============================Without Undersampling Ends===============================\n')

print('================================With Undersampling Starts===============================\n')

start = time.time()

# build model with undersampling
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=0), multiC)
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_validation)

# Print the distribution of labels about both models
print()
print("Without Undersampling - data distribution: {}".format(Counter(y_train)))
X_nearmiss, y_nearmiss = NearMiss(random_state = 0).fit_sample(X_train, y_train)
print("With Undersampling - data distribution: {}".format(Counter(y_nearmiss)))
print()

end = time.time()

# Here comes the result with Undersampling
print('Total time - With Undersampling: ', end - start, ' seconds\n')
print(classification_report_imbalanced(y_validation, nearmiss_prediction))
Пример #21
0
plt.title('Data Distribution')
plt.xlabel('Label')
plt.ylabel('Count')

label_0 = data[data['SepsisLabel'] == 0]
label_1 = data[data['SepsisLabel'] == 1]
print(label_0.shape, label_1.shape)

X = data.drop('SepsisLabel', axis=1).values
y = data['SepsisLabel'].values

X = X[:, 1:]
print(X)

from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_sample(X, y)

print(X_res.shape, y_res.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2,
                                                    random_state=0)

import pickle
pickle.dump(X_test, open('X_test.pkl', 'wb'))

from xgboost import XGBClassifier
model = XGBClassifier(min_child_weight=3,
Пример #22
0
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=200,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss
version = [1, 2, 3]
nm = [NearMiss(version=v, return_indices=True) for v in version]

X_resampled = []
y_resampled = []
X_res_vis = []
idx_samples_removed = []
for method in nm:
    X_res, y_res, idx_res = method.fit_sample(X, y)
    X_resampled.append(X_res)
    y_resampled.append(y_res)
    X_res_vis.append(pca.transform(X_res))
    idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res)

# Two subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
ax_res = [ax2, ax3, ax4]
Пример #23
0
ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1))
ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1))
ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1))

# Алгоритм SMOTE.
ClassifierTesting('Метод опорных векторов',SVC(), svc_prop, SMOTE(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, SMOTE(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, SMOTE(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, SMOTE(sampling_strategy='minority',n_jobs=-1))

# Алгоритм ADASYN.
ClassifierTesting('Метод опорных векторов',SVC(), svc_prop, ADASYN(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, ADASYN(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, ADASYN(sampling_strategy='minority',n_jobs=-1))
ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, ADASYN(sampling_strategy='minority',n_jobs=-1))
"""

# Тестирование нейронной сети.
NEURO('RandomUnderSampler',
      RandomUnderSampler(sampling_strategy='majority', random_state=36))
NEURO(
    'NearMiss',
    NearMiss(sampling_strategy='majority',
             version=1,
             random_state=36,
             n_jobs=-1))
NEURO('SMOTE', SMOTE(sampling_strategy='minority', random_state=36, n_jobs=-1))
NEURO('ADASYN', ADASYN(sampling_strategy='minority',
                       random_state=36,
                       n_jobs=-1))
Пример #24
0
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV

y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Under Sampling
from collections import Counter
Counter(y_train)

from collections import Counter
from imblearn.under_sampling import NearMiss
ns = NearMiss(0.8)
X_train_ns, y_train_ns = ns.fit_sample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

#Over Sampling¶

from imblearn.over_sampling import RandomOverSampler

os = RandomOverSampler(0.75)
X_train_ns, y_train_ns = os.fit_sample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()