Пример #1
0
def _oversample(X, y, method='SMOTE', strat='not majority'):
    # compute minimum number of samples per class
    min_samples = len(y)
    for l in set(y):
        if y.tolist().count(l) < min_samples:
            min_samples = y.tolist().count(l)
    if min_samples <= 5:
        method = 'RNDM'

    if method == 'ADASYN':
        ios = imbover.ADASYN(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTE':
        ios = imbover.SMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTENC':
        ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42)
    elif method == 'BORDERSMOTE':
        ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SVMSMOTE':
        ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'KMEANSSMOTE':
        ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'RNDM':
        ios = imbover.RandomOverSampler(sampling_strategy=strat,
                                        random_state=42)

    X_resampled, y_resampled = ios.fit_resample(X, y)
    return X_resampled, y_resampled
Пример #2
0
    def __init__(self,
                 inputs,
                 targets,
                 batch_size=100,
                 max_num_batches=-1,
                 shuffle_order=True,
                 rng=None,
                 oversample=None):
        """Create a new recognition data provider object.

        Args:
            inputs (ndarray): Array of data input features of shape
                (num_data, input_dim).
            targets (ndarray): Array of data output targets of shape
                (num_data, output_dim) or (num_data,) if output_dim == 1.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
                only as many batches as the data can be split into will be
                used. If set to -1 all of the data will be used.
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
        """
        if not oversample is None:
            oversample = oversample.lower()
            self.initialize_seed(rng)

            if oversample == "smote":
                oversampler = imbl.SMOTE(random_state=self.rng)
            elif oversample == "smote-cat":
                # Need method for specifying categorical attributes, e.g., imbl.SMOTENC(random_state=self.rng, categorical_features=range(4200, 4348))
                raise (NotImplementedError)
            elif oversample == "smote-svm":
                oversampler = imbl.SVMSMOTE(random_state=self.rng)
            elif oversample == "smote-borderline-1":
                oversampler = imbl.BorderlineSMOTE(random_state=self.rng,
                                                   kind='borderline-1')
            elif oversample == "smote-borderline-2":
                oversampler = imbl.BorderlineSMOTE(random_state=self.rng,
                                                   kind='borderline-2')
            elif oversample == "adasyn":
                oversampler = imbl.ADASYN(random_state=self.rng)
            else:
                raise (Exception(
                    "Unrecognized oversampling method: {0}".format(oversample))
                       )

            inputs, targets = oversampler.fit_resample(inputs, targets)

        self.num_classes = 3
        inputs = inputs.astype(np.float32)

        # pass the loaded data to the parent class __init__
        super(RecognitionDataProvider,
              self).__init__(inputs, targets, batch_size, max_num_batches,
                             shuffle_order, rng)
feature_src[where_are_inf] = 0
feature_src = mami.fit_transform(feature_src)

where_are_nan = np.isnan(feature_tar)
where_are_inf = np.isinf(feature_tar)
feature_tar[where_are_nan] = 0
feature_tar[where_are_inf] = 0
feature_tar = mami.fit_transform(feature_tar)

train_label = train_label.values
src_label = train_label[:, 0]
test_label = test_label.values
tar_label = test_label[:, 0]

print('Before resampled dataset shape %s' % Counter(src_label))
sm = over_sampling.SVMSMOTE(sampling_strategy={1: 860, 0: 600})
# feature_src, src_label = sm.fit_resample(feature_src, src_label)
# print('Resampled dataset shape %s' % Counter(src_label))

from sklearn.linear_model import Lasso, LassoCV
"""       Lasso           400 features 0.000005   """

lasso = Lasso(alpha=0.0005)
lasso.fit(feature_src, src_label)
coef = lasso.coef_
coef.astype('float64')
pos = np.where(abs(coef) > 0.000001)
pos = np.array(pos)
pos = pos[0, :]
print(len(pos))
def svmsmote(features, labels):
    smote = over_sampling.SVMSMOTE(random_state=0)
    return smote.fit_resample(X=features, y=labels)
Пример #5
0
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    features = X_train.columns.to_list()
    # imputation
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(X_train)
    X_train = imp.transform(X_train)
    X_test = imp.transform(X_test)

    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # over-sampling
    # print('before', y_train.groupby(['Label']).size())
    sm = over_sampling.SVMSMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    # feature selection
    model = ExtraTreesClassifier(n_estimators=250, random_state=42)
    model.fit(X_train, y_train.values.ravel())
    importances = model.feature_importances_
    no_zero_importance = importances[np.where(importances > 0)]
    cutoff = np.std(no_zero_importance) + np.min(no_zero_importance)
    indices = np.where(importances > cutoff)[0]
    fs_elements = np.concatenate((fs_elements, np.array(features)[indices]),
                                 axis=0)
    # for i in indices:
    #     print(features[i], importances[i])
    X_train_fs = X_train[:, indices]
    X_test_fs = X_test[:, indices]
Пример #6
0
def main(X=None, y=None):  # ndarray

    fig, axes = plt.subplots(4, 2, figsize=(10, 10))
    X = pd.DataFrame(X)
    print('\nOriginal dataset shape %s' % Counter(y), '\n')
    font2 = {
        'family': 'Times New Roman',
        'size': 20,
    }
    '''weight-smote'''
    sm = all_smote_v2.SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    print('weight smote %s' % Counter(y_res), '\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    for i in range(len(X)):
        if y[i] == 1:
            # print(X.iloc[i][0])
            axes[0][0].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25)
    for i in range(len(X)):
        if y[i] == 0:
            axes[0][0].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25)
    axes[0][0].scatter(X_new[0], X_new[1], c='red', marker='+', s=50)
    axes[0][0].set_title('(a) weight-SMOTE ', font2)
    '''smote'''
    sm = over_sampling.SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    print('smote %s' % Counter(y_res), '\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    for i in range(len(X)):
        if y[i] == 1:
            # print(X.iloc[i][0])
            axes[0][1].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25)
    for i in range(len(X)):
        if y[i] == 0:
            axes[0][1].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25)
    axes[0][1].scatter(X_new[0], X_new[1], c='red', marker='+', s=50)
    axes[0][1].set_title('(b) SMOTE ', font2)
    '''boderline_1'''
    sm_1 = over_sampling.BorderlineSMOTE(random_state=42, kind="borderline-1")
    X_res, y_res = sm_1.fit_resample(X, y)
    print('borderline-1 shape %s' % Counter(y_res), '\n\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    for i in range(len(X)):
        if y[i] == 1:
            # print(X.iloc[i][0])
            axes[1][1].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25)
    for i in range(len(X)):
        if y[i] == 0:
            axes[1][1].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25)
            # axes[0][0].scatter(X.iloc[i:,0:], X[i:,1:], c='cyan',)
    axes[1][1].scatter(X_new[0], X_new[1], c='red', marker='+', s=50)
    axes[1][1].set_title('(d) boderline-SMOTE1', font2)
    '''weight-boderline'''
    sm_zhou = all_smote_v2.BorderlineSMOTE(
        random_state=42,
        kind="weight-borderline-smote",
    )
    X_res, y_res = sm_zhou.fit_resample(X, y)
    print('weight-borderline shape %s' % Counter(y_res), '\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    for i in range(len(X)):
        if y[i] == 1:
            # print(X.iloc[i][0])
            axes[1][0].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25)
    for i in range(len(X)):
        if y[i] == 0:
            axes[1][0].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25)
            # axes[0][0].scatter(X.iloc[i:,0:], X[i:,1:], c='cyan',)
    axes[1][0].scatter(X_new[0], X_new[1], c='red', marker='+', s=50)
    axes[1][0].set_title('(c) weight-boderline', font2)
    '''weight-kmeans-smote'''
    sm_3 = all_smote_v2.KMeansSMOTE(random_state=42, kind='kmeans-borderline')
    X_res, y_res = sm_3.fit_resample(X, y)
    print('weight-kmeans-smote:\t', Counter(y_res), '\n\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    axes[2][0].scatter(X[0], X[1], c=y, alpha=0.5)
    axes[2][0].scatter(X_new[0], X_new[1], c='red', alpha=0.2)
    axes[2][0].set_title('weight_kmeans_smote')
    '''kmeans-smote'''
    sm_4 = over_sampling.KMeansSMOTE(random_state=42, )
    X_res, y_res = sm_4.fit_resample(X, y)
    print('kmeans-smote:\t', Counter(y_res), '\n\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    axes[2][1].scatter(X[0], X[1], c=y, alpha=0.5)
    axes[2][1].scatter(X_new[0], X_new[1], c='red', alpha=0.2)
    axes[2][1].set_title('kmeans_smote')
    '''weight-svm-smote'''
    sm_7 = all_smote_v2.SVMSMOTE(random_state=42, )
    X_res, y_res = sm_7.fit_resample(X, y)
    print('weight-SVM_smote:\t', Counter(y_res), '\n\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    axes[3][0].scatter(X[0], X[1], c=y, alpha=0.5)
    axes[3][0].scatter(X_new[0], X_new[1], c='red', alpha=0.2)
    axes[3][0].set_title('weight_svm_smote')
    '''SVM_SMOTE'''
    sm_6 = over_sampling.SVMSMOTE(random_state=42, )
    X_res, y_res = sm_6.fit_resample(X, y)
    print('SVM_smote:\t', Counter(y_res), '\n\n\n\n')
    y_res = pd.DataFrame(y_res)
    X_new = X_res.iloc[len(X):, :]
    y_new = y_res.iloc[len(y):, :]
    axes[3][1].scatter(X[0], X[1], c=y, alpha=0.5)
    axes[3][1].scatter(X_new[0], X_new[1], c='red', alpha=0.2)
    axes[3][1].set_title('svm_smote')