def _oversample(X, y, method='SMOTE', strat='not majority'): # compute minimum number of samples per class min_samples = len(y) for l in set(y): if y.tolist().count(l) < min_samples: min_samples = y.tolist().count(l) if min_samples <= 5: method = 'RNDM' if method == 'ADASYN': ios = imbover.ADASYN(sampling_strategy=strat, random_state=42) elif method == 'SMOTE': ios = imbover.SMOTE(sampling_strategy=strat, random_state=42) elif method == 'SMOTENC': ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42) elif method == 'BORDERSMOTE': ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42) elif method == 'SVMSMOTE': ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42) elif method == 'KMEANSSMOTE': ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42) elif method == 'RNDM': ios = imbover.RandomOverSampler(sampling_strategy=strat, random_state=42) X_resampled, y_resampled = ios.fit_resample(X, y) return X_resampled, y_resampled
def __init__(self, inputs, targets, batch_size=100, max_num_batches=-1, shuffle_order=True, rng=None, oversample=None): """Create a new recognition data provider object. Args: inputs (ndarray): Array of data input features of shape (num_data, input_dim). targets (ndarray): Array of data output targets of shape (num_data, output_dim) or (num_data,) if output_dim == 1. batch_size (int): Number of data points to include in each batch. max_num_batches (int): Maximum number of batches to iterate over in an epoch. If `max_num_batches * batch_size > num_data` then only as many batches as the data can be split into will be used. If set to -1 all of the data will be used. shuffle_order (bool): Whether to randomly permute the order of the data before each epoch. rng (RandomState): A seeded random number generator. """ if not oversample is None: oversample = oversample.lower() self.initialize_seed(rng) if oversample == "smote": oversampler = imbl.SMOTE(random_state=self.rng) elif oversample == "smote-cat": # Need method for specifying categorical attributes, e.g., imbl.SMOTENC(random_state=self.rng, categorical_features=range(4200, 4348)) raise (NotImplementedError) elif oversample == "smote-svm": oversampler = imbl.SVMSMOTE(random_state=self.rng) elif oversample == "smote-borderline-1": oversampler = imbl.BorderlineSMOTE(random_state=self.rng, kind='borderline-1') elif oversample == "smote-borderline-2": oversampler = imbl.BorderlineSMOTE(random_state=self.rng, kind='borderline-2') elif oversample == "adasyn": oversampler = imbl.ADASYN(random_state=self.rng) else: raise (Exception( "Unrecognized oversampling method: {0}".format(oversample)) ) inputs, targets = oversampler.fit_resample(inputs, targets) self.num_classes = 3 inputs = inputs.astype(np.float32) # pass the loaded data to the parent class __init__ super(RecognitionDataProvider, self).__init__(inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
feature_src[where_are_inf] = 0 feature_src = mami.fit_transform(feature_src) where_are_nan = np.isnan(feature_tar) where_are_inf = np.isinf(feature_tar) feature_tar[where_are_nan] = 0 feature_tar[where_are_inf] = 0 feature_tar = mami.fit_transform(feature_tar) train_label = train_label.values src_label = train_label[:, 0] test_label = test_label.values tar_label = test_label[:, 0] print('Before resampled dataset shape %s' % Counter(src_label)) sm = over_sampling.SVMSMOTE(sampling_strategy={1: 860, 0: 600}) # feature_src, src_label = sm.fit_resample(feature_src, src_label) # print('Resampled dataset shape %s' % Counter(src_label)) from sklearn.linear_model import Lasso, LassoCV """ Lasso 400 features 0.000005 """ lasso = Lasso(alpha=0.0005) lasso.fit(feature_src, src_label) coef = lasso.coef_ coef.astype('float64') pos = np.where(abs(coef) > 0.000001) pos = np.array(pos) pos = pos[0, :] print(len(pos))
def svmsmote(features, labels): smote = over_sampling.SVMSMOTE(random_state=0) return smote.fit_resample(X=features, y=labels)
y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index] features = X_train.columns.to_list() # imputation imp = SimpleImputer(missing_values=np.nan, strategy='median') imp.fit(X_train) X_train = imp.transform(X_train) X_test = imp.transform(X_test) # scaling scaler = preprocessing.MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # over-sampling # print('before', y_train.groupby(['Label']).size()) sm = over_sampling.SVMSMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) # feature selection model = ExtraTreesClassifier(n_estimators=250, random_state=42) model.fit(X_train, y_train.values.ravel()) importances = model.feature_importances_ no_zero_importance = importances[np.where(importances > 0)] cutoff = np.std(no_zero_importance) + np.min(no_zero_importance) indices = np.where(importances > cutoff)[0] fs_elements = np.concatenate((fs_elements, np.array(features)[indices]), axis=0) # for i in indices: # print(features[i], importances[i]) X_train_fs = X_train[:, indices] X_test_fs = X_test[:, indices]
def main(X=None, y=None): # ndarray fig, axes = plt.subplots(4, 2, figsize=(10, 10)) X = pd.DataFrame(X) print('\nOriginal dataset shape %s' % Counter(y), '\n') font2 = { 'family': 'Times New Roman', 'size': 20, } '''weight-smote''' sm = all_smote_v2.SMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) print('weight smote %s' % Counter(y_res), '\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] for i in range(len(X)): if y[i] == 1: # print(X.iloc[i][0]) axes[0][0].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25) for i in range(len(X)): if y[i] == 0: axes[0][0].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25) axes[0][0].scatter(X_new[0], X_new[1], c='red', marker='+', s=50) axes[0][0].set_title('(a) weight-SMOTE ', font2) '''smote''' sm = over_sampling.SMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) print('smote %s' % Counter(y_res), '\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] for i in range(len(X)): if y[i] == 1: # print(X.iloc[i][0]) axes[0][1].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25) for i in range(len(X)): if y[i] == 0: axes[0][1].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25) axes[0][1].scatter(X_new[0], X_new[1], c='red', marker='+', s=50) axes[0][1].set_title('(b) SMOTE ', font2) '''boderline_1''' sm_1 = over_sampling.BorderlineSMOTE(random_state=42, kind="borderline-1") X_res, y_res = sm_1.fit_resample(X, y) print('borderline-1 shape %s' % Counter(y_res), '\n\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] for i in range(len(X)): if y[i] == 1: # print(X.iloc[i][0]) axes[1][1].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25) for i in range(len(X)): if y[i] == 0: axes[1][1].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25) # axes[0][0].scatter(X.iloc[i:,0:], X[i:,1:], c='cyan',) axes[1][1].scatter(X_new[0], X_new[1], c='red', marker='+', s=50) axes[1][1].set_title('(d) boderline-SMOTE1', font2) '''weight-boderline''' sm_zhou = all_smote_v2.BorderlineSMOTE( random_state=42, kind="weight-borderline-smote", ) X_res, y_res = sm_zhou.fit_resample(X, y) print('weight-borderline shape %s' % Counter(y_res), '\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] for i in range(len(X)): if y[i] == 1: # print(X.iloc[i][0]) axes[1][0].scatter(X.iloc[i][0], X.iloc[i][1], c='tan', s=25) for i in range(len(X)): if y[i] == 0: axes[1][0].scatter(X.iloc[i][0], X.iloc[i][1], c='darkcyan', s=25) # axes[0][0].scatter(X.iloc[i:,0:], X[i:,1:], c='cyan',) axes[1][0].scatter(X_new[0], X_new[1], c='red', marker='+', s=50) axes[1][0].set_title('(c) weight-boderline', font2) '''weight-kmeans-smote''' sm_3 = all_smote_v2.KMeansSMOTE(random_state=42, kind='kmeans-borderline') X_res, y_res = sm_3.fit_resample(X, y) print('weight-kmeans-smote:\t', Counter(y_res), '\n\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] axes[2][0].scatter(X[0], X[1], c=y, alpha=0.5) axes[2][0].scatter(X_new[0], X_new[1], c='red', alpha=0.2) axes[2][0].set_title('weight_kmeans_smote') '''kmeans-smote''' sm_4 = over_sampling.KMeansSMOTE(random_state=42, ) X_res, y_res = sm_4.fit_resample(X, y) print('kmeans-smote:\t', Counter(y_res), '\n\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] axes[2][1].scatter(X[0], X[1], c=y, alpha=0.5) axes[2][1].scatter(X_new[0], X_new[1], c='red', alpha=0.2) axes[2][1].set_title('kmeans_smote') '''weight-svm-smote''' sm_7 = all_smote_v2.SVMSMOTE(random_state=42, ) X_res, y_res = sm_7.fit_resample(X, y) print('weight-SVM_smote:\t', Counter(y_res), '\n\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] axes[3][0].scatter(X[0], X[1], c=y, alpha=0.5) axes[3][0].scatter(X_new[0], X_new[1], c='red', alpha=0.2) axes[3][0].set_title('weight_svm_smote') '''SVM_SMOTE''' sm_6 = over_sampling.SVMSMOTE(random_state=42, ) X_res, y_res = sm_6.fit_resample(X, y) print('SVM_smote:\t', Counter(y_res), '\n\n\n\n') y_res = pd.DataFrame(y_res) X_new = X_res.iloc[len(X):, :] y_new = y_res.iloc[len(y):, :] axes[3][1].scatter(X[0], X[1], c=y, alpha=0.5) axes[3][1].scatter(X_new[0], X_new[1], c='red', alpha=0.2) axes[3][1].set_title('svm_smote')