def smote_data(data_train, target_train): y = np.bincount(target_train) ratio = 1.5 # float(y[2] + y[1]) / float(y[0]) # smote = SMOTE(ratio=ratio, verbose=True, kind='regular') # smox, smoy = smote.fit_transform(data_train, target_train) OS = OverSampler(ratio=ratio, verbose=True) osx, osy = OS.fit_transform(data_train, target_train) return osx, osy
def oversample_data(X_t, y_t, ratio): x_columns = X_t.columns X_t = X_t.reset_index(drop=True).as_matrix() y_t = y_t.reset_index(drop=True).as_matrix() smote = OverSampler(ratio=ratio, verbose=False) smox, smoy = smote.fit_transform(X_t, y_t) X_t = pd.DataFrame(smox, columns=x_columns) y_t = pd.Series(smoy) return X_t, y_t
def split(self, x_data, y_data): Xt, Yt, Xv, Yv = super(OverUnderSplitter, self).split(x_data, y_data) Xt_smote, Yt_smote = OverSampler( ratio=self._over_sample).fit_transform(Xt.as_matrix(), Yt.as_matrix()) Xt_smote, Yt_smote = UnderSampler( ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote) return Xt_smote, Yt_smote, Xv, Yv
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) print('NearMiss-2') NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) print('NearMiss-3') NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(verbose=verbose) ncrx, ncry = NCR.fit_transform(x, y) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def sampling(): verbose = False y = np.bincount(target_train1) print y ratio = float(y[2]) / float(y[1]) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(data_train1, target_train1) random_methods(osx,osy) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(data_train1, target_train1) random_methods(smox,smoy) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(data_train, target_train) random_methods(bs1x,bs1y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1) random_methods(bs2x,bs2y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(data_train1, target_train1) random_methods(svsx,svsy) # 'SMOTE Tomek links' STK = SMOTETomek(ratio=ratio, verbose=verbose) stkx, stky = STK.fit_transform(data_train1, target_train1) random_methods(stkx,stky) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose) ennx, enny = SENN.fit_transform(data_train1, target_train1) random_methods(ennx,enny) # 'EasyEnsemble' EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(data_train1, target_train1) random_methods(eex,eey) # 'BalanceCascade' BS = BalanceCascade(verbose=verbose) bsx, bsy = BS.fit_transform(data_train1, target_train1) random_methods(bsx,bsy)
def _parallel_build_trees(tree, forest, X, y): if forest.sampling is None: sampler = BootstrapSampler(random_state=tree.random_state) elif forest.sampling == 'up': sampler = OverSampler(random_state=tree.random_state, verbose=False) elif forest.sampling == 'down': sampler = UnderSampler(random_state=tree.random_state, verbose=False) X_sample, y_sample = sampler.fit_transform(X, y) tree.fit(X_sample, y_sample, check_input=False) return tree
def _sample_values(X, y, method=None, ratio=1, verbose=False): """Perform any kind of sampling(over and under). Parameters ---------- X : array, shape = [n_samples, n_features] Data. y : array, shape = [n_samples] Target. method : str, optional default: None Over or under smapling method. ratio: float Unbalanced class ratio. Returns ------- X, y : tuple Sampled X and y. """ if method == 'SMOTE': sampler = SMOTE(ratio=ratio, verbose=verbose) elif method == 'SMOTEENN': ratio = ratio * 0.3 sampler = SMOTEENN(ratio=ratio, verbose=verbose) elif method == 'random_over_sample': sampler = OverSampler(ratio=ratio, verbose=verbose) elif method == 'random_under_sample': sampler = UnderSampler(verbose=verbose) elif method == 'TomekLinks': sampler = TomekLinks(verbose=verbose) return sampler.fit_transform(X, y)
def random_over_sampling(self): OS = OverSampler(ratio=self._ratio, verbose=self.verbose) osx, osy = OS.fit_transform(self.x, self.y) return osx, osy
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen): ratio = float(np.count_nonzero(Y_data == 1)) / \ float(np.count_nonzero(Y_data == 0)) X_data = np.reshape(X_data, (len(X_data), n_states * maxlen)) # 'Random over-sampling' if sampling == 'OverSampler': OS = OverSampler(ratio=ratio, verbose=True) # 'Random under-sampling' elif sampling == 'UnderSampler': OS = UnderSampler(verbose=True) # 'Tomek under-sampling' elif sampling == 'TomekLinks': OS = TomekLinks(verbose=True) # Oversampling elif sampling == 'SMOTE': OS = SMOTE(ratio=1, verbose=True, kind='regular') # Oversampling - Undersampling elif sampling == 'SMOTETomek': OS = SMOTETomek(ratio=ratio, verbose=True) # Undersampling elif sampling == 'OneSidedSelection': OS = OneSidedSelection(verbose=True) # Undersampling elif sampling == 'CondensedNearestNeighbour': OS = CondensedNearestNeighbour(verbose=True) # Undersampling elif sampling == 'NearMiss': OS = NearMiss(version=1, verbose=True) # Undersampling elif sampling == 'NeighbourhoodCleaningRule': OS = NeighbourhoodCleaningRule(verbose=True) # ERROR: WRONG SAMPLER, TERMINATE else: print('Wrong sampling variable you have set... Exiting...') sys.exit() # print('shape ' + str(X.shape)) X_data, Y_data = OS.fit_transform(X_data, Y_data) return X_data, Y_data
del df['ID'] id = df_test['ID'] del df_test['ID'] from src.transfomations import remove_correlated _, to_remove = remove_correlated(df, 0.99) df_test.drop(to_remove, axis=1, inplace=True) variance_threshold = VarianceThreshold(threshold=0.001) df = variance_threshold.fit_transform(df) ratio = float(sum(target)) / float(len(target) - sum(target)) print(ratio) smote = OverSampler(ratio=ratio * 2, verbose=True) print(type(target)) # df, target = smote.fit_transform(df.as_matrix(), target.as_matrix()) df_test = variance_threshold.fit(df_test) gbc = GradientBoostingClassifier() gbc.fit(df, target) # best = randomizedSearch.best_estimator_ # print(randomizedSearch.best_params_) scores = cross_validation.cross_val_score(gbc, df, target, cv=5, scoring='roc_auc',
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) axes[2, 1].set_title('One-sided selection', fontsize=fs) # Neighboorhood cleaning rule axes[2, 2].scatter(ncrx_vis[ncry==0, 0], ncrx_vis[ncry==0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) axes[2, 2].scatter(ncrx_vis[ncry==1, 0], ncrx_vis[ncry==1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) axes[2, 2].set_title('Neighboorhood cleaning rule', fontsize=fs) plt.show() # Generate the new dataset using under-sampling method verbose = False ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0)) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(x, y) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(x, y) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(x, y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(x, y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(x, y) # 'SMOTE Tomek links'
y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) x, y = OS.fit_transform(main_x, main_y) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]