def sampling(): verbose = False y = np.bincount(target_train1) print y ratio = float(y[2]) / float(y[1]) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(data_train1, target_train1) random_methods(osx,osy) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(data_train1, target_train1) random_methods(smox,smoy) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(data_train, target_train) random_methods(bs1x,bs1y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1) random_methods(bs2x,bs2y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(data_train1, target_train1) random_methods(svsx,svsy) # 'SMOTE Tomek links' STK = SMOTETomek(ratio=ratio, verbose=verbose) stkx, stky = STK.fit_transform(data_train1, target_train1) random_methods(stkx,stky) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose) ennx, enny = SENN.fit_transform(data_train1, target_train1) random_methods(ennx,enny) # 'EasyEnsemble' EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(data_train1, target_train1) random_methods(eex,eey) # 'BalanceCascade' BS = BalanceCascade(verbose=verbose) bsx, bsy = BS.fit_transform(data_train1, target_train1) random_methods(bsx,bsy)
def _sample_values(X, y, method=None, ratio=1, verbose=False): """Perform any kind of sampling(over and under). Parameters ---------- X : array, shape = [n_samples, n_features] Data. y : array, shape = [n_samples] Target. method : str, optional default: None Over or under smapling method. ratio: float Unbalanced class ratio. Returns ------- X, y : tuple Sampled X and y. """ if method == 'SMOTE': sampler = SMOTE(ratio=ratio, verbose=verbose) elif method == 'SMOTEENN': ratio = ratio * 0.3 sampler = SMOTEENN(ratio=ratio, verbose=verbose) elif method == 'random_over_sample': sampler = OverSampler(ratio=ratio, verbose=verbose) elif method == 'random_under_sample': sampler = UnderSampler(verbose=verbose) elif method == 'TomekLinks': sampler = TomekLinks(verbose=verbose) return sampler.fit_transform(X, y)
y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose) x, y = SENN.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)