def split(self, x_data, y_data): Xt, Yt, Xv, Yv = super(SMOTESplitter, self).split(x_data, y_data) Xt_smote, Yt_smote = SMOTE(**self._smote_params).fit_transform( Xt.as_matrix(), Yt.as_matrix()) Xt_smote, Yt_smote = UnderSampler( ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote) return Xt_smote, Yt_smote, Xv, Yv
def sampling(): verbose = False y = np.bincount(target_train1) print y ratio = float(y[2]) / float(y[1]) # 'Random over-sampling' OS = OverSampler(ratio=ratio, verbose=verbose) osx, osy = OS.fit_transform(data_train1, target_train1) random_methods(osx,osy) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') smox, smoy = smote.fit_transform(data_train1, target_train1) random_methods(smox,smoy) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') bs1x, bs1y = bsmote1.fit_transform(data_train, target_train) random_methods(bs1x,bs1y) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1) random_methods(bs2x,bs2y) # 'SMOTE SVM' svm_args={'class_weight' : 'auto'} svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args) svsx, svsy = svmsmote.fit_transform(data_train1, target_train1) random_methods(svsx,svsy) # 'SMOTE Tomek links' STK = SMOTETomek(ratio=ratio, verbose=verbose) stkx, stky = STK.fit_transform(data_train1, target_train1) random_methods(stkx,stky) # 'SMOTE ENN' SENN = SMOTEENN(ratio=ratio, verbose=verbose) ennx, enny = SENN.fit_transform(data_train1, target_train1) random_methods(ennx,enny) # 'EasyEnsemble' EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(data_train1, target_train1) random_methods(eex,eey) # 'BalanceCascade' BS = BalanceCascade(verbose=verbose) bsx, bsy = BS.fit_transform(data_train1, target_train1) random_methods(bsx,bsy)
def resample(self, X, y, t, fold): if not self.resample_method: return X, y else: start = time.time() if self.verbose: ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile) # create resampler if self.resample_method == 'under': print 'UNDER SAMPLING is not implemented yet' return X, y elif self.resample_method == 'over': if self.oversample_method.lower() == 'smote': resampler = SMOTE(**self.oversample_arguments) else: print 'Your resampling method is not implemented yet' return X, y print type(X), type(y) print X.shape, y[0].shape Xsmote, ysmote = resampler.fit_transform(X, y[0]) # resample ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold) # ysmote_df = self.build_smoted_label_df(ysmote, y, fold) # # find new folds # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y) if self.debug: print np.sum(y[0] == 0), np.sum(ysmote == 0) print np.sum(y[0] == 1), np.sum(ysmote == 1) if self.on_disk: self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t) self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t) else: self.trigger_resample_labels[fold][t] = ysmote_tuple self.trigger_resample_features[fold][t] = Xsmote end = time.time() if self.verbose: ptf('... %d s' % (end - start), self.logfile) return Xsmote, ysmote_tuple
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen): ratio = float(np.count_nonzero(Y_data == 1)) / \ float(np.count_nonzero(Y_data == 0)) X_data = np.reshape(X_data, (len(X_data), n_states * maxlen)) # 'Random over-sampling' if sampling == 'OverSampler': OS = OverSampler(ratio=ratio, verbose=True) # 'Random under-sampling' elif sampling == 'UnderSampler': OS = UnderSampler(verbose=True) # 'Tomek under-sampling' elif sampling == 'TomekLinks': OS = TomekLinks(verbose=True) # Oversampling elif sampling == 'SMOTE': OS = SMOTE(ratio=1, verbose=True, kind='regular') # Oversampling - Undersampling elif sampling == 'SMOTETomek': OS = SMOTETomek(ratio=ratio, verbose=True) # Undersampling elif sampling == 'OneSidedSelection': OS = OneSidedSelection(verbose=True) # Undersampling elif sampling == 'CondensedNearestNeighbour': OS = CondensedNearestNeighbour(verbose=True) # Undersampling elif sampling == 'NearMiss': OS = NearMiss(version=1, verbose=True) # Undersampling elif sampling == 'NeighbourhoodCleaningRule': OS = NeighbourhoodCleaningRule(verbose=True) # ERROR: WRONG SAMPLER, TERMINATE else: print('Wrong sampling variable you have set... Exiting...') sys.exit() # print('shape ' + str(X.shape)) X_data, Y_data = OS.fit_transform(X_data, Y_data) return X_data, Y_data
def _sample_values(X, y, method=None, ratio=1, verbose=False): """Perform any kind of sampling(over and under). Parameters ---------- X : array, shape = [n_samples, n_features] Data. y : array, shape = [n_samples] Target. method : str, optional default: None Over or under smapling method. ratio: float Unbalanced class ratio. Returns ------- X, y : tuple Sampled X and y. """ if method == 'SMOTE': sampler = SMOTE(ratio=ratio, verbose=verbose) elif method == 'SMOTEENN': ratio = ratio * 0.3 sampler = SMOTEENN(ratio=ratio, verbose=verbose) elif method == 'random_over_sample': sampler = OverSampler(ratio=ratio, verbose=verbose) elif method == 'random_under_sample': sampler = UnderSampler(verbose=verbose) elif method == 'TomekLinks': sampler = TomekLinks(verbose=verbose) return sampler.fit_transform(X, y)
y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') x, y = smote.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)
n_source, n_target, n_samples = None, None, None for f in args.read_train: npzfile = np.load(f) print "Read %d instances from %s" \ % (npzfile['feature_matrix'].shape[0], f.name) assert npzfile['targets'].size == npzfile['feature_matrix'].shape[0] tgt, fm = npzfile['targets'], npzfile['feature_matrix'] print "target size: ", tgt.shape print "positive examples: ", sum(sum(tgt)) tgt = tgt.reshape(tgt.size) if args.smote: ratio = float(np.count_nonzero(tgt == 0)) / \ float(np.count_nonzero(tgt == 1)) OS = SMOTE(ratio=ratio, kind='regular') fm, tgt = OS.fit_transform(fm, tgt) if targets is None: targets = tgt m = fm else: print "Before concat: ", targets.shape, tgt.shape targets = np.concatenate((targets, tgt), axis=0) m = np.concatenate((m, fm), axis=0) print "After concat: ", targets.shape, tgt.shape assert targets.size == m.shape[0] assert m.shape[0] == targets.shape[0] print "Sum of targets: ", sum(targets)
label_index = 11 folds = os.listdir(location) delimiter = '\t' folds[:] = [location + i for i in folds] verbose = True ## LOAD RAW DATA ## raw_data = [] feat_content = [] labels = [] for fold_nr in range(0, len(folds)): file = open(folds[fold_nr], 'r') raw_data.append([]) feat_content.append([]) labels.append([]) for line in file.readlines(): raw_data[fold_nr].append(line.split(delimiter)) feat_content[fold_nr] = [ row[content_index] for row in raw_data[fold_nr] ] labels[fold_nr] = [row[label_index] for row in raw_data[fold_nr]] labels[fold_nr] = [ 0 if label == 'f' else 1 for label in labels[fold_nr] ] file.close() ## EVALUATE ## resampler = SMOTE(verbose=verbose) pip = Pipeline(feat_content, labels, resampler, verbose) f1_complete = pip.validation()
y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') x, y = bsmote2.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)
def f6(x): if x == 7: return 2 elif x > 7: return x - 1 else: return x #y = map(f6, y) y = numpy.array(y) sm = SMOTE(kind='regular') for i in xrange(20): x_metrics, y = sm.fit_transform(x_metrics, y) clf = RandomForestClassifier(n_estimators=100, class_weight='auto') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10) #print metrics.accuracy_score(y, pr) #print metrics.confusion_matrix(y, pr) delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7] x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0) y = numpy.delete(y, delete_rows_indexes, axis=0) #clf.fit(x_metrics,y) #joblib.dump(clf, 'rand_forest_model_3.pkl') pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
usecols=[3, 4, 5, 6, 7]) y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') x, y = bsmote1.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)
orig_X, orig_y = read_data() skf = StratifiedKFold(orig_y, n_folds=4, shuffle=True) while True: scores = [] for train_index, test_index in skf: X, X_cv = orig_X[train_index], orig_X[test_index] y, y_cv = orig_y[train_index], orig_y[test_index] # Fraction of majority samples to draw with respect to samples of # minority class. sampled_X,sampled_y = X,y # Oversample data from the minority class. if P['is_smote']: sampled_X, sampled_y = SMOTE(k=P['k'], m=P['m'], ratio=P['ratio'], verbose=False, kind='regular').fit_transform(sampled_X, sampled_y) # Undersample samples from the majority class. sampled_X, sampled_y = UnderSampler(1.0).fit_transform(sampled_X, sampled_y) # Fit a scaler only for the sampled data. scaler = Scaler(sampled_X, sampled_y) sampled_X = scaler.getOriginalTransformedData() #model = RandomForestClassifier(n_estimators=100).fit(sampled_X, sampled_y) #model = RandomForestClassifier(n_estimators=P['n_estimators'], criterion=P['criterion'], max_depth=P['max_depth'], min_samples_split=P['min_samples_split'], min_samples_leaf=P['min_samples_leaf'], min_weight_fraction_leaf=P['min_weight_fraction_leaf'], max_features=P['max_features'], max_leaf_nodes=P['max_leaf_nodes'], bootstrap=P['bootstrap'], oob_score=P['oob_score'], n_jobs=8, random_state=None, verbose=0, warm_start=False, class_weight=None).fit(sampled_X, sampled_y) model = MLPClassifier(activation=P['activation'], algorithm=P['algorithm'], alpha=P['alpha'], hidden_layer_sizes=P['layer'], learning_rate=P['learning_rate'], tol=P['tol'], random_state=1).fit(sampled_X, sampled_y) #model = xgb.XGBClassifier(max_depth=P['max_depth'], n_estimators=P['n_estimators'], learning_rate=P['learning_rate'], nthread=8, subsample=P['subsample'], colsample_bylevel=P['colsample_bylevel']).fit(sampled_X, sampled_y, eval_metric=P['eval_metric']) prediction_cv = model.predict_proba(scaler.transform(X_cv)) auc_score = roc_auc_score(y_cv, prediction_cv[:,1]) scores.append(auc_score) log("***roc_auc_score:%f" % auc_score)