def apply_sampling(X_data, Y_data, sampling, n_states, maxlen): ratio = float(np.count_nonzero(Y_data == 1)) / \ float(np.count_nonzero(Y_data == 0)) X_data = np.reshape(X_data, (len(X_data), n_states * maxlen)) # 'Random over-sampling' if sampling == 'OverSampler': OS = OverSampler(ratio=ratio, verbose=True) # 'Random under-sampling' elif sampling == 'UnderSampler': OS = UnderSampler(verbose=True) # 'Tomek under-sampling' elif sampling == 'TomekLinks': OS = TomekLinks(verbose=True) # Oversampling elif sampling == 'SMOTE': OS = SMOTE(ratio=1, verbose=True, kind='regular') # Oversampling - Undersampling elif sampling == 'SMOTETomek': OS = SMOTETomek(ratio=ratio, verbose=True) # Undersampling elif sampling == 'OneSidedSelection': OS = OneSidedSelection(verbose=True) # Undersampling elif sampling == 'CondensedNearestNeighbour': OS = CondensedNearestNeighbour(verbose=True) # Undersampling elif sampling == 'NearMiss': OS = NearMiss(version=1, verbose=True) # Undersampling elif sampling == 'NeighbourhoodCleaningRule': OS = NeighbourhoodCleaningRule(verbose=True) # ERROR: WRONG SAMPLER, TERMINATE else: print('Wrong sampling variable you have set... Exiting...') sys.exit() # print('shape ' + str(X.shape)) X_data, Y_data = OS.fit_transform(X_data, Y_data) return X_data, Y_data
def test_rest(x, y): print('Random under-sampling') US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) print('NearMiss-2') NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) print('NearMiss-3') NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(verbose=verbose) ncrx, ncry = NCR.fit_transform(x, y) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def near_miss3(self): NM3 = NearMiss(version=3, verbose=self.verbose) nm3x, nm3y = NM3.fit_transform(self.x, self.y) print "Near Miss 3 transformed" return nm3x, nm3y
def near_miss2(self): NM2 = NearMiss(version=2, verbose=self.verbose) nm2x, nm2y = NM2.fit_transform(self.x, self.y) print "Near Miss 2 Transformed" return nm2x, nm2y
def near_miss1(self): NM1 = NearMiss(version=1, verbose=self.verbose) nm1x, nm1y = NM1.fit_transform(self.x, self.y) print "Near Miss Transformed" return nm1x, nm1y
usecols=[3, 4, 5, 6, 7]) y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False # 'NearMiss-1' NM1 = NearMiss(version=1, verbose=verbose) x, y = NM1.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)
plt.legend() plt.show() # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) # 'Tomek links' TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) # 'Clustering centroids' CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) # 'NearMiss-1' NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) # 'NearMiss-2' NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) # 'NearMiss-3' NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) # 'Condensed Nearest Neighbour' CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51, verbose=verbose) cnnx, cnny = CNN.fit_transform(x, y) # 'One-Sided Selection' OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51, verbose=verbose) ossx, ossy = OSS.fit_transform(x, y) # 'Neighboorhood Cleaning Rule' NCR = NeighbourhoodCleaningRule(size_ngh=51, verbose=verbose)
'mem_requested', 'disk', 'violation'] tain_path = r'/home/askrey/Dropbox/Project_step_by_step/3_create_database/csvs/frull_db_2.csv' X = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [3,4,5,6,7]) y = pd.read_csv(tain_path, header = None, index_col = False ,names = colnames, skiprows = [0], usecols = [8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False # 'NearMiss-2' NM2 = NearMiss(version=2, verbose=verbose) x, y = NM2.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test) y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
usecols=[3, 4, 5, 6, 7]) y = pd.read_csv(tain_path, header=None, index_col=False, names=colnames, skiprows=[0], usecols=[8]) y = y['violation'].values # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) main_x = X.values main_y = y verbose = False # 'NearMiss-3' NM3 = NearMiss(version=3, verbose=verbose) x, y = NM3.fit_transform(main_x, main_y) ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0)) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.333, random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score clf = RandomForestClassifier(n_estimators=10) scores = cross_val_score(clf, X_test, y_test) y_pred = clf.fit(X_train, y_train).predict(X_test)