def gradient_boosting(): kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True) scoreings = [] for train_index, test_index in kfold.split(X, y): # print("Train:", train_index, "Validation:", test_index) X_t, X_test = X[train_index], X[test_index] y_t, y_test = y[train_index], y[test_index] GSMOTE = EGSmote() X_train, y_train = GSMOTE.fit_resample(X_t, y_t) gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=3) gbc.fit(X_train, y_train) # Predicting the Test set results y_predict = gbc.predict(X_test) y_pred = np.where(y_predict > 0.5, 1, 0) scoreings.append(evaluate2(y_test, y_pred)) scoreings = np.asarray(scoreings) fscores = scoreings[:, 0] gmean = scoreings[:, 1] auc = scoreings[:, 2] return ["GBC", fscores.mean(), gmean.mean(), auc.mean()]
def logistic_training(): kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True) scoreings = [] for train_index, test_index in kfold.split(X, y): # print("Train:", train_index, "Validation:", test_index) X_t, X_test = X[train_index], X[test_index] y_t, y_test = y[train_index], y[test_index] GSMOTE = EGSmote() X_train, y_train = GSMOTE.fit_resample(X_t, y_t) regressor = LogisticRegression(max_iter=120) regressor.fit(X_train, y_train) # Predicting the Test set results y_predict = regressor.predict(X_test) y_pred = np.where(y_predict > 0.5, 1, 0) scoreings.append(evaluate2(y_test, y_pred)) scoreings = np.asarray(scoreings) fscores = scoreings[:, 0] gmean = scoreings[:, 1] auc = scoreings[:, 2] return ["LR", fscores.mean(), gmean.mean(), auc.mean()]
def KNN(): # Fitting Simple Linear Regression to the Training set kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True) scoreings = [] for train_index, test_index in kfold.split(X, y): # print("Train:", train_index, "Validation:", test_index) X_t, X_test = X[train_index], X[test_index] y_t, y_test = y[train_index], y[test_index] GSMOTE = EGSmote() X_train, y_train = GSMOTE.fit_resample(X_t, y_t) classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) scoreings.append(evaluate2(y_test, y_pred)) scoreings = np.asarray(scoreings) fscores = scoreings[:, 0] gmean = scoreings[:, 1] auc = scoreings[:, 2] return ["KNN", fscores.mean(), gmean.mean(), auc.mean()]
def parse_input_zoo_data(filename, header='infer'): gsmote = EGSmote(random_state=1) df = pd.read_csv(filename) X = np.asarray(df.iloc[:, :-1].values) y = np.asarray(df.iloc[:, -1].values) X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # X_train, y_train = gsmote.fit_resample(X_t,y_t) smt = SMOTE() X_train, y_train = smt.fit_sample(X_t, y_t) classes = y_train.tolist() labels = y_train.tolist() input_database = {0: X_train} # input_data = pd.read_csv(filename, header=header) # # classes = input_data[17].tolist() # labels = input_data[0].tolist() # input_database = { # 0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) # } return input_database, labels, classes, X_test, y_test
print("---------------------------------------------------------") kfold = model_selection.StratifiedKFold(n_splits=5, random_state=True, shuffle=True) scorings = [] iter = 0 for train_index, test_index in kfold.split(X, y): iter = iter + 1 print("Itertion: " + str(iter) + " => Processing") X_t, X_test = X[train_index], X[test_index] y_t, y_test = y[train_index], y[test_index] # # GSMOTE = SMOTE() # GSMOTE = OldGeometricSMOTE() GSMOTE = EGSmote() X_train, y_train = GSMOTE.fit_resample(X_t, y_t) # X_train,y_train = X_t,y_t fold_score = [] performance1 = logistic_training(X_train, y_train, X_test, y_test) # performance2 = gradient_boosting(X_train,y_train,X_test,y_test) # performance3 = XGBoost(X_train,y_train,X_test,y_test) # performance4 = KNN(X_train,y_train,X_test,y_test) # performance5 = decision_tree(X_train,y_train,X_test,y_test) fold_score.append(performance1) # fold_score.append(performance2) # fold_score.append(performance3) # fold_score.append(performance4) # fold_score.append(performance5) scorings.append(fold_score)
# Partition the dataset from sklearn.model_selection import train_test_split date_file = "../../data/CICID-11372.csv" X, y = pp.pre_process(date_file) X, X_t, y, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Instantiate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) gs = gs() X_resampled, y_resampled = gs.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == '0', 0], X_vis[y == '0', 1], label="Class #0", alpha=0.5, marker='.') c1 = ax1.scatter(X_vis[y == '1', 0], X_vis[y == '1', 1], label="Class #1", alpha=0.5, marker='.')
from gsmote.comparison_testing import preprocessing as pp import pandas as pd from gsmote import EGSmote date_file = "../../data/KDDCUP0.csv" X, y = pp.pre_process(date_file) sm = EGSmote() # X,y = sm.fit_resample(X,y) train_sizes = [ 100, 500, 600, 7000, 1000, 1500, 2000, 3000, 10000, 15000, 20000, 30000, 40000, 50000, 60000, 70000 ] from sklearn.linear_model import LinearRegression from sklearn.model_selection import learning_curve train_sizes, train_scores, validation_scores = learning_curve( estimator=LinearRegression(), X=X, y=y, train_sizes=train_sizes, cv=16, shuffle=True, scoring="f1") print('Training scores:\n\n', train_scores) print('\n', '-' * 70) # separator to make the output easy to read print('\nValidation scores:\n\n', validation_scores) train_scores_mean = -train_scores.mean(axis=1) validation_scores_mean = -validation_scores.mean(axis=1) print('Mean training scores\n\n', pd.Series(train_scores_mean, index=train_sizes))