def RF(opts): FOLDER = 'clean_vpn12_rf' param_grid = { 'critire': ['gini', 'gain'], 'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': np.arange(2, 30, 2), 'max_depth':np.arange(2, 31) } classifier = RandomForestClassifier(n_jobs=-1, oob_score=True) rf = ML_Model("Random Forest", classifier, param_grid) X_train, y_train = data_process(opts) X_train = normalize(X_train, norm = 'l2', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) rf.model_path = FOLDER train_ml(rf, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return rf
def XGB(opts): FOLDER = 'clean_vpn12_xgb' xgb = XGBClassifier( learning_rate=0.1, n_estimators=1000, objective='multi:softmax', nthread=4, scale_pos_weight=1, seed=27, num_classes = 12) param_grid = { 'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2), 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5, 2, 5], 'subsample': [i / 10.0 for i in range(5, 11)], 'colsample_bytree': [i / 10.0 for i in range(5, 11)], 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100] } xgb = ML_Model('XGBoost', xgb, param_grid) X_train, y_train = data_process(opts) X_train = normalize(X_train, norm = 'l2', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) xgb.model_path = FOLDER train_ml(xgb, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return xgb
def NBMULTI(opts): FOLDER = 'clean_vpn12_NB-Multi' classifier = MultinomialNB() X_train, y_train = data_process(opts) #y_train = to_categorical(y_train, num_classes = nclass) X_train = normalize(X_train, norm = 'max', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) nb = ML_Model("MultinomialNB", classifier, None) nb.model_path = FOLDER train_ml(nb, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return nb
def LR(opts): FOLDER = 'clean_vpn12_lr' classifier = LogisticRegression(multi_class='ovr', penalty='l2') param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) lr = ML_Model("Log. Regression", classifier, param_grid) X_train, y_train = data_process(opts) #y_train = to_categorical(y_train, num_classes = nclass) X_train = normalize(X_train, norm = 'max', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) lr.model_path = FOLDER train_ml(lr, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return lr
def LINSVC(opts): FOLDER = 'clean_vpnn_linearsvc' classifier = svm.LinearSVC() C_range = range(1, 200, 50) param_grid = dict(C = C_range) svmsvc = ML_Model("SVM-Linear", classifier, param_grid) X_train, y_train = data_process(opts) X_train = normalize(X_train, norm = 'l2', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) svmsvc.model_path = FOLDER train_ml(svmsvc, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return svmsvc
def SVMSVC(opts): FOLDER = 'clean_vpn12_svc' classifier = svm.SVC() C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) X_train, y_train = data_process(opts) X_train = normalize(X_train, norm = 'l2', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) svmsvc = ML_Model('SVM-SVC', classifier, param_grid) svmsvc.model_path = FOLDER train_ml(svmsvc, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return svmsvc
def DTree(opts): FOLDER = 'clean_vpn12_dtr' classifier = DecisionTreeClassifier() entropy_thresholds = np.linspace(0, 1, 50) gini_thresholds = np.linspace(0, 0.5, 50) param_grid = [{'criterion': ['entropy'], 'min_impurity_decrease': entropy_thresholds}, {'criterion': ['gini'], 'min_impurity_decrease': gini_thresholds}, {'max_depth': np.arange(2, 31)}, {'min_samples_split': np.arange(2, 30, 2)}] dtree = ML_Model('Decision Tree', classifier, param_grid) X_train, y_train = data_process(opts) X_train = normalize(X_train, norm = 'l2', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] size = np.shape(X_train)[0] print(size, dim) dtree.model_path = FOLDER train_ml(dtree, X_train, y_train, X_test, y_test, opts.sets, FOLDER, random = True) return dtree
def ALLModels(opts): models = [] #models.append(XGB(opts)) #models.append(DTree(opts)) #models.append(LR(opts)) #models.append(SVMSVC(opts)) #models.append(LINSVC(opts)) #models.append(NBMULTI(opts)) #models.append(NBBonuli(opts)) X_train, y_train = data_process(opts) #y_train = to_categorical(y_train, num_classes = nclass) X_train = normalize(X_train, norm = 'max', axis=0, copy = True, return_norm = False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) FOLDER = 'clean_vpn12_rf' param_grid = { 'critire': ['gini', 'gain'], 'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': np.arange(2, 30, 2), 'max_depth':np.arange(2, 31) } classifier = RandomForestClassifier(n_jobs=-1, oob_score=True) rf = ML_Model("Random Forest", classifier, param_grid) rf.model_path = FOLDER models.append(rf) FOLDER = 'clean_vpn12_lr' classifier = LogisticRegression(multi_class='ovr', penalty='l2') param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) lr = ML_Model("Log. Regression", classifier, param_grid) lr.model_path = FOLDER models.append(lr) classifier = BernoulliNB() FOLDER = 'clean_vpn12_NB-Bonulina' nb = ML_Model("NB-Bernoulli", classifier, None) nb.model_path = FOLDER models.append(nb) FOLDER = 'clean_vpn12_xgb' xgb = XGBClassifier( learning_rate=0.1, n_estimators=1000, objective='multi:softmax', nthread=4, scale_pos_weight=1, seed=27, num_classes = 12) param_grid = { 'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2), 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5, 2, 5], 'subsample': [i / 10.0 for i in range(5, 11)], 'colsample_bytree': [i / 10.0 for i in range(5, 11)], 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100] } xgb = ML_Model('XGBoost', xgb, param_grid) xgb.model_path = FOLDER models.append(xgb) FOLDER = 'clean_vpn12_svc' classifier = svm.SVC() C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) svmsvc = ML_Model('SVM-SVC', classifier, param_grid) svmsvc.model_path = FOLDER models.append(svmsvc) final_train(models, X_train, y_train, X_test, y_test, opts.sets) ML_Model.models_metric_summary(models)