def stackedmodel_p(train, query, train_cols): #instantiating the model rf = RandomForestClassifier(n_estimators=150) hybrid_model_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10) lrd = LogisticRegression(solver='lbfgs', max_iter=300) clf_stackd = StackingClassifier(classifiers =[rf, hybrid_model_AB], meta_classifier = lra, use_probas = True, use_features_in_secondary = True) #cleaning data by applying preprocessing train[train_cols] = preprocessing.scale(train[train_cols]) query[train_cols] = preprocessing.scale(query[train_cols]) #fitting the model print(clf_stackd.fit(train[train_cols], train['malicious'])) #cross-validating and evaluating the performance of model scores = cv.cross_val_score(clf_stackd, train[train_cols], train['malicious'], cv=30) print('Estimated score Random Forest & Gaussian Naive Bayes: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) #predicting the target query from the model query['result'] = clf_stackd.predict(query[train_cols]) #printing the predicted results print(query[['URL', 'result']]) return query['result']
def stackedmodel_n(train, query, train_cols): #instantiating the model rf = RandomForestClassifier(n_estimators=150) tree = DecisionTreeClassifier(min_impurity_decrease=0) lrb = LogisticRegression(solver='lbfgs', max_iter=300) clf_stackb = StackingClassifier(classifiers =[rf, tree], meta_classifier = lra, use_probas = True, use_features_in_secondary = True) #cleaning data by applying preprocessing train[train_cols] = preprocessing.scale(train[train_cols]) query[train_cols] = preprocessing.scale(query[train_cols]) #fitting the model print(clf_stackb.fit(train[train_cols], train['malicious'])) #cross-validating and evaluating the performance of model scores = cv.cross_val_score(clf_stackb, train[train_cols], train['malicious'], cv=30) print('Estimated score Random Forest & Decision Tree: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) #predicting the target query from the model query['result'] = clf_stackb.predict(query[train_cols]) #printing the predicted results print(query[['URL', 'result']]) return query['result']
def stackedmodel_e(train, query, train_cols): #instantiating the model rf = RandomForestClassifier(n_estimators=150) gnb = GaussianNB() tree = DecisionTreeClassifier(min_impurity_decrease=0) mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes=([100,100,100]),max_iter=900) hybrid_model_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10) clf = svm.SVC() lrm = LogisticRegression(solver='lbfgs', max_iter=1000) clf_stackm = StackingClassifier(classifiers =[clf_stackz, clf], meta_classifier = lrz, use_probas = True, use_features_in_secondary = True) #cleaning data by applying preprocessing train[train_cols] = preprocessing.scale(train[train_cols]) query[train_cols] = preprocessing.scale(query[train_cols]) #fitting the model print(clf_stackm.fit(train[train_cols], train['malicious'])) #cross-validating and evaluating the performance of model scores = cv.cross_val_score(clf_stackm, train[train_cols], train['malicious'], cv=30) print('Estimated score Random Forest,Gaussian Naive Bayes,Decision Forest,MLPs,AdaBoost & SVM: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) #predicting the target query from the model query['result'] = clf_stackm.predict(query[train_cols]) #printing the predicted results print(query[['URL', 'result']]) return query['result']
def modelfit(alg, dtrain, dtest, predictors, target, IDcol, filename): # Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain[target]) # Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) # Perform cross-validation: cv_score = cross_validate.cross_val_score(alg, dtrain[predictors], dtrain[target], cv=20, scoring='mean_squared_error') cv_score = np.sqrt(np.abs(cv_score)) # Print model report: print "\nModel Report" print "RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(dtrain[target].values, dtrain_predictions)) print "CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % ( np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)) # Predict on testing data: dtest[target] = alg.predict(dtest[predictors]) # Export submission file: IDcol.append(target) submission = pd.DataFrame({x: dtest[x] for x in IDcol}) submission.to_csv(filename, index=False)
def forest_classifier(train, query, train_cols): rf = RandomForestClassifier(n_estimators=150) print rf.fit(train[train_cols], train['firmware_bool']) scores = cv.cross_val_score(rf, train[train_cols], train['firmware_bool'], cv=30) print('Estimated score RandomForestClassifier: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) query['result'] = rf.predict(query[train_cols]) print query[['url', 'result']]
def cross_validation(): dataset = load_data() print(dataset) row, col = dataset.shape X = dataset[:, :col - 1] y = dataset[:, -1] clf = SVC(kernel='rbf', C=1000) clf.fit(X, Y) scores = cs.cross_val_score(clf, X, y, cv=5, score_func=None) print("Accuracy: %0.2f (+- %0.2f)" % (scores.mean(), scores.std())) return clf
def svm_classifier(train, query, train_cols): ###特征数据,带查询的特征数据,去除字符串的特征信息 clf = svm.SVC() ##分类器 train[train_cols] = preprocessing.scale(train[train_cols]) query[train_cols] = preprocessing.scale(query[train_cols]) print clf.fit(train[train_cols], train['firmware_bool']) scores = cv.cross_val_score(clf, train[train_cols], train['firmware_bool'], cv=30) print('Estimated score SVM: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) query['result'] = clf.predict(query[train_cols]) print query[['url', 'result']]
def svm_classifier(train, query, train_cols): clf = svm.SVC() train[train_cols] = preprocessing.scale(train[train_cols]) query[train_cols] = preprocessing.scale(query[train_cols]) print(clf.fit(train[train_cols], train['malicious'])) scores = cv.cross_val_score(clf, train[train_cols], train['malicious'], cv=30) print('Estimated score SVM: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) query['result'] = clf.predict(query[train_cols]) print(query[['URL', 'result']])