colsample_btree=1, # 所有特征建立决策树 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27, # 随机数 slient=0, ) xg.fit(feature_train_balance, label_train_balance) xg_pred = xg.predict_proba(feature_test_balance)[:, 1] xg_evaluation = valid.evaluate( label_test_balance, xg_pred, save_path="../data/xg_evaluation.json" ) plot_evaluation(label_test_balance, xg_pred, "../figure", method="XG") #%% lb = LogitBoost(n_estimators=200, random_state=0) # base_estimator=LogisticRegression() lb.fit(feature_train_balance, label_train_balance) lb_pred = lb.predict_proba(feature_test_balance)[:, 1] lb_evaluation = evaluate( label_test_balance, lb_pred, save_path="../data/lb_evaluation.json" ) plot_evaluation(label_test_balance, lb_pred, "../figure", method="LB") #%% from feature import valid #%% # Auto-tunan_column_listne model for pandemic # 1. XGboost # 2. XGboost - additive learning # 3. LogisticBoosting - additive learning # 4. dummy Logistic # 5. Transfer Logistic from preprocess import transform
def _toy_dataset_test(load_func, test_size=(1. / 3), random_state=0, min_score_train=0.9, min_score_test=0.9): """Create a classification unit test from a scikit-learn toy dataset.""" # Fetch the dataset data = load_func() X = data.data y = data.target_names[data.target] # Distinct classes classes = data.target_names n_classes = len(classes) # Binary/multiclass classification indicator is_binary = (n_classes == 2) # Shuffle data and split it into training/testing samples X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=random_state) for bootstrap in (True, False): # Fit a LogitBoost model logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state) logitboost.fit(X_train, y_train) # Compute accuracy scores and assert minimum accuracy score_train = logitboost.score(X_train, y_train) score_test = logitboost.score(X_test, y_test) assert score_train >= min_score_train, \ ("Failed with bootstrap=%s: training score %.3f less than %.3f" % (bootstrap, score_train, min_score_train)) assert score_test >= min_score_test, \ ("Failed with bootstrap=%s: testing score %.3f less than %.3f" % (bootstrap, score_test, min_score_test)) # Get probabilities and the decision function predict_proba = logitboost.predict_proba(X_test) decision_function = logitboost.decision_function(X_test) # predict_proba() should always return (n_samples, n_classes) assert predict_proba.shape == (X_test.shape[0], n_classes) # decision_function() shape depends on the classification task if is_binary: assert decision_function.shape == (X_test.shape[0], ) else: assert decision_function.shape == (X_test.shape[0], n_classes) # Check that the last item of a staged method is the same as a regular # method staged_predict = np.asarray(list(logitboost.staged_predict(X_test))) staged_predict_proba = \ np.asarray(list(logitboost.staged_predict_proba(X_test))) staged_decision_function = \ np.asarray(list(logitboost.staged_decision_function(X_test))) staged_score = \ np.asarray(list(logitboost.staged_score(X_test, y_test))) np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test)) np.testing.assert_almost_equal(staged_predict_proba[-1], logitboost.predict_proba(X_test)) np.testing.assert_almost_equal(staged_decision_function[-1], logitboost.decision_function(X_test)) np.testing.assert_almost_equal(staged_score[-1], logitboost.score(X_test, y_test)) # contributions() should return one non-negative number for each # estimator in the ensemble contrib = logitboost.contributions(X_train) assert contrib.shape == (logitboost.n_estimators, ) assert np.all(contrib >= 0)
def model_comp(X_train, X_test, y_train, y_test, title=""): xgboost_model = XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=700, random_state=8) gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, max_features='log2', min_samples_leaf=4, n_estimators=280, subsample=0.25, random_state=8) random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=8) svm_model = SVC(kernel='poly', probability=True, verbose=1, random_state=8) knn_model = KNeighborsClassifier(n_neighbors=3) elm_model = MLPClassifier(hidden_layer_sizes=(80, ), activation='logistic', learning_rate_init=0.01, verbose=1) adaboost_model = AdaBoostClassifier(n_estimators=300, learning_rate=0.01, random_state=8) logitboost_model = LogitBoost(n_estimators=300, learning_rate=0.01, random_state=8) xgboost_model.fit(X_train, y_train) gradient_boost_model.fit(X_train, y_train) random_forest_model.fit(X_train, y_train) svm_model.fit(X_train, y_train) knn_model.fit(X_train, y_train) elm_model.fit(X_train, y_train) adaboost_model.fit(X_train, y_train) logitboost_model.fit(X_train, y_train) p_random_forest = random_forest_model.predict_proba(X_test) p_gradient_boost = gradient_boost_model.predict_proba(X_test) p_xgboost = xgboost_model.predict_proba(X_test) p_svm = svm_model.predict_proba(X_test) p_knn = knn_model.predict_proba(X_test) p_elm = elm_model.predict_proba(X_test) p_adaboost = adaboost_model.predict_proba(X_test) p_logitboost = logitboost_model.predict_proba(X_test) random_forest_ll = log_loss(y_test, p_random_forest) gradient_boost_ll = log_loss(y_test, p_gradient_boost) xgboost_ll = log_loss(y_test, p_xgboost) svm_ll = log_loss(y_test, p_svm) knn_ll = log_loss(y_test, p_knn) elm_ll = log_loss(y_test, p_elm) adaboost_ll = log_loss(y_test, p_adaboost) logitboost_ll = log_loss(y_test, p_logitboost) strng0 = "\n" + title strtest = "\nLength of test data: " + str(len(y_test)) strng2 = "\n------------------" strng4 = "\nGradient Boost Log Loss " + str(gradient_boost_ll) strng5 = "\nRandom Forest Log Loss " + str(random_forest_ll) strng6 = "\nXGBoost Log Loss " + str(xgboost_ll) strng7 = "\n------------------" strng9 = "\nSVM Log Loss " + str(svm_ll) strng10 = "\nKNN Log Loss " + str(knn_ll) strng11 = "\nELM Log Loss " + str(elm_ll) strng12 = "\nAdaBoost Log Loss " + str(adaboost_ll) strng13 = "\nLogitBoost Log Loss " + str(logitboost_ll) prntstr = strng0 + strtest + strng2 + strng4 + strng5 + strng6 + strng7 + strng9 + strng10 + strng11 + strng12 + strng13 print(prntstr) write_to_file(prntstr) return xgboost_model, random_forest_model, adaboost_model