示例#1
0
    colsample_btree=1,  # 所有特征建立决策树
    scale_pos_weight=1,  # 解决样本个数不平衡的问题
    random_state=27,  # 随机数
    slient=0,
)
xg.fit(feature_train_balance, label_train_balance)
xg_pred = xg.predict_proba(feature_test_balance)[:, 1]
xg_evaluation = valid.evaluate(
    label_test_balance, xg_pred, save_path="../data/xg_evaluation.json"
)
plot_evaluation(label_test_balance, xg_pred, "../figure", method="XG")
#%%

lb = LogitBoost(n_estimators=200, random_state=0)  # base_estimator=LogisticRegression()
lb.fit(feature_train_balance, label_train_balance)
lb_pred = lb.predict_proba(feature_test_balance)[:, 1]
lb_evaluation = evaluate(
    label_test_balance, lb_pred, save_path="../data/lb_evaluation.json"
)
plot_evaluation(label_test_balance, lb_pred, "../figure", method="LB")
#%%
from feature import valid

#%%
# Auto-tunan_column_listne model for pandemic
# 1. XGboost
# 2. XGboost - additive learning
# 3. LogisticBoosting - additive learning
# 4. dummy Logistic
# 5. Transfer Logistic
from preprocess import transform
def _toy_dataset_test(load_func,
                      test_size=(1. / 3),
                      random_state=0,
                      min_score_train=0.9,
                      min_score_test=0.9):
    """Create a classification unit test from a scikit-learn toy dataset."""
    # Fetch the dataset
    data = load_func()
    X = data.data
    y = data.target_names[data.target]

    # Distinct classes
    classes = data.target_names
    n_classes = len(classes)

    # Binary/multiclass classification indicator
    is_binary = (n_classes == 2)

    # Shuffle data and split it into training/testing samples
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y,
                         random_state=random_state)

    for bootstrap in (True, False):
        # Fit a LogitBoost model
        logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state)
        logitboost.fit(X_train, y_train)

        # Compute accuracy scores and assert minimum accuracy
        score_train = logitboost.score(X_train, y_train)
        score_test = logitboost.score(X_test, y_test)
        assert score_train >= min_score_train, \
            ("Failed with bootstrap=%s: training score %.3f less than %.3f"
             % (bootstrap, score_train, min_score_train))
        assert score_test >= min_score_test, \
            ("Failed with bootstrap=%s: testing score %.3f less than %.3f"
             % (bootstrap, score_test, min_score_test))

        # Get probabilities and the decision function
        predict_proba = logitboost.predict_proba(X_test)
        decision_function = logitboost.decision_function(X_test)

        # predict_proba() should always return (n_samples, n_classes)
        assert predict_proba.shape == (X_test.shape[0], n_classes)

        # decision_function() shape depends on the classification task
        if is_binary:
            assert decision_function.shape == (X_test.shape[0], )
        else:
            assert decision_function.shape == (X_test.shape[0], n_classes)

        # Check that the last item of a staged method is the same as a regular
        # method
        staged_predict = np.asarray(list(logitboost.staged_predict(X_test)))
        staged_predict_proba = \
            np.asarray(list(logitboost.staged_predict_proba(X_test)))
        staged_decision_function = \
            np.asarray(list(logitboost.staged_decision_function(X_test)))
        staged_score = \
            np.asarray(list(logitboost.staged_score(X_test, y_test)))

        np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test))
        np.testing.assert_almost_equal(staged_predict_proba[-1],
                                       logitboost.predict_proba(X_test))
        np.testing.assert_almost_equal(staged_decision_function[-1],
                                       logitboost.decision_function(X_test))
        np.testing.assert_almost_equal(staged_score[-1],
                                       logitboost.score(X_test, y_test))

        # contributions() should return one non-negative number for each
        # estimator in the ensemble
        contrib = logitboost.contributions(X_train)
        assert contrib.shape == (logitboost.n_estimators, )
        assert np.all(contrib >= 0)
示例#3
0
def model_comp(X_train, X_test, y_train, y_test, title=""):
    xgboost_model = XGBClassifier(learning_rate=0.01,
                                  max_depth=3,
                                  n_estimators=700,
                                  random_state=8)
    gradient_boost_model = GradientBoostingClassifier(learning_rate=0.01,
                                                      max_depth=4,
                                                      max_features='log2',
                                                      min_samples_leaf=4,
                                                      n_estimators=280,
                                                      subsample=0.25,
                                                      random_state=8)
    random_forest_model = RandomForestClassifier(n_estimators=300,
                                                 max_depth=3,
                                                 verbose=1,
                                                 random_state=8)
    svm_model = SVC(kernel='poly', probability=True, verbose=1, random_state=8)
    knn_model = KNeighborsClassifier(n_neighbors=3)
    elm_model = MLPClassifier(hidden_layer_sizes=(80, ),
                              activation='logistic',
                              learning_rate_init=0.01,
                              verbose=1)
    adaboost_model = AdaBoostClassifier(n_estimators=300,
                                        learning_rate=0.01,
                                        random_state=8)
    logitboost_model = LogitBoost(n_estimators=300,
                                  learning_rate=0.01,
                                  random_state=8)

    xgboost_model.fit(X_train, y_train)
    gradient_boost_model.fit(X_train, y_train)
    random_forest_model.fit(X_train, y_train)
    svm_model.fit(X_train, y_train)
    knn_model.fit(X_train, y_train)
    elm_model.fit(X_train, y_train)
    adaboost_model.fit(X_train, y_train)
    logitboost_model.fit(X_train, y_train)

    p_random_forest = random_forest_model.predict_proba(X_test)
    p_gradient_boost = gradient_boost_model.predict_proba(X_test)
    p_xgboost = xgboost_model.predict_proba(X_test)
    p_svm = svm_model.predict_proba(X_test)
    p_knn = knn_model.predict_proba(X_test)
    p_elm = elm_model.predict_proba(X_test)
    p_adaboost = adaboost_model.predict_proba(X_test)
    p_logitboost = logitboost_model.predict_proba(X_test)

    random_forest_ll = log_loss(y_test, p_random_forest)
    gradient_boost_ll = log_loss(y_test, p_gradient_boost)
    xgboost_ll = log_loss(y_test, p_xgboost)
    svm_ll = log_loss(y_test, p_svm)
    knn_ll = log_loss(y_test, p_knn)
    elm_ll = log_loss(y_test, p_elm)
    adaboost_ll = log_loss(y_test, p_adaboost)
    logitboost_ll = log_loss(y_test, p_logitboost)

    strng0 = "\n" + title
    strtest = "\nLength of test data: " + str(len(y_test))
    strng2 = "\n------------------"
    strng4 = "\nGradient Boost Log Loss " + str(gradient_boost_ll)
    strng5 = "\nRandom Forest Log Loss " + str(random_forest_ll)
    strng6 = "\nXGBoost Log Loss " + str(xgboost_ll)
    strng7 = "\n------------------"
    strng9 = "\nSVM Log Loss " + str(svm_ll)
    strng10 = "\nKNN Log Loss " + str(knn_ll)
    strng11 = "\nELM Log Loss " + str(elm_ll)
    strng12 = "\nAdaBoost Log Loss " + str(adaboost_ll)
    strng13 = "\nLogitBoost Log Loss " + str(logitboost_ll)
    prntstr = strng0 + strtest + strng2 + strng4 + strng5 + strng6 + strng7 + strng9 + strng10 + strng11 + strng12 + strng13
    print(prntstr)
    write_to_file(prntstr)

    return xgboost_model, random_forest_model, adaboost_model