Exemplo n.º 1
0
def GradBoost_LR(train_x, train_y, test_x, test_y, parameters=None):
    '''
    Creates and fits the Gradient Boosting Classifier
    by GridSearchCV
    :param train_x: train_x
    :param train_y: train_y
    :param test_x: test_x
    :param test_y: test_y
    :param parameters: *Dict for GridSearchCV
    :return: fpr, tpr, auc_score
    '''
    #Logistic Regression should be fitted on a different set to
    #prevent overfitting
    train_x, train_x_lr, train_y, train_y_lr = train_test_split(
        train_x, train_y, test_size=0.3, random_state=42)

    if parameters == 'default':
        parameters = {"max_depth": [3, 4], "n_estimators": range(100, 600, 50)}
        gb = GradientBoostingClassifier()
        clf_gb = GridSearchCV(gb, parameters, scoring='roc_auc')
    elif type(parameters) == dict:
        gb = GradientBoostingClassifier()
        clf_gb = GridSearchCV(gb, parameters, scoring='roc_auc')
    else:
        #Optimal parameters obtained previously from GridSearchCV
        clf_gb = GradientBoostingClassifier(n_estimators=500)

    #Define one hot encoder to transform the output of GB to fit
    #the input of LR
    enc_gb = OneHotEncoder()
    lr_gb = LogisticRegression(n_jobs=-1)

    clf_gb.fit(train_x, train_y)
    enc_gb.fit(clf_gb.apply(train_x)[:, :, 0])  #Transformation
    lr_gb.fit(enc_gb.transform(clf_gb.apply(train_x_lr)[:, :, 0]), train_y_lr)

    predictions = lr_gb.predict_proba(
        enc_gb.transform(clf_gb.apply(test_x)[:, :, 0]))[:, 1]
    fpr, tpr, _ = roc_curve(test_y, predictions, pos_label=1.0)
    score = round(roc_auc_score(test_y, predictions), 4)
    return fpr, tpr, score
Exemplo n.º 2
0
    #     plt.axvline(mode(median_each_tree), color="g", label="mode")
    # except:
    #     pass
    plt.axvline(weighted_mean, color="r", label="weighted")
    plt.legend(loc="best")
    plt.show()
    plt.savefig("/Users/lls/Documents/mlhalos_files/regression/feature_importances_tests/estimators_particle_" + str(
        particle_id) + ".png")
    plt.clf()

    # weight medians by number of leaves in the tree

####### COMPARE MEAN OF MEANS VS MEDIANS OF MEDIANS FOR SMALL MASS HALOS ######

small_mass = np.where((log_halo_mass_testing <= 11))[0]
leaf_nodes_train_set = rf_random.apply(train_feat_04_corr)
leaf_nodes_test_set = rf_random.apply(test_feat_04_corr)

mean_mean = []
median_median = []
mean_median = []
median_mean = []

for particle_id in small_mass[12584:]:

    mean_each_tree = []
    median_each_tree = []

    for i in range(rf_random.n_estimators):
        training_ids_in_leaf_node_tree_i = np.where(leaf_nodes_train_set[:, i] == leaf_nodes_test_set[particle_id, i])[0]
        median_i = np.median(log_mass_training[training_ids_in_leaf_node_tree_i])
Exemplo n.º 3
0
#print(clf.cv_results_)

# The sklearn API models are picklable
print("Pickling sklearn API models")
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
# print(np.allclose(clf.predict(X), clf2.predict(X)))
# y_score = clf.predict_proba(X)[:,1]
# fpr,tpr,thresholds = roc_curve(y, y_score, pos_label=1)
# plt.plot([0,1],[0,1],'r--',fpr,tpr,'b')
# plt.show()
# Early-stopping

X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train,
        y_train,
        early_stopping_rounds=10,
        eval_metric="error",
        eval_set=[(X_test, y_test)])

print(clf.feature_importances_)  #输出特征的重要度
xgb.plot_importance(clf)  #画出特征的重要度
xgb.plot_tree(clf)
plt.show()

print(clf.apply(X))  #输出X落在不同树中的叶子节点编号