Пример #1
0
def histGradientModel(X_train,Y_train):
    # use Hist Gradient Boosting Classifier
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier
    histgrad=HistGradientBoostingClassifier()
    histgrad.fit(X_train,y_train)
    print('\nHist Gradient Boosting Training Score:',histgrad.score(X_train,Y_train))
    return histgrad,histgrad.score(X_train,Y_train)
Пример #2
0
def hist_gradient_boosting_classifier(x_train, y_train, x_test, y_test):
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier
    ensem = HistGradientBoostingClassifier()
    ensem.fit(x_train, y_train)
    value = ensem.score(x_test, y_test)
    return "{0:.2f}".format(value)
Пример #3
0
Файл: p4.py Проект: i72sijia/IMD
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs,
                                              trainOutputs, testInputs,
                                              testOutputs, graphname):
    print("\n[" + str(graphname) + "]")
    scoreArray = np.array([])
    clf = HistGradientBoostingClassifier()
    scores = cross_val_score(clf, X, y, cv=10)
    clf = clf.fit(trainInputs, trainOutputs)
    precisionTrain = clf.score(trainInputs, trainOutputs)
    precisionTest = clf.score(testInputs, testOutputs)
    print("\tCCR train = %.2f%% | CCR test = %.2f%%" %
          (precisionTrain * 100, precisionTest * 100))
    prediccion_test = clf.predict(testInputs)
    print(prediccion_test)
    print(testOutputs)
    return precisionTest
Пример #4
0
def k_fold_cross_val(Xs, y_var, k=10):
    clf = tree.DecisionTreeClassifier()
    clf_forest = RandomForestClassifier(n_estimators=10)
    clf_boost = HistGradientBoostingClassifier()

    num_folds = k
    N = Xs.shape[0]
    test_size = int(N / num_folds)

    test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape(
        num_folds, test_size)

    total_score = np.asarray([0., 0., 0.])
    total_F1_score = np.asarray([0., 0., 0.])

    for i in range(num_folds):
        print("Iteration " + str(i) + ":")
        test_i = Xs.index.isin(test_idxs[i])
        df_train, df_test = Xs[~test_i], Xs[test_i]
        y_train, y_test = y_var[~test_i], y_var[test_i]

        clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel())
        score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel())

        clf_forest = clf_forest.fit(df_train.to_numpy(),
                                    y_train.to_numpy().ravel())
        score_f = clf_forest.score(df_test.to_numpy(),
                                   y_test.to_numpy().ravel())

        clf_boost = clf_boost.fit(df_train.to_numpy(),
                                  y_train.to_numpy().ravel())
        score_h = clf_boost.score(df_test.to_numpy(),
                                  y_test.to_numpy().ravel())

        y_hat = clf.predict(df_test.to_numpy())
        f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (tree):", f1_b)

        y_hat = clf_forest.predict(df_test.to_numpy())
        f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (forest):", f1_f)

        y_hat = clf_boost.predict(df_test.to_numpy())
        f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (boost):", f1_boost)

        print("Prediction scores for (tree,forest,boost):", score_b, score_f,
              score_h)
        total_score += np.asarray([score_b, score_f, score_h])
        total_F1_score += np.asarray([f1_b, f1_f, f1_boost])

    print("Avg. accuracy scores for (tree,forest,boost):",
          total_score / num_folds)
    print("Avg. F1 scores for (tree,forest,boost):",
          total_F1_score / num_folds)

    return clf, clf_forest, clf_boost
Пример #5
0
def test_missing_values_trivial():
    # sanity check for missing values support. With only one feature and
    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
    # training set.

    n_samples = 100
    n_features = 1
    rng = np.random.RandomState(0)

    X = rng.normal(size=(n_samples, n_features))
    mask = rng.binomial(1, 0.5, size=X.shape).astype(bool)
    X[mask] = np.nan
    y = mask.ravel()
    gb = HistGradientBoostingClassifier()
    gb.fit(X, y)

    assert gb.score(X, y) == pytest.approx(1)
Пример #6
0
        clf = HistGradientBoostingClassifier(loss='deviance',
                                             learning_rate=0.1,
                                             n_estimators=n_estimators,
                                             criterion='friedman_mse',
                                             max_depth=max_depth,
                                             random_state=None,
                                             max_features=max_features,
                                             verbose=0,
                                             warm_start=warm_start,
                                             presort='deprecated')
        clf.fit(X_train, y_train)

        print("n_estimators : ", n_estimators)
        print("learning rate  :", lr)
        print("Accuracy score (training): {0:.3f}".format(
            clf.score(X_train, y_train)))
        print("Accuracy score (validation): {0:.3f}".format(
            clf.score(X_test, y_test)))
        print("\n")

        filename = 'LGBM' + str(n_estimators) + str(lr) + str(
            max_depth) + str(max_features) + str(warm_start) + str(
                clf.score(X_test, y_test)) + "%" + '.sav'
        if clf.score(X_test, y_test) > 0.93:
            pickle.dump(clf, open(filename, 'wb'))

y_predict = clf.predict(X_test)
score = accuracy_score(y_test, y_predict)
print("n_estimators = ", n_estimators)
print("max_features = ", max_features)
print("warm_start = ", warm_start)
Пример #7
0
def HGB():
    actions = [
        'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left',
        'turn_right', 'wait_to_turn_left'
    ]

    data_points = []
    with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv',
              'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                data_points.append(sample)

    data_points_xycl = np.array(data_points)
    data_points_xyc = data_points_xycl[:, :-1]
    y = data_points_xycl[:, -1]

    # centralize datapoints and normalize
    data_points_xy_cent = []
    for row in data_points_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        data_points_xy_cent.append(new_row)

    result_point = []
    with open(
            '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv',
            'r') as fd:
        for row in fd:
            row_list = row[:-1].split(',')
            sample = [float(i) for i in row_list[:-1]]
            sample.append(actions.index(row_list[-1]))
            if len(sample) == 76:
                result_point.append(sample)

    result_point_xycl = np.array(result_point)
    result_point_xyc = result_point_xycl[:, :-1]
    result_point_y = result_point_xycl[:, -1]

    result_point_xy_cent = []
    for row in result_point_xyc:
        # print(row)
        avg_x = row[3]
        avg_y = row[4]
        head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5
        shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5
        new_row = []
        for i in range(16):  # first 16 points
            new_row.append((row[3 * i] - avg_x) / shoulder_length)
            new_row.append((row[3 * i + 1] - avg_y) / head_length)
            new_row.append(row[3 * i + 2])  # conf

        result_point_xy_cent.append(new_row)
    '''sum = 0
    gesture_results = []
    for i in range(100):
        data_points_xy_train, data_points_xy_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.3)
        clf = MLPClassifier(hidden_layer_sizes=(512,))
        clf.fit(data_points_xy_train, y_train)
        gesture_results.append(clf.predict([result_point_xy_cent[0]])[0])
        score = clf.score(data_points_xy_test, y_test)
        #print(score)
        sum += score'''

    X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent,
                                                        y,
                                                        test_size=0.4)
    scaler = preprocessing.StandardScaler().fit(X_train)
    #print(scaler.mean_)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    r_X_scaled = scaler.transform(result_point_xy_cent)

    sum = 0
    clf = HistGradientBoostingClassifier()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            data_points_xy_cent, y, test_size=0.4)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        clf.fit(X_train_scaled, y_train)
        #print(clf.n_iter_, end=" ")=
        score_train = clf.score(X_train_scaled, y_train)
        #print(score_train, end=" ")
        score_test = clf.score(X_test_scaled, y_test)
        sum += score_test  #print(score_test)

    tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0])

    return clf.predict([r_X_scaled[0]])[0], sum / 10, tf
Пример #8
0
        따라서 결측치에 대한 전처리는 필요가 없다
 - 샘플이 1만개 이상이면 기존 그래디언트 부스팅 보다 훨씬 빠름
'''
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)
sc = cross_validate(hgb,
                    train_input,
                    train_target,
                    return_train_score=True,
                    n_jobs=-1)
print(np.mean(sc['train_score']), np.mean(sc['test_score']))
# 0.9321723946453317 0.8801241948619236
hgb.fit(train_input, train_target)
print(hgb.score(test_input, test_target))
# 0.8723076923076923 87%정확도를 보임
'''
XGBoost
=> pip install xgboost
LGBM (LightGBM)
=> pip install lightgbm
'''
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method='hist', random_state=42)
sc = cross_validate(xgb,
                    train_input,
                    train_target,
                    return_train_score=True,
                    n_jobs=-1)
print('@' * 50)
best_clf = None

for alpha in np.arange(0.2, 1.01, 0.2):
    for beta in np.arange(0.2, 1.01, 0.2):
        all_sample_weights = getWeights(data, alpha * wPep_t, beta * wXL_t)

        clf = HistGradientBoostingClassifier(
            scoring="f1",
            monotonic_cst=monotonic_cst,
            tol=1e-7,
            random_state=42,
            validation_fraction=None)  #, early_stopping=True)
        clf.fit(X, y, sample_weight=all_sample_weights)
        print("alpha,beta: " + str(alpha) + "\t" + str(beta))
        print("Loss on all data (sample weight): {:.2f}".format(
            clf.score(X, y, sample_weight=all_sample_weights)))

        p = clf.predict_proba(
            data.loc[:,
                     data.columns != 'Label'])[:,
                                               1]  # prob for class=1 (target)
        p = pd.DataFrame({'p-value': p})
        data.reset_index(drop=True, inplace=True)
        p.reset_index(drop=True, inplace=True)
        data2 = pd.concat([data, p], axis=1)
        data2 = calcQ(data2, scoreColName="p-value")
        data2["Rank"] = 1
        # store best fit
        nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1)
        print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc))
        print("sum(pAUC): " + str(nXLauc + XLauc))
Пример #10
0
print(result.importances_mean)
# [0.08876275 0.23438522 0.08027708]
# n_repeat = None = 5 : 특성을 비교하기위해 섞을 횟수. 여기서는 10회 적용

result = permutation_importance(hgb,
                                test_input,
                                test_target,
                                n_repeats=10,
                                random_state=42,
                                n_jobs=-1)
print(result.importances)
print(result.importances_std)
print(result.importances_mean)
# [0.05969231 0.20238462 0.049     ]

hgb.score(test_input, test_target)
# 0.8723076923076923

#####
# XHBoost
###

import xgboost

from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method='hist', random_state=42)
scores = cross_validate(xgb,
                        train_input,
                        train_target,
                        return_train_score=True,
                        n_jobs=-1)