示例#1
0
XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

GBR = GradientBoostingRegressor()
GBR.fit(X_train, Y_train)
GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test,GBR_preds))

GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(XGB, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

results2 = cross_val_score(RFR, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results2.mean()*100, results2.std()*100))

results3 = cross_val_score(GBR, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results3.mean()*100, results3.std()*100))

XGB.score(X_train, Y_train)
RFR.score(X_train, Y_train)
GBR.score(X_train, Y_train)

XGB.score(X_test, Y_test)
RFR.score(X_test, Y_test)
GBR.score(X_test, Y_test)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=77)

#2. 모델
model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=8)

#3. 훈련

model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric=['rmse', 'logloss', 'mae'],
          eval_set=[(x_train, y_train), (x_test, y_test)])

aaa = model.score(x_test, y_test)
print('score : ', aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print('r2 : ', r2)

#validation 0 - train
#validation 1 = teest

# print('==================================')
# results = model.evals_result()
# print(results)
示例#3
0
from sklearn import cross_validation

train = pd.read_csv('../data/train_empty.csv')

features = ['store_nbr',  'item_nbr', #  'units',  'station_nbr',
 'tmax',  'tmin',  'tavg',  'depart',  'dewpoint',  'wetbulb',
 'heat',  'cool',  'snowfall',  'preciptotal',  'stnpressure',
 'sealevel',  'resultspeed',  'resultdir',  'avgspeed',
 'HZ',  'FU',  'UP',  'TSSN',  'VCTS',  'DZ',  'BR',  'FG',
 'BCFG',  'DU',  'FZRA',  'TS',  'RA',  'PL',  'GS',  'GR',
 'FZDZ',  'VCFG',  'PRFG',  'FG+',  'TSRA',  'FZFG',  'BLDU',
 'MIFG',  'SQ',  'BLSN',  'SN',  'SG',
#  'month',
#  'day',
 'day_length']
#  'sunset_hour',
#  'sunset_minute',
#  'sunrise_hour',
#  'sunrise_minute']

import xgboost

X = xgboost.DMatrix(train[features].values, missing=np.nan)
y = train["units"].values

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)
clf = XGBRegressor(silent=False)


print clf.score(X_test, y_test)
示例#4
0
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
'''print(featureScores.nlargest(10,'Score'))  #print 10 best features'''

#Using XGBoost as a model
no_survived = train.drop("Survived", axis=1)
survived = train["Survived"]
xgb = XGBRegressor()
xgb.fit(no_survived, survived, verbose=False)

#making the prediction
X_test = test.copy()
XGBPredict = np.round(xgb.predict(X_test), 0)
print("XGBoost")
print(round(xgb.score(no_survived, survived) * 100, 2))  #got a score of 60.82

#Random Forest 74.162%, 77.033% with titles
RF = RandomForestClassifier(n_estimators=100)
RF.fit(no_survived, survived)
RFPredict = RF.predict(X_test)
RF.score(no_survived, survived)
RFScore = round(RF.score(no_survived, survived) * 100, 2)
print("Random Forest")
print(RFScore)

#Decision Tree 75.598%, 75.119% with titles
decisionTree = DecisionTreeClassifier()
decisionTree.fit(no_survived, survived)
decisionTreePredict = decisionTree.predict(X_test)
decisionTree.score(no_survived, survived)
def rest(F, X, Y, X_train, y_train, X_test, y_test):
    best_ada_score = float('-inf')
    best_ada_rmse = float('inf')
    best_ada_score_f = -1
    best_ada_rmse_f = -1

    best_xg_score = float('-inf')
    best_xg_rmse = float('inf')
    best_xg_score_f = -1
    best_xg_rmse_f = -1

    best_svr_score = float('-inf')
    best_svr_rmse = float('inf')
    best_svr_score_f = -1
    best_svr_rmse_f = -1

    for f in F:
        print("\npca %d" % f)
        # # PCA Feature Selection

        X_mu, X_Z = pca(f, X.values)
        X_pca = pca_proj(X.values, X_mu, X_Z)

        X_train_mu, X_train_Z = pca(f, X_train.values)

        X_train_pca = pca_proj(X_train.values, X_train_mu, X_train_Z)

        X_test_pca = pca_proj(X_test.values, X_train_mu, X_train_Z)

        # # AdaBoost
        print("\nAdaBoost")
        from sklearn.ensemble import AdaBoostRegressor
        adaBoost = AdaBoostRegressor()
        k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(
            adaBoost, f, X.values, Y.values.ravel(), k=5, B=5)
        if k_rmse < best_ada_rmse:
            best_ada_rmse = k_rmse
            best_ada_rmse_f = f

        adaBoost.fit(X_train_pca, y_train.values.ravel())
        ada_score = adaBoost.score(X_test_pca, y_test.values.ravel())
        print(ada_score)
        if ada_score > best_ada_score:
            best_ada_score = ada_score
            best_ada_score_f = f

        #View Predicted values
        predicted = adaBoost.predict(X_test_pca)
        ada_pred = y_test.copy()
        ada_pred['predicted'] = predicted
        ada_pred.head()

        # # XGBoost Regressor
        print("\nXGBoost")
        from xgboost import XGBRegressor

        xgb = XGBRegressor(max_depth=3,
                           learning_rate=0.2,
                           booster='gbtree',
                           n_estimators=70)

        k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(
            xgb, f, X.values, Y.values.ravel(), k=5, B=5)
        if k_rmse < best_xg_rmse:
            best_xg_rmse = k_rmse
            best_xg_rmse_f = f

        xgb.fit(X_train_pca, y_train)
        xgb_score = xgb.score(X_test_pca, y_test.values.ravel())
        print(xgb_score)
        if xgb_score > best_xg_score:
            best_xg_score = xgb_score
            best_xg_score_f = f

        predicted = xgb.predict(X_test_pca)
        xgb_pred = y_test.copy()
        xgb_pred['predicted'] = predicted
        xgb_pred.head()

        # # SVM (SVR)
        print("\nSVR")
        from sklearn import svm

        svr_model = svm.SVR(kernel="poly", coef0=-3500, gamma='scale')
        # coef0 only works with poly and sigmoid kernels
        # it just puts that value instead of the column of 1's

        # without it, this model breaks for some reason

        k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(
            svr_model, f, X.values, Y.values.ravel(), k=5, B=5)
        if k_rmse < best_svr_rmse:
            best_svr_rmse = k_rmse
            best_svr_rmse_f = f

        # epsilon, degree
        svr_model.fit(X_train_pca, y_train.values.ravel())
        svr_score = svr_model.score(X_test_pca, y_test.values.ravel())
        print(svr_score)
        if svr_score > best_svr_score:
            best_svr_score = svr_score
            best_svr_score_f = f

        svr_predicted = svr_model.predict(X_test_pca)
        svr_pred = y_test.copy()
        svr_pred["predicted"] = svr_predicted
        svr_pred.head()

    return ((best_ada_score, best_ada_score_f),
            (best_ada_rmse, best_ada_rmse_f), (best_xg_score, best_xg_score_f),
            (best_xg_rmse, best_xg_rmse_f), (best_svr_score, best_svr_score_f),
            (best_svr_rmse, best_svr_rmse_f))
示例#6
0
#-----------------------------------------------------------------------------#
# XGBoost
from xgboost import XGBRegressor

# initialize model
xgb_clf = XGBRegressor()
# fit model
xgb_clf.fit(X_train, y_train)

#predictions: test data
y_pred = xgb_clf.predict(X_test)

print('\n\n\nXGBoost')
#Scores
print('Train score')
print(xgb_clf.score(X_train, y_train))
print('Test score')
print(xgb_clf.score(X_test, y_test))
print('-------------------------------------------------------')

# MAE
print('Mean absolute error')
print(mean_absolute_error(y_test, y_pred))
print('-------------------------------------------------------')

# MSE
print('Mean squared error')
print(mean_squared_error(y_test, y_pred))
print('-------------------------------------------------------')

# R-squared
                                                    y,
                                                    train_size=0.8,
                                                    random_state=66,
                                                    shuffle=True)

#2. 모델
# model = DecisionTreeRegressor(max_depth=4)
# model = RandomForestRegressor(max_depth=4)
#model = GradientBoostingRegressor(max_depth=4)
model = XGBRegressor(n_job=-1)

#3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
acc = model.score(x_test, y_test)
print(model.feature_importances_)
print("acc : ", acc)

#5. 시각화
import matplotlib.pyplot as plt
import numpy as np


def plot_feature_importances_dataset(model):
    n_features = dataset.data.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), dataset.feature_names)
    plt.xlabel("Feature Improtances")
    plt.ylabel("Features")
    plt.ylim(-1, n_features)
示例#8
0
model = XGBRegressor(n_estimators=550,
                     learning_rate=0.05,
                     max_depth=8,
                     colsample_bytree=0.7,
                     reg_alpha=1,
                     scale_pos_weight=1,
                     reg_lambda=1.1,
                     n_jobs=6)

model.fit(x_train,
          y_train1,
          verbose=False,
          eval_metric=['logloss', 'mae'],
          eval_set=[(x_train, y_train1), (x_test, y_test1)],
          early_stopping_rounds=20)
score1 = model.score(x_test, y_test1)
print("score1 : %.4f" % (score1 * 100.0))
# print(model.feature_importances_)
y_pred_1 = model.predict(x_test)
mae1 = mean_absolute_error(y_test1, y_pred_1)
print('mae1 : %.4f' % (mae1))
y_pred1 = model.predict(x_pred)

model.fit(x_train,
          y_train2,
          verbose=False,
          eval_metric=['logloss', 'mae'],
          eval_set=[(x_train, y_train2), (x_test, y_test2)],
          early_stopping_rounds=20)
score2 = model.score(x_test, y_test2)
print("score2 : %.4f" % (score2 * 100.0))
示例#9
0
# Shape of train and test data
print(X_train_temp.shape, X_val_temp.shape)

# --------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Code starts here
dt = DecisionTreeRegressor(random_state=5)
dt.fit(X_train, y_train)
accuracy = dt.score(X_val, y_val)
y_pred = dt.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(accuracy)
print(y_pred)
print(rmse)

# --------------
from xgboost import XGBRegressor

xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100)
xgb.fit(X_train, y_train)
accuracy = xgb.score(X_val, y_val)
y_pred = xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(accuracy)
print(y_pred)
print(rmse)

# Code ends here
示例#10
0
    model = forest.RandomForestRegressor(max_depth=6, n_estimators=i)
    model.fit(x_train, y_train)
    ploter.append((i, mean_squared_error(y_test, model.predict(x_test))))
    print(i, '/', 99)
plt.plot([x for x, y in ploter], [y for x, y in ploter], c='b', linewidth=1.5)
plt.show()
pre_data = model_2.predict(test_data)
pre_data = pd.DataFrame(pre_data)
#
# simple = pd.read_csv('SampleSubmission.csv')
# simple['Item_Outlet_Sales'] = pre_data
# simple.to_csv('generated_SampleSubmission.csv')

model_3 = XGBRegressor(max_depth=4, n_estimators=12, learning_rate=.2)
model_3.fit(x_train, y_train)
print('model 3:', mean_squared_error(y_test, model_3.predict(x_test)))
print('score :', model_3.score(x_test, y_test))
# 8,40 = 1324449
# 4, 34 = 1150544

# ploter=[]
# for i in range(1, 80):
#     model = XGBRegressor(max_depth=4, n_estimators=i, learning_rate=.2)
#     model.fit(x_train, y_train)
#     ploter.append((i, mean_squared_error(y_test, model.predict(x_test))))
#     print(i,'/',99)
# plt.plot([x for x, y in ploter], [y for x, y in ploter], c='r', marker='.')
# plt.show()

#
                                                    y,
                                                    train_size=0.8,
                                                    random_state=66,
                                                    shuffle=True)

#2. 모델
# model = DecisionTreeRegressor(max_depth=4)
# model = RandomForestRegressor(max_depth=4)
# model = GradientBoostingRegressor(max_depth=4)
model = XGBRegressor(n_job=-1)

#3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
acc = model.score(x_test, y_test)
print(model.feature_importances_)
print("acc : ", acc)

#5. 시각화
import matplotlib.pyplot as plt
import numpy as np
'''
def plot_feature_importances_dataset(model): 
    n_features = dataset.data.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_,
        align='center')
    plt.yticks(np.arange(n_features), dataset.feature_names)
    plt.xlabel("Feature Improtances")
    plt.ylabel("Features")
    plt.ylim(-1, n_features)
示例#12
0
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
plt.scatter(y_test, y_pred)

rf.score(x_test, y_test)

print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred))
print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred))
print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# XGBoost

xgb = XGBRegressor(n_estimators=500,
                   max_depth=4,
                   learning_rate=0.1,
                   early_stopping_rounds=10)
xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

y_pred = xgb.predict(x_test)
plt.scatter(y_test, y_pred)

xgb.score(x_test, y_test)

plot_importance(xgb)

print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred))
print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred))
print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
示例#13
0
X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

# In[]:
# B、弱评估器超参数:
# 4、booster(选择弱评估器)
for booster in ["gbtree", "gblinear", "dart"]:
    reg = XGBR(n_estimators=260,
               learning_rate=0.25,
               random_state=420,
               booster=booster,
               silent=True).fit(Xtrain, Ytrain)
    print(booster)
    print(reg.score(Xtest, Ytest))

# gblinear线性弱评估器表现最差: 说明 波斯顿数据集 不是线性数据集(特征X 与 因变量Y 不是线性联系)

# In[]:
# 5、objective(损失函数)
# Sklearn的XGB: objective:默认reg:linear
reg = XGBR(n_estimators=270,
           subsample=0.75,
           learning_rate=0.13,
           random_state=420).fit(Xtrain, Ytrain)
print(reg.score(Xtest, Ytest))
print(MSE(Ytest, reg.predict(Xtest)))
# In[]:
# xgb原生库: obj:默认binary:logistic
# 使用类Dmatrix读取数据
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

print(ada.score(X_test, y_test))
print('Parameters currently in use:\n')
pprint(ada.get_params())

# XGBoost
# XGBoost and Adaboost are bad in multiclassification. The reason can be c=found in the lightgbm literature review.

from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras import optimizers


def baseline_model():
    # create model
    model = Sequential()
    model.add(
        Dense(64, input_dim=25, kernel_initializer='normal',
              activation='relu'))