#we want to test for classification, regression or stacking, which one is the best.
cv = StratifiedKFold(5, random_state=model_random_state)
updated_dict = gridsearch.best_params_
updated_dict['learning_rate'] = .1
updated_dict['n_estimators'] = 800
updated_dict['min_child_weight'] = 50

#%%
print('===============XGboost regression with rounding===============')
xgbr = XGBRegressor(random_state=model_random_state,
                    n_jobs=-1,
                    early_stopping_rounds=80)
xgbr.set_params(**updated_dict)
xgbr_scores = evaluation.cv_scores(xgbr,
                                   X_train,
                                   y_train,
                                   cv=cv,
                                   scoring=quadratic_weighted_kappa_round,
                                   return_estimator=True)

# train mean of score: 0.6473931873941305
# train std of score: 0.001041262887225388
# test mean of score: 0.6063831053298916
# test std of score: 0.003201456042199307

#%%
print('===============XGboost regression with decision tree===============')
#It is very easy for Stacking to get overfitted, so we reduce the model complexity here
sclf = StackingClassifier(classifiers=[xgbr],
                          meta_classifier=DecisionTreeClassifier(
                              min_samples_leaf=500,
                              random_state=model_random_state))
updated_dict['n_estimators'] = 1500
updated_dict['subsample'] = .25
# updated_dict['max_depth'] = 5
updated_dict['min_child_weight'] = 60
updated_dict['colsample_bytree'] = .33
best_model.set_params(**updated_dict)
#build final model here
time_series = pd.to_datetime(
    X.merge(df[['utc_time']], left_index=True, right_index=True)['utc_time'])
X_weights = list(
    pd.to_timedelta(
        [l - pd.Timestamp('2017-01-01 00:00:00')
         for l in time_series]) / np.timedelta64(1, 'Y') + 1)
scores = evaluation.cv_scores(best_model,
                              X_rfe,
                              y,
                              cv=KFold(3, random_state=model_random_state),
                              fit_params={'sample_weight': X_weights},
                              return_estimator=True)
final_model = scores['estimator'][0]
# train mean of r2: 0.9683232695378964
# train std of r2: 0.00011232320526037112
# test mean of r2: 0.9538122261617176
# test std of r2: 0.0002890399242091812
# train mean of smape: 0.2213808872433647
# train std of smape: 0.0384519096425457
# test mean of smape: 0.22190557395182622
# test std of smape: 0.0067101691226868435

#%%
#save the final model
final_model = scores['estimator'][0]
예제 #3
0
# best train score: 0.9083334290707444
# best train std: 0.004247777959102244
# best test score: 0.8011678911600456
# best test std: 0.05801191776853861

# In[115]:


lgbr = lgb.LGBMRegressor(objective='regression')
updated_dict = rc.best_params_
updated_dict['learning_rate'] = .01
updated_dict['n_estimators'] = 400
updated_dict['reg_alpha'] = 1
updated_dict['reg_lambda'] = 1
lgbr.set_params(**updated_dict)
scores = evaluation.cv_scores(lgbr, train_X_PM25, train_y_PM25, cv=KFold(5, random_state=model_random_state), return_estimator=True)
final_model_PM25 = scores['estimator'][0]
# train mean of r2: 0.9105595243830468
# train std of r2: 0.004123702698672325
# test mean of r2: 0.8026625934690428
# test std of r2: 0.056931438217053924
# train mean of smape: 0.29400950618095223
# train std of smape: 0.007571337452039795
# test mean of smape: 0.3703110126085242
# test std of smape: 0.05225316940873302

#  2. PM10

# In[116]: