#we want to test for classification, regression or stacking, which one is the best. cv = StratifiedKFold(5, random_state=model_random_state) updated_dict = gridsearch.best_params_ updated_dict['learning_rate'] = .1 updated_dict['n_estimators'] = 800 updated_dict['min_child_weight'] = 50 #%% print('===============XGboost regression with rounding===============') xgbr = XGBRegressor(random_state=model_random_state, n_jobs=-1, early_stopping_rounds=80) xgbr.set_params(**updated_dict) xgbr_scores = evaluation.cv_scores(xgbr, X_train, y_train, cv=cv, scoring=quadratic_weighted_kappa_round, return_estimator=True) # train mean of score: 0.6473931873941305 # train std of score: 0.001041262887225388 # test mean of score: 0.6063831053298916 # test std of score: 0.003201456042199307 #%% print('===============XGboost regression with decision tree===============') #It is very easy for Stacking to get overfitted, so we reduce the model complexity here sclf = StackingClassifier(classifiers=[xgbr], meta_classifier=DecisionTreeClassifier( min_samples_leaf=500, random_state=model_random_state))
updated_dict['n_estimators'] = 1500 updated_dict['subsample'] = .25 # updated_dict['max_depth'] = 5 updated_dict['min_child_weight'] = 60 updated_dict['colsample_bytree'] = .33 best_model.set_params(**updated_dict) #build final model here time_series = pd.to_datetime( X.merge(df[['utc_time']], left_index=True, right_index=True)['utc_time']) X_weights = list( pd.to_timedelta( [l - pd.Timestamp('2017-01-01 00:00:00') for l in time_series]) / np.timedelta64(1, 'Y') + 1) scores = evaluation.cv_scores(best_model, X_rfe, y, cv=KFold(3, random_state=model_random_state), fit_params={'sample_weight': X_weights}, return_estimator=True) final_model = scores['estimator'][0] # train mean of r2: 0.9683232695378964 # train std of r2: 0.00011232320526037112 # test mean of r2: 0.9538122261617176 # test std of r2: 0.0002890399242091812 # train mean of smape: 0.2213808872433647 # train std of smape: 0.0384519096425457 # test mean of smape: 0.22190557395182622 # test std of smape: 0.0067101691226868435 #%% #save the final model final_model = scores['estimator'][0]
# best train score: 0.9083334290707444 # best train std: 0.004247777959102244 # best test score: 0.8011678911600456 # best test std: 0.05801191776853861 # In[115]: lgbr = lgb.LGBMRegressor(objective='regression') updated_dict = rc.best_params_ updated_dict['learning_rate'] = .01 updated_dict['n_estimators'] = 400 updated_dict['reg_alpha'] = 1 updated_dict['reg_lambda'] = 1 lgbr.set_params(**updated_dict) scores = evaluation.cv_scores(lgbr, train_X_PM25, train_y_PM25, cv=KFold(5, random_state=model_random_state), return_estimator=True) final_model_PM25 = scores['estimator'][0] # train mean of r2: 0.9105595243830468 # train std of r2: 0.004123702698672325 # test mean of r2: 0.8026625934690428 # test std of r2: 0.056931438217053924 # train mean of smape: 0.29400950618095223 # train std of smape: 0.007571337452039795 # test mean of smape: 0.3703110126085242 # test std of smape: 0.05225316940873302 # 2. PM10 # In[116]: