def test_unsupported_meta_regressor(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) lasso = Lasso() stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=lasso) with pytest.raises(TypeError): stack.fit(X1, y, sample_weight=w).predict(X1)
def test_predict_meta_features(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf) X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3) stregr.fit(X_train, y_train) test_meta_features = stregr.predict(X_test) assert test_meta_features.shape[0] == X_test.shape[0]
def test_train_meta_features_(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3) stregr.fit(X_train, y_train) train_meta_features = stregr.train_meta_features_ assert train_meta_features.shape[0] == X_train.shape[0]
def test_different_models(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stack.fit(X1, y).predict(X1) mse = 0.21 got = np.mean((stack.predict(X1) - y) ** 2) assert round(got, 2) == mse
def test_multivariate(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stack.fit(X2, y).predict(X2) mse = 0.19 got = np.mean((stack.predict(X2) - y) ** 2) assert round(got, 2) == mse, '%f != %f' % (round(got, 2), mse)
def test_use_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, cv=3, use_features_in_secondary=True) stack.fit(X1, y).predict(X1) mse = 0.2 got = np.mean((stack.predict(X1) - y) ** 2) assert round(got, 2) == mse, '%f != %f' % (round(got, 2), mse)
def test_weight_ones(): # sample_weight = None and sample_weight = ones # should give the same result, provided that the # randomness of the models is controled lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, cv=KFold(5, shuffle=True, random_state=5)) pred1 = stack.fit(X1, y).predict(X1) pred2 = stack.fit(X1, y, sample_weight=np.ones(40)).predict(X1) assert np.max(np.abs(pred1 - pred2)) < 1e-3
def test_internals(): lr = LinearRegression() regressors = [lr, lr, lr, lr, lr] cv = 10 stack = StackingCVRegressor(regressors=[lr, lr, lr, lr, lr], meta_regressor=lr, cv=cv) stack.fit(X3, y2) assert stack.predict(X3).mean() == y2.mean() assert stack.meta_regr_.intercept_ == 0.0 assert stack.meta_regr_.coef_[0] == 0.0 assert stack.meta_regr_.coef_[1] == 0.0 assert stack.meta_regr_.coef_[2] == 0.0 assert len(stack.regr_) == len(regressors)
def test_sample_weight(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, cv=KFold(4, shuffle=True, random_state=7)) pred1 = stack.fit(X1, y, sample_weight=w).predict(X1) mse = 0.21 # 0.20770 got = np.mean((stack.predict(X1) - y) ** 2) assert round(got, 2) == mse, "Expected %.2f, but got %.5f" % (mse, got) pred2 = stack.fit(X1, y).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def test_sparse_matrix_inputs(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, random_state=42) # dense stack.fit(X1, y).predict(X1) mse = 0.21 got = np.mean((stack.predict(X1) - y) ** 2) assert round(got, 2) == mse, got # sparse stack.fit(sparse.csr_matrix(X1), y) if Version(sklearn_version) < Version("0.21"): expected_value = 0.20 else: expected_value = 0.19 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y) ** 2) assert round(got, 2) == expected_value, got
def test_get_params(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[ridge, lr], meta_regressor=svr_rbf) got = sorted(list({s.split('__')[0] for s in stregr.get_params().keys()})) expect = ['cv', 'linearregression', 'meta-svr', 'meta_regressor', 'regressors', 'ridge', 'shuffle', 'store_train_meta_features', 'use_features_in_secondary'] assert got == expect, got
def test_weight_unsupported_with_no_weight(): # should be okay since we do not pass weight lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) lasso = Lasso() stack = StackingCVRegressor(regressors=[svr_lin, lr, lasso], meta_regressor=ridge) stack.fit(X1, y).predict(X1) stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=lasso) stack.fit(X1, y).predict(X1)
def test_sparse_matrix_inputs_with_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf, random_state=42, use_features_in_secondary=True) # dense stack.fit(X1, y).predict(X1) mse = 0.20 got = np.mean((stack.predict(X1) - y) ** 2) assert round(got, 2) == mse, got # sparse stack.fit(sparse.csr_matrix(X1), y) mse = 0.20 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y) ** 2) assert round(got, 2) == mse, got
cols100 = [i for i,j in enumerate(var) if j in tmp] pipe4 = make_pipeline(ColumnSelector(cols=tuple(cols100)), RandomForestRegressor(n_estimators=100,n_jobs=-1)) #cols137 = [i for i,j in enumerate(var) if j in var] #pipe5 = make_pipeline(ColumnSelector(cols=tuple(cols137)), # RandomForestRegressor(n_estimators=100,n_jobs=-1)) lasso = Lasso() lasso = Lasso(alpha=0.5) stack1 = StackingCVRegressor(regressors=(pipe1, pipe2,pipe3,pipe4), use_features_in_secondary=True, meta_regressor=lasso) tr_x = tr.loc[:,var] #tr_x2 = tr.loc[:,var2] tr_y = tr.loc[:,'elect_down'] #tr_y2 = tr.loc[:,'square_elect_down'] te_N = te.loc[te.typhoon =='NESATANDHAITANG', var] te_M = te.loc[te.typhoon =='MEGI', var] stack1.fit(tr_x.values, tr_y.values) Nes = stack.predict(te_N.values) Meg = stack.predict(te_M.values) test = pd.read_csv('/Users/charlie/Desktop/Taipower/data/submit.csv') #test.NesatAndHaitang = Nes *(602539/2471910.1150000058) #test.Megi = Meg *(4180000/3816284.0750000146)
# XGBoost Regressor xgboost = XGBRegressor() # Ridge Regressor ridge_alphas = [0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100] ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf)) # Gradient Boosting Regressor gbr = GradientBoostingRegressor() # Random Forest Regressor rf = RandomForestRegressor() # Stack up all the models above, optimized using xgboost stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, ridge, gbr, rf), meta_regressor=xgboost, use_features_in_secondary=True) # Skorlar scores = {} score = cv_rmse(lightgbm) print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std())) scores['lgb'] = (score.mean(), score.std()) score = cv_rmse(xgboost) print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std())) scores['xgb'] = (score.mean(), score.std()) score = cv_rmse(ridge)
def stacked_model(self): self.stacked_model = StackingCVRegressor( regressors=(self.ridge, self.lasso, self.elasticnet, self.gradient_boost, self.lightgbm, self.xgboost), meta_regressor=self.xgboost, use_features_in_secondary=True)
score_dict['AdaBoost'] = diff diff # In[381]: from mlxtend.regressor import StackingCVRegressor best_stack = MultiOutputRegressor( StackingCVRegressor( regressors=(KNeighborsRegressor( n_neighbors=best_param_nn['n_neighbors'], algorithm=best_param_nn['algorithm']), xgb.XGBRegressor( n_estimators=best_param_xgb['estimator__n_estimators'], eta=best_param_xgb['estimator__eta'], gamma=best_param_xgb['estimator__gamma'], max_depth=best_param_xgb['estimator__max_depth'])), meta_regressor=RandomForestRegressor( n_estimators=best_param_rf['n_estimators'], criterion=best_param_rf['criterion'], max_leaf_nodes=best_param_rf['max_leaf_nodes'], n_jobs=best_param_rf['n_jobs'], warm_start=best_param_rf['warm_start']), n_jobs=-1, refit=False)) best_stack.fit(x_train[:100], y_train[:100]) # In[382]: pred_y = best_stack.predict(x_test) diff = mean_absolute_error(y_test, pred_y) score_dict['StackingCVRegressor'] = diff
from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.metrics import mean_squared_error from sklearn.linear_model import ElasticNet from mlxtend.regressor import StackingCVRegressor from sklearn.model_selection import GridSearchCV rf = RandomForestRegressor() en = ElasticNet() gbr = GradientBoostingRegressor() etr = ExtraTreesRegressor() ada = AdaBoostRegressor() stack = StackingCVRegressor(regressors=(en, gbr, etr, ada), meta_regressor=rf, random_state=4) grid = GridSearchCV(estimator=stack, param_grid={'meta_regressor__n_estimators': [10, 100]}, cv=10, refit=True) grid.fit(trainpc, y) grid_predict = grid.predict(testpc) grid_test = pd.DataFrame(grid_test) #model fitting with kerasRegressor from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasRegressor
XGBR = XGBRegressor( learning_rate=0.01, n_estimators=3500, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, scale_pos_weight=1, reg_alpha=0.00006, seed = 100 ) STACKING = StackingCVRegressor(regressors=(RIDGE, BRIDGE, LASSO, #ELASTICNET, GBMR, LGBMR, XGBR), meta_regressor=XGBR, use_features_in_secondary=True, random_state = 0) print(datetime.now(), 'RIDGE: ',end="") RIDGE_MODEL = RIDGE.fit(X_train, y) print(rmsle(y, RIDGE_MODEL.predict(X_train))) print(datetime.now(), 'BRIDGE: ',end="") BRIDGE.fit(X_train, y) print(rmsle(y, BRIDGE.predict(X_train))) print(datetime.now(), 'LASSO: ',end="") LASSO_MODEL = LASSO.fit(X_train, y) print(rmsle(y, LASSO_MODEL.predict(X_train)))
max_depth=5, learning_rate=0.03, colsample_bytree=0.8, subsample=0.7, booster='gbtree') xgb_cols = [ 'weather', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday', 'Hour', 'week_day', 'Year', 'Day', 'season' ] params = {'depth': 6, 'learning_rate': 0.05, 'iterations': 150} cat_model = CatBoostRegressor(1000) cat_model.fit(train[xgb_cols], train['registered_log']) lr = LinearRegression() streg_model = StackingCVRegressor( regressors=[cat_model, rf_model, gbm_model, xgb_model], meta_regressor=lr) scores_casual_cat = cross_val_score(cat_model, train[xgb_cols], train['casual_log'], cv=5, scoring=make_scorer( log_rmsle, greater_is_better=False)) scores_r_cat = cross_val_score(cat_model, train[xgb_cols], train['registered_log'], cv=5, scoring=make_scorer(log_rmsle, greater_is_better=False)) scores_casual_xgb = cross_val_score(xgb_model,
random_state=1, activation='tanh', max_iter=10000, ) lasso = Lasso(max_iter=5000, alpha=0.001, random_state=SEED) enet = ElasticNet(random_state=SEED, alpha=0.001) ridge = Ridge(alpha=1, random_state=SEED) rf = RandomForestRegressor(n_estimators=1024, bootstrap=True, max_features='auto', min_samples_leaf=1, min_samples_split=2, random_state=SEED, ) xgb = GradientBoostingRegressor(random_state=SEED, n_estimators=1024, learning_rate=0.05, ) stack = StackingCVRegressor(regressors=(ridge, lasso, rf, xgb, enet, mlp), meta_regressor=lasso, verbose=1, n_jobs=2, use_features_in_secondary=True) # clf_label_zip = zip([ ridge, lasso, rf, xgb, mlp, stack], ['Ridge', 'Lasso','Random Forest', 'xgb', # 'mlp', 'StackingClassifier']) clf_label_zip = [(ridge, 'Ridge'), (lasso, 'Lasso')] def get_statistic(): df = pd.read_csv(root_path + in_file, index_col='index') # ageing only # df.rename(columns = {'SPICE1':'label'}, inplace = True) # pred_results = [] results = [] for i in range(6): # for i in [0]: if i == 0:
'reg_alpha': 0, 'reg_lambda': 1 } #model = xgb.XGBRegressor(**other_params) #mgb = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=5, verbose=1) #mgb.fit(train_X, train_Y) #print('参数的最佳取值:{0}'.format(mgb.best_params_)) #print('最佳模型得分:{0}'.format(-mgb.best_score_)) #myxgb = mgb.best_estimator_ myxgb = xgb.XGBRegressor(**other_params) ###############################--模型融合--###################################### stack = StackingCVRegressor(regressors=[myxgb, myRFR, mylgb], meta_regressor=bayes, use_features_in_secondary=True, cv=8) stack.fit(train_X, train_Y) pred_Y = stack.predict(test_X) mse = mean_squared_error(test_Y, pred_Y) print('mse: %.10f' % mse) folds = KFold(n_splits=7, shuffle=True, random_state=2019) #mean = [] #for fold, (i, j) in enumerate(folds.split(train_X1, train_Y1)): # print("fold {}".format(fold+1)) # trn_X, trn_Y = train_X1[i], train_Y1[i] # tsn_X, tsn_Y = train_X1[j], train_Y1[j] # # stack = stack
r = ridge.fit(train, y_train) predictions = ridge.predict(test) RIDGE = np.expm1(predictions) elasticnet = make_pipeline( RobustScaler(), ElasticNet(max_iter=1e7, alpha=0.0004, l1_ratio=0.9)) score = rmsle_cv(elasticnet) print(f"ElasticNet score: {score.mean():.4f} ({score.std():.4f})") e = elasticnet.fit(train, y_train) predictions = elasticnet.predict(test) EN = np.expm1(predictions) stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, lgb, gbr, svr), meta_regressor=lgb, use_features_in_secondary=True) score = rmsle_cv(stack_gen) print(f"Stack score: {score.mean():.4f} ({score.std():.4f})") sg = stack_gen.fit(train, y_train) predictions = stack_gen.predict(test) STACK = np.expm1(predictions) for i in range(LASSO.size): if LASSO[i] < 55000 or LASSO[i] > 500000: LASSO[i] = XGB[i] else: LASSO[i] = .1 * XGB[i] + .05 * LGB[i] + .05 * LASSO[i] + \ .2 * SVR[i] + .05 * GBR[i] + .25 * RIDGE[i] + .05 * EN[i] + .25 * STACK[i] submission = pd.DataFrame()
verbose=-1) # Stack model baggingmodel_lasso = BaggingRegressor(base_estimator=lasso) baggingmodel_ENet = BaggingRegressor(base_estimator=ENet) # baggingmodel_KRR = BaggingRegressor(base_estimator=KRR) baggingmodel_svr = BaggingRegressor(base_estimator=svr) baggingmodel_GBoost = BaggingRegressor(base_estimator=GBoost) GBoost = GBoost baggingmodel_lgb = BaggingRegressor(base_estimator=model_lgb) baggingmodel_xgb = BaggingRegressor(base_estimator=model_xgb) stackmodel = make_pipeline( imp_median, StackingCVRegressor(regressors=(lasso, ENet, GBoost, model_xgb, model_lgb), meta_regressor=model_xgb, use_features_in_secondary=True)) # TestModel(stackmodel, 1, 0.20,True) #stackmodel: RMSLE: 0.012 | MAPE: 7.785 #baggingmodel_xgb: RMSLE: 0.012 | MAPE: 7.482 #baggingmodel_lgb:RMSLE: 0.012 | MAPE: 7.568 #baggingmodel_GBoost: #GBoost: # sbaggingmodel_lasso = MakePrediction(baggingmodel_lasso);print("sbaggingmodel_lasso done") # sbaggingmodel_ENet = MakePrediction(baggingmodel_ENet);print("baggingmodel_ENet done") # sbaggingmodel_KRR = MakePrediction(baggingmodel_KRR);print("baggingmodel_KRR done") # sbaggingmodel_svr = MakePrediction(baggingmodel_svr);print("baggingmodel_svr done") sGBoost = MakePrediction(baggingmodel_GBoost) print("baggingmodel_GBoost done")
min_samples_split=10, loss='huber', random_state=0) rf = RandomForestRegressor(n_estimators=1200, max_depth=15, min_samples_split=5, min_samples_leaf=5, max_features=None, oob_score=True, random_state=0, n_jobs=-1) # Stack up all the models above, optimized using xgboost stack_gen = StackingCVRegressor(regressors=(lasso, xgboost, lightgbm, svr, ridge, gbr, rf), meta_regressor=xgboost, use_features_in_secondary=True) def elapsed_time(time_start, time_stop): print('Elapsed time:', timedelta(seconds=round(time_stop - time_start, 0))) #%% MODELLING : GET BASELINE CROSS-VALIDATION SCORES - MANDATORY scores = {} print('Baseline Cross-Validation Scores (RMSLE)') print('lasso') t_start = perf_counter()
elasticnet_l1ratios = [0.8, 0.85, 0.9, 0.95, 1] #lasso lasso_alphas = [5e-5, 1e-4, 5e-4, 1e-3] #ridge ridge_alphas = [13.5, 14, 14.5, 15, 15.5] MODELS = { "elasticnet" : make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=elasticnet_alphas, l1_ratio=elasticnet_l1ratios)), "lasso" : make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=lasso_alphas, random_state=42)), "ridge" : make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas)), "gradb" : GradientBoostingRegressor(n_estimators=6000, learning_rate=0.01, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42), "svr" : make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003)), "xgboost" : XGBRegressor(learning_rate=0.01, n_estimators=6000, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:squarederror', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=0.00006, random_state=42)} MODELS_stack = StackingCVRegressor(regressors=(MODELS['elasticnet'], MODELS['gradb'], MODELS['lasso'], MODELS['ridge'], MODELS['svr'], MODELS['xgboost']), meta_regressor=MODELS['xgboost'], use_features_in_secondary=True)
rf = RandomForestRegressor(max_depth=15, min_samples_split=5, min_samples_leaf=5, max_features=None, oob_score=True) ridge_alphas = [1e-3, 1e-2, 1e-1, 1, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100] ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, normalize=True, cv=cv)) lasso_alphas = np.logspace(-10, 0.1, 140) lasso = make_pipeline(RobustScaler(), LassoCV(alphas=lasso_alphas, normalize=True, tol=0.1, cv=cv)) # Stacking the models stack = StackingCVRegressor(regressors=(lgbm, rf, svr, ridge, lasso), meta_regressor=lgbm, use_features_in_secondary=True) # Define a scoring system def mse(y, y_pred): return mean_squared_error(y, y_pred) def cv_mse(model, X, y): rmse = -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv) return rmse # Test the models accuracy all_models = {'Random Forest': rf,
# # Extra Trees # et_model = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, min_samples_leaf=35, max_features=150) # results = cross_val_score(et_model, train, y_train, cv=5, scoring='r2') # print("ET score: %.4f (%.4f)" % (results.mean(), results.std())) stack = StackingCVRegressor( #meta_regressor=Ridge(alpha=10), meta_regressor=ElasticNet(l1_ratio=0.1, alpha=1.5), regressors=(svm_pipe, en_pipe, xgb_pipe, rf_model, lgbm_model)) #regressors=(svm_pipe, en_pipe, xgb_pipe, rf_model)) # cv_pred = cross_val_predict(stack, train, y_train, cv=5) # print("R2 score: %.4f" % r2_score(y_train, cv_pred)) # exit() ## R2 score: 0.5600 (en_pipe, rf_model) ## R2 score: 0.5601 (svm_pipe, en_pipe, xgb_pipe, rf_model, et_model) ## R2 score: 0.5605 (svm_pipe, en_pipe, xgb_pipe, rf_model, et_model, lgbm_model) ## R2 score: 0.5618 (svm_pipe, en_pipe, xgb_pipe, rf_model, lgbm_model) stack.fit(train, y_train) y_test = stack.predict(test)
('get', PipeExtractor(size_features)), ])), ('means', Pipeline([ ('get', PipeExtractor(mean_features)), ])), ('medians', Pipeline([ ('get', PipeExtractor(median_features)), ])), ])), ]) p31 = make_pipeline(pipe3, lgbr2) p32 = make_pipeline(pipe3, lr1) p8 = StackingCVRegressor( regressors=[p11, p12, p13, p21, p22, p23], meta_regressor=lassoCV, cv=5) pipe = p8 mode = 'Submit' if mode == 'Val': cv = cross_val_score(pipe, train_df, y_train, cv=5) print("R^2 Score: %0.4f (+/- %0.3f) [%s]" % ( cv.mean(), cv.std(), pipe.__class__)) elif mode == 'Grid': params = {
for regr in self.regressors: regr.fit(X, y) return self def predict(self, X): self.predictions = np.column_stack([regr.predict(X) for regr in self.regressors]) return np.mean(self.predictions, axis=1) lasso = Lasso(alpha=0.01) ridge = Ridge(alpha=0.1) en = ElasticNet(alpha=0.0005, l1_ratio=0.5) svr = SVR(C=1500, epsilon=7) rf = RandomForestRegressor(n_estimators=75, min_samples_leaf=6, min_samples_split=2, max_depth=4, max_features=5) gbm = GradientBoostingRegressor(subsample=0.7, min_samples_leaf=6) xgbm = xgb.sklearn.XGBRegressor(n_estimators=75, colsample_bytree=0.8, max_depth=2, subsample=0.5) stacked = StackingCVRegressor(regressors=(lasso, ridge, en, xgbm, svr, rf), meta_regressor=Lasso(alpha=128), cv=5, use_features_in_secondary=True) stack_nofeats = StackingCVRegressor(regressors=(lasso, ridge, en, xgbm, svr, rf), meta_regressor=Lasso(alpha=15), cv=5) average = AveragingRegressor((lasso, ridge, en, svr, rf, gbm, xgbm, stacked)) # lasso.fit(X, y) # ridge.fit(X, y) # stack_nofeats.fit(X,y) # stacked.fit(X,y) # exit() # import seaborn as sns # import matplotlib.pyplot as plt # # average.fit(X,y) # average.predict(X) # sns.pairplot(pd.DataFrame(average.predictions))
rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=2018, n_jobs=8) xgb = XGBRegressor(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8) lgb = LGBMRegressor(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8) svr = SVR(kernel='rbf', gamma='auto') lr = LinearRegression(n_jobs=8) models = [rf, xgb, lgb, svr] y_pred_self = StackingModels(models=models, meta_model=lr, X_train=X_train, X_test=X_test, y_train=y_train, use_probas=False, task_mode='reg') mse = mean_squared_error(y_test, y_pred_self) print('MyModel: MSE = {:.6f}'.format(mse)) stack_reg = StackingCVRegressor(regressors=models, meta_regressor=lr, cv=5).fit(X_train, y_train) y_pred_mxltend = stack_reg.predict(X_test) mse = mean_squared_error(y_test, y_pred_mxltend) print('Mlxtend: MSE = {:.6f}'.format(mse))
seed=27, reg_alpha=0.00006) # suing gradient boosting algorithm gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) # using ensemble stack_gen = StackingCVRegressor(regressors=(lasso, gbr, svr), meta_regressor=xgboost, use_features_in_secondary=True) # training our methods score = cv_rmse(lasso) print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), ) score = cv_rmse(svr) print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), ) score = cv_rmse(gbr) print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), ) # fitting ensemble of algorithms print('stack_gen') stack_gen_model = stack_gen.fit(np.array(train), np.array(y_label))
min_samples_split=10, loss='huber', random_state=42) # Random Forest Regressor rf = RandomForestRegressor(n_estimators=1200, max_depth=15, min_samples_split=5, min_samples_leaf=5, max_features=None, oob_score=True, random_state=42) # Stack up all the models above, optimized using xgboost stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf), meta_regressor=xgboost, use_features_in_secondary=True) scores = {} score = cv_rmse(lightgbm) print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std())) scores['lgb'] = (score.mean(), score.std()) ## RANDOM FOREST CLASSIFIER from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=10, criterion="entropy") rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) cm = confusion_matrix(y_test, y_pred)
# Support Vector Regressor svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003)) # Gradient Boosting Regressor gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10,loss='huber', random_state=42) gbr2 = GradientBoostingRegressor(n_estimators=1500, learning_rate=0.01, max_depth=1, max_features='sqrt',min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) # Random Forest Regressor rf = RandomForestRegressor(n_estimators=1200, max_depth=15, min_samples_split=5, min_samples_leaf=5, max_features=None, oob_score=True, random_state=42) rf2 = RandomForestRegressor(n_estimators=1200, max_depth=5, min_samples_split=5, min_samples_leaf=5, max_features=None,oob_score=True, random_state=42) # Stack up all the models above, optimized using xgboost stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, gbr2,xgboost2), meta_regressor=xgboost,use_features_in_secondary=True) """# **MAE scores for several different maching learning algorithms**""" # Title scores = {} score = cv_rmse(lightgbm) print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std())) scores['lgb'] = (score.mean(), score.std()) score = cv_rmse(xgboost) print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std())) scores['xgb'] = (score.mean(), score.std())
'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1 } model = xgb.XGBRegressor(**other_params) mgb = GridSearchCV(estimator=model, param_grid=cv_params, scoring='neg_mean_squared_error', cv=5, verbose=1) mgb.fit(train_X, train_Y) print('参数的最佳取值:{0}'.format(mgb.best_params_)) print('最佳模型得分:{0}'.format(-mgb.best_score_)) myxgb = mgb.best_estimator_ ##############################--模型融合--###################################### stack = StackingCVRegressor(regressors=[myGBR, myxgb], meta_regressor=LinearRegression(), use_features_in_secondary=True, cv=5) stack.fit(train_X, train_Y) pred_Y = stack.predict(test_X) print(mean_squared_error(test_Y, pred_Y)) Y_pred = stack.predict(test) results = pd.DataFrame(Y_pred, columns=['target']) results.to_csv("results.txt", index=False, header=False) print("over")
n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=0.00006)) #We use a stack model, which basically means the predications of our base regressors act as input to our #meta regressor with the SalePrice being predictor values stack_gen = StackingCVRegressor(regressors=(ridge, elasticnet, lasso, xgboost, lightgbm), meta_regressor=xgboost, use_features_in_secondary=True) stackX = np.array(X) stacky = np.array(y) #Fit the models elasticnet.fit(X, y) lasso.fit(X, y) ridge.fit(X, y) xgboost.fit(X, y) lightgbm.fit(X, y) stack_gen.fit(stackX, stacky) #We take a weighted average of our predictions, what this does adds some variance at the expense of losing a little bias stack_preds = ((0.2 * elasticnet.predict(test_data)) +
catboost = cbr.CatBoostRegressor(random_state=666, n_estimators=5676, l2_leaf_reg=4.819227931494778, learning_rate=0.017224166221196126, max_depth=5, silent=True) gbm = lgb.LGBMRegressor(n_estimators=565, objective='regression', learning_rate=0.11517897606077306, max_depth=2, min_data_in_leaf=2, num_leaves=222, random_state=666) stack = StackingCVRegressor(regressors=(lr, lasso, ridge, xgboost, catboost, gbm), meta_regressor=lasso, random_state=666) for clf, label in zip([lr, lasso, ridge, xgboost, catboost, gbm, stack], [ 'LR', 'Lasso', 'Ridge', 'XGBoost', 'CatBoost', 'LightGBM', 'StackingCVRegressor' ]): scores = cross_val_score(clf, X_train, Y_train, cv=10, scoring='neg_root_mean_squared_error') print("Neg. RMSE Score: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label)) stack.fit(X_train, Y_train)
x_train, y_train = processData.get_training_data() # param_alpha = np.arange(1e-4, 1e-3, 1e-4) param_alpha = [0.1, 1, 10] # param_alpha = [0.1] # The StackingCVRegressor uses scikit-learn's check_cv # internally, which doesn't support a random seed. Thus # NumPy's random seed need to be specified explicitely for # deterministic behavior RANDOM_SEED = 33 np.random.seed(RANDOM_SEED) stregr = StackingCVRegressor( regressors=[Ridge(), Lasso()], # meta_regressor=ElasticNet(), meta_regressor=RandomForestRegressor(random_state=RANDOM_SEED), use_features_in_secondary=True) # param_n_estimators = [100, 1000, 10000] param_n_estimators = [100] # elastic_net_param_alpha = np.arange(1e-4, 1e-3, 1e-4) # elastic_net_param_l1_ratio = np.arange(0.1, 1.0, 0.1) # param_grid = {'ridge__alpha': param_alpha, 'lasso__alpha': param_alpha, # 'meta-elasticnet__alpha': elastic_net_param_alpha, # 'meta-elasticnet__l1_ratio': elastic_net_param_l1_ratio, # 'meta-elasticnet__max_iter':[1000] # } param_grid = { 'ridge__alpha': param_alpha, 'lasso__alpha': param_alpha,
# Kernel Ridge Regression : made robust to outliers ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds)) # LASSO Regression : made robust to outliers lasso = make_pipeline( RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds)) # Elastic Net Regression : made robust to outliers elasticnet = make_pipeline( RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)) stack_gen = StackingCVRegressor(regressors=(ridge, elasticnet, lightgbm), meta_regressor=elasticnet, use_features_in_secondary=True) # store models, scores and prediction values models = { 'Ridge': ridge, 'Lasso': lasso, 'ElasticNet': elasticnet, 'lightgbm': lightgbm, 'xgboost': xgboost } predictions = {} scores = {} #Training the models for name, model in models.items():
verbose=-1, # min_data_in_leaf=2, # min_sum_hessian_in_leaf=11 ) xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=0.00006) # stack stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm), meta_regressor=xgboost, use_features_in_secondary=True) print('TEST score on CV') score = cv_rmse(ridge) print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) score = cv_rmse(lasso) print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) score = cv_rmse(elasticnet) print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) score = cv_rmse(svr) print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )
print("start") dataset=wholeDataset.values print(dataset.shape) X = dataset[0:1460,2:276] Y = trainDataset['SalePrice'] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) xgbclassifier = xgb.XGBRegressor(colsample_bytree=0.7, gamma=0, min_child_weight=1.5, n_estimators=1000, reg_alpha=0.75, reg_lambda=0.45, subsample=0.6, seed=42, max_features=220) xgbclassifier.fit(X_train, Y_train) rfclassifier = RandomForestRegressor(n_estimators = 1000, max_features=220) rfclassifier.fit(X_train, Y_train) stack_gen = StackingCVRegressor(regressors=(xgbclassifier,rfclassifier), meta_regressor=xgbclassifier, use_features_in_secondary=True) stack_gen.fit(X_train,Y_train) print("Score rf",rfclassifier.score(X_test, Y_test)) print("Score xgb",xgbclassifier.score(X_test, Y_test)) print("Score stack",stack_gen.score(X_test, Y_test))
min_samples_split=10, loss='huber', random_state=42) # Random Forest Regressor rf = RandomForestRegressor(n_estimators=1200, # 1200棵树 max_depth=15, # 树的最大深度 min_samples_split=5, min_samples_leaf=5, max_features=None, oob_score=True, random_state=42) # Stack up all the models above, optimized using xgboost stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf), meta_regressor=xgboost, # 第二层的元学习器为 xgboost use_features_in_secondary=True) # In[273]: """ Train models Get cross validation scores for each model 单独使用lightgbm k 折交叉验证训练, 看看各个模型的预测分数为多少,用于查看各个模型的预测表现 """
rf = RandomForestRegressor(random_state=rand, n_estimators=10) rf = model_train(rf, X_matrix_train, y_train_new) model_eval(rf, X_matrix_train, y_train_new) model_eval(rf, X_matrix_test, y_test_new) # In[ ]: import random rand = random.seed(9001) ridge = Ridge(random_state=rand) lasso = Lasso(random_state=rand) rf = RandomForestRegressor(random_state=rand) stack = StackingCVRegressor(regressors=(lasso, ridge), meta_regressor=rf, random_state=rand, use_features_in_secondary=True) params = {'lasso__alpha': [0.1, 1.0, 2.0], 'ridge__alpha': [0.1, 1.0, 2.0]} grid = GridSearchCV(estimator=stack, param_grid={ 'lasso__alpha': [x / 5.0 for x in range(1, 2)], 'ridge__alpha': [x / 20.0 for x in range(1, 2)], }, cv=2, refit=True) X = np.hstack(X_matrix_train) grid.fit(X, y_train_new)