def test_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') rf = RandomForestRegressor(n_estimators=10, random_state=2) ridge = Ridge(random_state=0) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=True) stack.fit(X1, y).predict(X1) mse = 0.14 got = np.mean((stack.predict(X1) - y) ** 2) print(got) assert round(got, 2) == mse stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=False) # dense stack.fit(X1, y).predict(X1) mse = 0.12 got = np.mean((stack.predict(X1) - y) ** 2) print(got) assert round(got, 2) == mse
def test_get_coeff(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr], meta_regressor=ridge) stregr.fit(X1, y) got = stregr.coef_ expect = np.array([0.4874216, 0.45518317]) assert_almost_equal(got, expect)
def test_get_intercept(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr], meta_regressor=ridge) stregr.fit(X1, y) got = stregr.intercept_ expect = 0.024 assert round(got, 3) == expect
def test_predict_meta_features(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf) X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3) stregr.fit(X_train, y_train) test_meta_features = stregr.predict(X_test) assert test_meta_features.shape[0] == X_test.shape[0]
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.122 got = np.mean((stregr.predict(X2) - y2) ** 2) assert round(got, 3) == mse
def test_train_meta_features_(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3) stregr.fit(X_train, y_train) train_meta_features = stregr.train_meta_features_ assert train_meta_features.shape[0] == X_train.shape[0]
def test_different_models(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X1, y).predict(X1) mse = 0.21 got = np.mean((stregr.predict(X1) - y) ** 2) assert round(got, 2) == mse
def test_multivariate(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X2, y).predict(X2) mse = 0.218 got = np.mean((stregr.predict(X2) - y) ** 2) assert round(got, 3) == mse
def test_weight_unsupported_meta(): # meta regressor with no support for # sample_weight should raise error lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) lasso = Lasso(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=lasso) with pytest.raises(TypeError): stregr.fit(X1, y, sample_weight=w).predict(X1)
def test_multivariate(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X2, y).predict(X2) mse = 0.218 got = np.mean((stregr.predict(X2) - y)**2) print(got) assert round(got, 3) == mse
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.12 got = np.mean((stregr.predict(X2) - y2) ** 2.) # there seems to be an issue with the following test on Windows # sometimes via Appveyor assert round(got, 2) == mse, got
def test_weight_ones(): # sample weight of ones should produce equivalent outcome as no weight lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) pred1 = stregr.fit(X1, y).predict(X1) pred2 = stregr.fit(X1, y, sample_weight=np.ones(40)).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff < 1e-3, "max diff is %.4f" % maxdiff
def regressionStacking(df): # StackingRegressor inputdata type is ndarray X_train, X_test, y_train, y_test = trainDataSplit(df) randomforest_regressor = RandomForestRegressor() # # lightgbm不是scikit-learn的包,mlxtend不支持 # lgb_train = lightgbm.Dataset(X_train, y_train) # lgb_eval = lightgbm.Dataset(X_test, y_test, reference=lgb_train) # # # specify your configurations as a dict # params = { # 'task': 'train', # 'boosting_type': 'gbdt', # 'objective': 'regression', # 'metric': {'l2', 'auc'}, # 'num_leaves': 2 ** 10, # 'learning_rate': 1.0, # 'feature_fraction': 0.9, # 'bagging_fraction': 0.8, # 'bagging_freq': 5, # 'verbose': 0 # } # lightgbm_regressor = lightgbm.train(params, # lgb_train, # num_boost_round=20, # valid_sets=lgb_eval, # early_stopping_rounds=5) lasso_regressor = Lasso() dnn_regressor = MLPRegressor() linearRegression_regressor = LinearRegression() stacking_regressor = StackingRegressor( regressors=[randomforest_regressor, lasso_regressor, dnn_regressor], meta_regressor=linearRegression_regressor) stacking_regressor.fit(X_train, X_train) y_pred = stacking_regressor.predict(X_test) criterion_df, predict_result = predictResultOutput(stacking_regressor, X_test, y_test, y_pred) # save model joblib.dump(stacking_regressor, 'stacking.model') return criterion_df, predict_result
def sbg_mlxtend_ensamble(iterate): iterate += 501 lin_mod = linear_model.LinearRegression() bsn_rdg = linear_model.BayesianRidge() elstc_nt = ElasticNet(alpha=0.2, l1_ratio=1) ridge = Ridge(alpha=0.01, tol=0.1, solver='sag') svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1) sgd_reg = linear_model.SGDRegressor(penalty='l2', alpha=0.001, n_iter=1000) lasso_reg = linear_model.Lasso(alpha=1, max_iter=3000, normalize='True', selection='random', tol=0.001) rndm_frst = RandomForestRegressor(max_depth=5, n_estimators=10) stregr = StackingRegressor(regressors=[sgd_reg, rndm_frst], meta_regressor=ridge) X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y2, test_size=0.20, random_state=iterate) X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) stregr.fit(X_train, y_train) y_pred = stregr.predict(X_test) #print("Mean Squared Error: %.4f" # % np.mean((y_pred - y_test.values) ** 2)) #print('Variance Score: %.4f' % stregr.score(X_test, y_test.values)) dev_Memory = abs(y_pred - y_test.values) mean_dev = np.mean(dev_Memory) mse_Memory = np.sqrt(np.sum(dev_Memory**2) / dev_Memory.size) mape = np.mean(dev_Memory / y_test.values) max_pe = np.max(dev_Memory) max_ne = np.max(np.negative(dev_Memory)) new_data1 = pd.DataFrame(y_pred) new_data2 = pd.DataFrame(y_test.values) new_data = pd.merge(new_data1, new_data2, left_index=True, right_index=True) filename12 = r'C:\Users\epatdeb\AlphaCANDI\SBG_Rawinput_1.6\latest\Logs\AlphaCandi17_MlxEnsmbl_Memory.log' logging.basicConfig(filename=filename12, level=logging.DEBUG) logging.info( "tensor_bp sbg_mlxtend_ensamble iter:%s \n \n y_pred/y_test: \n %s \n mae:%s mse:%s mape:%s max_pe:%s max_ne:%s", iterate, new_data, mean_dev, mse_Memory, mape, max_pe, max_ne) logging.shutdown() return mean_squared_error(y_test, y_pred), mean_dev, mape
def test_weight_unsupported_regressor(): # including regressor that does not support # sample_weight should raise error lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') knn = KNeighborsRegressor() stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, knn], meta_regressor=svr_rbf) with pytest.raises(TypeError): stregr.fit(X1, y, sample_weight=w).predict(X1)
def test_weight_unsupported_regressor(): # including regressor that does not support # sample_weight should raise error lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') lasso = Lasso(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], meta_regressor=svr_rbf) with pytest.raises(TypeError): stregr.fit(X1, y, sample_weight=w).predict(X1)
def test_weight_unsupported_with_no_weight(): # pass no weight to regressors with no weight support # should not be a problem lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') lasso = Lasso(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], meta_regressor=svr_rbf) stregr.fit(X1, y).predict(X1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=lasso) stregr.fit(X1, y).predict(X1)
def test_weight_unsupported_with_no_weight(): # pass no weight to regressors with no weight support # should not be a problem lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') knn = KNeighborsRegressor() stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, knn], meta_regressor=svr_rbf) stregr.fit(X1, y).predict(X1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=knn) stregr.fit(X1, y).predict(X1)
def test_weight_unsupported_with_no_weight(): # pass no weight to regressors with no weight support # should not be a problem lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') lasso = Lasso(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], meta_regressor=svr_rbf) stregr.fit(X1, y).predict(X1) stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=lasso) stregr.fit(X1, y).predict(X1)
def test_predictions_from_sparse_matrix(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr], meta_regressor=ridge) # dense stregr.fit(X1, y) print(stregr.score(X1, y)) assert round(stregr.score(X1, y), 2) == 0.61 # sparse stregr.fit(sparse.csr_matrix(X1), y) print(stregr.score(X1, y)) assert round(stregr.score(X1, y), 2) == 0.61
def test_sample_weight(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1) mse = 0.22 got = np.mean((stregr.predict(X1) - y)**2) assert round(got, 2) == mse # make sure that this is not equivalent to the model with no weight pred2 = stregr.fit(X1, y).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def Gbc(): from sklearn.ensemble import GradientBoostingClassifier, AdaBoostRegressor from sklearn.linear_model import LogisticRegression from mlxtend.regressor import StackingRegressor from sklearn.svm import SVR adaboost = AdaBoostRegressor() lr = LogisticRegression gb = GradientBoostingClassifier() svr = SVR(kernel='linear') svr_rbf = SVR(kernel='rbf') regressors = [svr, adaboost, gb] stregr = StackingRegressor(regressors=regressors, meta_regressor=svr_rbf) stregr.fit(X_train, y_train) outpred = stregr.predict(X_valid) evaluate_strategy(outpred)
def train_model(X_train, y_train): clf1 = LinearSVR() clf2 = LinearRegression() clf3 = Ridge() clf4 = LGBMRegressor() svr_linear = LinearSVR() sr = StackingRegressor(regressors=[clf1, clf2, clf3, clf4], meta_regressor=svr_linear) sr.fit(X_train, y_train) result = sr.predict(X_train) score = get_rmse_score(result, y_train) print("RMSE Score train: %.4f" % score) return sr
def test_sample_weight(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1) mse = 0.22 got = np.mean((stregr.predict(X1) - y) ** 2) assert round(got, 2) == mse # make sure that this is not equivalent to the model with no weight pred2 = stregr.fit(X1, y).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def test_get_coeff_fail(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[ridge, lr], meta_regressor=svr_rbf) stregr = stregr.fit(X1, y) got = stregr.coef_
def test_predictions_from_sparse_matrix(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[svr_lin, lr], meta_regressor=ridge) # dense stregr.fit(X1, y) print(stregr.score(X1, y)) assert round(stregr.score(X1, y), 2) == 0.61 # sparse stregr.fit(sparse.csr_matrix(X1), y) print(stregr.score(X1, y)) assert round(stregr.score(X1, y), 2) == 0.61
def test_get_coeff_fail(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf', gamma='auto') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[ridge, lr], meta_regressor=svr_rbf) with pytest.raises(AttributeError): stregr = stregr.fit(X1, y) r = stregr.coef_ assert r
def stackModel(self): train_X = self.X.as_matrix() train_Y = self.Y.as_matrix() test_X = self.Test.as_matrix() # train_X = data_scaler(train_X) X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1) gbdt = GradientBoostingRegressor(loss='ls', alpha=0.9, n_estimators=500, learning_rate=0.05, max_depth=8, subsample=0.8, min_samples_split=9, max_leaf_nodes=10) xgb = XGBRegressor(max_depth=5, n_estimators=500, learning_rate=0.05, silent=False) lr = LinearRegression() rfg = RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=11, min_samples_split=8, n_estimators=100) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[gbdt, xgb, lr, rfg], meta_regressor=svr_rbf) stregr.fit(X_train, y_train) stregr.predict(X_train) # Evaluate and visualize the fit print("Mean Squared Error: %.6f" % np.mean((stregr.predict(X_train) - y_train) ** 2) ** 0.5) error(stregr.predict(X_test), y_test) # online result = stregr.predict(test_X) save_to_file(result, self.uid, "../result/result_12.09_2_stacking.csv") with plt.style.context(('seaborn-whitegrid')): plt.scatter(X_train, y_train, c='lightgray') plt.plot(X_train, stregr.predict(X_train), c='darkgreen', lw=2) plt.show()
def test_different_models(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) y_pred = stregr.fit(X1, y).predict(X1) mse = 0.214 got = np.mean((stregr.predict(X1) - y)**2) assert round(got, 3) == mse
def test_sparse_matrix_inputs_and_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear') rf = RandomForestRegressor(random_state=2) ridge = Ridge(random_state=0) svr_rbf = SVR(kernel='rbf') stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=True) # dense stack.fit(X1, y).predict(X1) mse = 0.14 got = np.mean((stack.predict(X1) - y)**2) assert round(got, 2) == mse # sparse stack.fit(sparse.csr_matrix(X1), y) mse = 0.14 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y)**2) assert round(got, 2) == mse
def train(self, X, y): features = X labels = y #test train split X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=4) #Ridge regcv = linear_model.RidgeCV( alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]) regcv.fit(features, labels) regcv.alpha_ reg = linear_model.Ridge(alpha=regcv.alpha_) reg.fit(features, labels) # GB params = { 'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls' } gbr = ensemble.GradientBoostingRegressor(**params) gbr.fit(features, labels) #blended model meta = linear_model.LinearRegression() blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta) _ = blender.fit(features, labels) y_pred = blender.predict(X_test) print "***** TRAINING STATS ********" scores = cross_val_score(blender, features, labels, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) mean_diff = np.mean(np.abs(np.exp(Y_test) - np.exp(y_pred))) p_mean_diff = np.mean(mean_diff / np.exp(Y_test)) print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff * 100) print "***** TRAINING STATS ********" return blender
def train(self, X,y): features = X labels = y #test train split X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=4) #Ridge regcv = linear_model.RidgeCV(alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]) regcv.fit(features, labels) regcv.alpha_ reg = linear_model.Ridge(alpha=regcv.alpha_) reg.fit(features, labels) # GB params = {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls'} gbr = ensemble.GradientBoostingRegressor(**params) gbr.fit(features, labels) #blended model meta = linear_model.LinearRegression() blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta) _=blender.fit(features, labels) y_pred = blender.predict(X_test) print "***** TRAINING STATS ********" scores = cross_val_score(blender, features, labels, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) mean_diff = np.mean(np.abs(np.exp(Y_test)-np.exp(y_pred))) p_mean_diff = np.mean(mean_diff/np.exp(Y_test)) print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff*100) print "***** TRAINING STATS ********" return blender
scaler=StandardScaler() scaler.fit(train_x) #train_x=scaler.transform(train_x) #LGB.fit(train_x,y_train) rid=KNeighborsRegressor(n_jobs=3, n_neighbors=4) rf=LinearRegression() str=StackingRegressor(regressors=[LGB,rid],verbose=1,meta_regressor=rf) print('Overall RMPSE') cv=cross_validate(str,train_x,y_train,scoring=('neg_mean_squared_error'),return_train_score=False,cv=10) print(np.sqrt(np.abs(np.mean(cv['test_score'])))) #Grabbing Feature Importance# #rint('grabbing feature importance') #GB.fit(train_x,y_train) #eature_df=pd.DataFrame({'Cols':train_x.columns,'Vals':LGB.feature_importances_}) #eature_df=feature_df.sort_values(['Vals'],ascending=[0]) #Use when Submitting Below# ''' test_x=temp.tail(1459) test_x=scaler.transform(test_x) str.fit(train_x,y_train) preds=np.expm1(str.predict(test_x)) id_array = list(range(1461,2920)) submission_frame=pd.DataFrame({'id':id_array,'SalePrice':preds}) submission_frame=submission_frame[['id','SalePrice']] submission_frame.to_csv('out.csv',index=False) '''
#============================================================================== # 4) LGBMRegressor模型 #============================================================================== # from lightgbm import LGBMRegressor # # model_lgb = LGBMRegressor() #============================================================================== # 5) 融合模型 #============================================================================== from mlxtend.regressor import StackingRegressor # regressors = [model_xgb,model_rfg,model_gb] model = StackingRegressor(regressors=regressors, meta_regressor=model_xgb) # model = model_gb model.fit(train_text,train_labels) # print('The parameters of the best model are: ') # print(model.best_params_) preds = model.predict(train_text) print('The pearsonr of training set is {}'.format(pearsonr (list(train_labels), list(preds))[0])) print('The MSE of training set is {}'.format(mean_squared_error(list(train_labels), list(preds)))) #============================================================================== # 预测 测试集 #============================================================================== preds = model.predict(test_text) print('The pearsonr of test set is {}'.format(pearsonr (list(test_labels), list(preds))[0])) print('The MSE of test set is {}'.format(mean_squared_error(list(test_labels), list(preds))))
def main(): """ load data """ train_set = pd.read_csv('../data/train.csv') test_set = pd.read_csv('../data/test.csv') """ Remove Outliers """ outliers = train_set[(train_set['GrLivArea'] > 4000) & (train_set['SalePrice'] < 300000)].index train_set.drop(outliers, inplace=True) """ fix salePrice skewness """ train_set["SalePrice"] = np.log1p(train_set["SalePrice"]) y_train_values = train_set["SalePrice"].values """ prepare combined data. """ train_set_id = train_set['Id'] test_set_id = test_set['Id'] train_set_rows = train_set.shape[0] test_set_rows = test_set.shape[0] train_set.drop('Id', axis=1, inplace=True) test_set.drop('Id', axis=1, inplace=True) train_set.drop('SalePrice', axis=1, inplace=True) combined_data = pd.concat((train_set, test_set)) """ create data transform pipeline """ transform_pipeline = Pipeline(steps=[ ('NaNFixer', NaNFixer()), ('SkewFixer', SkewFixer()), ('Scaler', Scaler()), ('FeatureDropper', FeatureDropper()), ('Dummyfier', Dummyfier()), #('TrainDataSeparator', TrainDataSeparator(train_set_rows=train_set_rows)), ]) transformed_data = transform_pipeline.transform(combined_data) train_data = transformed_data[:train_set_rows] predict_data = transformed_data[train_set_rows:] """ try various regressors """ rf_param = {'n_estimators': [10, 12], 'max_depth': [3], 'n_jobs': [-1]} ls_param = { 'alpha': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005], 'max_iter': [10000], "normalize": [True, False] } elnet_param = { 'alpha': [0.0008, 0.004, 0.005], 'l1_ratio': [0.08, 0.1, 0.3], 'max_iter': [10000] } ridge_param = {'alpha': [35, 40, 45, 50, 55, 60, 65, 70, 80, 90]} # gbm_param = {"n_estimators": [1000], # 'min_child_weight': [1, 5, 10], # 'gamma': [0.1, 0.5, 1, 1.5, 2, 5], # 'subsample': [0.6, 0.8, 1.0], # 'colsample_bytree': [0.6, 0.8, 1.0], # 'max_depth': [3, 4, 5], # 'eta': [0.01], # 'eval_metric': ['mae'] # } # gbm_param = {"n_estimators": [1000]} lgb_params = { 'objective': ['regression'], 'num_leaves': [255], 'max_depth': [8], 'bagging_seed': [3], 'boosting_type': ['gbdt'] # , # 'min_sum_hessian_in_leaf' : [100], # 'learning_rate': np.linspace(0.05, 0.1, 3), # 'bagging_fraction': np.linspace(0.7, 0.9, 3), # 'bagging_freq': np.linspace(30, 50, 3, dtype='int'), # 'max_bin': [15, 63, 255], } # grid(SVR()).grid_get(X_scaled,y_log,{'C':[11,13,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]}) # param_grid={'alpha':[0.2,0.3,0.4], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1]} # grid(KernelRidge()).grid_get(X_scaled,y_log,param_grid) rf = get_best_estimator(train_data, y_train_values, estimator=RandomForestRegressor(), params=rf_param) elnet = get_best_estimator(train_data, y_train_values, estimator=ElasticNet(), params=elnet_param) lso = get_best_estimator(train_data, y_train_values, estimator=Lasso(), params=ls_param) rdg = get_best_estimator(train_data, y_train_values, estimator=Ridge(), params=ridge_param) gbm = get_best_estimator(train_data, y_train_values, estimator=xgb.XGBRegressor(), params=gbm_param) lbm = get_best_estimator(train_data, y_train_values, estimator=lgb.LGBMRegressor(), params=lgb_params) model = StackingRegressor(regressors=[rf, elnet, lso, rdg, gbm, lbm], meta_regressor=Lasso(alpha=0.0005)) # Fit the model on our data model.fit(train_data, y_train_values) y_pred = model.predict(train_data) print(sqrt(mean_squared_error(y_train_values, y_pred))) # Predict test set ensembled = np.expm1(model.predict(predict_data)) """ export submission data """ submission = pd.DataFrame({"Id": test_set_id, "SalePrice": ensembled}) submission.to_csv('submission.csv', index=False) """" Ensemble Weights """ from scipy.optimize import minimize regressors = [rf, elnet, lso, rdg, gbm, lbm] predictions = [] for clf in regressors: predictions.append( clf.predict(train_data)) # listing all our predictions def mse_func(weights): # scipy minimize will pass the weights as a numpy array final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return mean_squared_error(y_train_values, final_prediction) starting_values = [0.5] * len( predictions) # minimize need a starting value bounds = [(0, 1)] * len(predictions) # weights are bound between 0 and 1 res = minimize(mse_func, starting_values, bounds=bounds, method='SLSQP') print('Result Assessment: {message_algo}'.format( message_algo=res['message'])) print('Ensemble Score: {best_score}'.format(best_score=res['fun'])) print('Best Weights: {weights}'.format(weights=res['x']))
def main(): print("Reading in Data") #最终预测 train = pd.read_csv('cleaned_train20180129_111517.csv') test = pd.read_csv('cleaned_test20180129_111517.csv') #验证A榜结果 #train = pd.read_csv('cleaned_train20180129_102513.csv') #test = pd.read_csv('cleaned_test20180129_102513.csv') test = test.drop(['id'], axis=1) train = train.drop(['id'], axis=1) y_train = train['血糖'] #pred_proba为测试集血糖权重 threshold = 6.5 test_num = len(test) train_num = len(train) bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat, X_train, pred_proba = fuck_columns( train, test, threshold) print("linear model 开始训练") pred_bigger, pred_less, linear_bigger, linear_less = linear_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 linear_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("lasso model 开始训练") pred_bigger, pred_less, lasso_bigger, lasso_less = lasso_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 lasso_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("ENet model 开始训练") pred_bigger, pred_less, ENet_bigger, ENet_less = ENet_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 ENet_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("集成模型开始训练...") print("RandomForestRegressor...") pred_bigger, pred_less, rf_bigger, rf_less = rf_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 rf_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("GradientBoostingRegressor...") pred_bigger, pred_less, gb_bigger, gb_less = GBoost_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 gb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("LGBMRegressor...") pred_bigger, pred_less, lgb_bigger, lgb_less = LGBM_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 lgb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("XGBRegressor...") pred_bigger, pred_less, xgb_bigger, xgb_less = xgb_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 xgb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) ''' Stacking Learning ''' print("StackingRegressor...") stacked_averaged_bigger_models = StackingRegressor( regressors=[linear_bigger, lasso_bigger, ENet_bigger], meta_regressor=gb_bigger) stacked_averaged_less_models = StackingRegressor( regressors=[linear_less, lasso_less, ENet_less], meta_regressor=gb_less) #拟合模型 stacked_averaged_bigger_models.fit(bigger_thr_X, bigger_thr_y) stacked_averaged_less_models.fit(less_thr_X, less_thr_y) #测试集预测 stacked_bigger_pred = stacked_averaged_bigger_models.predict(test_concat) stacked_less_pred = stacked_averaged_less_models.predict(test_concat) #预测结果结合权重 stacked_pred_res = np.array([ stacked_less_pred[i] * pred_proba[i][0] + stacked_bigger_pred[i] * pred_proba[i][1] for i in range(test_num) ]) ensemble = stacked_pred_res * 0.40 + xgb_pred_res * 0.40 + lgb_pred_res * 0.20 #stacking融合linear new_ensemble = np.array([ linear_pred_res[i] * pred_proba[i][0] + ensemble[i] * pred_proba[i][1] for i in range(test_num) ]) sub = pd.DataFrame({'pred': ensemble}) sub_wig = pd.DataFrame({'pred': new_ensemble}) sub.to_csv('submission_b.csv', header=None, index=False) sub_wig.to_csv('submission_b_wig.csv', header=None, index=False)
y = np.sin(X).ravel() y[::5] += 3 * (0.5 - np.random.rand(8)) # Initializing models lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) # Training the stacking classifier stregr.fit(X, y) stregr.predict(X) # Evaluate and visualize the fit print("Mean Squared Error: %.4f" % np.mean((stregr.predict(X) - y)**2)) print('Variance Score: %.4f' % stregr.score(X, y)) with plt.style.context(('seaborn-whitegrid')): plt.scatter(X, y, c='lightgray') plt.plot(X, stregr.predict(X), c='darkgreen', lw=2) plt.show() # Example 2 - Stacked Regression and GridSearch
K.set_session(sess) np.random.seed(7) rn.seed(7) from mlxtend.regressor import StackingRegressor rf = RandomForestRegressor(n_estimators=54, max_depth=None, random_state=8) ext = ExtraTreesRegressor(n_estimators=584, min_samples_split=2, random_state=8) def create_model(): model = Sequential() model.add(Dense(540, input_dim=8, activation='relu')) model.add(BatchNormalization()) model.add(Dense(1)) model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae']) return model nn = KerasRegressor(build_fn=create_model, epochs=32, batch_size=32, verbose=0) clf = StackingRegressor(regressors=[nn, ext], meta_regressor=rf) scores = [] for train, test in kfold.split(X, y): clf.fit(X[train], y[train]) score = clf.score(X[test], y[test]) print(score) scores.append(score) print("%.3f%% (+/- %.3f)" % (np.mean(scores), np.std(scores)))
def predict(): ''' For rendering results on HTML GUI ''' features = [x for x in request.form.values()] #final_features = [np.array(int_features)] #prediction = model.predict(final_features) #output = round(prediction[0], 2) features = np.array(features) features = features.reshape(1, 6) features = pd.DataFrame(data=features, columns=[ 'Name', 'Genre', 'Comments', 'Likes', 'Popularity', 'Followers' ]) df = pd.read_csv('data.csv') cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int} df = df.astype(cv) features = features.astype(cv) #x=df[df['Views']==0].index df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Popularity']].index, axis=1, inplace=True) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)) df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)] df = df.drop( columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index']) y = df['Views'] df = df.drop(columns=['Views']) be = BinaryEncoder() df = be.fit_transform(df) f = be.transform(features) X = df.iloc[:, :] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) rg1 = AdaBoostRegressor() rg1.fit(X_train, y_train) #ypred=rg1.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1) # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]} # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1) rg2.fit(X_train, y_train) #ypred=rg2.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15) # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]} # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1) rg3.fit(X_train, y_train) #ypred=rg3.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3) rg6.fit(X_train, y_train) #ypred=rg6.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) f = f.iloc[:, :] y_pred = rg6.predict(f) y_pred = y_pred.astype(int) return render_template( 'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
def stacklearning(self): class extAll(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): return self def predict(self, X): return self class extMorgan(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): _,morgan,_=sepTables(X) return morgan class extMACCS(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,_=sepTables(X) maccs = pd.concat([morgan,maccs],axis=1) return maccs class extDescriptor(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,descriptor=sepTables(X) descriptor = pd.concat([morgan,descriptor],axis=1) descriptor = pd.concat([maccs,descriptor],axis=1) return descriptor class extPCA(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): model = PCA(n_components=64) _,morgan,_=sepTables(X) morgan = morgan.reset_index().drop('index', axis=1) W = pd.DataFrame(model.fit_transform(X)) W = pd.concat([morgan,W],axis=1) return W lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) pipe1 = make_pipeline(extMACCS(), rgf) pipe2 = make_pipeline(extMorgan(), rgf1) pipe3 = make_pipeline(extDescriptor(), rgf2) pipe4 = make_pipeline(extPCA(), rgf3) pipe7 =make_pipeline(extDescriptor(), rgf4) pipe8 =make_pipeline(extDescriptor(), rgf4) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') sgd = SGDRegressor(max_iter=1000) pls = PLSRegression(n_components=3) ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) pipe5 = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1) scores = cross_val_score(stack2, X, y, cv=10) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking')) stack1_score = cross_val_score(stack1,X,y, cv=10) rgf_score = cross_val_score(rgf,X,y,cv=10) stack2.fit(X_train, y_train) y_pred = stack2.predict(X_train) y_val = stack2.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) rgf.fit(X_train, y_train) y_pred = rgf.predict(X_train) y_val = rgf.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) pipe1.fit(X_train, y_train) y_pred = pipe1.predict(X_train) y_val = pipe1.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) cols = np.arange(1,550,1).tolist() cols = X.columns.tolist() cols = [1,2,3] # Initializing Classifiers reg1 = Ridge(random_state=1) #reg2 = ExtraTreesRegressor() reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) reg3 = SVR(gamma='auto',kernel='linear') reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) pls = PLSRegression(n_components=3) pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50)) #linear =SGDRegressor(max_iter=1000) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) nbrs = KNeighborsRegressor(2) pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31)) meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1) stackReg.fit(X_train, y_train) y_pred = stackReg.predict(X_train) y_val = stackReg.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test)) rgf.fit(X_train, y_train) y_pred = reg4.predict(X_train) y_val = reg4.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
model_rf = RandomForestRegressor(n_estimators=200, max_features=0.26326530612244903, criterion='mse') model_extra_tree = ExtraTreesRegressor(n_estimators=200, criterion='mse') model_gb = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=43) model_lr = LinearRegression() svr_rbf = SVR(kernel='rbf') svr_lin = SVR(kernel='linear') ridge = Ridge() model_xgb2 = XGBRegressor(max_depth=10, n_estimators=100) model_vote = VotingClassifier( estimators=[('xgb', model_xgb), ('rf', model_rf), ('gb', model_gb)]) sclf = StackingRegressor(regressors=[model_extra_tree, model_xgb2, model_rf], meta_regressor=model_lr) time_split = TimeSeriesSplit(n_splits=5) print cross_val_score(sclf, X=train.as_matrix(), y=target.as_matrix(), scoring=SMAPE, cv=time_split).mean() sclf.fit(X=train, y=target) preds = sclf.predict(test) sample_submission['y'] = preds print sample_submission[sample_submission['y'] < 0] sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0) sample_submission.to_csv("my_submission_24_2.tsv", sep=',', index=False)
xgbtrain, num_boost_round=2889, early_stopping_rounds=50, evals=watchlist) rfreg = RandomForestRegressor(random_state=1, max_depth=15) ridge_reg = Ridge(normalize=True) lasso_reg = Lasso() linear_reg = LinearRegression(normalize=True) stacking_reg = StackingRegressor(regressors=[rfreg, ridge_reg, lasso_reg], meta_regressor=linear_reg) feature = [x for x in train_zero_var.columns if x not in ['Value']] # X_train, X_test, y_train, y_test = train_test_split(train_zero_var[feature], train_zero_var['Value'], test_size=0.2, # random_state=0) stacking_reg.fit(X_train, y_train) stacking_test = pd.DataFrame(stacking_reg.predict(X_test)) stacking_test.columns = ['stacking_pred'] y_test = pd.DataFrame(y_test) y_test.columns = ['Value'] mean_squared_error(stacking_test['stacking_pred'], y_test['Value']) train_zero_var = train_zero_var.reset_index() # predict for Random Forest rf_pred = pd.DataFrame() for idx in range(0, 5): train = train_zero_var[train_zero_var['index'] % 5 != idx] test = train_zero_var[train_zero_var['index'] % 5 == idx] stacking_feature = [ x for x in train.columns if x not in ['index', 'Value']
def useXYtrain(x, y, times): flag = 0 for i in range(0, len(Selected_learnerCode)): if Selected_learnerCode[i] != '': flag += 1 if flag == 0: print('No proper learner\n') return stacking_MSE = [[], [], [], [], [], []] MSE = [[], [], [], [], [], [], []] R_square = [[], [], [], [], [], [], [], []] Ada_MSE = [] Ada_r_square = [] for i in range(0, times): print('第' + str(i + 1) + '次试验:\n') Learners_map = {} Learners = [] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20) svr = SVR(C=1.0, epsilon=0.2) parameters = { 'C': np.logspace(-3, 3, 7), 'gamma': np.logspace(-3, 3, 7) } print("GridSearch starting...") clfsvr = GridSearchCV(svr, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfsvr.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfsvr.best_params_) y_pred = clfsvr.best_estimator_.predict(X_test) # drawTrain(y_pred, y_test, 'SVR', i) # SVR_MSE.append(mean_squared_error(y_test, y_pred)) yy = clfsvr.best_estimator_.predict(x) R_square[0].append(drawTrain(y, yy, 'SVR', i)) MSE[0].append(mean_squared_error(y_test, y_pred)) if 'SVR' in Selected_learnerCode: print('SVR Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfsvr.best_estimator_) Learners_map['SVR'] = svr """ann = Regressor(layers = [Layer("Sigmoid", units=14), Layer("Linear")], learning_rate = 0.02, random_state = 2018, n_iter = 10) ann.fit(X_train,y_train) y_pred = ann.predict(X_test) print('ANN Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")""" parameters = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000]} rfr = RandomForestRegressor(n_estimators=200, random_state=0) # drawTrain(rfr, x, y, 'RFR', i) # rfr = RandomForestRegressor(n_estimators=200, random_state=0) clfrfr = GridSearchCV(rfr, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfrfr.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfrfr.best_params_) y_pred = clfrfr.best_estimator_.predict(X_test) yy = clfrfr.best_estimator_.predict(x) MSE[1].append(mean_squared_error(y_test, y_pred)) R_square[1].append(drawTrain(y, yy, 'RFR', i)) # RFR_MSE.append(mean_squared_error(y_test, y_pred)) if 'RFR' in Selected_learnerCode: print('RFR Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfrfr.best_estimator_) Learners_map['RFR'] = rfr parameters = {'alpha': np.logspace(-2, 2, 5)} lasso = Lasso(alpha=0.05, random_state=1, max_iter=1000) # drawTrain(lasso, x, y, 'LASSO', i) clflasso = GridSearchCV(lasso, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clflasso.fit(X_train, y_train) yy = clflasso.best_estimator_.predict(x) print('The parameters of the best model are: ') print(clflasso.best_params_) y_pred = clflasso.best_estimator_.predict(X_test) R_square[2].append(drawTrain(y, yy, 'LASSO', i)) MSE[2].append(mean_squared_error(y_test, y_pred)) if 'LASSO' in Selected_learnerCode: print('LASSO Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('LASSO Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clflasso.best_estimator_) Learners_map['LASSO'] = lasso # drawTrain(ENet, X_train, y_train,X_test,y_test, 'Elastic NET', i) parameters = { 'alpha': np.logspace(-2, 2, 5), 'l1_ratio': np.linspace(0, 1.0, 11) } # ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3) # drawTrain(ENet, x, y, 'Elastic NET', i) ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3) clfENet = GridSearchCV(ENet, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfENet.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfENet.best_params_) y_pred = clfENet.best_estimator_.predict(X_test) yy = clfENet.best_estimator_.predict(x) MSE[3].append(mean_squared_error(y_test, y_pred)) R_square[3].append(drawTrain(y, yy, 'Elastic Net', i)) if 'ENET' in Selected_learnerCode: print('Elastic Net Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfENet.best_estimator_) Learners_map['ENET'] = ENet parameters = {'n_estimators': [100, 500, 1000, 2000, 3000, 5000]} GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) clfGBoost = GridSearchCV(GBoost, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfGBoost.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfGBoost.best_params_) y_pred = clfGBoost.best_estimator_.predict(X_test) yy = clfGBoost.best_estimator_.predict(x) MSE[4].append(mean_squared_error(y_test, y_pred)) # GBoost_MSE.append(mean_squared_error(y_test, y_pred)) R_square[4].append(drawTrain(y, yy, 'Gradient Boosting', i)) if 'GBOOST' in Selected_learnerCode: print('GBoost squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfGBoost.best_estimator_) Learners_map['GBOOST'] = GBoost # Adaboost # Adaboost = AdaBoostRegressor(base_estimator=SVR(C=1.0, epsilon=0.2)) Adaboost = AdaBoostRegressor() Adaboost.fit(X_train, y_train) y_pred = Adaboost.predict(X_test) yy = Adaboost.predict(x) R_square[5].append(drawTrain(y, yy, 'Adaboost', i)) print('Adaboost with SVR squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Ada_MSE.append(mean_squared_error(y_test, y_pred)) # BAGGING baggingModel = baggingAveragingModels( models=(clfsvr.best_estimator_, clfrfr.best_estimator_, clfENet.best_estimator_, clfGBoost.best_estimator_, clflasso.best_estimator_)) baggingModel.fit(X_train, y_train) y_pred = baggingModel.predict(X_test) MSE[5].append(mean_squared_error(y_test, y_pred)) yy = baggingModel.predict(x) R_square[6].append(drawTrain(y, yy, 'Bagging', i)) print('Bagging before selected squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") baggingModel = baggingAveragingModels(models=tuple(Learners)) # drawTrain(baggingModel, X_train, y_train,X_test,y_test, 'Bagging', i) # baggingModel = baggingAveragingModels(models=tuple(Learners)) baggingModel.fit(X_train, y_train) y_pred = baggingModel.predict(X_test) MSE[6].append(mean_squared_error(y_test, y_pred)) yy = baggingModel.predict(x) R_square[7].append(drawTrain(y, yy, 'Bagging', i)) print('Bagging after selected squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_R_square = [[], [], [], [], [], []] All_learner = ['SVR', 'RFR', 'LASSO', 'ENET', 'GBOOST'] for k in range(0, len(Selected_learnerCode)): """learnerList = [] for kk in range(0,len(Selected_learnerCode)): if Selected_learnerCode[kk]!='' : learnerList.append(Learners_map[Selected_learnerCode[kk]])""" """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), meta_model=Learners_map[All_learner[k]]) drawTrain(stacked_averaged_models, X_train, y_train,X_test,y_test, 'stacking with '+All_learner[k], i)""" # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), # meta_model=Learners_map[All_learner[k]]) params = {} """ if 'SVR' in Selected_learnerCode: params['svr__C'] = np.logspace(-3, 3, 7) params['svr__gamma'] = np.logspace(-3, 3, 7) if 'RFR' in Selected_learnerCode: params['randomforestregressor__n_estimators'] =[10, 50, 100, 500, 1000] if 'LASSO' in Selected_learnerCode: params['lasso__alpha'] = np.logspace(-2, 2, 5) if 'ENET' in Selected_learnerCode: params['elasticnet__alpha'] = np.logspace(-2, 2, 5) if 'GBOOST' in Selected_learnerCode: params['gradientboostingregressor__n_estimators']= [100, 500, 1000, 2000, 3000, 5000]""" if k == 0: params['meta-svr__C'] = np.logspace(-3, 3, 7) params['meta-svr__gamma'] = np.logspace(-3, 3, 7) if k == 1: params['meta-randomforestregressor__n_estimators'] = [ 10, 50, 100, 500, 1000 ] if k == 2: params['meta-lasso__alpha'] = np.logspace(-2, 2, 5) if k == 3: params['meta-elasticnet__alpha'] = np.logspace(-2, 2, 5) if k == 4: params['meta-gradientboostingregressor__n_estimators'] = [ 100, 500, 1000, 2000, 3000, 5000 ] """ params = {'svr__C': np.logspace(-3, 3, 7), 'svr__gamma': np.logspace(-3, 3, 7), 'randomforestregressor__n_estimators': [10, 50, 100, 500, 1000], 'lasso__alpha': np.logspace(-2, 2, 5), 'elasticnet__alpha':np.logspace(-2, 2, 5), 'gradientboostingregressor__n_estimators': [100, 500, 1000, 2000, 3000, 5000], }""" stacked_averaged_models = StackingRegressor( regressors=Learners, meta_regressor=Learners_map[All_learner[k]]) grid = GridSearchCV(estimator=stacked_averaged_models, param_grid=params) grid.fit(X_train, y_train) y_pred = grid.best_estimator_.predict(X_test) yy = grid.best_estimator_.predict(x) stacking_R_square[k].append( drawTrain(y, yy, 'stacking with ' + All_learner[k], i)) print('Stacking with metamodel is ' + All_learner[k] + ' squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_MSE[k].append(mean_squared_error(y_test, y_pred)) # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), # meta_model=baggingModel) # drawTrain(stacked_averaged_models, X_train, y_train, X_test, y_test, 'stacking with Bagging models' , i) """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), meta_model=Learners_map[All_learner[k]])""" stacked_averaged_models = StackingRegressor( regressors=Learners, meta_regressor=baggingModel) # grid = GridSearchCV(estimator=stacked_averaged_models, param_grid=params) stacked_averaged_models.fit(X_train, y_train) y_pred = stacked_averaged_models.predict(X_test) yy = stacked_averaged_models.predict(x) stacking_R_square[5].append( drawTrain(y, yy, 'stacking with bagging', i)) print('Stacking with metamodel is bagging models squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_MSE[5].append(mean_squared_error(y_test, y_pred)) gc.collect() print("Adaboost mean is " + str(np.mean(Ada_MSE))) min_stacking_MSE = [] for i in range(0, times): minMSE = stacking_MSE[0][i] for j in range(1, 6): if stacking_MSE[j][i] < minMSE: minMSE = stacking_MSE[j][i] min_stacking_MSE.append(minMSE) plot_x = np.linspace(1, times, times) if len(MSE[0]) > 0: plt.plot(plot_x, MSE[0], 'b') if len(MSE[1]) > 0: plt.plot(plot_x, MSE[1], 'r') if len(MSE[2]) > 0: plt.plot(plot_x, MSE[2], 'y') if len(MSE[3]) > 0: plt.plot(plot_x, MSE[3], 'k') if len(MSE[4]) > 0: plt.plot(plot_x, MSE[4], 'g') if len(MSE[5]) > 0: plt.plot(plot_x, MSE[5], 'm') if len(MSE[6]) > 0: plt.plot(plot_x, MSE[6], color='coral', linestyle=':', marker='|') plt.plot(plot_x, min_stacking_MSE, color='cyan') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.legend( ('SVR avg = ' + str(np.mean(MSE[0])), 'RFR avg = ' + str(np.mean(MSE[1])), 'Lasso avg=' + str(np.mean(MSE[2])), 'Enet avg=' + str(np.mean(MSE[3])), 'Gboost avg = ' + str(np.mean(MSE[4])), 'Bagging before avg = ' + str(np.mean(MSE[5])), 'Bagging after avg = ' + str(np.mean(MSE[6])), 'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))), loc='upper right') plt.title('Different learning machine') plt.savefig('DifferentLearner.png') plt.clf() plt.plot() plot_x = np.linspace(1, times, times) plt.plot(plot_x, Ada_MSE, 'b') plt.plot(plot_x, MSE[6], 'r') plt.plot(plot_x, min_stacking_MSE, 'g') plt.legend(('Adaboost avg = ' + str(np.mean(Ada_MSE)), 'Bagging avg = ' + str(np.mean(MSE[6])), 'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))), loc='upper right') plt.title('Bagging VS St-LIBS VS Adaboost') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.savefig('Bagging VS St-LIBS&Adaboost.png') plt.clf() plt.plot() plot_x = np.linspace(1, times, times) if len(stacking_MSE[0]) > 0: plt.plot(plot_x, stacking_MSE[0], 'b') if len(stacking_MSE[1]) > 0: plt.plot(plot_x, stacking_MSE[1], 'r') if len(stacking_MSE[2]) > 0: plt.plot(plot_x, stacking_MSE[2], 'y') if len(stacking_MSE[3]) > 0: plt.plot(plot_x, stacking_MSE[3], 'k') if len(stacking_MSE[4]) > 0: plt.plot(plot_x, stacking_MSE[4], 'g') if len(stacking_MSE[5]) > 0: plt.plot(plot_x, stacking_MSE[5], 'm') plt.legend(('SVR avg = ' + str(np.mean(stacking_MSE[0])), 'RFR avg = ' + str(np.mean(stacking_MSE[1])), 'Lasso avg=' + str(np.mean(stacking_MSE[2])), 'Enet avg=' + str(np.mean(stacking_MSE[3])), 'Gboost avg = ' + str(np.mean(stacking_MSE[4])), 'Bagging avg = ' + str(np.mean(stacking_MSE[5]))), loc='upper right') plt.title('Different meta-learning machine(Adaboost avg MSE=' + str(np.mean(Ada_MSE)) + ')') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.savefig('DifferentMetaLearner.png') plt.clf() plt.plot() index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING1', 'BAGGING2'] mse_file = pd.DataFrame(index=index, data=MSE) mse_file.to_csv('MSE.csv', encoding='utf-8') index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING'] mse_file = pd.DataFrame(index=index, data=stacking_MSE) mse_file.to_csv('stacking_MSE.csv', encoding='utf-8') mse_file = pd.DataFrame(data=min_stacking_MSE) mse_file.to_csv('min_stacking_MSE.csv', encoding='utf-8') index = [ 'SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'Adaboost', 'BAGGING1', 'BAGGING2' ] r_file = pd.DataFrame(index=index, data=R_square) r_file.to_csv('R_square.csv', encoding='utf-8') index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING'] mse_file = pd.DataFrame(index=index, data=stacking_R_square) mse_file.to_csv('stacking_R_square.csv', encoding='utf-8')
gbm_penetration_rate = lgb.LGBMRegressor( n_estimators=200, subsample_freq=1, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05, max_depth=8, num_leaves=256, objective='xentropy', device='gpu', ) xgb_penetration_rate = xgb.XGBRegressor(n_estimators=200, subsample_freq=1, subsample=0.7, colsample_bytree=0.7, learning_rate=0.1, max_depth=8, num_leaves=256, objective='reg:logistic', n_jobs=-1) meta_reg = Ridge() stregr = StackingRegressor( regressors=[gbm_penetration_rate, xgb_penetration_rate], meta_regressor=meta_reg) stregr.fit(X_train, y_train[:, 0]) print(1 - stregr.score(X_val, y_val[:, 0]))
silent=1, random_state=7, nthread=-1) gbm_b = GradientBoostingRegressor(learning_rate=0.05, n_estimators=2000, max_depth=4, max_features='log2', min_samples_leaf=15, min_samples_split=10, loss='huber') stackmodel = StackingRegressor( regressors=[ElNet_b, lasso_b, ridge_b, svr_b, model_xgb_b, gbm_b], meta_regressor=Lasso(alpha=0.00035)) stackmodel.fit(x_train, y_train) stacked = stackmodel.predict(x_test) rmse_stacked = np.sqrt(mean_squared_error(y_train, stackmodel.predict(x_train))) stacked_pred = np.expm1(stacked) # Averaged model ensembled = np.expm1((0.25 * ridge.predict(x_test).reshape(-1, 1)) + (0.2 * ElNet.predict(x_test).reshape(-1, 1)) + (0.2 * lasso.predict(x_test).reshape(-1, 1)) + (0.15 * model_xgb.predict(x_test).reshape(-1, 1)) + (0.2 * GBoost.predict(x_test).reshape(-1, 1))) # Print the performance of each model obj = pd.DataFrame([[
plt.ylabel('Accuracy') plt.show() # In[368]: from mlxtend.regressor import StackingRegressor lr = LinearRegression() sclf = StackingRegressor(regressors=[grid_search, abr, rfr], meta_regressor=lr) print('3-fold cross validation:\n') for clf, label in zip([grid_search, abr, rfr, sclf], ['grid_search', 'abr', 'rfr', 'StackingClassifier']): scores = cross_val_score(clf, X, y) print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) sclf.fit(X_train, y_train) predictions = sclf.predict(X_test) # In[370]: train_sizes, train_score, test_score = learning_curve( sclf, X, y, train_sizes=[0.1, 0.2, 0.4, 0.6, 0.8, 1], cv=3) train_error = 1 - np.mean(train_score, axis=1) test_error = 1 - np.mean(test_score, axis=1) plt.plot(train_sizes, 1 - train_error, 'o-', color='r', label='training') plt.plot(train_sizes, 1 - test_error, 'o-', color='g', label='testing') plt.legend(loc='best') plt.xlabel('traing examples') plt.ylabel('Accuracy') plt.show()