def test_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear') rf = RandomForestRegressor(random_state=2) ridge = Ridge(random_state=0) svr_rbf = SVR(kernel='rbf') stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=True) stack.fit(X1, y).predict(X1) mse = 0.14 got = np.mean((stack.predict(X1) - y)**2) print(got) assert round(got, 2) == mse stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=False) # dense stack.fit(X1, y).predict(X1) mse = 0.12 got = np.mean((stack.predict(X1) - y)**2) print(got) assert round(got, 2) == mse
def test_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') rf = RandomForestRegressor(n_estimators=10, random_state=2) ridge = Ridge(random_state=0) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=True) stack.fit(X1, y).predict(X1) mse = 0.14 got = np.mean((stack.predict(X1) - y) ** 2) print(got) assert round(got, 2) == mse stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=False) # dense stack.fit(X1, y).predict(X1) mse = 0.12 got = np.mean((stack.predict(X1) - y) ** 2) print(got) assert round(got, 2) == mse
def test(self): df =pd.read_csv('MorganMACCS.csv') baseDf = df extractDf = df['CAS'].isin(ejectCAS) df = df[~df['CAS'].isin(ejectCAS)] y = df['logTox'] dropList = ['CAS','toxValue','logTox','HDonor', 'HAcceptors', 'AromaticHeterocycles', 'AromaticCarbocycles', 'FractionCSP3'] #dropList = ['CAS','toxValue','logTox'] X = df.drop(columns=dropList) #Normalize for name in X.columns: if str.isdecimal(name)==True: if X[str(name)].sum() == 0: print(name) X = X.drop(columns=name) else: std =X[name].std() mean = X[name].mean() X[name] = X[name].apply(lambda x: ((x - mean) * 1 / std + 0)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=2) cols = np.arange(1,550,1).tolist() cols = X.columns.tolist() cols = [1,2,3] # Initializing Classifiers reg1 = Ridge(random_state=1) #reg2 = ExtraTreesRegressor() reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) reg3 = SVR(gamma='auto',kernel='linear') reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) pls = PLSRegression(n_components=3) pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50)) #linear =SGDRegressor(max_iter=1000) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) nbrs = KNeighborsRegressor(2) pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31)) meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1) stackReg.fit(X_train, y_train) y_pred = stackReg.predict(X_train) y_val = stackReg.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test)) reg4.fit(X_train, y_train) y_pred = reg4.predict(X_train) y_val = reg4.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
class RegressorBlender: def __init__(self, x_train, x_test, y_train, y_test=None): x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) self.x_train = x_train self.x_test = x_test self.y_train = y_train['y'].values if self.y_train is not None: self.y_test = y_test['y'].values def reg_blend(self): mete_reg = LinearRegression() reg1 = model.svm_regressor() reg2 = model.randomforest_regressor() reg3 = model.xgb_regressor() self.blend = StackingRegressor(regressors=[reg1, reg2, reg3], meta_regressor=mete_reg) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=10, verbose=2) return scores def prediction(self): y_pred = self.blend.predict(self.x_test) return y_pred
class Blend: def __init__(self, x_train, x_test, y_train): x_train.drop(['Unnamed: 0', 'PromoInterval', 'Date'], axis=1, inplace=True) x_test.drop(['Unnamed: 0', 'Id', 'PromoInterval', 'Date'], axis=1, inplace=True) self.x_train = x_train self.x_test = x_test self.y_train = y_train['Sales'].values def blending(self): mete_reg = LinearRegression() reg1 = model.svm_regressor() reg2 = model.randomforest_regressor() reg3 = model.xgb_regressor() self.blend = StackingRegressor(regressors=[reg1, reg2, reg3], meta_regressor=mete_reg) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=10, verbose=2) # scoring='neg_mean_squared_error' return scores def prediction(self): y_pred = self.blend.predict(self.x_test) y_pred = np.expm1(y_pred) return y_pred
def test_predict_meta_features(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf) X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3) stregr.fit(X_train, y_train) test_meta_features = stregr.predict(X_test) assert test_meta_features.shape[0] == X_test.shape[0]
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.122 got = np.mean((stregr.predict(X2) - y2)**2) assert round(got, 3) == mse
def stacking(self): from sklearn.svm import SVR from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor import lightgbm as lgb from lightgbm import LGBMRegressor import xgboost as xgb from mlxtend.regressor import StackingRegressor import scipy as sc s = make_pipeline(RobustScaler(), SVR(kernel='rbf', C=10, gamma=0.005)) rf = make_pipeline( RandomForestRegressor(random_state=641, n_estimators=250, max_depth=9)) GBoost = GradientBoostingRegressor(n_estimators=330, learning_rate=0.01, max_depth=12, max_features='sqrt', min_samples_leaf=1, min_samples_split=42, loss='ls', random_state=40, subsample=1) model_xgb = xgb.XGBRegressor(colsample_bytree=1, gamma=5, learning_rate=0.01, max_depth=11, min_child_weight=1.7817, n_estimators=500, reg_alpha=0.8, reg_lambda=5, subsample=0.5213, silent=1, seed=1024, nthread=-1) model_lgb = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.05, n_estimators=290, max_bin=147, subsample=0.65, colsample_bytree=0.7, feature_fraction_seed=46, subsample_freq=9, min_child_samples=20, min_child_weight=0.001) regressors = [s, rf, GBoost, model_lgb, model_xgb] stregr = StackingRegressor(regressors=regressors, meta_regressor=model_xgb) stregr.fit(self.X_train, self.y_train) print("the model is staking and the test's pearsonr is: ", sc.stats.pearsonr(self.y_test, stregr.predict(self.X_test))[0]) return stregr
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.122 got = np.mean((stregr.predict(X2) - y2) ** 2) assert round(got, 3) == mse
def test_multivariate(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X2, y).predict(X2) mse = 0.218 got = np.mean((stregr.predict(X2) - y) ** 2) assert round(got, 3) == mse
def test_multivariate(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X2, y).predict(X2) mse = 0.22 got = np.mean((stregr.predict(X2) - y)**2) assert round(got, 2) == mse
def test_different_models(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) stregr.fit(X1, y).predict(X1) mse = 0.21 got = np.mean((stregr.predict(X1) - y)**2) assert round(got, 2) == mse
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.12 got = np.mean((stregr.predict(X2) - y2)**2.) # there seems to be an issue with the following test on Windows # sometimes via Appveyor assert round(got, 2) == mse, got
def test_different_models(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) y_pred = stregr.fit(X1, y).predict(X1) mse = 0.214 got = np.mean((stregr.predict(X1) - y) ** 2) assert round(got, 3) == mse
def test_multivariate_class(): lr = LinearRegression() ridge = Ridge(random_state=1) meta = LinearRegression(normalize=True) stregr = StackingRegressor(regressors=[lr, ridge], meta_regressor=meta) stregr.fit(X2, y2).predict(X2) mse = 0.12 got = np.mean((stregr.predict(X2) - y2) ** 2.) # there seems to be an issue with the following test on Windows # sometimes via Appveyor assert round(got, 2) == mse, got
def mlx_reg_1(self): lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() lasso, lasso_pred = self.lasso_regr() sclf = StackingRegresorMLX( regressors=[lr, rf, lasso], meta_regressor=RandomForestRegressor(ccp_alpha=0.1, max_features="auto", n_estimators=30) ) sclf.fit(self.x_train, self.y_train) return sclf.predict(self.x_test)
def test_sparse_matrix_inputs_and_features_in_secondary(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') rf = RandomForestRegressor(n_estimators=10, random_state=2) ridge = Ridge(random_state=0) svr_rbf = SVR(kernel='rbf', gamma='auto') stack = StackingRegressor(regressors=[svr_lin, lr, ridge, rf], meta_regressor=svr_rbf, use_features_in_secondary=True) # dense stack.fit(X1, y).predict(X1) mse = 0.14 got = np.mean((stack.predict(X1) - y)**2) assert round(got, 2) == mse # sparse stack.fit(sparse.csr_matrix(X1), y) mse = 0.14 got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y)**2) assert round(got, 2) == mse
def regressionStacking(df): # StackingRegressor inputdata type is ndarray X_train, X_test, y_train, y_test = trainDataSplit(df) randomforest_regressor = RandomForestRegressor() # # lightgbm不是scikit-learn的包,mlxtend不支持 # lgb_train = lightgbm.Dataset(X_train, y_train) # lgb_eval = lightgbm.Dataset(X_test, y_test, reference=lgb_train) # # # specify your configurations as a dict # params = { # 'task': 'train', # 'boosting_type': 'gbdt', # 'objective': 'regression', # 'metric': {'l2', 'auc'}, # 'num_leaves': 2 ** 10, # 'learning_rate': 1.0, # 'feature_fraction': 0.9, # 'bagging_fraction': 0.8, # 'bagging_freq': 5, # 'verbose': 0 # } # lightgbm_regressor = lightgbm.train(params, # lgb_train, # num_boost_round=20, # valid_sets=lgb_eval, # early_stopping_rounds=5) lasso_regressor = Lasso() dnn_regressor = MLPRegressor() linearRegression_regressor = LinearRegression() stacking_regressor = StackingRegressor( regressors=[randomforest_regressor, lasso_regressor, dnn_regressor], meta_regressor=linearRegression_regressor) stacking_regressor.fit(X_train, X_train) y_pred = stacking_regressor.predict(X_test) criterion_df, predict_result = predictResultOutput(stacking_regressor, X_test, y_test, y_pred) # save model joblib.dump(stacking_regressor, 'stacking.model') return criterion_df, predict_result
def sbg_mlxtend_ensamble(iterate): iterate += 501 lin_mod = linear_model.LinearRegression() bsn_rdg = linear_model.BayesianRidge() elstc_nt = ElasticNet(alpha=0.2, l1_ratio=1) ridge = Ridge(alpha=0.01, tol=0.1, solver='sag') svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1) sgd_reg = linear_model.SGDRegressor(penalty='l2', alpha=0.001, n_iter=1000) lasso_reg = linear_model.Lasso(alpha=1, max_iter=3000, normalize='True', selection='random', tol=0.001) rndm_frst = RandomForestRegressor(max_depth=5, n_estimators=10) stregr = StackingRegressor(regressors=[sgd_reg, rndm_frst], meta_regressor=ridge) X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y2, test_size=0.20, random_state=iterate) X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) stregr.fit(X_train, y_train) y_pred = stregr.predict(X_test) #print("Mean Squared Error: %.4f" # % np.mean((y_pred - y_test.values) ** 2)) #print('Variance Score: %.4f' % stregr.score(X_test, y_test.values)) dev_Memory = abs(y_pred - y_test.values) mean_dev = np.mean(dev_Memory) mse_Memory = np.sqrt(np.sum(dev_Memory**2) / dev_Memory.size) mape = np.mean(dev_Memory / y_test.values) max_pe = np.max(dev_Memory) max_ne = np.max(np.negative(dev_Memory)) new_data1 = pd.DataFrame(y_pred) new_data2 = pd.DataFrame(y_test.values) new_data = pd.merge(new_data1, new_data2, left_index=True, right_index=True) filename12 = r'C:\Users\epatdeb\AlphaCANDI\SBG_Rawinput_1.6\latest\Logs\AlphaCandi17_MlxEnsmbl_Memory.log' logging.basicConfig(filename=filename12, level=logging.DEBUG) logging.info( "tensor_bp sbg_mlxtend_ensamble iter:%s \n \n y_pred/y_test: \n %s \n mae:%s mse:%s mape:%s max_pe:%s max_ne:%s", iterate, new_data, mean_dev, mse_Memory, mape, max_pe, max_ne) logging.shutdown() return mean_squared_error(y_test, y_pred), mean_dev, mape
def Gbc(): from sklearn.ensemble import GradientBoostingClassifier, AdaBoostRegressor from sklearn.linear_model import LogisticRegression from mlxtend.regressor import StackingRegressor from sklearn.svm import SVR adaboost = AdaBoostRegressor() lr = LogisticRegression gb = GradientBoostingClassifier() svr = SVR(kernel='linear') svr_rbf = SVR(kernel='rbf') regressors = [svr, adaboost, gb] stregr = StackingRegressor(regressors=regressors, meta_regressor=svr_rbf) stregr.fit(X_train, y_train) outpred = stregr.predict(X_valid) evaluate_strategy(outpred)
def test_sample_weight(): lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1) mse = 0.22 got = np.mean((stregr.predict(X1) - y)**2) assert round(got, 2) == mse # make sure that this is not equivalent to the model with no weight pred2 = stregr.fit(X1, y).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def test_sample_weight(): lr = LinearRegression() svr_lin = SVR(kernel='linear', gamma='auto') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf', gamma='auto') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) pred1 = stregr.fit(X1, y, sample_weight=w).predict(X1) mse = 0.22 got = np.mean((stregr.predict(X1) - y) ** 2) assert round(got, 2) == mse # make sure that this is not equivalent to the model with no weight pred2 = stregr.fit(X1, y).predict(X1) maxdiff = np.max(np.abs(pred1 - pred2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def train_model(X_train, y_train): clf1 = LinearSVR() clf2 = LinearRegression() clf3 = Ridge() clf4 = LGBMRegressor() svr_linear = LinearSVR() sr = StackingRegressor(regressors=[clf1, clf2, clf3, clf4], meta_regressor=svr_linear) sr.fit(X_train, y_train) result = sr.predict(X_train) score = get_rmse_score(result, y_train) print("RMSE Score train: %.4f" % score) return sr
def train(self, X, y): features = X labels = y #test train split X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=4) #Ridge regcv = linear_model.RidgeCV( alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]) regcv.fit(features, labels) regcv.alpha_ reg = linear_model.Ridge(alpha=regcv.alpha_) reg.fit(features, labels) # GB params = { 'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls' } gbr = ensemble.GradientBoostingRegressor(**params) gbr.fit(features, labels) #blended model meta = linear_model.LinearRegression() blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta) _ = blender.fit(features, labels) y_pred = blender.predict(X_test) print "***** TRAINING STATS ********" scores = cross_val_score(blender, features, labels, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) mean_diff = np.mean(np.abs(np.exp(Y_test) - np.exp(y_pred))) p_mean_diff = np.mean(mean_diff / np.exp(Y_test)) print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff * 100) print "***** TRAINING STATS ********" return blender
def train(self, X,y): features = X labels = y #test train split X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=4) #Ridge regcv = linear_model.RidgeCV(alphas=[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]) regcv.fit(features, labels) regcv.alpha_ reg = linear_model.Ridge(alpha=regcv.alpha_) reg.fit(features, labels) # GB params = {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls'} gbr = ensemble.GradientBoostingRegressor(**params) gbr.fit(features, labels) #blended model meta = linear_model.LinearRegression() blender = StackingRegressor(regressors=[reg, gbr], meta_regressor=meta) _=blender.fit(features, labels) y_pred = blender.predict(X_test) print "***** TRAINING STATS ********" scores = cross_val_score(blender, features, labels, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) mean_diff = np.mean(np.abs(np.exp(Y_test)-np.exp(y_pred))) p_mean_diff = np.mean(mean_diff/np.exp(Y_test)) print "Mean Error:\t %.0f/%0.3f%%" % (mean_diff, p_mean_diff*100) print "***** TRAINING STATS ********" return blender
def stackModel(self): train_X = self.X.as_matrix() train_Y = self.Y.as_matrix() test_X = self.Test.as_matrix() # train_X = data_scaler(train_X) X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1) gbdt = GradientBoostingRegressor(loss='ls', alpha=0.9, n_estimators=500, learning_rate=0.05, max_depth=8, subsample=0.8, min_samples_split=9, max_leaf_nodes=10) xgb = XGBRegressor(max_depth=5, n_estimators=500, learning_rate=0.05, silent=False) lr = LinearRegression() rfg = RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=11, min_samples_split=8, n_estimators=100) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[gbdt, xgb, lr, rfg], meta_regressor=svr_rbf) stregr.fit(X_train, y_train) stregr.predict(X_train) # Evaluate and visualize the fit print("Mean Squared Error: %.6f" % np.mean((stregr.predict(X_train) - y_train) ** 2) ** 0.5) error(stregr.predict(X_test), y_test) # online result = stregr.predict(test_X) save_to_file(result, self.uid, "../result/result_12.09_2_stacking.csv") with plt.style.context(('seaborn-whitegrid')): plt.scatter(X_train, y_train, c='lightgray') plt.plot(X_train, stregr.predict(X_train), c='darkgreen', lw=2) plt.show()
plt.ylabel('Accuracy') plt.show() # In[368]: from mlxtend.regressor import StackingRegressor lr = LinearRegression() sclf = StackingRegressor(regressors=[grid_search, abr, rfr], meta_regressor=lr) print('3-fold cross validation:\n') for clf, label in zip([grid_search, abr, rfr, sclf], ['grid_search', 'abr', 'rfr', 'StackingClassifier']): scores = cross_val_score(clf, X, y) print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) sclf.fit(X_train, y_train) predictions = sclf.predict(X_test) # In[370]: train_sizes, train_score, test_score = learning_curve( sclf, X, y, train_sizes=[0.1, 0.2, 0.4, 0.6, 0.8, 1], cv=3) train_error = 1 - np.mean(train_score, axis=1) test_error = 1 - np.mean(test_score, axis=1) plt.plot(train_sizes, 1 - train_error, 'o-', color='r', label='training') plt.plot(train_sizes, 1 - test_error, 'o-', color='g', label='testing') plt.legend(loc='best') plt.xlabel('traing examples') plt.ylabel('Accuracy') plt.show()
num_boost_round=2889, early_stopping_rounds=50, evals=watchlist) rfreg = RandomForestRegressor(random_state=1, max_depth=15) ridge_reg = Ridge(normalize=True) lasso_reg = Lasso() linear_reg = LinearRegression(normalize=True) stacking_reg = StackingRegressor(regressors=[rfreg, ridge_reg, lasso_reg], meta_regressor=linear_reg) feature = [x for x in train_zero_var.columns if x not in ['Value']] # X_train, X_test, y_train, y_test = train_test_split(train_zero_var[feature], train_zero_var['Value'], test_size=0.2, # random_state=0) stacking_reg.fit(X_train, y_train) stacking_test = pd.DataFrame(stacking_reg.predict(X_test)) stacking_test.columns = ['stacking_pred'] y_test = pd.DataFrame(y_test) y_test.columns = ['Value'] mean_squared_error(stacking_test['stacking_pred'], y_test['Value']) train_zero_var = train_zero_var.reset_index() # predict for Random Forest rf_pred = pd.DataFrame() for idx in range(0, 5): train = train_zero_var[train_zero_var['index'] % 5 != idx] test = train_zero_var[train_zero_var['index'] % 5 == idx] stacking_feature = [ x for x in train.columns if x not in ['index', 'Value'] ]
nthread=-1) gbm_b = GradientBoostingRegressor(learning_rate=0.05, n_estimators=2000, max_depth=4, max_features='log2', min_samples_leaf=15, min_samples_split=10, loss='huber') stackmodel = StackingRegressor( regressors=[ElNet_b, lasso_b, ridge_b, svr_b, model_xgb_b, gbm_b], meta_regressor=Lasso(alpha=0.00035)) stackmodel.fit(x_train, y_train) stacked = stackmodel.predict(x_test) rmse_stacked = np.sqrt(mean_squared_error(y_train, stackmodel.predict(x_train))) stacked_pred = np.expm1(stacked) # Averaged model ensembled = np.expm1((0.25 * ridge.predict(x_test).reshape(-1, 1)) + (0.2 * ElNet.predict(x_test).reshape(-1, 1)) + (0.2 * lasso.predict(x_test).reshape(-1, 1)) + (0.15 * model_xgb.predict(x_test).reshape(-1, 1)) + (0.2 * GBoost.predict(x_test).reshape(-1, 1))) # Print the performance of each model obj = pd.DataFrame([[ score_ridge, rdg_trainRMSE, rdg_testRMSE, rmse_ridge_test - rmse_ridge_train, 0.11866
def predict(): ''' For rendering results on HTML GUI ''' features = [x for x in request.form.values()] #final_features = [np.array(int_features)] #prediction = model.predict(final_features) #output = round(prediction[0], 2) features = np.array(features) features = features.reshape(1, 6) features = pd.DataFrame(data=features, columns=[ 'Name', 'Genre', 'Comments', 'Likes', 'Popularity', 'Followers' ]) df = pd.read_csv('data.csv') cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int} df = df.astype(cv) features = features.astype(cv) #x=df[df['Views']==0].index df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Popularity']].index, axis=1, inplace=True) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)) df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)] df = df.drop( columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index']) y = df['Views'] df = df.drop(columns=['Views']) be = BinaryEncoder() df = be.fit_transform(df) f = be.transform(features) X = df.iloc[:, :] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) rg1 = AdaBoostRegressor() rg1.fit(X_train, y_train) #ypred=rg1.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1) # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]} # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1) rg2.fit(X_train, y_train) #ypred=rg2.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15) # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]} # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1) rg3.fit(X_train, y_train) #ypred=rg3.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3) rg6.fit(X_train, y_train) #ypred=rg6.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) f = f.iloc[:, :] y_pred = rg6.predict(f) y_pred = y_pred.astype(int) return render_template( 'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
#============================================================================== # 4) LGBMRegressor模型 #============================================================================== # from lightgbm import LGBMRegressor # # model_lgb = LGBMRegressor() #============================================================================== # 5) 融合模型 #============================================================================== from mlxtend.regressor import StackingRegressor # regressors = [model_xgb,model_rfg,model_gb] model = StackingRegressor(regressors=regressors, meta_regressor=model_xgb) # model = model_gb model.fit(train_text,train_labels) # print('The parameters of the best model are: ') # print(model.best_params_) preds = model.predict(train_text) print('The pearsonr of training set is {}'.format(pearsonr (list(train_labels), list(preds))[0])) print('The MSE of training set is {}'.format(mean_squared_error(list(train_labels), list(preds)))) #============================================================================== # 预测 测试集 #============================================================================== preds = model.predict(test_text) print('The pearsonr of test set is {}'.format(pearsonr (list(test_labels), list(preds))[0])) print('The MSE of test set is {}'.format(mean_squared_error(list(test_labels), list(preds))))
'max_bin': 8192, 'verbosity': 10 } modelL1 = lgb.LGBMRegressor(**params) modelL2 = lgb.LGBMRegressor(**params2) metaregr = Ridge(solver="sag", max_iter=300) stregr = StackingRegressor(regressors=[modelR1, modelR3, modelL1, modelL2], meta_regressor=metaregr, verbose=10) stregr.fit(X, y) print('Weights/Iter of Regressors=', stregr.coef_) #preds = stregr.predict(X) #print('RMSE=', mean_squared_error(y, preds)**0.5) pred_test = stregr.predict(X_test) submission['price'] = np.expm1(pred_test) submission.to_csv("FF_LR_Meta.csv", index=False) #============================================================================== # GridSearch Cross Validation # paramsGSCV = { # 'ridge__alpha': [0.4, 1.0], # 'lgbmregressor__max_depth': [4, 5], # 'meta-ridge__alpha': [1.0] # } # stregr.get_params().keys() #Print list of available parameters # grid = GridSearchCV(estimator=stregr, # param_grid=paramsGSCV, # cv=4,
y[::5] += 3 * (0.5 - np.random.rand(8)) # Initializing models lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) # Training the stacking classifier stregr.fit(X, y) stregr.predict(X) # Evaluate and visualize the fit print("Mean Squared Error: %.4f" % np.mean((stregr.predict(X) - y)**2)) print('Variance Score: %.4f' % stregr.score(X, y)) with plt.style.context(('seaborn-whitegrid')): plt.scatter(X, y, c='lightgray') plt.plot(X, stregr.predict(X), c='darkgreen', lw=2) plt.show() # Example 2 - Stacked Regression and GridSearch from sklearn.model_selection import GridSearchCV
def main(): print("Reading in Data") #最终预测 train = pd.read_csv('cleaned_train20180129_111517.csv') test = pd.read_csv('cleaned_test20180129_111517.csv') #验证A榜结果 #train = pd.read_csv('cleaned_train20180129_102513.csv') #test = pd.read_csv('cleaned_test20180129_102513.csv') test = test.drop(['id'], axis=1) train = train.drop(['id'], axis=1) y_train = train['血糖'] #pred_proba为测试集血糖权重 threshold = 6.5 test_num = len(test) train_num = len(train) bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat, X_train, pred_proba = fuck_columns( train, test, threshold) print("linear model 开始训练") pred_bigger, pred_less, linear_bigger, linear_less = linear_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 linear_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("lasso model 开始训练") pred_bigger, pred_less, lasso_bigger, lasso_less = lasso_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 lasso_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("ENet model 开始训练") pred_bigger, pred_less, ENet_bigger, ENet_less = ENet_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 ENet_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("集成模型开始训练...") print("RandomForestRegressor...") pred_bigger, pred_less, rf_bigger, rf_less = rf_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 rf_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("GradientBoostingRegressor...") pred_bigger, pred_less, gb_bigger, gb_less = GBoost_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 gb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("LGBMRegressor...") pred_bigger, pred_less, lgb_bigger, lgb_less = LGBM_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 lgb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) print("XGBRegressor...") pred_bigger, pred_less, xgb_bigger, xgb_less = xgb_model( bigger_thr_X, bigger_thr_y, less_thr_X, less_thr_y, test_concat) #预测结果结合权重 xgb_pred_res = np.array([ pred_less[i] * pred_proba[i][0] + pred_bigger[i] * pred_proba[i][1] for i in range(test_num) ]) ''' Stacking Learning ''' print("StackingRegressor...") stacked_averaged_bigger_models = StackingRegressor( regressors=[linear_bigger, lasso_bigger, ENet_bigger], meta_regressor=gb_bigger) stacked_averaged_less_models = StackingRegressor( regressors=[linear_less, lasso_less, ENet_less], meta_regressor=gb_less) #拟合模型 stacked_averaged_bigger_models.fit(bigger_thr_X, bigger_thr_y) stacked_averaged_less_models.fit(less_thr_X, less_thr_y) #测试集预测 stacked_bigger_pred = stacked_averaged_bigger_models.predict(test_concat) stacked_less_pred = stacked_averaged_less_models.predict(test_concat) #预测结果结合权重 stacked_pred_res = np.array([ stacked_less_pred[i] * pred_proba[i][0] + stacked_bigger_pred[i] * pred_proba[i][1] for i in range(test_num) ]) ensemble = stacked_pred_res * 0.40 + xgb_pred_res * 0.40 + lgb_pred_res * 0.20 #stacking融合linear new_ensemble = np.array([ linear_pred_res[i] * pred_proba[i][0] + ensemble[i] * pred_proba[i][1] for i in range(test_num) ]) sub = pd.DataFrame({'pred': ensemble}) sub_wig = pd.DataFrame({'pred': new_ensemble}) sub.to_csv('submission_b.csv', header=None, index=False) sub_wig.to_csv('submission_b_wig.csv', header=None, index=False)
def useXYtrain(x, y, times): flag = 0 for i in range(0, len(Selected_learnerCode)): if Selected_learnerCode[i] != '': flag += 1 if flag == 0: print('No proper learner\n') return stacking_MSE = [[], [], [], [], [], []] MSE = [[], [], [], [], [], [], []] R_square = [[], [], [], [], [], [], [], []] Ada_MSE = [] Ada_r_square = [] for i in range(0, times): print('第' + str(i + 1) + '次试验:\n') Learners_map = {} Learners = [] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20) svr = SVR(C=1.0, epsilon=0.2) parameters = { 'C': np.logspace(-3, 3, 7), 'gamma': np.logspace(-3, 3, 7) } print("GridSearch starting...") clfsvr = GridSearchCV(svr, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfsvr.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfsvr.best_params_) y_pred = clfsvr.best_estimator_.predict(X_test) # drawTrain(y_pred, y_test, 'SVR', i) # SVR_MSE.append(mean_squared_error(y_test, y_pred)) yy = clfsvr.best_estimator_.predict(x) R_square[0].append(drawTrain(y, yy, 'SVR', i)) MSE[0].append(mean_squared_error(y_test, y_pred)) if 'SVR' in Selected_learnerCode: print('SVR Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfsvr.best_estimator_) Learners_map['SVR'] = svr """ann = Regressor(layers = [Layer("Sigmoid", units=14), Layer("Linear")], learning_rate = 0.02, random_state = 2018, n_iter = 10) ann.fit(X_train,y_train) y_pred = ann.predict(X_test) print('ANN Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n")""" parameters = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000]} rfr = RandomForestRegressor(n_estimators=200, random_state=0) # drawTrain(rfr, x, y, 'RFR', i) # rfr = RandomForestRegressor(n_estimators=200, random_state=0) clfrfr = GridSearchCV(rfr, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfrfr.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfrfr.best_params_) y_pred = clfrfr.best_estimator_.predict(X_test) yy = clfrfr.best_estimator_.predict(x) MSE[1].append(mean_squared_error(y_test, y_pred)) R_square[1].append(drawTrain(y, yy, 'RFR', i)) # RFR_MSE.append(mean_squared_error(y_test, y_pred)) if 'RFR' in Selected_learnerCode: print('RFR Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfrfr.best_estimator_) Learners_map['RFR'] = rfr parameters = {'alpha': np.logspace(-2, 2, 5)} lasso = Lasso(alpha=0.05, random_state=1, max_iter=1000) # drawTrain(lasso, x, y, 'LASSO', i) clflasso = GridSearchCV(lasso, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clflasso.fit(X_train, y_train) yy = clflasso.best_estimator_.predict(x) print('The parameters of the best model are: ') print(clflasso.best_params_) y_pred = clflasso.best_estimator_.predict(X_test) R_square[2].append(drawTrain(y, yy, 'LASSO', i)) MSE[2].append(mean_squared_error(y_test, y_pred)) if 'LASSO' in Selected_learnerCode: print('LASSO Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('LASSO Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clflasso.best_estimator_) Learners_map['LASSO'] = lasso # drawTrain(ENet, X_train, y_train,X_test,y_test, 'Elastic NET', i) parameters = { 'alpha': np.logspace(-2, 2, 5), 'l1_ratio': np.linspace(0, 1.0, 11) } # ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3) # drawTrain(ENet, x, y, 'Elastic NET', i) ENet = ElasticNet(alpha=0.05, l1_ratio=.9, random_state=3) clfENet = GridSearchCV(ENet, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfENet.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfENet.best_params_) y_pred = clfENet.best_estimator_.predict(X_test) yy = clfENet.best_estimator_.predict(x) MSE[3].append(mean_squared_error(y_test, y_pred)) R_square[3].append(drawTrain(y, yy, 'Elastic Net', i)) if 'ENET' in Selected_learnerCode: print('Elastic Net Mean squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfENet.best_estimator_) Learners_map['ENET'] = ENet parameters = {'n_estimators': [100, 500, 1000, 2000, 3000, 5000]} GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) clfGBoost = GridSearchCV(GBoost, parameters, n_jobs=-1, scoring='neg_mean_squared_error') clfGBoost.fit(X_train, y_train) print('The parameters of the best model are: ') print(clfGBoost.best_params_) y_pred = clfGBoost.best_estimator_.predict(X_test) yy = clfGBoost.best_estimator_.predict(x) MSE[4].append(mean_squared_error(y_test, y_pred)) # GBoost_MSE.append(mean_squared_error(y_test, y_pred)) R_square[4].append(drawTrain(y, yy, 'Gradient Boosting', i)) if 'GBOOST' in Selected_learnerCode: print('GBoost squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Learners.append(clfGBoost.best_estimator_) Learners_map['GBOOST'] = GBoost # Adaboost # Adaboost = AdaBoostRegressor(base_estimator=SVR(C=1.0, epsilon=0.2)) Adaboost = AdaBoostRegressor() Adaboost.fit(X_train, y_train) y_pred = Adaboost.predict(X_test) yy = Adaboost.predict(x) R_square[5].append(drawTrain(y, yy, 'Adaboost', i)) print('Adaboost with SVR squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") Ada_MSE.append(mean_squared_error(y_test, y_pred)) # BAGGING baggingModel = baggingAveragingModels( models=(clfsvr.best_estimator_, clfrfr.best_estimator_, clfENet.best_estimator_, clfGBoost.best_estimator_, clflasso.best_estimator_)) baggingModel.fit(X_train, y_train) y_pred = baggingModel.predict(X_test) MSE[5].append(mean_squared_error(y_test, y_pred)) yy = baggingModel.predict(x) R_square[6].append(drawTrain(y, yy, 'Bagging', i)) print('Bagging before selected squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") baggingModel = baggingAveragingModels(models=tuple(Learners)) # drawTrain(baggingModel, X_train, y_train,X_test,y_test, 'Bagging', i) # baggingModel = baggingAveragingModels(models=tuple(Learners)) baggingModel.fit(X_train, y_train) y_pred = baggingModel.predict(X_test) MSE[6].append(mean_squared_error(y_test, y_pred)) yy = baggingModel.predict(x) R_square[7].append(drawTrain(y, yy, 'Bagging', i)) print('Bagging after selected squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_R_square = [[], [], [], [], [], []] All_learner = ['SVR', 'RFR', 'LASSO', 'ENET', 'GBOOST'] for k in range(0, len(Selected_learnerCode)): """learnerList = [] for kk in range(0,len(Selected_learnerCode)): if Selected_learnerCode[kk]!='' : learnerList.append(Learners_map[Selected_learnerCode[kk]])""" """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), meta_model=Learners_map[All_learner[k]]) drawTrain(stacked_averaged_models, X_train, y_train,X_test,y_test, 'stacking with '+All_learner[k], i)""" # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), # meta_model=Learners_map[All_learner[k]]) params = {} """ if 'SVR' in Selected_learnerCode: params['svr__C'] = np.logspace(-3, 3, 7) params['svr__gamma'] = np.logspace(-3, 3, 7) if 'RFR' in Selected_learnerCode: params['randomforestregressor__n_estimators'] =[10, 50, 100, 500, 1000] if 'LASSO' in Selected_learnerCode: params['lasso__alpha'] = np.logspace(-2, 2, 5) if 'ENET' in Selected_learnerCode: params['elasticnet__alpha'] = np.logspace(-2, 2, 5) if 'GBOOST' in Selected_learnerCode: params['gradientboostingregressor__n_estimators']= [100, 500, 1000, 2000, 3000, 5000]""" if k == 0: params['meta-svr__C'] = np.logspace(-3, 3, 7) params['meta-svr__gamma'] = np.logspace(-3, 3, 7) if k == 1: params['meta-randomforestregressor__n_estimators'] = [ 10, 50, 100, 500, 1000 ] if k == 2: params['meta-lasso__alpha'] = np.logspace(-2, 2, 5) if k == 3: params['meta-elasticnet__alpha'] = np.logspace(-2, 2, 5) if k == 4: params['meta-gradientboostingregressor__n_estimators'] = [ 100, 500, 1000, 2000, 3000, 5000 ] """ params = {'svr__C': np.logspace(-3, 3, 7), 'svr__gamma': np.logspace(-3, 3, 7), 'randomforestregressor__n_estimators': [10, 50, 100, 500, 1000], 'lasso__alpha': np.logspace(-2, 2, 5), 'elasticnet__alpha':np.logspace(-2, 2, 5), 'gradientboostingregressor__n_estimators': [100, 500, 1000, 2000, 3000, 5000], }""" stacked_averaged_models = StackingRegressor( regressors=Learners, meta_regressor=Learners_map[All_learner[k]]) grid = GridSearchCV(estimator=stacked_averaged_models, param_grid=params) grid.fit(X_train, y_train) y_pred = grid.best_estimator_.predict(X_test) yy = grid.best_estimator_.predict(x) stacking_R_square[k].append( drawTrain(y, yy, 'stacking with ' + All_learner[k], i)) print('Stacking with metamodel is ' + All_learner[k] + ' squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_MSE[k].append(mean_squared_error(y_test, y_pred)) # stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), # meta_model=baggingModel) # drawTrain(stacked_averaged_models, X_train, y_train, X_test, y_test, 'stacking with Bagging models' , i) """stacked_averaged_models = StackingAveragedModels(base_models=tuple(learnerList), meta_model=Learners_map[All_learner[k]])""" stacked_averaged_models = StackingRegressor( regressors=Learners, meta_regressor=baggingModel) # grid = GridSearchCV(estimator=stacked_averaged_models, param_grid=params) stacked_averaged_models.fit(X_train, y_train) y_pred = stacked_averaged_models.predict(X_test) yy = stacked_averaged_models.predict(x) stacking_R_square[5].append( drawTrain(y, yy, 'stacking with bagging', i)) print('Stacking with metamodel is bagging models squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") # file.write('Stacking with metamodel is lasso squared error is ' + str(mean_squared_error(y_test, y_pred)) + "\n") stacking_MSE[5].append(mean_squared_error(y_test, y_pred)) gc.collect() print("Adaboost mean is " + str(np.mean(Ada_MSE))) min_stacking_MSE = [] for i in range(0, times): minMSE = stacking_MSE[0][i] for j in range(1, 6): if stacking_MSE[j][i] < minMSE: minMSE = stacking_MSE[j][i] min_stacking_MSE.append(minMSE) plot_x = np.linspace(1, times, times) if len(MSE[0]) > 0: plt.plot(plot_x, MSE[0], 'b') if len(MSE[1]) > 0: plt.plot(plot_x, MSE[1], 'r') if len(MSE[2]) > 0: plt.plot(plot_x, MSE[2], 'y') if len(MSE[3]) > 0: plt.plot(plot_x, MSE[3], 'k') if len(MSE[4]) > 0: plt.plot(plot_x, MSE[4], 'g') if len(MSE[5]) > 0: plt.plot(plot_x, MSE[5], 'm') if len(MSE[6]) > 0: plt.plot(plot_x, MSE[6], color='coral', linestyle=':', marker='|') plt.plot(plot_x, min_stacking_MSE, color='cyan') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.legend( ('SVR avg = ' + str(np.mean(MSE[0])), 'RFR avg = ' + str(np.mean(MSE[1])), 'Lasso avg=' + str(np.mean(MSE[2])), 'Enet avg=' + str(np.mean(MSE[3])), 'Gboost avg = ' + str(np.mean(MSE[4])), 'Bagging before avg = ' + str(np.mean(MSE[5])), 'Bagging after avg = ' + str(np.mean(MSE[6])), 'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))), loc='upper right') plt.title('Different learning machine') plt.savefig('DifferentLearner.png') plt.clf() plt.plot() plot_x = np.linspace(1, times, times) plt.plot(plot_x, Ada_MSE, 'b') plt.plot(plot_x, MSE[6], 'r') plt.plot(plot_x, min_stacking_MSE, 'g') plt.legend(('Adaboost avg = ' + str(np.mean(Ada_MSE)), 'Bagging avg = ' + str(np.mean(MSE[6])), 'St-LIBS avg = ' + str(np.mean(min_stacking_MSE))), loc='upper right') plt.title('Bagging VS St-LIBS VS Adaboost') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.savefig('Bagging VS St-LIBS&Adaboost.png') plt.clf() plt.plot() plot_x = np.linspace(1, times, times) if len(stacking_MSE[0]) > 0: plt.plot(plot_x, stacking_MSE[0], 'b') if len(stacking_MSE[1]) > 0: plt.plot(plot_x, stacking_MSE[1], 'r') if len(stacking_MSE[2]) > 0: plt.plot(plot_x, stacking_MSE[2], 'y') if len(stacking_MSE[3]) > 0: plt.plot(plot_x, stacking_MSE[3], 'k') if len(stacking_MSE[4]) > 0: plt.plot(plot_x, stacking_MSE[4], 'g') if len(stacking_MSE[5]) > 0: plt.plot(plot_x, stacking_MSE[5], 'm') plt.legend(('SVR avg = ' + str(np.mean(stacking_MSE[0])), 'RFR avg = ' + str(np.mean(stacking_MSE[1])), 'Lasso avg=' + str(np.mean(stacking_MSE[2])), 'Enet avg=' + str(np.mean(stacking_MSE[3])), 'Gboost avg = ' + str(np.mean(stacking_MSE[4])), 'Bagging avg = ' + str(np.mean(stacking_MSE[5]))), loc='upper right') plt.title('Different meta-learning machine(Adaboost avg MSE=' + str(np.mean(Ada_MSE)) + ')') plt.xlabel('Repeat times') plt.ylabel('MSE') plt.savefig('DifferentMetaLearner.png') plt.clf() plt.plot() index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING1', 'BAGGING2'] mse_file = pd.DataFrame(index=index, data=MSE) mse_file.to_csv('MSE.csv', encoding='utf-8') index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING'] mse_file = pd.DataFrame(index=index, data=stacking_MSE) mse_file.to_csv('stacking_MSE.csv', encoding='utf-8') mse_file = pd.DataFrame(data=min_stacking_MSE) mse_file.to_csv('min_stacking_MSE.csv', encoding='utf-8') index = [ 'SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'Adaboost', 'BAGGING1', 'BAGGING2' ] r_file = pd.DataFrame(index=index, data=R_square) r_file.to_csv('R_square.csv', encoding='utf-8') index = ['SVR', 'RFR', 'LASSO', 'ENET', 'Gboost', 'BAGGING'] mse_file = pd.DataFrame(index=index, data=stacking_R_square) mse_file.to_csv('stacking_R_square.csv', encoding='utf-8')
y[::5] += 3 * (0.5 - np.random.rand(8)) # Initializing models lr = LinearRegression() svr_lin = SVR(kernel='linear') ridge = Ridge(random_state=1) svr_rbf = SVR(kernel='rbf') stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf) # Training the stacking classifier stregr.fit(X, y) stregr.predict(X) # Evaluate and visualize the fit print("Mean Squared Error: %.4f" % np.mean((stregr.predict(X) - y) ** 2)) print('Variance Score: %.4f' % stregr.score(X, y)) with plt.style.context(('seaborn-whitegrid')): plt.scatter(X, y, c='lightgray') plt.plot(X, stregr.predict(X), c='darkgreen', lw=2) plt.show() print(stregr)
model_rf = RandomForestRegressor(n_estimators=200, max_features=0.26326530612244903, criterion='mse') model_extra_tree = ExtraTreesRegressor(n_estimators=200, criterion='mse') model_gb = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=43) model_lr = LinearRegression() svr_rbf = SVR(kernel='rbf') svr_lin = SVR(kernel='linear') ridge = Ridge() model_xgb2 = XGBRegressor(max_depth=10, n_estimators=100) model_vote = VotingClassifier( estimators=[('xgb', model_xgb), ('rf', model_rf), ('gb', model_gb)]) sclf = StackingRegressor(regressors=[model_extra_tree, model_xgb2, model_rf], meta_regressor=model_lr) time_split = TimeSeriesSplit(n_splits=5) print cross_val_score(sclf, X=train.as_matrix(), y=target.as_matrix(), scoring=SMAPE, cv=time_split).mean() sclf.fit(X=train, y=target) preds = sclf.predict(test) sample_submission['y'] = preds print sample_submission[sample_submission['y'] < 0] sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0) sample_submission.to_csv("my_submission_24_2.tsv", sep=',', index=False)