X_train, X_test, y_train, y_test = train_test_split(train_X_reduced, train_y, test_size=0.20, random_state=42) ######################################################################################################### model_lasso = Lasso(alpha=0.000507, random_state=1) model_ridge = Ridge(alpha=10.0) model_svr = SVR(C=15, epsilon=0.009, gamma=0.0004, kernel='rbf') model_ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3, max_iter=10000) model_KRR = KernelRidge(alpha=0.5, kernel='polynomial', degree=2, coef0=2.5) model_byr = BayesianRidge() model_rforest = RandomForestRegressor(n_estimators=210) model_lsvr = LinearSVR() model_sgd = SGDRegressor() model_extra = ExtraTreesRegressor() model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=4, min_child_weight=1.7817, n_estimators=3000, reg_alpha=0.4640, reg_lambda=0.88, subsample=0.5213,
def Ridge_Regression(): model = BayesianRidge(compute_score=True) return model
X = np.random.randn(n_samples, size**2) for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
elif alg.name == 'LinearRegression': if NVIDIA_RAPIDS_ENABLED: from cuml.linear_model import LinearRegression model = LinearRegression(**alg.input_variables.__dict__) else: from sklearn.linear_model import LinearRegression model = LinearRegression(**alg.input_variables.__dict__) elif alg.name == 'SupportVectorRegression': if NVIDIA_RAPIDS_ENABLED: from cuml.svm import SVR else: from sklearn.svm import SVR model = SVR(**alg.input_variables.__dict__) elif alg.name == 'BayesianRidgeRegression': from sklearn.linear_model import BayesianRidge model = BayesianRidge(**alg.input_variables.__dict__) warn_not_gpu_support(alg) elif alg.name == 'AdaBoost' and alg.type == 'regression': from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(**alg.input_variables.__dict__) warn_not_gpu_support(alg) elif alg.name == 'GradientBoosting' and alg.type == 'regression': from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor(**alg.input_variables.__dict__) warn_not_gpu_support(alg) elif alg.name == 'RandomForest' and alg.type == 'regression': from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(**alg.input_variables.__dict__) warn_not_gpu_support(alg) elif alg.name == 'XGBoost' and alg.type == 'regression': from xgboost.sklearn import XGBRegressor
# 模型融合 # 将lgb和xgb的结果进行stacking print('stacking...') train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 res_stack = mean_squared_error(target.values, oof_stack) print('lgb:{:<8.8f}, xgb:{:<8.8f}, stack:{:<8.8f}'.format(res_lgb, res_xgb, res_stack)) # 保存 sub_df = pd.read_csv(pre_root_path + '/jinnan_round1_submit_20181227.csv', header=None) sub_df[1] = predictions # sub_df[1] = sub_df[1].apply(lambda x:round(x, 3)) # 这是覆盖读取的文件 sub_df.to_csv(result_path + '/jinnan_round1_submit_20181227_1.csv', index=0, header=0) # 这是另存为,不保存索引行 print('save done!')
ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1) @pytest.mark.parametrize( "predictor", [DummyRegressor(), BayesianRidge(), ARDRegression()]) def test_mice_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X)
model, data.values, y_train, scoring='neg_mean_squared_error', cv=kf) def print_score(model, name, data=train): score = cross_val(model, data) print(' {}: {:.5f} {:.5f}'.format(name, score.mean(), score.std())) def print_mse(y, pred, name): mse = mean_squared_error(y, pred) print(' {}: {:.8f}'.format(name, mse)) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.00055)) ridge = make_pipeline(RobustScaler(), Ridge(alpha=25, tol=0.00001)) bayesian_ridge = make_pipeline(RobustScaler(), BayesianRidge()) elastic_net = make_pipeline(RobustScaler(), ElasticNet(alpha=0.00055, l1_ratio=0.7)) svr = make_pipeline(RobustScaler(), SVR(C=10, epsilon=0.001, shrinking=False)) print('\nTesting different regression algorithms, scores:') print_score(lasso, 'Lasso') print_score(ridge, 'Ridge Regression') print_score(bayesian_ridge, 'Bayesian Ridge Regression') print_score(elastic_net, 'Elastic Net') print_score(svr, 'Support Vector Regressor') # fit train data to all models, predict train and test, print mean_squared_error for trainings data lasso.fit(train, y_train) lasso_train_pred = lasso.predict(train) lasso_pred = lasso.predict(test)
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Bayesian Ridge Regression model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = MultiOutputRegressor(BayesianRidge(), n_jobs=N_JOBS) pipeline = Pipeline( [ ("var", VarianceThreshold()), # ('poly', PolynomialFeatures(2)), # longer run time, potentially more accurate # ('var2', VarianceThreshold()), # use this if 'poly' is used # ('shape', QuantileTransformer(output_distribution="normal")), # make input variables normally distributed ("scale", MinMaxScaler()), ("model", model), ] ) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner parameters = { "model__estimator__n_iter": [300], "model__estimator__tol": [1e-3], "model__estimator__alpha_1": [1e-2, 1e-6, 1e-10], "model__estimator__lambda_1": [1e-2, 1e-6, 1e-10], "model__estimator__alpha_2": [1e-2, 1e-6, 1e-10], "model__estimator__lambda_2": [1e-2, 1e-6, 1e-10], } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1, ) object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto")
def fit_bridge(X, y): from sklearn.linear_model import BayesianRidge br = BayesianRidge() br.fit(X,y) return br
for column in x1: if column not in X: X[column] = 0 X = X.sort_index(axis=1) x1 = x1.sort_index(axis=1) from sklearn.model_selection import train_test_split Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.33, random_state=0) # Fitting Simple Linear Regression to the Training set from sklearn.linear_model import BayesianRidge regressor = BayesianRidge() fitResult = regressor.fit(Xtrain, Ytrain) YPredTest = regressor.predict(Xtest) print('Intercept: \n', regressor.intercept_) print('Coefficients: \n', regressor.coef_) df2.head() # Predicting the Test set results y_pred = regressor.predict(x1) #print(y_pred) df2['Income'] = y_pred #print(df2) df2.describe()
rs.get_n_splits(X) X_trainset = None y_trainset = None X_testset = None y_testset = None for train_index, test_index in rs.split(X, y): X_trainset, X_testset = X[train_index], X[test_index] y_trainset, y_testset = y[train_index], y[test_index] # ## 模型训练 from sklearn.linear_model import BayesianRidge, HuberRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import BaggingRegressor regression_model = BayesianRidge() regression_model.fit(X_trainset, y_trainset) # # bagging = BaggingRegressor(BayesianRidge(),n_estimators=10) # bagging = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=3, random_state=0, loss='ls') # bagging.fit(X_trainset, y_trainset) # regression_model = bagging # joblib.dump(regression_model, "reg_0003-001.m") ## 预测测试集 X_testset = X y_testset = y lines = "" # regression_model = joblib.load("reg_0003-001.m") result = regression_model.predict(X_testset) mse = 0.0
####### Our Model for Comprison na_mask_train = ~X_train.loc[X_train_odds.index].isna().T.any() X_train_odds_comp = X_train.loc[X_train_odds.index].dropna() # X_train_odds_comp = X_train_odds_comp.fillna(X_train_odds_comp.mean()) na_mask_val = ~X_val.loc[X_val_odds.index].isna().T.any() X_val_odds_comp = X_val.loc[X_val_odds.index].dropna() # X_val_odds_comp = X_val_odds_comp.fillna(X_val_odds_comp.mean()) X_train_odds = X_train_odds[na_mask_train] X_val_odds = X_val_odds[na_mask_val] y_train_odds = y_train_odds[na_mask_train] y_val_odds = y_val_odds[na_mask_val] lm = BayesianRidge().fit(X_train_odds.median(axis=1).values.reshape(-1,1), y_train_odds) predictions = lm.predict(X_val_odds.median(axis=1).values.reshape(-1,1)) print(mean_squared_error(y_val_odds, predictions)) lm.score(X_val_odds.median(axis=1).values.reshape(-1,1), y_val_odds) # X_train_odds_comp_tot = pd.concat([X_train.loc[X_train_odds.index], X_train_odds], axis=1) # X_val_odds_comp_tot = pd.concat([X_val.loc[X_val_odds.index], X_val_odds], axis=1) ####### Scale data select features standardscaler = StandardScaler() X_trainscaled_odds_comp = standardscaler.fit_transform(X_train_odds_comp[featurestouse]) X_valscaled_odds_comp = standardscaler.transform(X_val_odds_comp[featurestouse]) # standardscaler = StandardScaler() # X_trainscaled_odds_comp_tot = standardscaler.fit_transform(X_train_odds_comp_tot[featurestouse]) # X_valscaled_odds_comp_tot = standardscaler.transform(X_val_odds_comp_tot[featurestouse])
}, { 'name': 'Lasso', 'model': Lasso() }, { 'name': 'ElasticNet', 'model': ElasticNet() }, { 'name': 'LassoLarsDefault', 'model': LassoLars() }, { 'name': 'BayesianRidgeDefault', 'model': BayesianRidge() }, { 'name': 'ARDRegressionDefault', 'model': ARDRegression(fit_intercept=True) }, { 'name': 'ARDRegression', 'model': ARDRegression(fit_intercept=True, threshold_lambda=10000) }, { 'name': 'ARDRegressionOptim1', 'model': ARDRegression(fit_intercept=True, n_iter=100,
def ml_regression(x_train, y_train, x_test, y_test, cross_validation=False, show=False): """ Build, train, and test the data set with classical machine learning regression models. If cross_validation=True an additional training with cross validation will be performed. """ from time import time from sklearn.linear_model import LinearRegression from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor # from sklearn.model_selection import KFold # from sklearn.base import clone regressors = (LinearRegression(), BayesianRidge(), DecisionTreeRegressor(), KNeighborsRegressor(n_neighbors=10), AdaBoostRegressor(), RandomForestRegressor(100)) names = [ "Linear", "Bayesian Ridge", "Decision Tree", "KNeighbors", "AdaBoost", "Random Forest" ] col = ['Time (s)', 'Test loss', 'Test R2 score'] results = pd.DataFrame(columns=col) for idx, clf in enumerate(regressors): name = names[idx] # clf_cv = clone(clf) print(name) t0 = time() # Fitting the model without cross validation clf.fit(x_train, y_train) train_time = np.around(time() - t0, 1) y_pred = clf.predict(x_test) loss, r2 = regression_scores(y_test, y_pred, show=show) if cross_validation: warnings.warn('Cross-validation removed') # k_fold = KFold(n_splits=10) # t0 = time() # # Fitting the model with cross validation # for id_train, id_test in k_fold.split(x_train): # # print(y_train[id_train, 0].shape) # clf_cv.fit(x_train[id_train], y_train[id_train, 0]) # TODO enhance # train_time_cv = time() - t0 # y_pred_cv = clf_cv.predict(x_test) # r2_cv = r2_score(y_test, y_pred_cv[:,1]) # print("Test R2-Score CV:\t {:.3f}".format(r2_cv)) # print( "Training Time CV: \t {:.1f} ms".format(train_time_cv * 1000)) results = results.append( pd.DataFrame([[train_time, loss, r2]], columns=col, index=[name])) if show: print("-" * 20) print("Training Time: \t {:.1f} s".format(train_time)) print("Test loss: \t\t {:.4f}".format(loss)) print("Test R2-score: \t {:.3f}\n".format(r2)) return results.sort_values('Test loss').round(2)
df['Prediction'] = df_close.shift(-forecast_out) # label column with data shifted 30 units up # print(df.tail()) X = np.array(df.drop(['Prediction'], 1)) X = preprocessing.scale(X) X_forecast = X[-forecast_out:] # set X_forecast equal to last 30 X = X[:-forecast_out] # remove last 30 from X y = np.array(df['Prediction']) y = y[:-forecast_out] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3) # Training clf = BayesianRidge() clf.fit(X_train,y_train) # Testing confidence = clf.score(X_test, y_test) print("confidence: ", confidence) forecast_prediction = clf.predict(X_forecast) print(forecast_prediction)
# 随机提取10个特征出来作为样本特征 relevant_features = np.random.randint(0, n_features, 10) # 基于先验分布,产生特征对应的初始权值 for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_)) # 产生alpha为50的噪声 alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # 产生目标数据 y = np.dot(X, w) + noise ############################################################################### # 使用贝叶斯脊回归拟合数据 clf = BayesianRidge(compute_score=True) clf.fit(X, y) # 使用最小二乘法拟合数据 ols = LinearRegression() ols.fit(X, y) ############################################################################### # 作图比较两个方法的结果 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") plt.plot(w, 'g-', label="Ground truth") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights")
def dict_method_reg(): """many reg method.""" dict_method = {} # 1st part """4KNR""" me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') cv4 = 5 scoring4 = 'r2' param_grid4 = [{ 'n_neighbors': [3, 4, 5, 6, 7], "weights": ['uniform', "distance"], "leaf_size": [10, 20, 30] }] dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]}) """1SVR""" me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=False, max_iter=2000) cv1 = 5 scoring1 = 'r2' param_grid1 = [{ 'C': [10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01], 'kernel': ker }] dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]}) """5kernelridge""" me5 = kernel_ridge.KernelRidge(alpha=1, gamma="scale", degree=3, coef0=1, kernel_params=None) cv5 = 5 scoring5 = 'r2' param_grid5 = [{ 'alpha': [100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01, 0.001, 1e-4, 1e-5], 'kernel': ker }] dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]}) """6GPR""" me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=0) cv6 = 5 scoring6 = 'r2' param_grid6 = [{'alpha': [1e-3, 1e-2], 'kernel': ker}] dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]}) # 2nd part """6RFR""" me7 = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False) cv7 = 5 scoring7 = 'r2' param_grid7 = [{ 'max_depth': [4, 5, 6, 7], }] dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]}) """7GBR""" me8 = GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, ) cv8 = 5 scoring8 = 'r2' param_grid8 = [{ 'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3], 'learning_rate': [0.1, 0.05] }] dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]}) "AdaBR" dt3 = DecisionTreeRegressor(criterion="mse", splitter="best", max_features=None, max_depth=7, min_samples_split=4) me9 = AdaBoostRegressor(dt3, n_estimators=200, learning_rate=0.05, random_state=0) cv9 = 5 scoring9 = 'explained_variance' param_grid9 = [{"base_estimator": [dt3]}] dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]}) '''DTR''' me10 = DecisionTreeRegressor( criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, ) cv10 = 5 scoring10 = 'r2' param_grid10 = [{ 'max_depth': [2, 3, 4, 5, 6, 7, 8], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2] }] dict_method.update({'DTR-em': [me10, cv10, scoring10, param_grid10]}) 'ElasticNet' me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None) cv11 = 5 scoring11 = 'r2' param_grid11 = [{ 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'l1_ratio': [0.3, 0.5, 0.8] }] dict_method.update({"EN-L1": [me11, cv11, scoring11, param_grid11]}) 'Lasso' me12 = Lasso( alpha=1.0, fit_intercept=True, normalize=True, precompute=False, copy_X=True, max_iter=3000, tol=0.001, warm_start=False, positive=False, random_state=None, ) cv12 = 5 scoring12 = 'r2' param_grid12 = [ { 'alpha': [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000 ], "tol": [0.001, 0.01, 0.1] }, ] dict_method.update({"LASSO-L1": [me12, cv12, scoring12, param_grid12]}) """2BayesianRidge""" me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.01, verbose=False) cv2 = 5 scoring2 = 'r2' param_grid2 = [{ 'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-06, 1e-05] }] dict_method.update({'BRR-L1': [me2, cv2, scoring2, param_grid2]}) """3SGDRL2""" me3 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l2', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv3 = 5 scoring3 = 'r2' param_grid3 = [{ 'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05], 'loss': ['squared_loss', "huber"], "penalty": ["l1", "l2"] }] dict_method.update({'SGDR-L1': [me3, cv3, scoring3, param_grid3]}) """PassiveAggressiveRegressor""" me14 = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, warm_start=False, average=False) cv14 = 5 scoring14 = 'r2' param_grid14 = [{ 'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01] }] dict_method.update({'PAR-L1': [me14, cv14, scoring14, param_grid14]}) return dict_method
def fit_transform(self, X, y=None): """Fits the imputer on X and return the transformed X. Parameters ---------- X : array-like, shape (n_samples, n_features) Input data, where "n_samples" is the number of samples and "n_features" is the number of features. y : ignored. Returns ------- Xt : array-like, shape (n_samples, n_features) The imputed input data. """ self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if self.predictor is None: from sklearn.linear_model import BayesianRidge self._predictor = BayesianRidge() else: self._predictor = clone(self.predictor) self._min_value = np.nan if self.min_value is None else self.min_value self._max_value = np.nan if self.max_value is None else self.max_value self.initial_imputer_ = None X, X_filled, mask_missing_values = self._initial_imputation(X) # edge case: in case the user specifies 0 for n_imputations, # then there is no need to do burn in and the result should be # just the initial imputation (before clipping) if self.n_imputations < 1: return X_filled X_filled = np.clip(X_filled, self._min_value, self._max_value) # order in which to impute # note this is probably too slow for large feature data (d > 100000) # and a better way would be good. # see: https://goo.gl/KyCNwj and subsequent comments ordered_idx = self._get_ordered_idx(mask_missing_values) abs_corr_mat = self._get_abs_corr_mat(X_filled) # impute data n_rounds = self.n_burn_in + self.n_imputations n_samples, n_features = X_filled.shape Xt = np.zeros((n_samples, n_features), dtype=X.dtype) self.imputation_sequence_ = [] if self.verbose > 0: print("[MICE] Completing matrix with shape %s" % (X.shape, )) start_t = time() for i_rnd in range(n_rounds): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) for feat_idx in ordered_idx: neighbor_feat_idx = self._get_neighbor_feat_idx( n_features, feat_idx, abs_corr_mat) X_filled, predictor = self._impute_one_feature( X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) predictor_triplet = MICETriplet(feat_idx, neighbor_feat_idx, predictor) self.imputation_sequence_.append(predictor_triplet) if i_rnd >= self.n_burn_in: Xt += X_filled if self.verbose > 0: print('[MICE] Ending imputation round ' '%d/%d, elapsed time %0.2f' % (i_rnd + 1, n_rounds, time() - start_t)) Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt
ytrain_est[:, 3] = knn.predict(Xtrain) yval_est[:, 3] = knn.predict(Xval) svmnorm = SVR(tol=tol, gamma='auto') svmnorm = svmnorm.fit(Xtrain_norm, ytrain) predictions[:, 4] = svmnorm.predict(Xtest_norm) ytrain_est[:, 4] = svmnorm.predict(Xtrain_norm) yval_est[:, 4] = svmnorm.predict(Xval_norm) svmlnorm = LinearSVR(max_iter=5000) svmlnorm = svmlnorm.fit(Xtrain_norm, ytrain) predictions[:, 5] = svmlnorm.predict(Xtest_norm) ytrain_est[:, 5] = svmlnorm.predict(Xtrain_norm) yval_est[:, 5] = svmlnorm.predict(Xval_norm) gnb = BayesianRidge() gnb = gnb.fit(Xtrain_norm, ytrain) predictions[:, 6] = gnb.predict(Xtest_norm) ytrain_est[:, 6] = gnb.predict(Xtrain_norm) yval_est[:, 6] = gnb.predict(Xval_norm) hr = HuberRegressor() hr = hr.fit(Xtrain_norm, ytrain) predictions[:, 7] = hr.predict(Xtest_norm) ytrain_est[:, 7] = hr.predict(Xtrain_norm) yval_est[:, 7] = hr.predict(Xval_norm) # eval d_train = xgb.DMatrix(data=Xtrain_norm, label=ytrain, feature_names=Xtrain.columns)
def bayesian_ridge_regression(): # ############################################################################# # Generating simulated data with Gaussian weights np.random.seed(0) n_samples, n_features = 100, 100 X = np.random.randn(n_samples, n_features) # Create Gaussian data # Create weights with a precision lambda_ of 4. lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise # ############################################################################# # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) # ############################################################################# # Plot true weights, estimated weights, histogram of the weights, and # predictions with standard deviations # lw = 2 # plt.figure(figsize=(6, 5)) # plt.title("Weights of the model") # plt.plot(clf.coef_, color='lightgreen', linewidth=lw, # label="Bayesian Ridge estimate") # plt.plot(w, color='gold', linewidth=lw, label="Ground truth") # plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate") # plt.xlabel("Features") # plt.ylabel("Values of the weights") # plt.legend(loc="best", prop=dict(size=12)) # plt.figure(figsize=(6, 5)) # plt.title("Histogram of the weights") # plt.hist(clf.coef_, bins=n_features, color='gold', log=True, # edgecolor='black') # plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.), # color='navy', label="Relevant features") # plt.ylabel("Features") # plt.xlabel("Values of the weights") # plt.legend(loc="upper left") # plt.figure(figsize=(6, 5)) # plt.title("Marginal log-likelihood") # plt.plot(clf.scores_, color='navy', linewidth=lw) # plt.ylabel("Score") # plt.xlabel("Iterations") # Plotting some predictions for polynomial regression def f(x, noise_amount): y = np.sqrt(x) * np.sin(x) noise = np.random.normal(0, 1, len(x)) return y + noise_amount * noise degree = 10 X = np.linspace(0, 10, 100) y = f(X, noise_amount=0.1) clf_poly = BayesianRidge() clf_poly.fit(np.vander(X, degree), y) X_plot = np.linspace(0, 11, 25) y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
def train(self): X_test, y_test, act_test, X_cnn_test, X_lstm_test = self.load_data() if X_test.shape[0] > 0 and len( self.methods) > 1 and self.istrained == False: if self.model_type in {'pv', 'wind'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) elif self.model_type in {'load'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) elif self.model_type in {'fa'}: if self.resampling == True: pred_resample, y_resample, results = self.resampling_for_combine( X_test, y_test, act_test, X_cnn_test, X_lstm_test) else: pred_resample, y_resample, results = self.without_resampling( X_test, y_test, act_test, X_cnn_test, X_lstm_test) self.best_methods = results.nsmallest(4, 'mae').index.tolist() results = results.loc[self.best_methods] results['diff'] = results['mae'] - results['mae'].iloc[0] best_of_best = results.iloc[np.where( results['diff'] <= 0.02)].index.tolist() if len(best_of_best) == 1: best_of_best.extend( [best_of_best[0], best_of_best[0], self.best_methods[1]]) elif len(best_of_best) == 2: best_of_best.extend([best_of_best[0], best_of_best[0]]) elif len(best_of_best) == 3: best_of_best.append(best_of_best[0]) self.best_methods = best_of_best X_pred = np.array([]) for method in sorted(self.best_methods): if X_pred.shape[0] == 0: X_pred = pred_resample[method] else: X_pred = np.hstack((X_pred, pred_resample[method])) X_pred[np.where(X_pred < 0)] = 0 X_pred, y_resample = shuffle(X_pred, y_resample) self.weight_size = len(self.best_methods) self.model = dict() for combine_method in self.combine_methods: if combine_method == 'rls': self.logger.info('RLS training') self.logger.info('/n') self.model[combine_method] = dict() w = self.rls_fit(X_pred, y_resample) self.model[combine_method]['w'] = w elif combine_method == 'bcp': self.logger.info('BCP training') self.logger.info('/n') self.model[combine_method] = dict() w = self.bcp_fit(X_pred, y_resample) self.model[combine_method]['w'] = w elif combine_method == 'mlp': self.logger.info('MLP training') self.logger.info('/n') cvs = [] for _ in range(3): X_train1, X_test1, y_train1, y_test1 = train_test_split( X_pred, y_resample, test_size=0.15) X_train, X_val, y_train, y_val = train_test_split( X_train1, y_train1, test_size=0.15) cvs.append( [X_train, y_train, X_val, y_val, X_test1, y_test1]) mlp_model = sklearn_model( self.static_data, self.model_dir, self.rated, 'mlp', self.n_jobs, is_combine=True, path_group=self.static_data['path_group']) self.model[combine_method] = mlp_model.train(cvs) elif combine_method == 'bayesian_ridge': self.logger.info('bayesian_ridge training') self.logger.info('/n') self.model[combine_method] = BayesianRidge() self.model[combine_method].fit(X_pred, y_resample) elif combine_method == 'elastic_net': self.logger.info('elastic_net training') self.logger.info('/n') self.model[combine_method] = ElasticNetCV(cv=5) self.model[combine_method].fit(X_pred, y_resample) elif combine_method == 'ridge': self.logger.info('ridge training') self.logger.info('/n') self.model[combine_method] = RidgeCV(cv=5) self.model[combine_method].fit(X_pred, y_resample) self.logger.info('End of combine models training') else: self.combine_methods = ['average'] self.istrained = True self.save(self.model_dir) return 'Done'
X_train, X_test, y_train, y_test = train_test_split(X, y_time, test_size=0.3, random_state=0) Regressor = { 'Random Forest Regressor': RandomForestRegressor(n_estimators=200), 'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500), 'ExtraTrees Regressor': ExtraTreesRegressor(n_estimators=500, min_samples_split=5), 'Bayesian Ridge': BayesianRidge(), 'Elastic Net CV': ElasticNetCV() } for name, clf in Regressor.items(): print(name) clf.fit(X_train, y_train) print('acc', clf.score(X_test, y_test)) #print('new_acc',get_acc(y_test,clf.predict(X_test),10)) # print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}') # print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}') # print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test)):.2f}')
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import BayesianRidge from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = BayesianRidge(compute_score=True) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
def regressionfunctions(X_temp, Y_temp, which_regs): tunedpars_lr = model_pars_name_dic["tunedpars_lr"] reg_ref_dic = { "omp": OrthogonalMatchingPursuit(), "muelnet": MultiTaskElasticNet(), "elnet": ElasticNet(), "rfr": RandomForestRegressor(random_state=0), "mlp": MLPRegressor(learning_rate="adaptive", random_state=0), "br": BayesianRidge(), "ard": ARDRegression(), "svr": SVR(), "nusvr": NuSVR() } tunedpars_dic = { "rfr": model_pars_name_dic["tunedpars_rfr"], "mlp": model_pars_name_dic["tunedpars_mlp"], "br": model_pars_name_dic["tunedpars_br"], "ard": model_pars_name_dic["tunedpars_ard"], "svr": model_pars_name_dic["tunedpars_svr"], "nusvr": model_pars_name_dic["tunedpars_nusvr"], "elnet": model_pars_name_dic["tunedpars_elnet"], "muelnet": model_pars_name_dic["tunedpars_muelnet"], "omp": model_pars_name_dic["tunedpars_omp"] } models_output_dic = dict() model_dic = dict() for key in which_regs.keys(): if which_regs[key] == True: model_dic[key] = [tunedpars_dic[key], reg_ref_dic[key]] cv1 = KFold(n_splits=cv, shuffle=True, random_state=1) #LinearRegression reg = GridSearchCV(LinearRegression(), tunedpars_lr, cv=cv1, n_jobs=-1, return_train_score=True) reg.fit(X_temp, Y_temp.ravel()) #rsquared=reg.best_score_ rsquared = reg.score(X_temp, Y_temp) best_score_all = adj_r_sqrd(reg.score(X_temp, Y_temp)) best_estimator_all = reg models_output_dic["lr"] = { "model": reg, "mod_score": rsquared, "predicted_Y": reg.predict(X_temp) } for ttm in model_dic.items(): tunedpars = ttm[1][0] models = ttm[1][1] mod_name = ttm[0] reg = GridSearchCV(models, tunedpars, cv=cv1, n_jobs=-1, return_train_score=True) try: reg.fit(X_temp, Y_temp.ravel()) mod_score = reg.score(X_temp, Y_temp) models_output_dic[mod_name] = { "model": reg, "mod_score": mod_score, "predicted_Y": reg.predict(X_temp) } if adj_r_sqrd(mod_score) > best_score_all: #rsquared=reg.best_score_ rsquared = mod_score best_score_all = adj_r_sqrd(rsquared) best_estimator_all = reg except: print("####In except: ", mod_name) pass #to transfer score from cv score to normal: #rsquared=best_estimator_all.score(X_temp, Y_temp) #best_score_all=adj_r_sqrd(rsquared) ################################################################################# #################################################################### return best_score_all, best_estimator_all, rsquared, models_output_dic
import pandas as pd from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error import numpy as np from sklearn.linear_model import LinearRegression from sklearn.svm import SVR from sklearn.linear_model import BayesianRidge from sklearn.linear_model import Lasso from sklearn.neighbors import KNeighborsRegressor # Store the algorithms into a dictionary reg_all = { 'Linear Regression': LinearRegression(), 'Support Vector Machine': SVR(), 'Byesian Ridge': BayesianRidge(), 'Lasso': Lasso(), 'K Neighbors Regression': KNeighborsRegressor(n_neighbors=2) } # Read Data def read_file(filename, n_fold_input=5): data = pd.read_csv(filename, sep='\t', header=None) data_x = data.iloc[:, :-1] data_y = data.iloc[:, -1] n_fold = n_fold_input # n_fold # Split data by KFold kf = KFold(len(data_y), n_fold) return data_x, data_y, kf
def BayesianRidge_Model(): x_train, y_train, x_test,_ = load_data() clf = BayesianRidge() test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=10, scoring='neg_mean_squared_error')) print(np.mean(test_score))
mean_squared_error(y_test, model.predict(X_test))) logging.info("Linear Regression | MSE: " + str(mean_squared_error(y_test, model.predict(X_test)))) ##### Decision Tree Regression ##### from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() model.fit(X_train, y_train) print("Decision Tree Regression | Accuracy Score:", model.score(X_test, y_test)) logging.info("Decision Tree Regression | Accuracy Score: " + str(model.score(X_test, y_test))) print("Decision Tree Regression | MSE:", mean_squared_error(y_test, model.predict(X_test))) logging.info("Decision Tree Regression | MSE: " + str(mean_squared_error(y_test, model.predict(X_test)))) ##### Bayesian Ridge ##### from sklearn.linear_model import BayesianRidge model = BayesianRidge() model.fit(X_train, y_train) print("Bayesian Ridge | Accuracy Score:", model.score(X_test, y_test)) logging.info("Bayesian Ridge | Accuracy Score: " + str(model.score(X_test, y_test))) print("Bayesian Ridge | MSE:", mean_squared_error(y_test, model.predict(X_test))) logging.info("Bayesian Ridge | MSE: " + str(mean_squared_error(y_test, model.predict(X_test))))
pca = PCA(n_components=410) X_scaled=pca.fit_transform(X_scaled) test_X_scaled = pca.transform(test_X_scaled) print(X_scaled.shape, test_X_scaled.shape) '''modeling&evalution''' #34 # define cross validation strategy def rmse_cv(model,X,y): rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)) return rmse #35 #We choose 13 models and use 5-folds cross-calidation to evaluate these models. models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(), ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5), ExtraTreesRegressor(),XGBRegressor()] #36 names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"] for name, model in zip(names, models): score = rmse_cv(model, X_scaled, y_log) print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std())) #37 #Next we do some hyperparameters tuning. First define a gridsearch method. class grid(): def __init__(self, model): self.model = model def grid_get(self, X, y, param_grid):
def task2(data): df = data dfreg = df.loc[:,['Adj Close','Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) # 1 - First save the models to local device in models folder # filename = 'models/clfreg_model.sav' # pickle.dump(clfreg, open(filename, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfreg = pickle.load(open(filename, 'rb')) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) #Save model to a pickle # filename1 = 'models/clfpoly2_model.sav' # pickle.dump(clfpoly2, open(filename1, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly2 = pickle.load(open(filename1, 'rb')) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) #Save model to a pickle # filename2 = 'models/clfpoly3_model.sav' # pickle.dump(clfpoly3, open(filename2, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly3 = pickle.load(open(filename2, 'rb')) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) #Save model to a pickle # filename3 = 'models/clfknn_model.sav' # pickle.dump(clfknn, open(filename3, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfknn = pickle.load(open(filename3, 'rb')) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() #Save model to a pickle # filename4 = 'models/clflas_model.sav' # pickle.dump(clflas, open(filename4, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflas = pickle.load(open(filename4, 'rb')) clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) #Save model to a pickle # filename5 = 'models/clfbyr_model.sav' # pickle.dump(clfbyr, open(filename5, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfbyr = pickle.load(open(filename5, 'rb')) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) #Save model to a pickle # filename6 = 'models/clflar_model.sav' # pickle.dump(clflar, open(filename6, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflar = pickle.load(open(filename6, 'rb')) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) #Save model to a pickle # filename7 = 'models/clfomp_model.sav' # pickle.dump(clfomp, open(filename7, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfomp = pickle.load(open(filename7, 'rb')) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) #Save model to a pickle # filename8 = 'models/clfard_model.sav' # pickle.dump(clfard, open(filename8, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfard = pickle.load(open(filename8, 'rb')) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) #SGD Regression clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) #Save model to a pickle # filename9 = 'models/clfsgd_model.sav' # pickle.dump(clfsgd, open(filename9, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfsgd = pickle.load(open(filename9, 'rb')) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test,y_test) confidencepoly3 = clfpoly3.score(X_test,y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:',confidencereg*100) print('The quadratic regression 2 confidence is:',confidencepoly2*100) print('The quadratic regression 3 confidence is:',confidencepoly3*100) print('The knn regression confidence is:',confidenceknn*100) print('The lasso regression confidence is:',confidencelas*100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:',confidencebyr*100) print('The Lasso LARS regression confidence is:',confidencelar*100) print('The OMP regression confidence is:',confidenceomp*100) print('The ARD regression confidence is:',confidenceard*100) print('The SGD regression confidence is:',confidencesgd*100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), dfreg['Adj Close'].to_list(), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()