def runARDRegressor(self): lm = ARDRegression(fit_intercept=True, normalize=True) print("runARDRegressor\n") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def task2(data): df = data dfreg = df.loc[:, ['Adj Close', 'Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:', confidencereg * 100) print('The quadratic regression 2 confidence is:', confidencepoly2 * 100) print('The quadratic regression 3 confidence is:', confidencepoly3 * 100) print('The knn regression confidence is:', confidenceknn * 100) print('The lasso regression confidence is:', confidencelas * 100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:', confidencebyr * 100) print('The Lasso LARS regression confidence is:', confidencelar * 100) print('The OMP regression confidence is:', confidenceomp * 100) print('The ARD regression confidence is:', confidenceard * 100) print('The SGD regression confidence is:', confidencesgd * 100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime( '%Y-%m-%d')), dfreg['Adj Close'].to_list( ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list( ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list( ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list( ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list( ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
def calc_ard_regression(X_train, X_test, y_train, y_test): reg = ARDRegression().fit(X_train, y_train) reg.score(X_train, y_train) return calc_spearmanr_from_regressor(reg, X_test, y_test)
# ## ARD регрессия # In[356]: ard = ARDRegression() ard.fit(X_train, y_train) np.random.seed(0) print("среднее значение отклика обучающей выборки: %f" % np.mean(ard.predict(X_train))) print( "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, обучение: %f" % np.sqrt(mean_squared_error(ard.predict(X_train), y_train))) print( "корень из среднеквадратичной ошибки прогноза средним значением на обучающей выборке, тест: %f" % np.sqrt(mean_squared_error(ard.predict(X_test), y_test))) print('коэффициент детерминации: %f' % ard.score(X_test, y_test)) print('абсолютная ошибка: %f' % mean_absolute_error(y_test, ard.predict(X_test))) # In[357]: print(list(np.array(y_test)[:10])) print(list(map(lambda x: int(round(x)), (ard.predict(X_test))[:10]))) # ## Случайный лес # Построим на обучающей выборке случайный лес: # In[358]: rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=3)
print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, bayesianRidge.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试ARDRegression类**********" ardRegression = ARDRegression() # 拟合训练集 ardRegression.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", ardRegression.coef_ print "截距:", ardRegression.intercept_ print '训练集R2: ', r2_score(train_Y, ardRegression.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = ardRegression.predict(test_X) print "测试集得分:", ardRegression.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, ardRegression.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2