def forecast_mlr(_data,_col,predict,option,session_id): ###変数まとめ estimate_method = option[1] #データフレーム初期化 _data_pr = pd.DataFrame(columns = []) data_rsp = pd.DataFrame(columns = []) data_stat = pd.DataFrame(columns = []) data_all = pd.DataFrame(columns = []) data_stat_preview = pd.DataFrame(columns = []) data_ori = pd.DataFrame(columns = []) ###選択された粒度ごとで計算 _data_sum = _data ##model入力値 holdout = int(option[0]) _data_model = _data[:-(holdout)] X = _data_model[_col].fillna(0) if int(option[3]) !=1: X = sm.add_constant(X)#定数を入れるか Y = _data_model[predict].fillna(0) X_all = _data[_col].fillna(0) Y_all = _data[predict].fillna(0) if estimate_method =='ols': model = smf.OLS(Y,X) if estimate_method =='wls': model = smf.WLS(Y,X) if estimate_method =='glm_po': model = smf.GLM(Y,X,family=sm.families.Poisson()) # 予測モデルを作成 result = model.fit() result.summary() ##サマリをテキスト保存 f = open( UPLOADE_DIR +'/temp/mlr/text/summary.txt', 'w' ) f.write( str(result.summary()) ) f.close() # 予測値計算 Y_pre = pd.DataFrame(columns = []) for i in _col: Y_pre[i] = result.params[i]*X_all[i] pred_df = Y_pre.sum(axis = 1) + result.params[0] #insert_data = SummaryModel(model = option[2],method = estimate_method,aic=round(result.aic,3),bic=round(result.bic,3),rsq=round(result.rsquared,3),rsq_adj=round(result.rsquared_adj,3),holdout = holdout,session_id=session_id) insert_data = SummaryModel(model = option[2],method = estimate_method,aic=round(result.aic,3),bic=round(result.bic,3),rsq=round(result.rsquared,3),rsq_adj=round(result.rsquared_adj,3),holdout = holdout) insert_data.save() #preview用データ格納 data_rsp = pd.DataFrame(columns = []) data_rsp['index'] = [int(i) for i in range(len(_data))] data_rsp['original'] = Y_all data_rsp['predict'] = pred_df data_stat_preview = data_stat_preview.append(data_rsp) #Result画面でのグラフ用にオリジナルデータ + 予測データ data_rsp = pd.DataFrame(columns = []) data_rsp['index'] = [int(i) for i in range(len(_data))] data_rsp['original'] = Y_all data_rsp['predict'] = pred_df data_ori = data_ori.append(data_rsp) return data_stat,data_stat_preview,data_ori
nextYM = list(mreturnP.YM)[pos + 61] currRF = list(mreturnP.RF)[pos + 60] for tkt in common_tickers: stockReturn = currReturn[currReturn.YM.isin(mRets.YM)][tkt] #delete NAN data combined = pd.concat([stockReturn,mRets], axis =1) cleaned = combined.dropna(axis=0) betaReturn = cleaned[tkt]-cleaned['RF'] # Create linear regression object if len(cleaned) is not 0: aweights = [math.pow(0.5, math.pow((1/23), x) ) for x in range(0, len(cleaned) )] params = pd.DataFrame ({ 'Ticker':[tkt], 'YM': [currYM], 'Mkt_RF': [sm.WLS(betaReturn.values,cleaned['Mkt_RF'].values, weights = aweights).fit().params[0]], 'SMB': [sm.WLS(betaReturn.values,cleaned['SMB'].values, weights = aweights).fit().params[0]], 'HML': [sm.WLS(betaReturn.values,cleaned['HML'].values, weights = aweights).fit().params[0]], 'CMA': [sm.WLS(betaReturn.values,cleaned['RMW'].values, weights = aweights).fit().params[0]], 'RF': [sm.WLS(betaReturn.values,cleaned['CMA'].values, weights = aweights).fit().params[0]] }) beta = pd.concat([beta, params]) betas = pd.concat([betas, beta]) currLiquidity = liquidity[liquidity.Ticker.isin(common_tickers)][liquidity.YM ==currYM].groupby('Ticker').mean() currLiquidity = currLiquidity.reset_index() currLiquidity.index = currLiquidity.Ticker currPrice = equityPriceC[equityPriceC.Ticker.isin(common_tickers)][equityPrice.YM ==currYM].groupby('Ticker').mean() currPrice = currPrice.reset_index() currPrice.index = currPrice.Ticker existingTickers = currPrice.Ticker.unique()
def robust_irls_regression(x, y, c, penalty, max_iter=100, tol=1e-8): """Run a robust linear regression with iterated weighted least squares Parameters ---------- x : float predictor vector of size n*1 y : float target variable of size n*1 penalty : string Type of penalty to be applied while doing the iterated weighted least squares (IWLS) regression. Defualts to OLS regression. Choices for penalty are 'Huber', 'Tukey'. c: float Tuning hyperparameter depending on the chosen penalty. Defaults to none. max_iter : int Maximum number of iterations for IWLS. Default is 100 tol: float tolerance level for norm of difference of successive estimates for coefficients of linear regression and robust estimates of spread of residuals. Defaults to 1e-8 Returns ------- coefs : float 2*1 vector of coefficients of linear regression. """ x = np.c_[np.ones(len(x)), x] # append column vector of 1's #------------------------------------------ # Initial co-efficients, returned by OLS. #------------------------------------------ results = smf.WLS(y, sm.add_constant(x)).fit() coefs = results.params #-------------------------------------------- # Raise error if no penalty is specified #---------------------------------------- if penalty is None: raise ValueError("Specify either 'Huber' or 'Tukey' penalty!") old_coefs = coefs residuals = results.resid #-------------------------------------------- # Initial estimate for spread of residuals #-------------------------------------------- robust_sd = sm_scale.mad(residuals) old_robust_sd = robust_sd #-------------------------------------------- # Initialize weights #-------------------------------------------- weights = np.diag(1.0 / (residuals**2)) for iteration in range(max_iter): #------------------------------- # Update regression coefficients #------------------------------- coefs = LA.solve(np.dot(np.dot(x.T, weights), x), np.dot(np.dot(x.T, weights), y)) #-------------------------------------------- # Update residuals #-------------------------------------------- residuals = y - np.dot(coefs, x.T) #-------------------------------------------- # Update robust measure of spread of residuals #-------------------------------------------- robust_sd = sm_scale.mad(residuals) #-------------------------------------------- # Standardize updated residuals #-------------------------------------------- standardized_residuals = residuals / robust_sd #-------------------------------------------- # Update weights #-------------------------------------------- if penalty == 'Huber': weights = np.diag(huber_weight(standardized_residuals, c=c)) elif penalty == "Tukey": weights = np.diag(tukey_weight(standardized_residuals, c=c)) #-------------------------------------------- # Stop if estimates for co-efficients and spread of residuals # are stable. #-------------------------------------------- if LA.norm(robust_sd - old_robust_sd) < tol and \ LA.norm(coefs - old_coefs) < tol: break old_coefs = coefs old_robust_sd = robust_sd return coefs
def iterative_wls(x, y, tol=1e-6, max_iter=100): """Run a weighted least squares linear regression with iterative refinement of variance. (This is computationally intensive!) Parameters ---------- x : float predictor vector of size n*1 y : float target variable of size n*1 max_iter : int Maximum number of iterations for IWLS. Default is 100 tol: float tolerance level for norm of difference of successive estimates for coefficients of linear regression and robust estimates of spread of residuals. Defaults to 1e-6 Returns ------- coefs : float 2*1 vector of coefficients of linear regression. """ x = np.c_[np.ones(len(x)), x] # append column vector of 1's iteration = 0 old_coefs = None #---------------------------------------- # Run an OLS to get initial estimates #---------------------------------------- regression = smf.WLS(y, sm.add_constant(x)).fit() coefs = regression.params while old_coefs is None or (np.max(abs(coefs - old_coefs)) > tol and iteration < max_iter): #---------------------------------------------------------------------- # Construct the log-squared residuals and use a non-parametric # method (kernel regression) to estimate the conditonal mean. # Residual can be 0 in which case log-squared residual is not defined. # Ignore the warning and put a small value for log-squared residual and # proceed. # Exponentiate to predict the variance and take inverse of the variance # as weights. #---------------------------------------------------------------------- with np.errstate(divide='ignore', invalid='ignore'): old_coefs = coefs log_squared_residuals = np.where(regression.resid**2 > 0, np.log(regression.resid**2), 1e-12) model = nparam_kreg.KernelReg(endog=y, exog=log_squared_residuals, var_type='c') weights = np.exp(model.fit()[0])**-1 #------------------------------- # Update regression coefficients #------------------------------- regression = sm.WLS(y, sm.add_constant(x), weights=weights).fit() coefs = regression.params iteration += 1 return coefs
data = np.matrix(df) x, y = data[:, 1], data[:, 2] lm.fit(x, y) lm = sm.ols(formula='y ~ x', data=df).fit() print lm.summary() exog = pd.DataFrame({'x': [10, 15]}) lm.predict(exog) lm.resid lm.params # Weighted Least Squares nsamp = df.shape[0] Y = np.array(df['y']) X = np.c_[np.ones(nsamp), np.array(df['x'])] fm1 = sm.WLS(Y, X, weights=1 / w**2) res_fm1 = fm1.fit() res_fm1.summary() # Plots plt.figure() plt.scatter(x, y) x_range = arange(0, 20, .1) exog = DataFrame({'x': x_range}) y_pred = lm.predict(exog) plt.plot(x_range, y_pred) plt.close() res = lm.resid fig = sma.qqplot(resid) plt.show()