def test_linearity(x, y, n_knots=5, verbose=True): """Test linearity between two variables. Run a linear regression of y on x, and take the residuals. Fit the residuals with a natural spline with `n_knots` knots. Conduct a joint F-test for all columns in the natural spline basis matrix. Example: >>> import numpy as np >>> rng = np.random.default_rng(0) >>> x = np.linspace(0., 1., 101) >>> y = 5 * x + 3 + rng.random(size=101) / 5 >>> test_linearity(x, y, n_knots=5, verbose=False) 0.194032 """ residuals = OLS(y, add_constant(x)).fit().resid basis_matrix = patsy.dmatrix( f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x}, return_type='dataframe') results = OLS(residuals, basis_matrix).fit() results.summary() nobs = results.nobs f_value = results.fvalue p_value = np.round(results.f_pvalue, 6) print('Test for Linearity: ' f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; ' f'F = {f_value:.3f}; p = {p_value:.6f}.') return p_value
def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return
def get_cointLst(corrList, df_is): # called in main # Test cointegration the test has to be perform on both side of the spread cointLst = [] for pair in corrList: X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values x1 = add_constant(X1) x2 = add_constant(X2) r1 = OLS(X2, x1).fit() r2 = OLS(X1, x2).fit() adf1 = adfuller(r1.resid)[1] if adf1 < 0.01: adf2 = adfuller(r2.resid)[1] if adf2 < 0.01 and adf1 < adf2: # Test for strong cointegration in both side only. cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair + [adf1] + list(r1.params)) elif adf2 < 0.01: cointLst.append(["{0}_{1}".format(pair[1], pair[0])] + [pair[1], pair[0], pair[2], pair[3], adf2] + list(r2.params)) #print "There are {0} pairs strongly cointegrated.".format(len(cointLst)) return cointLst
def backwardElimination(x, SL): numVars = len(x[0]) temp = np.zeros((50, 6)).astype(int) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() print(regressor_OLS.summary()) maxVar = max(regressor_OLS.pvalues).astype(float) adjR_before = regressor_OLS.rsquared_adj.astype(float) if maxVar > SL: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): temp[:, j] = x[:, j] x = np.delete(x, j, 1) tmp_regressor = OLS(y, x).fit() adjR_after = tmp_regressor.rsquared_adj.astype(float) if (adjR_before >= adjR_after): x_rollback = np.hstack((x, temp[:, [0, j]])) x_rollback = np.delete(x_rollback, j, 1) print(regressor_OLS.summary()) return x_rollback else: continue else: break return x
def stepwise_selection(data, target, SL_in=0.05, SL_out=0.05): initial_features = data.columns.tolist() best_features = [] while (len(initial_features) > 0): remaining_features = list(set(initial_features) - set(best_features)) new_pval = pd.Series(index=remaining_features) for new_column in remaining_features: model = OLS(target, sm.add_constant(data[best_features + [new_column]])).fit() new_pval[new_column] = model.pvalues[new_column] min_p_value = new_pval.min() if (min_p_value < SL_in): best_features.append(new_pval.idxmin()) while (len(best_features) > 0): best_features_with_constant = sm.add_constant( data[best_features]) p_values = OLS(target, best_features_with_constant).fit().pvalues[1:] max_p_value = p_values.max() if (max_p_value >= SL_out): excluded_feature = p_values.idxmax() best_features.remove(excluded_feature) else: break else: break return best_features
def find_apex(decel): res = [] for t in decel.index[10::10]: left = decel[:t]['accelY'] right = decel[t:]['accelY'] left_mod = OLS(left, add_constant(range(len(left)))).fit() right_mod = OLS(right, add_constant(range(len(right)))).fit() ssrs = [t, left_mod.ssr, right_mod.ssr] res.append(ssrs) apex = min(res, key=lambda x: x[1] + x[2])[0] return apex
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''): #Express the observations relative to the base period obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))]) #Load the best estimate forcings from Piers forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt' data = np.genfromtxt(forc_file,skip_header=4) years = data[:,0] tot_forc = data[:,13] ant_forc = data[:,14] #Integrate anthropogenic and natural forcing with standard FAIR parameters C, t_nat = fair_scm(other_rf=tot_forc-ant_forc) C, t_anthro = fair_scm(other_rf=ant_forc) #Express relative to the centre of the base period t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)]) t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)]) # ----------------------------------------------- # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared # ----------------------------------------------- #Interpolate the annual forced responses to the grid of the observed data if reg_type !='mon': t_nat = np.interp(obs_years+0.5, years+0.5, t_nat) t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro) else: t_nat = np.interp(obs_years, years+0.5, t_nat) t_anthro = np.interp(obs_years, years+0.5, t_anthro) #Linearly project the final half year t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_anthro[obs_years<=(years[-1]+0.5)][-1] t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_nat[obs_years<=(years[-1]+0.5)][-1] # ----------------------------------------------- #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant y = np.copy(obs) x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)}) # add constant vector on to dataframe we will fit to temp observations x = statsmodels.tools.tools.add_constant(x) # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset. model = OLS(y, x) result = model.fit() # collect output scaling factors for anthro and natural temperature timeseries sf = result.params #Form scaled anthropgenic warming index awi = t_anthro * sf['x1'] #Scaled natural warming index nwi = t_nat * sf['x2'] #Scaled total externally forced warming index gwi = awi + nwi print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2']) return awi, nwi
def factor_alpha_beta( factor_data: pd.DataFrame, returns: pd.DataFrame = None, demeaned: bool = True, group_adjust: bool = False, equal_weight: bool = False, ): """ 计算因子的 alpha (超额收益), alpha 的 t-统计量 以及 beta 值 参数 --- :param factor_data: 索引为 ['日期' '股票'] 的 MultiIndex, values 包括因子值,远期收益,因子分位,因子分组 [可选] :param returns: 因子远期收益,默认为 None, 如果为 None 的时候,会通过调用 `factor_returns` 来计算相应的收益 :param demeaned: 是否基于一个多空组合 :param group_adjust: 是否进行行业中性处理 :param equal_weight: 返回 --- """ if returns is None: returns = factor_returns( factor_data, demeaned, group_adjust, equal_weight ) universe_ret = ( factor_data.groupby(level="datetime")[get_forward_returns_columns( factor_data.columns )].mean().loc[returns.index] ) if isinstance(returns, pd.Series): returns.name = universe_ret.columns.values[0] returns = pd.DataFrame(returns) alpha_beta = pd.DataFrame() for period in returns.columns.values: x = universe_ret[period].values y = returns[period].values x = add_constant(x) reg_fit = OLS(y, x).fit() try: alpha, beta = reg_fit.params except ValueError: alpha_beta.loc["Ann. alpha", period] = np.nan alpha_beta.loc["beta", period] = np.nan else: freq_adjust = pd.Timedelta(days=DAYS_PER_YEAR) / pd.Timedelta( utils.get_period(period.replace("period_", "")) ) alpha_beta.loc["Ann. alpha", period] = (1 + alpha)**freq_adjust - 1.0 alpha_beta.loc["beta", period] = beta return alpha_beta
def remove_outliers(train, targetField, dropVal, studentResid, verbose=True): """ Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features :param pandas.DataFrame train: data for training :param str targetField: target from train/ test :py:class:`pandas.DataFrame` :param obj dropVal: value to drop rows across :param float studentResid: number to threshold absolute value of student residuals above :param bool verbose: flag to print out OLS summary information and number of outlier removed """ train = train.dropna() if dropVal is not None: train = train.ix[(train.T != dropVal).all()] design = train[[i for i in train if i != targetField]] target = train[targetField] design = StandardScaler().fit_transform(design) model = OLS(target, design) mask = np.ones((train.shape[0])).astype(bool) if studentResid is not None: mask = (model.fit().outlier_test()['student_resid'].abs() < 2) if verbose: print model.fit().summary() print 'Removed:' + str(train.shape[0] - sum(mask)) return train.ix[mask]
def run_acc_compare(self, print_summary=False, data_df=None): #if regressiondict is None: # regressiondict=self.modeldict['regressiondict'] if data_df is None: self.set_flat_c_stats_df() data_df = self.flat_c_stats_df data_df.dropna(inplace=True, axis=0) y_df = data_df.loc[:, 'accuracy'] X_df = data_df.drop(labels='accuracy', axis=1, inplace=False) #print('y_df',y_df) #print('X_df',X_df) X_dtypes_ = dict(X_df.dtypes) obj_vars = [ var for var, dtype in X_dtypes_.items() if dtype == 'object' ] #float_idx=[i for i in range(X_df.shape[1]) if i not in obj_idx] #self.model=regressiondict['pipeline'](cat_idx=obj_idx,float_idx=float_idx) X_float_df = self.floatify_df(X_df, obj_vars) #X_float_df=add_constant(X_float_df) self.X_float_df = X_float_df self.y_df = y_df self.model = OLS(y_df, X_float_df) self.model_result = self.model.fit() if print_summary: print('OLS results for modeldict:') print(self.modeldict) print(self.model_result.summary())
def prosperity_score_regression(cards, metadata, score_columns=score_column_names): """ Perform a linear regression to determine the degree to which the Prosperity add-on treasure and victory cards contribute to a good score. """ prosperity = set(cards['currency'].columns.get_level_values(1)) # victory_cards = set(cards['victory'].columns.get_level_values(1)) # cards = currency_cards.union(victory_cards) scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1) # Ignore missing cells refine_idx = np.isfinite(scores) scores = scores[refine_idx] set_counts = pd.concat([ pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values, columns=[c]) for c in prosperity ] + [ pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score']) ], axis=1).fillna(0) results = OLS(scores, set_counts).fit() print results.summary()
def linear_regression(data): """ goal of this function : - to apply a linear regression ; ie. to calculate the coefficient and the intercept value of the regression line input parameter : - json file's content (data) output : - dict containing the coefficient value and intercept for each word cmd packages : - numpy (ones, arange) - statsmodels.api (ols) """ #initialisation dict_linreg = {} #for each entry in the json file (data) #intercept value and coefficient calculation for k, v in data.items(): mat_x = np.ones((len(v), 2)) mat_x[:, 1] = np.arange(0, len(v)) reg = OLS(v, mat_x) results = reg.fit() dict_linreg[k] = [results.params[1], results.params[0]] return (dict_linreg)
def residual_k(self): """ Residual series of the K factors Returns: np.array, shape=(k Factors, T periods) """ from statsmodels.api import OLS res = [] idx_lst = list(range(self.K)) for i in range(self.K): x_idx, y_idx = [*idx_lst[0:i], *idx_lst[i + 1:]], idx_lst[i:i + 1] x_data, y_data = self.mtx_factors[:, x_idx], self.mtx_factors[:, y_idx] # from sklearn.linear_model import LinearRegression # model = LinearRegression(fit_intercept=True).fit(x_data, y_data) # intercept is not added by default in OLS implement model = OLS(y_data, x_data).fit() res.append(model.resid) return np.array(res)
def nuevo_regress(): modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit() summary = modelo.summary() vals_residuales = modelo.resid print(summary) print(anderson(vals_residuales)) grafica_qq(vals_residuales)
def optimal_spreads_regression(cov_matrix, mid, market_rel_spread): regressors = 3*pd.DataFrame([np.diag(cov_matrix)], ['Variance'], mid.index).T regressors['Inverse decay'] = 1 fit = OLS(market_rel_spread*mid, regressors).fit() risk_aversion = fit.params['Variance'] intensity_decay = 2/fit.params['Inverse decay'] return risk_aversion, intensity_decay, fit.rsquared
def _compute_vif(exog, exog_idx, weights=None, model_config=None): """ Compute variance inflation factor, VIF, for one exogenous variable for OLS and WLS that allows weights. Parameters ---------- exog: X features [X_1, X_2, ..., X_n] exog_idx: ith index for features weights: weights model_config: {"hasconst": True, "cov_type": "HC3"} by default Returns: vif ------- """ if model_config is None: model_config = {"hasconst": True, "cov_type": "HC3"} k_vars = exog.shape[1] x_i = exog[:, exog_idx] mask = np.arange(k_vars) != exog_idx x_noti = exog[:, mask] if weights is None: r_squared_i = OLS(x_i, x_noti, hasconst=model_config["hasconst"]).fit().rsquared else: r_squared_i = WLS(x_i, x_noti, hasconst=model_config["hasconst"], weights=weights).fit( cov_type=model_config["cov_type"]).rsquared vif = 1. / (1. - r_squared_i) return vif
def alpha_beta(self): rr = (self.X - 1).mean(1) m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T) reg = m.fit() alpha, beta = reg.params.const * 252, reg.params.x1 return alpha, beta
def capm(y: pd.Series, bases: pd.DataFrame, rf=0.0, fee=0.0): freq = _freq(y.index) rf = rf / freq fee = fee / freq R = y.pct_change() - rf R.name = y.name R_base = bases.pct_change().sub(rf, axis=0) # CAPM: # R = alpha + rf + beta * (Rm - rf) model = OLS(R, R_base.assign(Intercept=1), missing="drop").fit() alpha = model.params["Intercept"] * freq betas = model.params[bases.columns] # reconstruct artificial portfolio proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee) cumproxy = (1 + proxy).cumprod() # residual portfolio r = y.pct_change() - cumproxy.pct_change() residual = (1 + r).cumprod() return { "alpha": alpha, "betas": betas, "cumproxy": cumproxy, "model": model, "residual": residual, }
def _capm_mu(self, asset, markets, mu, sigma, X): """Calculate mean estimated by CAPM.""" freq = tools.freq(X.index) X = X[[asset] + markets].dropna() res = OLS(X[asset] - 1 - self.rfr / freq, add_constant(X[markets] - 1 - self.rfr / freq)).fit() beta = res.params.drop(['const']) prev_mu = mu[asset] new_mu = self.rfr + (mu[markets] - self.rfr).dot(beta) alpha = res.params.const * freq alpha_std = freq * np.sqrt(res.cov_params().loc['const', 'const']) if self.verbose: print(f'Beta of {[x for x in beta.round(2)]} changed {asset} mean return from {prev_mu:.1%} to {new_mu:.1%} with alpha {alpha:.2%} ({alpha_std:.2%})') # be benevolent and add alpha if it is positive # k = 0.2 was fine tuned on DPST in order to get it out of the portfolio k = 0.2 if alpha - k * alpha_std > 0 and asset in ('KRE', 'DPST'): if self.verbose: print(f' Adding alpha of {alpha - k * alpha_std:.2%} for {asset}') new_mu += alpha - k * alpha_std return new_mu
def RL_LR_correlation(sessions, fig_no=1): '''Correlate the effect of stimulation on the transition predictor with RL model paramters across subjects''' # Fit RL model to all trials. RL_agent = rl.MFmoMF_MB_dec(['bs','rb','ec','mc']) RL_fit = mf.fit_population(sessions, RL_agent) # Fit regression model seperately to stim and non-stim trial. LR_model = lr.config_log_reg() LR_model.trial_select['trial_mask'] = 'stim_trials' LR_model.trial_select['invert_mask'] = False LR_fit_stim = mf.fit_population(sessions, LR_model) LR_model.trial_select['invert_mask'] = True LR_fit_nons = mf.fit_population(sessions, LR_model) # Make data frame with parameter fits for each subject. ses_LR_params_stim = np.vstack([sf['params_T'] for sf in LR_fit_stim['session_fits']]) ses_LR_params_nons = np.vstack([sf['params_T'] for sf in LR_fit_nons['session_fits']]) ses_RL_params = np.vstack([sf['params_T'] for sf in RL_fit['session_fits']]) ses_df = pd.DataFrame({pn: ses_RL_params[:,i] for i,pn in enumerate(RL_agent.param_names)}) ses_df['d_trans'] = (ses_LR_params_stim[:,LR_model.param_names.index('trans_CR')] - ses_LR_params_nons[:,LR_model.param_names.index('trans_CR')]) ses_df['subject'] = np.array([s.subject_ID for s in sessions]) sub_df = ses_df.groupby('subject').mean() # Plot correlation of G_mb with stim effect on transition predictor. plt.figure(fig_no, clear=True, figsize=[3.3,3]) regplot('G_mb', 'd_trans', sub_df) plt.xlabel('Model-based weight') plt.ylabel('Stim change in\ntransition predictor') plt.tight_layout() res = linregress(sub_df['G_mb'], sub_df['d_trans']) print('Slope: {:.3f} r: {:.3f} P value: {:.4f}'.format( res.slope, res.rvalue, res.pvalue)) # Regress stim effect with multiple RL model parameters. X = sub_df[['G_mb','G_td','G_tdm','mc']] X.insert(0,'const',1) print(OLS(sub_df['d_trans'], X).fit().summary())
def prs_betaci(q, prs, df): (q0,q1)=q we_print=(q0==2) q0=df[prs].quantile((100-q0)/100.0), # pandas has 99 as the highest; we have 1 as the highest q1=df[prs].quantile((100-q1)/100.0) q40=df[prs].quantile(0.4) q60=df[prs].quantile(0.6) iids=df.index[((q0 <= df[prs]) & (df[prs] <= q1)) | ((q40 <= df[prs]) & (df[prs] <= q60))] if is_bin: data=np.vstack((expit(models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates])), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T try: m=Logit(df.loc[iids,phe_code], data).fit(disp=0) except PerfectSeparationError: return None,(None,None),None b=np.exp(m.params[1]) ci=np.abs(np.exp(m.conf_int().iloc[1,:].values)-b) else: data=np.vstack((models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates]), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T m=OLS(df.loc[iids,phe_code], data).fit(disp=0) b=m.params[1] ci=np.abs(m.conf_int().iloc[1,:].values-b) if we_print: print(b, [b-ci[0],b+ci[1]]) return b,ci,df.loc[(q0 <= df[prs]) & (df[prs] <= q1),phe_code].mean()
def FamaMacbeth_statsmodels(ff3, returns, plot_return=False): # First stage: N-time-series regression, one for each asset or portfolio, of its excess returns on the ff3 to estimate the factor loadings betas = [] for equity in returns: beta = OLS(endog=returns.loc[returns.index, equity], exog=add_constant(ff3), missing='drop').fit() betas.append(beta.params.drop('const')) betas = pd.DataFrame(betas, columns=ff3.columns, index=returns.columns) # Second stage: T cross-sectional regression, one for each time period, to estimate the risk premium lambdas = list() for period in returns.index: lmda = OLS(endog=returns.loc[period, betas.index], exog=betas, missing='drop').fit() lambdas.append(lmda.params) return betas, lambdas
def testPow(n): raw_X = trainData.OverallQual.values.reshape(-1, 1) OLS_y = trainData.SalePrice X = raw_X**n features = sm.add_constant(X) ols_sm = OLS(OLS_y.values, features) model = ols_sm.fit() return model.rsquared
def run(): varsY = [ x for x in Y.columns.tolist() if Y.columns.tolist().index(x) in listboxY.curselection() ] varsX = [ x for x in X.columns.tolist() if X.columns.tolist().index(x) in listboxX.curselection() ] global trainY global trainX trainY = Y[~data.isnull().T.any().T] trainX = X[~data.isnull().T.any().T] trainX = add_constant(trainX[varsX]) testX = X[data.isnull().T.any().T] testX = add_constant(testX[varsX]) result0 = DataFrame(columns=varsY) if (len(varsY) == 0): messagebox.showinfo('提示', '至少选中一个结果变量!') return if (len(varsX) == 0): messagebox.showinfo('提示', '至少选中一个预测变量!') return with ExcelWriter(saveFile, engine="openpyxl") as writer: for id, varY in enumerate(varsY): fit = OLS(trainY.iloc[:, id], trainX).fit() print(fit.summary2().tables) result0[varY] = fit.predict(testX) result0.to_excel(writer, sheet_name="SUMMARY", header=True, index=True) global result1 result1 = fit.get_prediction(testX).summary_frame() result1.to_excel(writer, sheet_name=varY, header=True, index=True) global result2 result2 = fit.summary2().tables result2[0].iloc[:, [0, 1]].to_excel(writer, sheet_name=varY, header=False, index=False, startrow=result1.shape[0] + 2, startcol=0) result2[0].iloc[:, [2, 3]].to_excel(writer, sheet_name=varY, header=False, index=False, startrow=result1.shape[0] + 2, startcol=5) result2[1].to_excel(writer, sheet_name=varY, header=True, index=True, startrow=result1.shape[0] + result2[0].shape[0] + 3) writer.save() writer.close() messagebox.showinfo('提示', '执行完成!')
def fit(self, x, y): x = array(x).reshape(-1, 1) model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit() self.m = model.predict( PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1))) self.s = wls_prediction_std( model, PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0] return self
def get_half_life(Z): z_lag = np.roll(Z, 1) z_lag[0] = 0 z_ret = Z - z_lag # adds intercept terms to X for regression z_lag2 = add_constant(z_lag) model = OLS(z_ret, z_lag2).fit() return int(-np.log(2) / model.params[1])
def _capm(self): rfr = self.rf_rate / self.freq() rr = self.ucrp_r - rfr if 'CASH' in self.B.columns: cash = self.B.CASH else: cash = 0 m = OLS(self.r - 1 - (1 - cash) * rfr, np.vstack([np.ones(len(self.r)), rr - 1]).T) return m.fit()
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) print(regressor_OLS.summary()) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) return x
def stats_models(self, X_train, y_train, show_summary=False): ''' perform OLS from stats model return model results ''' X = sm.add_constant(X_train) model_stats = OLS(y_train, X) results_stats = model_stats.fit() if show_summary: results_stats.summary() return results_stats