def _get_drop_vari(self, attributes): '''regress endog on exog without one of the variables This uses a k_vars loop, only attributes of the OLS instance are stored. Parameters ---------- attributes : list of strings These are the names of the attributes of the auxiliary OLS results instance that are stored and returned. not yet used ''' from statsmodels.sandbox.tools.cross_val import LeaveOneOut endog = self.results.model.endog exog = self.exog cv_iter = LeaveOneOut(self.k_vars) res_loo = defaultdict(list) for inidx, outidx in cv_iter: for att in attributes: res_i = self.model_class(endog, exog[:,inidx]).fit() res_loo[att].append(getattr(res_i, att)) return res_loo
def _res_looo(self): '''collect required results from the LOOO loop all results will be attached. currently only 'params', 'mse_resid', 'det_cov_params' are stored regresses endog on exog dropping one observation at a time this uses a nobs loop, only attributes of the OLS instance are stored. ''' from statsmodels.sandbox.tools.cross_val import LeaveOneOut get_det_cov_params = lambda res: np.linalg.det(res.cov_params()) endog = self.endog exog = self.exog params = np.zeros(exog.shape, dtype=np.float) mse_resid = np.zeros(endog.shape, dtype=np.float) det_cov_params = np.zeros(endog.shape, dtype=np.float) cv_iter = LeaveOneOut(self.nobs) for inidx, outidx in cv_iter: res_i = self.model_class(endog[inidx], exog[inidx]).fit() params[outidx] = res_i.params mse_resid[outidx] = res_i.mse_resid det_cov_params[outidx] = get_det_cov_params(res_i) return dict(params=params, mse_resid=mse_resid, det_cov_params=det_cov_params)
def loo(self, **kwargs): """ Leave one out cross-validation. Calculates summary statistics for each iteraction of leave one out cross-validation, specially `mse` on entire model and `pred_err` to measure prediction error. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame model_mse : np.array, float Mean sum of squares error for each iteration of the cross validation. pred_mse : np.array, float Prediction mean sum of squares error for each iteration of the cross validation. See Also -------- fit statsmodels.regression.linear_model. """ # number of observations (i.e. samples) nobs = self.response_matrix.shape[0] cv_iter = LeaveOneOut(nobs) results = pd.DataFrame(index=self.response_matrix.index, columns=['model_mse', 'pred_mse'], dtype=np.float64) for i, (train, test) in enumerate(cv_iter): sample_id = self.response_matrix.index[i] res_i = OLSModel(self.response_matrix.iloc[train], self.design_matrix.iloc[train]) res_i.fit(**kwargs) # model error predicted = res_i.predict(X=self.design_matrix.iloc[train]) r = self.response_matrix.iloc[train].values p = predicted.values model_resid = ((r - p)**2) model_mse = np.mean(model_resid.sum(axis=0)) results.loc[sample_id, 'model_mse'] = model_mse # prediction error predicted = res_i.predict(X=self.design_matrix.iloc[test]) r = self.response_matrix.iloc[test].values p = predicted.values pred_resid = ((r - p)**2) pred_mse = np.mean(pred_resid.sum(axis=0)) results.loc[sample_id, 'pred_mse'] = pred_mse return results
def loo(self, **kwargs): """ Leave one out cross-validation. Calculates summary statistics for each iteraction of leave one out cross-validation, specially `mse` on entire model and `pred_err` to measure prediction error. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame mse : np.array, float Mean sum of squares error for each iteration of the cross validation. pred_err : np.array, float Prediction mean sum of squares error for each iteration of the cross validation. See Also -------- fit statsmodels.regression.linear_model. """ nobs = self.balances.shape[0] # number of observations (i.e. samples) cv_iter = LeaveOneOut(nobs) endog = self.balances exog_names = self.results[0].model.exog_names exog = pd.DataFrame(self.results[0].model.exog, index=self.balances.index, columns=exog_names) results = pd.DataFrame(index=self.balances.index, columns=['mse', 'pred_err']) for i, (inidx, outidx) in enumerate(cv_iter): sample_id = self.balances.index[i] model_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs) res_i = [r.fit(**kwargs) for r in model_i] # mean sum of squares error sse = sum([r.ssr for r in res_i]) # degrees of freedom for residuals dfe = res_i[0].df_resid results.loc[sample_id, 'mse'] = sse / dfe # prediction error on loo point predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i]) pred_sse = np.sum((predicted - self.balances.loc[outidx])**2) results.loc[sample_id, 'pred_err'] = pred_sse.sum() return results
def lovo(self, **kwargs): """ Leave one variable out cross-validation. Calculates summary statistics for each iteraction of leave one variable out cross-validation, specially `r2` and `mse` on entire model. This technique is particularly useful for feature selection. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame Rsquared : np.array, flot Coefficient of determination for each variable left out. mse : np.array, float Mean sum of squares error for each iteration of the cross validation. """ endog = self.balances exog_names = self.results[0].model.exog_names exog = pd.DataFrame(self.results[0].model.exog, index=self.balances.index, columns=exog_names) cv_iter = LeaveOneOut(len(exog_names)) results = pd.DataFrame(index=exog_names, columns=['mse', 'Rsquared']) for i, (inidx, outidx) in enumerate(cv_iter): feature_id = exog_names[i] res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs) res_i = [r.fit(**kwargs) for r in res_i] # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. # sum of squares regression. ssr = sum([r.ess for r in res_i]) # sum of squares error. sse = sum([r.ssr for r in res_i]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr results.loc[feature_id, 'Rsquared'] = 1 - sse / sst # degrees of freedom for residuals dfe = res_i[0].df_resid results.loc[feature_id, 'mse'] = sse / dfe return results
def lovo(self, **kwargs): """ Leave one variable out cross-validation. Calculates summary statistics for each iteraction of leave one variable out cross-validation, specially `r2` and `mse` on entire model. This technique is particularly useful for feature selection. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame mse : np.array, float Mean sum of squares error for each iteration of the cross validation. Rsquared : np.array, float Coefficient of determination for each variable left out. R2diff : np.array, float Decrease in Rsquared for each variable left out. """ cv_iter = LeaveOneOut(len(self.design_matrix.columns)) results = pd.DataFrame(index=self.design_matrix.columns, columns=['mse', 'Rsquared', 'R2diff'], dtype=np.float64) for i, (inidx, outidx) in enumerate(cv_iter): feature_id = self.design_matrix.columns[i] res_i = OLSModel(Y=self.response_matrix, Xs=self.design_matrix.iloc[:, inidx]) res_i.fit(**kwargs) predicted = res_i.predict() r = self.response_matrix.values p = predicted.values model_resid = ((r - p)**2) model_mse = np.mean(model_resid.sum(axis=0)) results.loc[feature_id, 'mse'] = model_mse results.loc[feature_id, 'Rsquared'] = res_i.r2 results.loc[feature_id, 'R2diff'] = self.r2 - res_i.r2 return results
def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None): '''estimate the model and selection criteria for up to maxfact factors The selection criteria that are calculated are AIC, BIC, and R2_adj. and additionally cross-validation prediction error sum of squares if `skip_crossval` is false. Cross-validation is not used by default because it can be time consuming to calculate. By default the cross-validation method is Leave-one-out on the full dataset. A different cross-validation sample can be specified as an argument to cv_iter. Results are attached in `results_find_nfact` ''' #print 'OLS on Factors' if not hasattr(self, 'factors'): self.calc_factors() hasconst = self.hasconst if maxfact is None: maxfact = self.factors.shape[1] - hasconst if (maxfact + hasconst) < 1: raise ValueError( 'nothing to do, number of factors (incl. constant) should ' + 'be at least 1') #temporary safety maxfact = min(maxfact, 10) y0 = self.endog results = [] #xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1) for k in range(1, maxfact + hasconst): #k includes now the constnat #xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # this is faster and same result fact = self.factors[:, :k] res = sm.OLS(y0, fact).fit() ## print 'k =', k ## print res.params ## print 'aic: ', res.aic ## print 'bic: ', res.bic ## print 'llf: ', res.llf ## print 'R2 ', res.rsquared ## print 'R2 adj', res.rsquared_adj if not skip_crossval: if cv_iter is None: cv_iter = LeaveOneOut(len(y0)) prederr2 = 0. for inidx, outidx in cv_iter: res_l1o = sm.OLS(y0[inidx], fact[inidx, :]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), prederr2 += (y0[outidx] - res_l1o.model.predict( res_l1o.params, fact[outidx, :]))**2. else: prederr2 = np.nan results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2]) self.results_find_nfact = results = np.array(results) self.best_nfact = np.r_[(np.argmin(results[:, 1:3], 0), np.argmax(results[:, 3], 0), np.argmin(results[:, -1], 0))]
results = [] xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1) for k in range(0, x0.shape[1] + 1): #xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # this is faster and same result fact_wconst = sm.add_constant(fact[:, :k], prepend=False) res = sm.OLS(y0, fact_wconst).fit() ## print 'k =', k ## print res.params ## print 'aic: ', res.aic ## print 'bic: ', res.bic ## print 'llf: ', res.llf ## print 'R2 ', res.rsquared ## print 'R2 adj', res.rsquared_adj prederr2 = 0. for inidx, outidx in LeaveOneOut(len(y0)): resl1o = sm.OLS(y0[inidx], fact_wconst[inidx, :]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), prederr2 += (y0[outidx] - resl1o.model.predict(fact_wconst[outidx, :]))**2. results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2]) results = np.array(results) print results print 'best result for k, by AIC, BIC, R2_adj, L1O' print np.r_[(np.argmin(results[:, 1:3], 0), np.argmax(results[:, 3], 0), np.argmin(results[:, -1], 0))] from statsmodels.iolib.table import (SimpleTable, default_txt_fmt, default_latex_fmt, default_html_fmt)
def _res_looo(self): """collect required results from the LOOO loop all results will be attached. currently only 'params', 'mse_resid', 'det_cov_params' are stored Reestimates the model with endog and exog dropping one observation at a time This uses a nobs loop, only attributes of the results instance are stored. Warning: This will need refactoring and API changes to be able to add options. """ from statsmodels.sandbox.tools.cross_val import LeaveOneOut get_det_cov_params = lambda res: np.linalg.det(res.cov_params()) endog = self.results.model.endog exog = self.results.model.exog init_kwds = self.results.model._get_init_kwds() # We need to drop obs also from extra arrays freq_weights = init_kwds.pop('freq_weights') var_weights = init_kwds.pop('var_weights') offset = offset_ = init_kwds.pop('offset') exposure = exposure_ = init_kwds.pop('exposure') n_trials = init_kwds.pop('n_trials', None) # family Binomial creates `n` i.e. `n_trials` # we need to reset it # TODO: figure out how to do this properly if hasattr(init_kwds['family'], 'initialize'): # assume we have Binomial is_binomial = True else: is_binomial = False params = np.zeros(exog.shape, dtype=float) scale = np.zeros(endog.shape, dtype=float) det_cov_params = np.zeros(endog.shape, dtype=float) cv_iter = LeaveOneOut(self.nobs) for inidx, outidx in cv_iter: if offset is not None: offset_ = offset[inidx] if exposure is not None: exposure_ = exposure[inidx] if n_trials is not None: init_kwds['n_trials'] = n_trials[inidx] mod_i = self.model_class(endog[inidx], exog[inidx], offset=offset_, exposure=exposure_, freq_weights=freq_weights[inidx], var_weights=var_weights[inidx], **init_kwds) if is_binomial: mod_i.family.n = init_kwds['n_trials'] res_i = mod_i.fit(start_params=self.results.params, method='newton') params[outidx] = res_i.params.copy() scale[outidx] = res_i.scale det_cov_params[outidx] = get_det_cov_params(res_i) return dict( params=params, scale=scale, mse_resid=scale, # alias for now det_cov_params=det_cov_params)