"""Example: scikits.statsmodels.OLS """ from scikits.statsmodels.datasets.longley import Load import scikits.statsmodels as models import numpy as np data = Load() data.exog = models.tools.add_constant(data.exog) ols_model = models.OLS(data.endog, data.exog) ols_results = ols_model.fit() # the Longley dataset is well known to have high multicollinearity # one way to find the condition number is as follows # normalize the independent variables to have unit length, Greene 4.9 norm_x = np.ones_like(data.exog) for i in range(ols_model.df_model): norm_x[:,i] = data.exog[:,i]/np.linalg.norm(data.exog[:,i]) norm_xtx = np.dot(norm_x.T,norm_x) eigs = np.linalg.eigvals(norm_xtx) collin = np.sqrt(eigs.max()/eigs.min()) print collin # clearly there is a big problem with multicollinearity # the rule of thumb is any number of 20 requires attention # for instance, consider the longley dataset with the last observation dropped ols_results2 = models.OLS(data.endog[:-1], data.exog[:-1,:]).fit() # all of our coefficients change considerably in percentages
if __name__ == '__main__': #A: josef-pktd import scikits.statsmodels as sm from scikits.statsmodels import OLS from scikits.statsmodels.datasets.longley import Load import scikits.statsmodels as sm from scikits.statsmodels.iolib.table import (SimpleTable, default_txt_fmt, default_latex_fmt, default_html_fmt) import numpy as np data = Load() data.exog = sm.tools.add_constant(data.exog) for inidx, outidx in LeaveOneOut(len(data.endog)): res = sm.OLS(data.endog[inidx], data.exog[inidx,:]).fit() print data.endog[outidx], res.model.predict(data.exog[outidx,:]), print data.endog[outidx] - res.model.predict(data.exog[outidx,:]) resparams = [] for inidx, outidx in LeavePOut(len(data.endog), 2): res = sm.OLS(data.endog[inidx], data.exog[inidx,:]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), #print ((data.endog[outidx] - res.model.predict(data.exog[outidx,:]))**2).sum() resparams.append(res.params) resparams = np.array(resparams) doplots = 1