def sample_properties(): """ Pop out some sample properties for explanation """ #read in some wooldridge data df=pd.read_csv('http://samba.fsv.cuni.cz/~cahlik/Backup/Ekonometrie/Data%20Wooldridge%20Stata/401k.csv') logging.debug(df.describe())
def confidence_interval(self): #get sample std N=len(self.X) S=statistics.stdev(self.X) logging.debug(S) se=S/numpy.sqrt(self.X) #get t statistic pctile=stats.t.ppf(0.975,df=N-1) x_bar=numpy.mean(self.X) return (x_bar-pctile*se,x_bar+pctile*se)
def test_pred_interval(show_plot=False): from ml_ext import examples (coefs,df)=examples.gen_simplemodel_data(n=50,k=3) df.sort('X1',inplace=True) lr=LinModel() X=df[df.columns[df.columns!='y']] y=df.y lr.fit(X=X,y=y) lr.summary() df_ci=lr.get_confidence_interval_for_mean(X) df_pi=lr.get_prediction_interval(X) #Now use statsmodels to compare from statsmodels.sandbox.regression.predstd import wls_prediction_std import statsmodels.api as sm re = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(re) if show_plot: (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12]) cols=sns.color_palette('husl',n_colors=4) ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5) ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5) ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[0].legend(loc='best') ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5) ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5) ax[1].legend(loc='best') #get difference between uppers from each and check they are within 1% overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u) logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff)) assert overall_diff<0.1