示例#1
0
文件: lin_model.py 项目: ejokeeffe/ML
    def get_prediction_interval(self,X=[]):
        """

        Chuck out the 95% prediction interval for the data passed

        Note that if X is a dataframe it may contain more columns than there are in the original data,
        therefore just pull out what we're after

        """
        #need to get the idempotent matrix
        i_n=numpy.matrix(numpy.ones(X.shape[0]))
        n_obs=X.shape[0]
        # M_0=numpy.matrix(numpy.eye(n_obs))-numpy.power(n_obs,-1)*i_n*i_n.T

        #Z is the X's without the offset
        # logging.debug(X.head())
        if isinstance(X,pd.core.frame.DataFrame):
            #assume its' called alpha
            X=X[self.independent_]



        df_pred=pd.DataFrame({'upper_pred':numpy.zeros(X.shape[0]),'lower_pred':numpy.zeros(X.shape[0])})
        df_pred['y_hat']=self.predict(X)
        df_pred['percent_ci']=0.0
        alpha=0.05
        t_val=stats.t.ppf(1-alpha/2,self.df_resid+1)
        for indx in df_pred.index:
            # print(df_pred.ix[indx].values[1:])
            # logging.debug(self.X_bar)
            # logging.debug(X.head())
            if "alpha" in self.independent_:
                x_0_x_bar=numpy.matrix(X.ix[indx].values[1:]-self.X_bar)
            else:
                x_0_x_bar=numpy.matrix(X.ix[indx].values-self.X_bar)
            
            
            # print(numpy.shape(x_0_x_bar))
            # print("************")
            # logging.debug(self.Z_M_Z)
            # logging.debug(x_0_x_bar)
            se_e = self.s_y*numpy.sqrt(1 + (1/self.nobs) +
                x_0_x_bar*inv(self.Z_M_Z)*x_0_x_bar.T)

            df_pred.loc[indx,'upper_pred']=df_pred.loc[indx,'y_hat']+t_val*se_e
            df_pred.loc[indx,'lower_pred']=df_pred.loc[indx,'y_hat']-t_val*se_e

            df_pred.loc[indx,'percent_ci']=100*2*t_val*se_e/numpy.abs(df_pred.loc[indx,'y_hat'])
        return df_pred
示例#2
0
文件: lin_model.py 项目: ejokeeffe/ML
    def get_confidence_intervals_for_coefs(self):
        """

        Gets the upper and lower bound 95% confidence intervals for
        each coefficient excluding alpha

        t_k=(b_k-beta_k)/sqrt(s^2 * Skk)

        """
        alpha=0.05
        t_val=stats.t.ppf(1-alpha/2,self.df_resid+1)

        df_res=pd.DataFrame({'upper':numpy.zeros(len(self.coef_)),\
            'lower':numpy.zeros(len(self.coef_)),\
            'b':self.coef_},index=self.independent_)

        for ii,var in enumerate(self.independent_):
            df_res.loc[var,'upper']=df_res.loc[var,'b']+t_val*self.se[ii]
            df_res.loc[var,'lower']=df_res.loc[var,'b']-t_val*self.se[ii]

        return df_res
示例#3
0
文件: lin_model.py 项目: ejokeeffe/ML
    def get_confidence_interval_for_mean(self,X=[]):
        """

        Calculates the confidence interval for each datapoint, given a model fit
        This is the confidence interval of the model, not the prediction interval


        """
        if isinstance(X,pd.core.frame.DataFrame):
            X=X[self.independent_]
        df_results=pd.DataFrame({'y_hat':numpy.zeros(X.shape[0])})
        y_hat=self.predict(X)
        w=numpy.matrix(X)

     
        # XT_X=numpy.matrix(X).T*\
        #     numpy.matrix(X) 
        #print "X_XT"
        #print X_XT
        
    #    print "w"
    #    print numpy.shape(w)
    #    print "XT_T"
    #    print numpy.shape(XT_X)
        #logging.debug(numpy.shape(s_2*inv(XT_X)))
        s_c_2=numpy.array(w*numpy.power(self.s_y,2)*inv(self.X_dash_X)*w.T)
        #logging.debug("s_c_2: {}".format(s_c_2))
        #we only want the diagonal
        s_c_2=numpy.diagonal(s_c_2)
        #logging.debug("s_c_2 diag: {}".format(s_c_2))
        #tau=df_new.apply(lambda x:numpy.matrix(x[est.params.index.values].values),axis=1)
    #        X_XT*numpy.matrix(x[est.params.index.values].values).T)
    #    tau=numpy.matrix(df_new[est.params.index.values].values[])*X_XT*\
    #        numpy.matrix(df_new[est.params.index.values].values).T
        #print "tau"
        #print numpy.shape(numpy.squeeze(tau))
        #95% confidence interval so alpha =0.95
        alpha=0.05
        t_val=stats.t.ppf(1-alpha/2,self.df_resid+1)
        upper=y_hat+t_val*numpy.sqrt(s_c_2)
        lower=y_hat-t_val*numpy.sqrt(s_c_2)
        

        # df_orig['s_c_2']=s_c_2
        # #df_orig['sigma_tilde']=sigma_tilde
        # df_orig['t']=t_val
        
        # df_orig['upper_y_hat']=upper
        # df_orig['lower_y_hat']=lower
        df=pd.DataFrame({'y_hat':y_hat,'upper_mean':upper,'lower_mean':lower})
        return (df)