示例#1
0
文件: lin_model.py 项目: ejokeeffe/ML
def test_pred_interval(show_plot=False):
    from ml_ext import examples
    (coefs,df)=examples.gen_simplemodel_data(n=50,k=3)
    df.sort('X1',inplace=True)
    lr=LinModel()
    X=df[df.columns[df.columns!='y']]
    y=df.y


    lr.fit(X=X,y=y)
    lr.summary()
    df_ci=lr.get_confidence_interval_for_mean(X)
    df_pi=lr.get_prediction_interval(X)

    #Now use statsmodels to compare
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    import statsmodels.api as sm
    re = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(re)

    if show_plot:
        (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12])

        cols=sns.color_palette('husl',n_colors=4)
        ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        
        ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5)
        ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5)
        ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[0].legend(loc='best')

        ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5)
        ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5)
        ax[1].legend(loc='best')

    #get difference between uppers from each and check they are within 1%
    overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u)
    logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff))
    assert overall_diff<0.1
示例#2
0
文件: lin_model.py 项目: ejokeeffe/ML
    def fit(self, X, y, n_jobs=1):
        """
        y can be series or array
        X can be dataframe or ndarry (N datapoints x M features)
        """
        self = super(LinModel, self).fit(X, y, n_jobs)


        self.nobs=X.shape[0]
        self.nparams=X.shape[1]
        #remove an extra 1 for the alpha (k-1)
        self.df_model=X.shape[1]-1
        #(n-k-1) - we always assume an alpha is present
        self.df_resid=self.nobs-X.shape[1]-1
        #standard error of the regression 
        y_bar=y.mean()
        y_hat=self.predict(X)

        self.raw_data=X
        self.training=y
        # logging.debug(X)
        self.fittedvalues=y_hat
        #explained sum of squares
        SSE=numpy.sum([numpy.power(val-y_bar,2) for val in y_hat])
        e=numpy.matrix(y-y_hat).T
        self.resid=numpy.ravel(e)
        # logging.debug(y_bar)
        # logging.debug(y)
        SST=numpy.sum([numpy.power(val-y_bar,2) for val in y])
        SSR=numpy.sum([numpy.power(x,2) for x in e])
        self.ssr=SSR
        #print(SSR)
        
        #mean squared error of the residuals (unbiased)
        #square root of this is the standard error of the regression
        s_2 = SSR / (self.df_resid+1)
        self.s_y=numpy.sqrt(s_2)
        self.RMSE_pc=metrics.get_RMSE_pc(y,y_hat)
        # logging.debug("s_y = {}".format(self.s_y))

        #Also get the means of the independent variables
        if isinstance(X,pd.core.frame.DataFrame):
            #assume its' called alpha
            self.X_bar=X[X.columns[X.columns!='alpha']].mean()
            Z=numpy.matrix(X[X.columns[X.columns!='alpha']])
        else:
            #assume its the first column
            self.X_bar=numpy.mean(X.values,axis=0)[1:]
            Z=numpy.matrix(X[:,1:])
        
        i_n=numpy.matrix(numpy.ones(self.nobs))
        M_0=numpy.matrix(numpy.eye(self.nobs))-numpy.power(self.nobs,-1)*i_n*i_n.T
        self.Z_M_Z=Z.T*M_0*Z
        # #print(numpy.sqrt(numpy.diagonal(sse * numpy.linalg.inv(numpy.dot(X.T, X)))))
        # #standard error of estimator bk
        X_mat=numpy.matrix(X.values)
        #print(X_mat)
        self.X_dash_X=X_mat.T*X_mat
        # we get nans using this approach so calculate each one separately
        # se=numpy.zeros(self.nparams)
        # for ii in range(self.nparams):
        #     se[ii]=numpy.sqrt(X_dash_X[ii,ii]*s_2)
        # logging.debug(s_2)
        # logging.debug(numpy.linalg.inv(X_dash_X))
        # #se = numpy.sqrt(numpy.diagonal(s_2 * numpy.linalg.inv(numpy.matrix(X.T, X))))
        se=numpy.sqrt(numpy.diagonal(s_2 * numpy.linalg.inv(self.X_dash_X)))

        self.se= se
        self.t = self.coef_ / se
        self.p = 2 * (1 - stats.t.cdf(numpy.abs(self.t), y.shape[0] - X.shape[1]))

        self.independent_ = []
        if isinstance(X,pd.DataFrame):
            self.independent_=X.columns.values
        #t_val=stats.t.ppf(1-0.05/2,y.shape[0] - X.shape[1])

        
        
        #R2 - 1-SSR/SST
        self.rsquared=1-SSR/SST
        #adjusted r2
        #1-[(1-R2)(n-1)/(n-k-1)]
        self.rsquared_adj=1-(((1-self.rsquared)*(self.nobs-1))/self.df_resid)
        #f-value
        f_value=(self.rsquared/(self.df_model))/\
            ((1-self.rsquared)/(self.df_resid+1))
        self.f_stat=f_value
        self.f_pvalue=stats.f.pdf(f_value,self.df_model,self.df_resid+1)