def errors_sampling(self):
     beta_boot=[]
     res_boot=[]
     for b in range(self.B):
         ind= npr.randint(0,len(self.X),len(self.X))
         sample_X=self.X[ind,:]
         sample_resid=self.resid[ind]
         sample_Y=np.zeros(len(self.X))
         if self.method=="linear":    
             for i in range(len(self.X)):
                 sample_Y[i] = np.dot(sample_X[i,:],self.beta) + np.sqrt(self.var)*sample_resid[i]
             model_sample =st.GLM(sample_Y,sample_X,st.families.Gaussian())
         if self.method=="logistic":
             for i in range(len(self.X)):
                 pi_logreg = np.exp( np.dot(sample_X[i,:],self.beta) )
                 cutoff = pi_logreg / ( 1 + pi_logreg ) + np.sqrt(pi_logreg*(1-pi_logreg))*sample_resid[i]
                 sample_Y[i] = 1 if cutoff > 0.5 else 0
             model_sample =st.GLM(sample_Y,sample_X,st.families.Binomial())
         res_sample = model_sample.fit()
         beta_sample = res_sample.params
         if self.method == "linear":
             resid_sample = res_sample.resid_deviance
         if self.method == "logistic":
             resid_sample = res_sample.resid_pearson
         sd_sample=np.std(resid_sample)
         beta_boot.append(beta_sample) 
         res_boot.append(sd_sample)
         
     return(beta_boot)
    def bootstrap_H0_CS(self,k,hyp):
        S_b = np.zeros(self.B)
        #ETAPE 1: estime les paramètres et erreurs sous H0
        gamma,Residus,X_H0,vrais,y_prime = self.Estim_H0(k,hyp)
        #ETAPE 2: échantillonne aléatoirement les résidus
        test_ES=[]
        for b in range(self.B):
            ind=npr.randint(0,len(Residus),len(Residus))
            #ETAPE 3: calcul des Beta et variances avec une regression linéaire sur l'échantillon bootstrapé
            if self.method == "linear":
                model_sample =st.GLM(y_prime[ind],X_H0[ind,:],st.families.Gaussian())
            if self.method == "logistic":
                model_sample =st.GLM(y_prime[ind],X_H0[ind,:],st.families.Binomial())
            res_sample = model_sample.fit()
            beta_sample = res_sample.params
            #ETAPE 4: sauvegarde dans une liste les beta's et erreurs estimés
            X_sub=X_H0.copy()#[ ind,: ]
            X = self.X.copy()#[ind,:]
            Y_sub=self.y.copy()#[ind]
            if self.method == "linear":
                test_ES.append( self.Fisher(X,X_sub,Y_sub,y_prime,beta_sample,self.beta) )
            if self.method == "logistic":
                test_ES.append( -2*( res_sample.llf - vrais ) )
        return test_ES

        
        
 def bootstrap_H0_ES(self,k,hyp):
     #ETAPE 1: estime les paramètres et erreurs sous H0
     gamma,Residus,X_H0,vrais,y_prime = self.Estim_H0(k,hyp)
     var_H0=(1/(len(X_H0)-X_H0.shape[1]-1))*np.sum(Residus**2)
     #ETAPE 2: échantillonne aléatoirement les résidus
     test_ES=[]
     y_hat_list=[]
     ind_list=[]
     for b in range(self.B):
         ind=npr.randint(0,len(Residus),len(Residus))
         #ETAPE 3: calcul des Beta et variances avec une regression linéaire sur l'échantillon bootstrapé
         y_hat=np.zeros(len(X_H0))
         m=0
         for i in ind:
             if self.method == "linear":
                 y_hat[m] = np.dot(X_H0[i,:],gamma) + np.sqrt(var_H0)*Residus[i]
             if self.method == "logistic":
                 pi_logreg = np.exp( np.dot(X_H0[i,:],gamma) )
                 cutoff = pi_logreg / ( 1 + pi_logreg ) + np.sqrt(pi_logreg*(1-pi_logreg))*Residus[i]
                 y_hat[m] = 1 if cutoff > 0.5 else 0
             m=m+1
         y_hat_list.append(y_hat)
         #ind_list.append(ind)
         #if self.method == "linear":
         #    model_sample =st.GLM(y_hat,X_H0[ind,:],st.families.Gaussian())
         if self.method == "logistic":
             model_sample =st.GLM(y_hat,X_H0[ind,:],st.families.Binomial())
             res_sample = model_sample.fit()
             beta_sample = res_sample.params
         #ETAPE 4: sauvegarde dans une liste les beta's et erreurs estimés
         if self.method == "linear":
             test_ES.append( self.Fisher(self.X,X_H0,y_hat,gamma,self.beta) )
         if self.method == "logistic":
             test_ES.append( -2*( res_sample.llf - vrais ) )
     return test_ES
 def case_sampling(self):
     beta_boot=[]
     sd_hat=np.zeros(self.B)
     for b in range(self.B):
         ind= npr.randint(0,len(self.X),len(self.X))
         sample_X=self.X[ind,:]
         sample_Y=self.y[ind]
         if self.method=="linear":
             model_sample = st.GLM(sample_Y,sample_X,st.families.Gaussian())
         if self.method=="logistic":
             model_sample = st.GLM(sample_Y,sample_X,st.families.Binomial())
         res_sample = model_sample.fit()
         beta_sample = res_sample.params
         beta_boot.append(beta_sample)
         sd_hat[b]=np.std(self.X[ind,:]/np.sqrt(np.var(self.X[ind,:])/np.mean(self.y[ind])**2+
                                 self.X[ind,:]*np.var(self.y[ind])/np.mean(self.y[ind])**4) )
     return(beta_boot,sd_hat)
 def __init__(self,X,y,method,alpha=0.05,B=1000):
     self.X=X  # !!! enlever y !!!
     self.y=y
     self.alpha=alpha
     self.B=B
     self.method=method
     if method=="linear":
         Reg = st.GLM(y,X)
     if method == "logistic":
         Reg = st.GLM(y,X,st.families.Binomial())
     results = Reg.fit()
     self.results=results
     #paramètres de sortie de la régression
     self.beta = results.params
     if method == "linear":
         self.resid = results.resid_deviance
     if method == "logistic":
         self.resid = results.resid_pearson
     self.var=(1/(len(X)-X.shape[1]-1))*sum(self.resid**2)
     self.y_pred = results.fittedvalues
     self.std_beta =  results.bse
 def Estim_H0(self,k,hyp):
     y= self.y.copy()
     if hyp != 0:
         for i in range(len(y)):
             y[i]=y[i]-hyp*self.X[i,k]
     
     if self.method=="linear":
         X=np.delete(self.X,k,axis=1).copy()
         model=st.families.Gaussian()
         Reg = st.GLM(y,X,model)
         results = Reg.fit()
         Beta=results.params
         resid=results.resid_deviance
         vraise=0
     if self.method=="logistic":
         X=np.delete(self.X,k,axis=1).copy()
         model=st.families.Binomial()
         Reg = st.GLM(y,X,model)
         results = Reg.fit()
         Beta=results.params
         resid=results.resid_pearson
         vraise=results.llf
     return(Beta,resid,X,vraise,y)
Exemplo n.º 7
0
def logistic_regression(df,
                        model,
                        groupby=None,
                        compute_cpd=True,
                        standardize=False):

    #should we use group by?
    usegroupby = groupby != None

    #If we're using group by, find unique values to group by.
    if (usegroupby):
        gb_u = df[groupby].drop_duplicates(
        )  #unique values of groupby variable
        ncond = gb_u.shape[0]
    else:  #otherwise, we're not using group by.
        ncond = 1

    mout = []
    for i in range(0, ncond):

        #gets the subset of data given by groupby variable.
        #Otherwise, use entire dataframe
        if (usegroupby):
            thisdf = df.loc[np.sum(df[groupby] == gb_u.iloc[i, :], axis=1) ==
                            len(groupby)]
        else:
            thisdf = df

        #converts data into regressand (y) and regression matrix (X) based on model.
        y, X = patsy.dmatrices(model, thisdf, return_type='dataframe')
        if (standardize):
            for c in X.columns:
                if (c != 'Intercept'):
                    X[c] = scipy.stats.mstats.zscore(X[c])
        #create and fit model object
        mdl = sreg.GLM(endog=y,
                       exog=X,
                       family=sm.genmod.families.family.Binomial())
        thismout = mdl.fit()
        thismout.bic = thismout.deviance + np.log(X.shape[0]) * len(
            thismout.params)
        thismout.rank = np.linalg.matrix_rank(X)
        thismout.npar = X.shape[1]
        thismout.fullrank = thismout.rank == thismout.npar
        #placeholder for computing coefficient of partial determination
        if (compute_cpd):
            pass
        else:
            pass

        #store results
        mout.append(thismout)

    #convert output from GLMresults object into dictionary, which is later converted
    #into a pandas table.
    mout_dict = {
        'bic': [m.bic for m in mout],
        'deviance': [m.deviance for m in mout],
        'df_model': [m.df_model for m in mout],
        'df_resid': [m.df_resid for m in mout],
        'fittedvalues': [m.fittedvalues for m in mout],
        'llf': [m.llf for m in mout],
        'mu': [m.mu for m in mout],
        'npar': [m.npar for m in mout],
        'null_deviance': [m.null_deviance for m in mout],
        'rank': [m.rank for m in mout],
        'resid_deviance': [m.resid_deviance for m in mout],
        'scale': [m.scale for m in mout]
    }

    #flatten parameter/pvalue output into 1 parameter per column.
    for i in range(0, X.shape[1]):
        mout_dict['b_' + X.columns[i]] = [m.params[i] for m in mout]
        mout_dict['p_' + X.columns[i]] = [m.pvalues[i] for m in mout]

    #add groupby information to output data structure
    if (usegroupby):
        for i in range(0, ncond):
            for gbcond in groupby:
                if (i == 0):
                    mout_dict[gbcond] = []
                mout_dict[gbcond].append(gb_u[gbcond].iloc[i])

    #convert dictionary into dataframe
    return pd.DataFrame(mout_dict)