Exemplo n.º 1
0
    def test_custom_models(self, sdata):
        model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
        logd = LogisticRegression(penalty='l1', C=1.0, random_state=203)

        ipt = IPTW(sdata, treatment='art', standardize='unexposed', stabilized=True)
        ipt.regression_models(model, custom_model_denominator=logd)
        ipt.fit()
        sdata['iptw'] = ipt.Weight

        # Estimating GEE
        ind = sm.cov_struct.Independence()
        f = sm.families.family.Binomial(sm.families.links.identity)
        smf.gee('dead ~ art', sdata['id'], sdata, cov_struct=ind, family=f, weights=sdata['iptw']).fit()
Exemplo n.º 2
0
    def outcome_model(self, model, print_results=True):
        """Build the model for the outcome. This is also referred to at the Q-model. This must be specified
        before the fit function. If it is not, an error will be raised.

        model:
            -variables to include in the model for predicting the outcome. Must be contained within the input
             pandas dataframe when initialized. Model form should contain the exposure. Format is the same as
             the functional form, i.e. 'var1 + var2 + var3 + var4'
        print_results:
            -whether to print the logistic regression results to the terminal. Default is True
        """
        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial(sm.families.links.logit)
        else:
            linkdist = sm.families.family.Gaussian(sm.families.links.identity)

        # Modeling the outcome
        if self._weights is None:
            m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist)
            self.outcome_model = m.fit()
        else:
            m = smf.gee(self.outcome + ' ~ ' + model,
                        self.gf.index,
                        self.gf,
                        family=linkdist,
                        weights=self.gf[self._weights])
            self.outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results is True:
            print(self.outcome_model.summary())
        self.model_fit = True
Exemplo n.º 3
0
def modeler(model, lists, linkdist=sm.families.family.Poisson()):
    global df, true_direct_ve
    try:
        # Modified Poisson Regression Model
        ind = sm.cov_struct.Independence()
        log = smf.gee(model, 'id', df, family=linkdist, cov_struct=ind).fit()

        # Estimated Direct Effect
        dvebeta = log.params[1]

        # Estimated Standard Error
        dvese = log.bse[1]

        # Estimated Confidence Intervals
        dlcl = log.conf_int().loc['Vac'][0]
        ducl = log.conf_int().loc['Vac'][1]
        if ((dlcl < true_direct_ve) & (ducl > true_direct_ve)):
            dciv = 1
        else:
            dciv = 0
        dclr = np.exp(ducl) / np.exp(dlcl)

        # Adding results to the end of storage lists
        lists[0].append(dvebeta)
        lists[1].append(dvese)
        lists[2].append(dciv)
        lists[3].append(dclr)

    # If model doesn't converge, add NaN to list
    except:
        lists[0].append(np.nan)
        lists[1].append(np.nan)
        lists[2].append(np.nan)
        lists[3].append(np.nan)
Exemplo n.º 4
0
    def outcome_model(self, model, print_results=True):
        """Build the model for the outcome. This is also referred to at the Q-model. This must be specified
        before the fit function. If it is not, an error will be raised.

        Parameters
        ----------
        model : str
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
        print_results : bool, optional
            Whether to print the logistic regression results to the terminal. Default is True
        """
        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif self.outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        else:
            linkdist = sm.families.family.Poisson()

        # Modeling the outcome
        if self._weights is None:
            m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist)
            self._outcome_model = m.fit()
        else:
            m = smf.gee(self.outcome + ' ~ ' + model,
                        self.gf.index,
                        self.gf,
                        family=linkdist,
                        weights=self.gf[self._weights])
            self._outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())
Exemplo n.º 5
0
    def censoring_model(self, model, restriction=None, print_results=True):
        """Add a specified regression model for censoring. Specifying this model is optional, but is recommended when
        censoring occurs in your data set. Otherwise, you will be assuming non-informative censoring

        Parameters
        ----------
        model:
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Format follows patsy standards
            For example) 'var1 + var2 + var3 + var4'
        restriction : str, optional
            Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model
            fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1"
        print_results : bool, optional
            Whether to print the logistic regression model results to the terminal. Default is True
        """
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()
        linkdist = sm.families.family.Binomial()

        if self._weights is None:  # Unweighted g-formula
            self.cens_model = smf.glm('__uncensored__ ~ ' + model,
                                      g,
                                      family=linkdist).fit()
        else:  # Weighted g-formula
            self.cens_model = smf.gee('__uncensored__ ~ ' + model,
                                      self.idvar,
                                      g,
                                      weights=g[self._weights],
                                      family=linkdist).fit()
        if print_results:
            print(self.cens_model.summary())

        self._censor_model_fit = True
Exemplo n.º 6
0
    def test_match_sas_smr_u_stabilized(self, sdata):
        sas_rd = -0.080048197
        sas_rd_ci = -0.153567335, -0.006529058
        model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
        ipt = IPTW(sdata,
                   treatment='art',
                   standardize='unexposed',
                   stabilized=True)
        ipt.regression_models(model)
        ipt.fit()
        sdata['iptw'] = ipt.Weight

        # Estimating GEE
        ind = sm.cov_struct.Independence()
        f = sm.families.family.Binomial(sm.families.links.identity)
        linrisk = smf.gee('dead ~ art',
                          sdata['id'],
                          sdata,
                          cov_struct=ind,
                          family=f,
                          weights=sdata['iptw']).fit()
        npt.assert_allclose(linrisk.params[1], sas_rd, rtol=1e-5)
        npt.assert_allclose(
            (linrisk.conf_int()[0][1], linrisk.conf_int()[1][1]),
            sas_rd_ci,
            rtol=1e-4)
Exemplo n.º 7
0
    def exposure_model(self, model, restriction=None, print_results=True):
        """Add a specified regression model for the exposure. This is used for natural course estimation of the Monte
        Carlo g-formula. This must be specified before calling the fit function.

        Parameters
        ----------
        model : str
            Variables to include in the model for predicting the exposure. Must be contained within the input
            pandas dataframe when initialized. Format follows patsy standards
            For example) 'var1 + var2 + var3 + var4'
        restriction : str, optional
            Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model
            fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1"
        print_results : bool, optional
            Whether to print the logistic regression model results to the terminal. Default is True
        """
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()
        linkdist = sm.families.family.Binomial()

        if self._weights is None:  # Unweighted g-formula
            self.exp_model = smf.glm(self.exposure + ' ~ ' + model,
                                     g,
                                     family=linkdist).fit()
        else:  # Weighted g-formula
            self.exp_model = smf.gee(self.exposure + ' ~ ' + model,
                                     self.idvar,
                                     g,
                                     weights=g[self._weights],
                                     family=linkdist).fit()

        if print_results:
            print(self.exp_model.summary())
        self._exposure_model_fit = True
Exemplo n.º 8
0
    def test_match_sas_unstabilized(self, sdata):
        sas_w_sum = 1038.051
        sas_rd = -0.081519085
        sas_rd_ci = -0.156199938, -0.006838231
        model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
        ipt = IPTW(sdata, treatment='art', stabilized=False)
        ipt.regression_models(model)
        ipt.fit()
        sdata['iptw'] = ipt.Weight
        npt.assert_allclose(np.sum(sdata.dropna()['iptw']),
                            sas_w_sum,
                            rtol=1e-4)

        # Estimating GEE
        ind = sm.cov_struct.Independence()
        f = sm.families.family.Binomial(sm.families.links.identity)
        linrisk = smf.gee('dead ~ art',
                          sdata['id'],
                          sdata,
                          cov_struct=ind,
                          family=f,
                          weights=sdata['iptw']).fit()
        npt.assert_allclose(linrisk.params[1], sas_rd, rtol=1e-5)
        npt.assert_allclose(
            (linrisk.conf_int()[0][1], linrisk.conf_int()[1][1]),
            sas_rd_ci,
            rtol=1e-4)
Exemplo n.º 9
0
    def exposure_model(self, model, restriction=None, print_results=True):
        """Build the model for the exposure. This must be specified before the fit function. If it is not,
        an error will be raised.
        
        model:
            -variables to include in the model for predicting the outcome. Must be contained within the input
             pandas dataframe when initialized. Format is the same as the functional form
             Example) 'var1 + var2 + var3 + var4'
        restriction:
            -used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat
             model fitting. The pandas dataframe must be referred to as 'g'
             Example) "g['art']==1"
        print_results:
            -whether to print the logistic regression results to the terminal. Default is True
        """
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()
        linkdist = sm.families.family.Binomial(sm.families.links.logit)

        if self._weights is None:  # Unweighted g-formula
            self.exp_model = smf.glm(self.exposure + ' ~ ' + model,
                                     g,
                                     family=linkdist).fit()
        else:  # Weighted g-formula
            self.exp_model = smf.gee(self.exposure + ' ~ ' + model,
                                     self.idvar,
                                     g,
                                     weights=g[self._weights],
                                     family=linkdist).fit()

        if print_results:
            print(self.exp_model.summary())
        self._exposure_model_fit = True
Exemplo n.º 10
0
def RR(formula, idvar, df, printOutput=True):
    """Performs relative risk regression for dichotomous outcomes. 
    Uses a working poisson model and an empirical ("robust") variance estimator.
    
    **Arguments**: \n
    1) formula - a formula expression for the model.
    2) idvar - an identifier for each indepent observation of the data (typically a row).
    3) df - the name of the pandas dataframe.
    4) printOutput - a boolean argument for whether the function should print the output.
    
    **Example Code**: \n
    import pandas as pd \n
    carrot = pd.read_stata("https://stats.idre.ucla.edu/stat/stata/faq/eyestudy.dta") \n
    RR("lenses ~ carrot + gender + latitude", "id", carrot) \n
    # Note: this choice of reference category is different than the IDRE analysis
    
    **References**: \n
    Lumley, T., Kronmal, R., & Ma, S. (2006). 
    Relative risk regression in medical research: 
    models, contrasts, estimators, and algorithms.
    """
    gee = smf.gee(formula, idvar, df, family=sm.families.Poisson())
    results = gee.fit()
    if printOutput:
        print("Relative Risk Regression")
        print(
            "-----------------------------------------------------------------------------------"
        )
        print(results.summary())
    fits = results.fittedvalues
    if printOutput:
        print("Additional diagnostics:")
        print(sum(fits > 1),
              "observations have fitted probabilities greater than one")
        print((sum(fits > 1) / len(fits)) * 100,
              "% of observations have fitted probabilities greater than one")
        print(
            "=============================================================================="
        )
        print("Relative Risk:")
    RRs = results.params
    RRs = np.exp(RRs)
    RRs = RRs.to_frame()
    RRs = RRs.rename(columns={0: 'RR'})
    RR.rrs = RRs.drop(RRs.index[[0]])
    if printOutput:
        print(RR.rrs)
        print(
            "------------------------------------------------------------------------------"
        )
        print("95% Confidence Intervals for Relative Risk:")
    CIs = results.conf_int()
    CIs = CIs.rename(columns={0: 'LCL', 1: 'UCL'})
    CIs = np.exp(CIs)
    RR.ci = CIs.drop(CIs.index[[0]])
    if printOutput:
        print(RR.ci)
        print(
            "=============================================================================="
        )
Exemplo n.º 11
0
def test_missing():
    # gh-1877
    data = [['id', 'al', 'status', 'fake', 'grps'],
            ['4A', 'A', 1, 1, 0],
            ['5A', 'A', 1, 2.0, 1],
            ['6A', 'A', 1, 3, 2],
            ['7A', 'A', 1, 2.0, 3],
            ['8A', 'A', 1, 1, 4],
            ['9A', 'A', 1, 2.0, 5],
            ['11A', 'A', 1, 1, 6],
            ['12A', 'A', 1, 2.0, 7],
            ['13A', 'A', 1, 1, 8],
            ['14A', 'A', 1, 1, 9],
            ['15A', 'A', 1, 1, 10],
            ['16A', 'A', 1, 2.0, 11],
            ['17A', 'A', 1, 3.0, 12],
            ['18A', 'A', 1, 3.0, 13],
            ['19A', 'A', 1, 2.0, 14],
            ['20A', 'A', 1, 2.0, 15],
            ['2C', 'C', 0, 3.0, 0],
            ['3C', 'C', 0, 1, 1],
            ['4C', 'C', 0, 1, 2],
            ['5C', 'C', 0, 2.0, 3],
            ['6C', 'C', 0, 1, 4],
            ['9C', 'C', 0, 1, 5],
            ['10C', 'C', 0, 3, 6],
            ['12C', 'C', 0, 3, 7],
            ['14C', 'C', 0, 2.5, 8],
            ['15C', 'C', 0, 1, 9],
            ['17C', 'C', 0, 1, 10],
            ['22C', 'C', 0, 1, 11],
            ['23C', 'C', 0, 1, 12],
            ['24C', 'C', 0, 1, 13],
            ['32C', 'C', 0, 2.0, 14],
            ['35C', 'C', 0, 1, 15]]

    df = pd.DataFrame(data[1:], columns=data[0])
    df.ix[df.fake == 1, 'fake'] = np.nan
    mod = smf.gee('status ~ fake', data=df, groups='grps',
                  cov_struct=sm.cov_struct.Independence(),
                  family=sm.families.Binomial())

    df = df.dropna()
    #df.loc[:, 'constant'] = 1
    df['constant'] = 1

    mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps,
               cov_struct=sm.cov_struct.Independence(),
               family=sm.families.Binomial())

    assert_equal(mod.endog, mod2.endog)
    assert_equal(mod.exog, mod2.exog)
    assert_equal(mod.groups, mod2.groups)

    res = mod.fit()
    res2 = mod2.fit()

    assert_almost_equal(res.params.values, res2.params.values)
Exemplo n.º 12
0
def test_missing():
    # gh-1877
    data = [['id', 'al', 'status', 'fake', 'grps'],
            ['4A', 'A', 1, 1, 0],
            ['5A', 'A', 1, 2.0, 1],
            ['6A', 'A', 1, 3, 2],
            ['7A', 'A', 1, 2.0, 3],
            ['8A', 'A', 1, 1, 4],
            ['9A', 'A', 1, 2.0, 5],
            ['11A', 'A', 1, 1, 6],
            ['12A', 'A', 1, 2.0, 7],
            ['13A', 'A', 1, 1, 8],
            ['14A', 'A', 1, 1, 9],
            ['15A', 'A', 1, 1, 10],
            ['16A', 'A', 1, 2.0, 11],
            ['17A', 'A', 1, 3.0, 12],
            ['18A', 'A', 1, 3.0, 13],
            ['19A', 'A', 1, 2.0, 14],
            ['20A', 'A', 1, 2.0, 15],
            ['2C', 'C', 0, 3.0, 0],
            ['3C', 'C', 0, 1, 1],
            ['4C', 'C', 0, 1, 2],
            ['5C', 'C', 0, 2.0, 3],
            ['6C', 'C', 0, 1, 4],
            ['9C', 'C', 0, 1, 5],
            ['10C', 'C', 0, 3, 6],
            ['12C', 'C', 0, 3, 7],
            ['14C', 'C', 0, 2.5, 8],
            ['15C', 'C', 0, 1, 9],
            ['17C', 'C', 0, 1, 10],
            ['22C', 'C', 0, 1, 11],
            ['23C', 'C', 0, 1, 12],
            ['24C', 'C', 0, 1, 13],
            ['32C', 'C', 0, 2.0, 14],
            ['35C', 'C', 0, 1, 15]]

    df = pd.DataFrame(data[1:], columns=data[0])
    df.ix[df.fake == 1, 'fake'] = np.nan
    mod = smf.gee('status ~ fake', data=df, groups='grps',
                  cov_struct=sm.cov_struct.Independence(),
                  family=sm.families.Binomial())

    df = df.dropna()
    df['constant'] = 1

    mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps,
               cov_struct=sm.cov_struct.Independence(),
               family=sm.families.Binomial())

    assert_equal(mod.endog, mod2.endog)
    assert_equal(mod.exog, mod2.exog)
    assert_equal(mod.groups, mod2.groups)

    res = mod.fit()
    res2 = mod2.fit()

    assert_almost_equal(res.params.values, res2.params.values)
Exemplo n.º 13
0
 def trend_model(df, group_var, formula):
     """
     Trend modeling with generalized estimaing equations, accounting for dependency 
     structure (nesting) within journal
     """
     gee_fit = smf.gee(formula,
                       group_var,
                       data=df,
                       family=sm.families.Binomial()).fit()
     return gee_fit
Exemplo n.º 14
0
def generalized_estimating_equation_example():
    data = sm.datasets.get_rdataset('epil', package='MASS').data
    fam = sm.families.Poisson()
    ind = sm.cov_struct.Exchangeable()

    mod = smf.gee('y ~ age + trt + base',
                  'subject',
                  data,
                  cov_struct=ind,
                  family=fam)
    res = mod.fit()
    print(res.summary())
Exemplo n.º 15
0
    def outcome_model(self, model, continuous_distribution='gaussian', print_results=True):
        r"""Specify the outcome model. Model used to predict the outcome via a logistic regression model

        .. math::

            \widehat{\Pr}(Y|A,L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta_1} A + \widehat{\beta} L)

        Parameters
        ----------
        model : str
            Independent variables to predict the outcome. For example, 'var1 + var2 + var3 + var4'
        continuous_distribution : str, optional
            Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson'
            for Poisson distributions
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._out_model = self._outcome + ' ~ ' + model

        if self._continuous_outcome:
            if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'):
                f = sm.families.family.Gaussian()
            elif continuous_distribution == 'poisson':
                f = sm.families.family.Poisson()
            else:
                raise ValueError("Only 'gaussian' and 'poisson' distributions are supported")
        else:
            f = sm.families.family.Binomial()

        if self._weight_ is None:
            log = smf.glm(self._out_model, self.df, family=f).fit()
        else:
            log = smf.gee(self._out_model, self.df.index, self.df, weights=self.df[self._weight_], family=f).fit()

        if print_results:
            print('\n----------------------------------------------------------------')
            print('MODEL: ' + self._out_model)
            print('-----------------------------------------------------------------')
            print(log.summary())

        dfx = self.df.copy()
        dfx[self._exposure] = 1
        self.df['_pY1_'] = log.predict(dfx)
        dfx = self.df.copy()
        dfx[self._exposure] = 0
        self.df['_pY0_'] = log.predict(dfx)
        self._fit_outcome_ = True
Exemplo n.º 16
0
    def outcome_model(self, model, restriction=None, print_results=True):
        """Add a specified regression model for the outcome. Must be specified before the fit function.

        Parameters
        ----------
        model:
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Format follows patsy standards
            For example) 'var1 + var2 + var3 + var4'
        restriction : str, optional
            Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model
            fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1"
        print_results : bool, optional
            Whether to print the logistic regression model results to the terminal. Default is True
        """
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()
        linkdist = sm.families.family.Binomial()

        if self._weights is None:  # Unweighted g-formula
            if self._competing_event:
                self.out_model = sm.MNLogit.from_formula(
                    self.outcome + ' ~ ' + model, g).fit()
            else:
                self.out_model = smf.glm(self.outcome + ' ~ ' + model,
                                         g,
                                         family=linkdist).fit()

        else:  # Weighted g-formula
            if self._competing_event:
                raise ValueError(
                    "The weighted MonteCarloGFormula is not supported for competing events"
                )
            self.out_model = smf.gee(self.outcome + ' ~ ' + model,
                                     self.idvar,
                                     g,
                                     weights=g[self._weights],
                                     family=linkdist).fit()
        if print_results:
            print(self.out_model.summary())

        self._outcome_model_fit = True
Exemplo n.º 17
0
    def test_match_iptw_continuous(self, cdata):
        model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
        cdata = cdata.dropna().copy()

        # Estimating Marginal Structural Model
        ipt = IPTW(cdata, treatment='art', stabilized=False)
        ipt.regression_models(model)
        ipt.fit()
        cdata['iptw'] = ipt.Weight
        ind = sm.cov_struct.Independence()
        f = sm.families.family.Gaussian()
        linrisk = smf.gee('cd4_wk45 ~ art', cdata['id'], cdata, cov_struct=ind, family=f, weights=cdata['iptw']).fit()

        # Estimating 'Stochastic Treatment'
        sipw = StochasticIPTW(cdata, treatment='art', outcome='cd4_wk45')
        sipw.treatment_model(model='male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',
                             print_results=False)
        sipw.fit(p=1.0)
        r_all = sipw.marginal_outcome
        sipw.fit(p=0.0)
        r_non = sipw.marginal_outcome

        npt.assert_allclose(linrisk.params[1], r_all - r_non, atol=1e-4)
Exemplo n.º 18
0
    def fit(self, X, y=None):
        #Same settings as the documentation's example:
        self.fam = sm.families.Poisson()
        self.ind = sm.cov_struct.Exchangeable()

        #Auxiliary function: only used in this method within the class
        def expand_X(X, y, desired_group):
            X_plus = X.copy()
            X_plus['y'] = y

            #roughly make ten groups
            X_plus[desired_group +
                   '_group'] = (X_plus[desired_group] * 10) // 10

            return X_plus

        #save the seen class labels
        self.class_labels = np.unique(y)

        dataframe_feature_names = X.columns
        not_group_by_features = [
            x for x in dataframe_feature_names if x != self.group_by_feature
        ]

        formula_in = 'y ~ ' + ' + '.join(not_group_by_features)

        data = expand_X(X, y, self.group_by_feature)
        self.mod = smf.gee(formula_in,
                           self.group_by_feature + "_group",
                           data,
                           cov_struct=self.ind,
                           family=self.fam)

        self.res = self.mod.fit()

        return self
Exemplo n.º 19
0
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math
import pandas as pd
import random

# data = sm.datasets.get_rdataset("epil", "MASS").data
# print(data)
# md = smf.gee("y ~ age + trt + base", "subject", data,
#              cov_struct=sm.cov_struct.Independence(),
#              family=sm.families.Poisson())
# mdf = md.fit()
# print(mdf.summary())
y=[]
x2=[[xx] for xx in np.arange(0.,100.,1.)]
for kk in x2:
    y.append(0.5*kk[0]+3+10*math.sin(kk[0]/3.14)*random.random())
panddf=pd.DataFrame({'x2':x2,'y':y})

md2 = smf.gee("x2", "y", panddf,
             cov_struct=sm.cov_struct.Independence(),
             family=sm.families.Poisson())
mdf2 = md2.fit()
print(mdf2.summary())
          "Smokers F", "Smokers M", "Hospital beds", "Life expectancy", "HDI")
# plot specifications
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap="Spectral", square=True)
ax.set_xticklabels(labels, size=8, rotation=25, horizontalalignment="right")
ax.set_yticklabels(labels, size=8, rotation=25, verticalalignment="top")
plt.title("Correlation Covid-related Variables", fontsize=18)
plt.show()
plt.savefig(fname="./Plots/corrmatrix.png", dpi=1080)
plt.close()

# gee models
fam = sm.families.Gaussian()
ind = sm.cov_struct.Exchangeable()
mod_gee = smf.gee(
    "CFR_log ~ stringency_index + population_density + median_age + aged_65_older + aged_70_older + gdp_per_capita + cardiovasc_death_rate + diabetes_prevalence + female_smokers + male_smokers + hospital_beds_per_thousand + life_expectancy + human_development_index",
    "location",
    data=data,
    cov_struct=ind,
    family=fam)
result_gee = mod_gee.fit()
print(result_gee.summary())

mod_gee2 = smf.gee(
    "case_fatality_ratio ~ stringency_index + population_density + median_age + aged_65_older + aged_70_older + gdp_per_capita + cardiovasc_death_rate + diabetes_prevalence + female_smokers + male_smokers + hospital_beds_per_thousand + life_expectancy + human_development_index",
    "location",
    data=data,
    cov_struct=ind,
    family=fam)
result_gee2 = mod_gee2.fit()
print(result_gee2.summary())

mod_gee3 = smf.gee(
Exemplo n.º 21
0
    def fit(self, continuous_distribution='gaussian'):
        """Fit the specified marginal structural model using the calculated inverse probability of treatment weights.
        """
        if self.__mdenom is None:
            raise ValueError(
                'No model has been fit to generated predicted probabilities')

        if self.ms_model is None:
            raise ValueError('No marginal structural model has been specified')

        if self._miss_flag and not self._fit_missing_:
            warnings.warn(
                "All missing outcome data is assumed to be missing completely at random. To relax this "
                "assumption to outcome data is missing at random please use the `missing_model()` "
                "function", UserWarning)

        ind = sm.cov_struct.Independence()
        full_msm = self.outcome + ' ~ ' + self.ms_model

        df = self.df.copy()
        if self.ipmw is None:
            if self._weight_ is None:
                df['_ipfw_'] = self.iptw
            else:
                df['_ipfw_'] = self.iptw * self.df[self._weight_]
        else:
            if self._weight_ is None:
                df['_ipfw_'] = self.iptw * self.ipmw
            else:
                df['_ipfw_'] = self.iptw * self.ipmw * self.df[self._weight_]
        df = df.dropna()

        if self._continuous_outcome:
            if (continuous_distribution
                    == 'gaussian') or (continuous_distribution == 'normal'):
                f = sm.families.family.Gaussian()
            elif continuous_distribution == 'poisson':
                f = sm.families.family.Poisson()
            else:
                raise ValueError(
                    "Only 'gaussian' and 'poisson' distributions are supported"
                )
            self._continuous_y_type = continuous_distribution
            fm = smf.gee(full_msm,
                         df.index,
                         df,
                         cov_struct=ind,
                         family=f,
                         weights=df['_ipfw_']).fit()
            self.average_treatment_effect = pd.DataFrame()
            self.average_treatment_effect['labels'] = np.asarray(
                fm.params.index)
            self.average_treatment_effect.set_index(keys=['labels'],
                                                    inplace=True)
            self.average_treatment_effect['ATE'] = np.asarray(fm.params)
            self.average_treatment_effect['SE(ATE)'] = np.asarray(fm.bse)
            self.average_treatment_effect['95%LCL'] = np.asarray(
                fm.conf_int()[0])
            self.average_treatment_effect['95%UCL'] = np.asarray(
                fm.conf_int()[1])

        else:
            # Ignoring DomainWarnings from statsmodels
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DomainWarning)

                # Estimating Risk Difference
                f = sm.families.family.Binomial(sm.families.links.identity())
                fm = smf.gee(full_msm,
                             df.index,
                             df,
                             cov_struct=ind,
                             family=f,
                             weights=df['_ipfw_']).fit()
                self.risk_difference = pd.DataFrame()
                self.risk_difference['labels'] = np.asarray(fm.params.index)
                self.risk_difference.set_index(keys=['labels'], inplace=True)
                self.risk_difference['RD'] = np.asarray(fm.params)
                self.risk_difference['SE(RD)'] = np.asarray(fm.bse)
                self.risk_difference['95%LCL'] = np.asarray(fm.conf_int()[0])
                self.risk_difference['95%UCL'] = np.asarray(fm.conf_int()[1])

                # Estimating Risk Ratio
                f = sm.families.family.Binomial(sm.families.links.log())
                fm = smf.gee(full_msm,
                             df.index,
                             df,
                             cov_struct=ind,
                             family=f,
                             weights=df['_ipfw_']).fit()
                self.risk_ratio = pd.DataFrame()
                self.risk_ratio['labels'] = np.asarray(fm.params.index)
                self.risk_ratio.set_index(keys=['labels'], inplace=True)
                self.risk_ratio['RR'] = np.exp(np.asarray(fm.params))
                self.risk_ratio['SE(log(RR))'] = np.asarray(fm.bse)
                self.risk_ratio['95%LCL'] = np.exp(np.asarray(
                    fm.conf_int()[0]))
                self.risk_ratio['95%UCL'] = np.exp(np.asarray(
                    fm.conf_int()[1]))

                # Estimating Odds Ratio
                f = sm.families.family.Binomial()
                fm = smf.gee(full_msm,
                             df.index,
                             df,
                             cov_struct=ind,
                             family=f,
                             weights=df['_ipfw_']).fit()
                self.odds_ratio = pd.DataFrame()
                self.odds_ratio['labels'] = np.asarray(fm.params.index)
                self.odds_ratio.set_index(keys=['labels'], inplace=True)
                self.odds_ratio['OR'] = np.exp(np.asarray(fm.params))
                self.odds_ratio['SE(log(OR))'] = np.asarray(fm.bse)
                self.odds_ratio['95%LCL'] = np.exp(np.asarray(
                    fm.conf_int()[0]))
                self.odds_ratio['95%UCL'] = np.exp(np.asarray(
                    fm.conf_int()[1]))
Exemplo n.º 22
0
globwarm = globwarm.dropna()
X = sm.add_constant(globwarm.iloc[:, 1:9])
gmod = sm.GLSAR(globwarm.nhtemp, X, rho=1)
res = gmod.iterative_fit(maxiter=6)
gmod.rho

gmod = sm.GLSAR(globwarm.nhtemp, X, rho=1)
for i in range(6):
    results = gmod.fit()
    print("AR coefficients: {0}".format(gmod.rho))
    rho, sigma = sm.regression.yule_walker(results.resid, order=gmod.order)
    gmod = sm.GLSAR(globwarm.nhtemp, X, rho)

oatvar = pd.read_csv("oatvar.csv", index_col=0)
oatvar['variety'] = oatvar['variety'].astype('category')
oatvar['grams'] = oatvar['yield']
oatvar.head()

mmod = smf.mixedlm("grams ~ variety", oatvar, groups=oatvar['block']).fit()
mmod.summary()

ind = sm.cov_struct.Exchangeable()
gmod = smf.gee("grams ~ variety", "block", oatvar, cov_struct=ind).fit()
gmod.summary()

ind.summary()

fpe = pd.read_csv("fpe.csv", index_col=0)
fpe.head()
Exemplo n.º 23
0
import statsmodels.api as sm
import statsmodels.formula.api as smf
data = sm.datasets.get_rdataset('epil', package='MASS').data
fam = sm.families.Poisson()
ind = sm.cov_struct.Exchangeable()
mod = smf.gee("y ~ age + trt + base", "subject", data,
              cov_struct=ind, family=fam)

res = mod.fit()
print(res.summary())
Exemplo n.º 24
0
    rd_results.append(r_all - r_none)
    rr_results.append(r_all / r_none)

print('RD 95% CI:', np.percentile(rd_results, q=[2.5, 97.5]))
print('RR 95% CI:', np.percentile(rr_results, q=[2.5, 97.5]))
#IPTW
model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0'
df['iptw'] = ze.ipw.iptw(df,
                         treatment='art',
                         model_denominator=model,
                         stabilized=True)
ind = sm.cov_struct.Independence()
f = sm.families.family.Binomial(sm.families.links.identity)
linrisk = smf.gee('dead ~ art',
                  df['id'],
                  df,
                  cov_struct=ind,
                  family=f,
                  weights=df['iptw']).fit()
linrisk.summary()
f = sm.families.family.Binomial(sm.families.links.log)
log = smf.gee('dead ~ art',
              df['id'],
              df,
              cov_struct=ind,
              family=f,
              weights=df['iptw']).fit()
log.summary()
#Double-Robust
sdr = SimpleDoubleRobust(df, exposure='art', outcome='dead')
sdr.exposure_model(
    'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
Exemplo n.º 25
0
    def _sequential_regression(self, treatment, tmax):
        """Hidden function that executes the sequential regression estimation for g-formula
        """
        # TODO allow option to include different estimation models for each time point or the same model
        if treatment == 'natural':
            # Thoughts: MC estimator needs natural course as a check. This should not apply to SR estimator
            raise ValueError(
                'Natural course estimation is not clear to me with Sequential Regression Estimator. '
                'Therefore, "natural" is not implemented')

        # If custom treatment, it gets evaluated here
        g = self.gf
        if treatment not in ['all', 'none']:
            g['__indicator'] = np.where(eval(treatment), 1, 0)

        # Restricting based on tmax argument
        if tmax is None:
            pass
        elif tmax in list(self.gf[self.time_out].unique()):
            g = g.loc[g[self.time_out] <= tmax].copy()
        else:
            warnings.warn(
                "The t_max argument specifies a time that is not observed in the data. All times less than"
                "the specified t_max argument included in the estimation procedure",
                UserWarning)
            g = g.loc[g[self.time_out] <= tmax].copy()

        # Converting dataframe from long-to-wide for easier estimation
        column_labels = list(
            g.columns
        )  # Getting all column labels (important to match with formula)
        df = self._long_to_wide(df=g, id=self.idvar, t=self.time_out)
        linkdist = sm.families.family.Binomial()
        rt_points = sorted(list(g[self.time_out].unique()),
                           reverse=True)  # Getting all t's to backward loop
        t_points = sorted(list(g[self.time_out].unique()),
                          reverse=False)  # Getting all t's to forward loop

        # Checking for recurrent outcomes. Recurrent are not currently supported
        if pd.Series(df[[
                self.outcome + '_' + str(t)
                for t in sorted(t_points, reverse=False)
        ]].sum(axis=1, skipna=True) > 1).any():
            raise ValueError(
                'Looks like your data has multiple outcomes. Recurrent outcomes are not currently '
                'supported')

        # Step 1: Creating indicator for individuals who followed counterfactual outcome
        treat_t_points = []
        for t in t_points:
            # Following treatment strategy
            # alternative: if treat all, can do simple multiplication. if treat none, can do (1-A) simple multiplication
            if treatment == 'all':
                df['__indicator_' + str(t)] = np.where(
                    df[self.exposure + '_' + str(t)] == 0, 0, np.nan)
                df['__indicator_' + str(t)] = np.where(
                    df[self.exposure + '_' + str(t)] == 1, 1,
                    df['__indicator_' + str(t)])
            elif treatment == 'none':
                df['__indicator_' + str(t)] = np.where(
                    df[self.exposure + '_' + str(t)] == 0, 1, np.nan)
                df['__indicator_' + str(t)] = np.where(
                    df[self.exposure + '_' + str(t)] == 1, 0,
                    df['__indicator_' + str(t)])
            else:  # custom exposure pattern
                pass

            treat_t_points.append('__indicator_' + str(t))
            df['__check_' + str(t)] = df[treat_t_points +
                                         [self.outcome + '_' + str(t)]].prod(
                                             axis=1, skipna=True)

            # This following check carries forward the outcome under the counterfactual treatment
            if t_points.index(t) == 0:
                pass
            else:
                df['__check_' + str(t)] = np.where(
                    df['__check_' + str(t_points[t_points.index(t) - 1])] == 1,
                    1, df['__check_' + str(t)])

        # Step 2: Sequential Regression Estimation
        for t in rt_points:
            # 2.1) Relabel everything to match with the specified model (selecting out that timepoint is within)
            d_labels = {}
            for c in column_labels:
                d_labels[c + '_' + str(t)] = c
            g = df.filter(regex='_' + str(t)).rename(
                mapper=d_labels, axis=1).reset_index().copy()
            g[self.time_out] = t

            # 2.2) Fit the model to the observed data
            if rt_points.index(t) == 0:
                if self._weights is None:
                    m = smf.glm(self.outcome + ' ~ ' + self._modelform,
                                g,
                                family=linkdist).fit()  # GLM
                else:
                    m = smf.gee(self.outcome + ' ~ ' + self._modelform,
                                self.idvar,
                                g,
                                weights=df[self._weights + '_' + str(t)],
                                family=linkdist).fit()  # Weighted, so GEE
                if self._printseqregresults:
                    print(m.summary())
            else:
                # Uses previous predicted values to estimate
                g[self.outcome] = np.where(
                    df['__pred_' + self.outcome + '_' +
                       str(t_points[t_points.index(t) + 1])].isna(),
                    g[self.outcome], df['__pred_' + self.outcome + '_' +
                                        str(t_points[t_points.index(t) + 1])])

                if self._weights is None:
                    m = smf.glm(self.outcome + ' ~ ' + self._modelform,
                                g,
                                family=linkdist).fit()  # GLM
                else:
                    m = smf.gee(self.outcome + ' ~ ' + self._modelform,
                                self.idvar,
                                g,
                                weights=df[self._weights + '_' + str(t)],
                                family=linkdist).fit()  # Weighted, so GEE
                if self._printseqregresults:
                    print(m.summary())

            # 2.3) Getting Counterfactual Treatment Values
            if treatment == 'all':
                g[self.exposure] = 1
            elif treatment == 'none':
                g[self.exposure] = 0
            else:
                g[self.exposure] = np.where(eval(treatment), 1, 0)

            # Predicted values based on counterfactual treatment strategy from predicted model
            df['__pred_' + self.outcome + '_' + str(t)] = np.where(
                df[self.outcome + '_' + str(t)].isna(), np.nan, m.predict(g))
            # If followed counterfactual treatment & had outcome, then always considered to have outcome past that t
            df['__cf_' + self.outcome + '_' + str(t)] = np.where(
                (df['__check_' + str(t)] == 1), 1,
                df['__pred_' + self.outcome + '_' + str(t)])

        # Step 3) Returning estimated results
        if self._weights is None:
            return np.mean(df['__pred_' + self.outcome + '_' +
                              str(t_points[0])])
        else:
            return np.average(
                df['__pred_' + self.outcome + '_' + str(t_points[0])],
                weights=df[self._weights + '_' + str(t_points[0])])
Exemplo n.º 26
0
    def add_covariate_model(self,
                            label,
                            covariate,
                            model,
                            restriction=None,
                            recode=None,
                            var_type='binary',
                            print_results=True):
        """Add a specified regression model for time-varying confounders. Unlike the exposure and outcome models, a
        covariate model does NOT have to be specified. Additionally, *n* covariate models can be specified for *n*
        time-varying covariates. Additional models are added by repeated calls for this function with the corresponding
        covariates and predictive regression equations

        This argument is only used for the Monte Carlo g-formula. The sequential regression only requires specification
        of the outcome model.

        Parameters
        ----------
        label : int
            Integer label for the covariate model. Covariate models are fit in ascending order within
             TimeVaryGFormula
        covariate : str
            Column label for time-varying confounder to be predicted
        model : str
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Format follows patsy
            For example) 'var1 + var2 + var3 + var4'
        restriction : str, optional
            Used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat
            model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1"
        recode : str, optional
            This variable is vitally important for various functional forms implemented later in models. This
            is used to run some background code to recreate functional forms as the g-formula is estimated via fit()
            For an example, let's say we have age but we want the functional form to be quadratic. For this, we
            would set the recode="g['age_sq'] = g['age']**2;" Similar to TimeFixedGFormula, 'g' must be specified as the
            DataFrame object with the corresponding indexes. Also lines of executable code should end with ';', so
            Python knows that the line ends there. My apologies for this poor solution... I am working on a better way.
            In the background, Python executes the code input into recode
        var_type : str, optional
            Type of variable that the covariate is. Current options include 'binary' or 'continuous'
        print_results : bool, optional
            Whether to print the logistic regression model results to the terminal. Default is True
        """
        if type(label) is not int:
            raise ValueError('Label must be an integer')

        # Building predictive model
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()

        if self._weights is None:  # Unweighted g-formula
            if var_type == 'binary':
                linkdist = sm.families.family.Binomial()
                m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist)
            elif var_type == 'continuous':
                linkdist = sm.families.family.Gaussian(
                    sm.families.links.identity)
                m = smf.gls(covariate + ' ~ ' + model, g)
            else:
                raise ValueError(
                    'Only binary or continuous covariates are currently supported'
                )
        else:  # Weighted g-formula
            if var_type == 'binary':
                linkdist = sm.families.family.Binomial()
                m = smf.gee(covariate + ' ~ ' + model,
                            self.idvar,
                            g,
                            weights=g[self._weights],
                            family=linkdist)
            elif var_type == 'continuous':
                linkdist = sm.families.family.Gaussian(
                    sm.families.links.identity)
                m = smf.gee(covariate + ' ~ ' + model,
                            self.idvar,
                            g,
                            weights=g[self._weights],
                            family=linkdist)
            else:
                raise ValueError(
                    'Only binary or continuous covariates are currently supported'
                )

        f = m.fit()
        if print_results:
            print(f.summary())

        # Adding to lists, it is used to predict variables later on for the time-varying...
        self._covariate_models.append(f)
        self._covariate_model_index.append(label)
        self._covariate.append(covariate)
        self._covariate_type.append(var_type)
        if recode is None:
            self._covariate_recode.append(
                'None')  # Must be string for exec() to use later
        else:
            self._covariate_recode.append(recode)
Exemplo n.º 27
0
        for j in range(0, renshu):
            zu += 1
            for shij in range(0, 3):
                if key[shij] == 'N':
                    zhi = 1
                else:
                    zhi = 0
                temp = pd.DataFrame([{'周数': shij, '值': zhi, '组': zu}])
                temp['诊断严重程度'] = data_temp['诊断严重程度']
                temp['治疗'] = data_temp['治疗']
                tmp = tmp.append(temp)
# tmp.to_csv('D:/结果数据_抑郁症治疗.csv',encoding='gbk')
tmp['诊断严重程度'] = tmp['诊断严重程度'].replace({'轻微': 0, '严重': 1})
tmp['治疗'] = tmp['治疗'].replace({'标准': 0, '新药': 1})
tmp = tmp.reset_index()
del tmp['index']
va = sm.cov_struct.Autoregressive()
fam = sm.families.Binomial()
ind = sm.cov_struct.Independence()
#与书中结果一致
mod = smf.gee("值 ~ 诊断严重程度 + 治疗 + 周数+治疗:周数",
              "组",
              tmp,
              cov_struct=ind,
              family=fam)
res = mod.fit()
res.summary()

#2.多元GEE 疑似NominalGEE
#3.有序 OrdinalGEE
Exemplo n.º 28
0
    def add_covariate_model(self,
                            label,
                            covariate,
                            model,
                            restriction=None,
                            recode=None,
                            var_type='binary',
                            print_results=True):
        """
        Build the model for the specified covariate. This is to deal with time-varying confounders.
        Does NOT have to be specified, unlike the exposure and outcome models. The order in which these
        models are fit is based on the provided integer labels
        
        Input:
        
        label:
            -integer label for the covariate model. Covariate models are fit in ascending order within 
             TimeVaryGFormula
        covariate:
            -variable to be predicted
        model:
            -variables to include in the model for predicting the outcome. Must be contained within the input
             pandas dataframe when initialized. Format is the same as the functional form,
             i.e. 'var1 + var2 + var3 + var4'
        restriction:
            -used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat
             model fitting. The pandas dataframe must be referred to as 'g'
             Example) "g['art']==1"
        recode:
            -This variable is vitally important for various functional forms implemented later in models. This
             is used to run some background code to recreate functional forms as the g-formula is fit via fit()
             For an example, let's say we have age but we want the functional form to be cubic. For this, we 
             would set the recode="g['']" Similar to TimeFixedGFormula, 'g' must be specified as the data frame 
             object with the corresponding indexes. Also lines of executable code should end with ';', so Python
             knows that the line ends there. My apologies for this poor solution... I am working on a better way
        var_type:
            -type of variable that the covariate is. Current options include 'binary' or 'continuous'
        print_results:
            -whether to print the logistic regression results to the terminal. Default is True
        """
        if type(label) is not int:
            raise ValueError('Label must be an integer')

        # Building predictive model
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()

        if self._weights is None:  # Unweighted g-formula
            if var_type == 'binary':
                linkdist = sm.families.family.Binomial(sm.families.links.logit)
                m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist)
            elif var_type == 'continuous':
                linkdist = sm.families.family.Gaussian(
                    sm.families.links.identity)
                m = smf.gls(covariate + ' ~ ' + model, g)
            else:
                raise ValueError(
                    'Only binary or continuous covariates are currently supported'
                )
        else:  # Weighted g-formula
            if var_type == 'binary':
                linkdist = sm.families.family.Binomial(sm.families.links.logit)
                m = smf.gee(covariate + ' ~ ' + model,
                            self.idvar,
                            g,
                            weights=g[self._weights],
                            family=linkdist)
            elif var_type == 'continuous':
                linkdist = sm.families.family.Gaussian(
                    sm.families.links.identity)
                m = smf.gee(covariate + ' ~ ' + model,
                            self.idvar,
                            g,
                            weights=g[self._weights],
                            family=linkdist)
            else:
                raise ValueError(
                    'Only binary or continuous covariates are currently supported'
                )

        f = m.fit()
        if print_results:
            print(f.summary())

        # Adding to lists, it is used to predict variables later on for the time-varying...
        self._covariate_models.append(f)
        self._covariate_model_index.append(label)
        self._covariate.append(covariate)
        self._covariate_type.append(var_type)
        if recode is None:
            self._covariate_recode.append(
                'None')  # Must be string for exec() to use later
        else:
            self._covariate_recode.append(recode)