Пример #1
0
    def test_default_time(self):
        """
        Check that the time defaults work correctly.
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()

        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(),
                            decimal=6)
Пример #2
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
Пример #3
0
 def gendat_ar1():
     ars = AR_simulator()
     ars.ngroups = 200
     ars.params = np.r_[0, -0.8, 1.2, 0, 0.5]
     ars.error_sd = 2
     ars.dparams = [ar,]
     ars.simulate()
     return ars, Autoregressive()
Пример #4
0
 def gendat_ar0(msg = False):
     ars = AR_simulator()
     ars.ngroups = 200
     ars.params = np.r_[0, -1, 1, 0, 0.5]
     ars.error_sd = 2
     ars.dparams = [ar,]
     ars.simulate()
     return ars, Autoregressive()
Пример #5
0
    def test_autoregressive(self):

        dep_params_true = [0, 0.589208623896, 0.559823804948]

        params_true = [[1.08043787, 1.12709319, 0.90133927],
                       [0.9613677, 1.05826987, 0.90832055],
                       [1.05370439, 0.96084864, 0.93923374]]

        np.random.seed(342837482)

        num_group = 100
        ar_param = 0.5
        k = 3

        ga = Gaussian()

        for gsize in 1, 2, 3:

            ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :]
            ix = np.abs(ix)
            cmat = ar_param**ix
            cmat_r = np.linalg.cholesky(cmat)

            endog = []
            exog = []
            groups = []
            for i in range(num_group):
                x = np.random.normal(size=(gsize, k))
                exog.append(x)
                expval = x.sum(1)
                errors = np.dot(cmat_r, np.random.normal(size=gsize))
                endog.append(expval + errors)
                groups.append(i * np.ones(gsize))

            endog = np.concatenate(endog)
            groups = np.concatenate(groups)
            exog = np.concatenate(exog, axis=0)

            ar = Autoregressive()
            md = GEE(endog, exog, groups, family=ga, cov_struct=ar)
            mdf = md.fit()
            assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1])
            assert_almost_equal(mdf.params, params_true[gsize - 1])
Пример #6
0
    def test_logistic(self):
        """
        R code for comparing results:

        library(gee)
        Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]

        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="independence")
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="exchangeable")
        sme = summary(me)
        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="AR-M")
        sma = summary(ma)
        u = coefficients(sma)
        cfa = paste(u[,1], collapse=",")
        sea = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[
            0.0167272965285882, 1.13038654425893, -1.86896345082962,
            1.09397608331333
        ],
              [
                  0.0178982283915449, 1.13118798191788, -1.86133518416017,
                  1.08944256230299
              ],
              [
                  0.0109621937947958, 1.13226505028438, -1.88278757333046,
                  1.09954623769449
              ]]
        se = [[
            0.127291720283049, 0.166725808326067, 0.192430061340865,
            0.173141068839597
        ],
              [
                  0.127045031730155, 0.165470678232842, 0.192052750030501,
                  0.173174779369249
              ],
              [
                  0.127240302296444, 0.170554083928117, 0.191045527104503,
                  0.169776150974586
              ]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  "Id",
                                  D,
                                  family=family,
                                  cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
Пример #7
0
# model comparison with likelihood ratio test
LR = 2 * (model_panel2_results.llf - model_panel1_results.llf)
p = chi2.sf(LR, 2) 
print('p: %.30f' % p) 

# provides a summary of the number of zeros
print(US_cases_long_demogr_week['cases_count_pos'].describe())
print(US_cases_long_demogr_week['cases_count_pos'].value_counts())
count_total = sum(US_cases_long_demogr_week['cases_count_pos'].value_counts().to_dict().values())
count_zero = US_cases_long_demogr_week['cases_count_pos'].value_counts()[0.0]
print("Count of zero is {}, about {:.4f} of the data.".format(count_zero, count_zero / count_total ))

# Approach one to generalized linear models for panel data: Generalized Estimating Equations
# poisson model
poi=Poisson()
ar=Autoregressive()
gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
gee_model0_results = gee_model0.fit(maxiter=200)
print(gee_model0_results.summary())
print(ar.summary())
print("scale=%.2f" % (gee_model0_results.scale))

# There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case,
# specific starting values are needed to get the estimating algorithm to converge.
# First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077.
fam = Poisson()
ex = Exchangeable()
ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
ex_results = ex_model.fit()