예제 #1
0
    def test_missing(self):
        """
        Test missing data handling for calling from the api.  Missing
        data handling does not currently work for formulas.
        """

        endog = np.random.normal(size=100)
        exog = np.random.normal(size=(100, 3))
        exog[:, 0] = 1
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog[10:12, 1] = np.nan

        mod1 = GEE(endog, exog, groups, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])

        ii = np.isfinite(endog) & np.isfinite(exog).all(1)

        mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params)
        assert_almost_equal(rslt1.bse, rslt2.bse)
예제 #2
0
    def test_nested_linear(self):

        family = Gaussian()

        endog, exog, group = load_data("gee_nested_linear_1.csv")

        group_n = []
        for i in range(endog.shape[0] // 10):
            group_n.extend([
                0,
            ] * 5)
            group_n.extend([
                1,
            ] * 5)
        group_n = np.array(group_n)[:, None]

        dp = Independence()
        md = GEE(endog, exog, group, None, family, dp)
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106]
        se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989]
        assert_almost_equal(mdf1.params, cf, decimal=6)
        assert_almost_equal(mdf1.standard_errors(), se, decimal=6)

        ne = Nested()
        md = GEE(endog, exog, group, None, family, ne, dep_data=group_n)
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969]
        se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991]
        assert_almost_equal(mdf2.params, cf, decimal=6)
        assert_almost_equal(mdf2.standard_errors(), se, decimal=6)
예제 #3
0
    def test_default_time(self):
        """
        Check that the time defaults work correctly.
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()

        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(),
                            decimal=6)
예제 #4
0
    def test_missing(self):
        #Test missing data handling for calling from the api.  Missing
        #data handling does not currently work for formulas.

        endog = np.random.normal(size=100)
        exog = np.random.normal(size=(100, 3))
        exog[:, 0] = 1
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog[10:12, 1] = np.nan

        mod1 = GEE(endog, exog, groups, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])

        ii = np.isfinite(endog) & np.isfinite(exog).all(1)

        mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params)
        assert_almost_equal(rslt1.bse, rslt2.bse)
예제 #5
0
    def test_nominal(self):

        family = Multinomial(3)

        endog, exog, groups = load_data("gee_nominal_1.csv",
                                        icept=False)

        # Test with independence correlation
        v = Independence()
        md = GEE(endog, exog, groups, None, family, v)
        md.setup_nominal()
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf1 = np.r_[0.44944752,  0.45569985, -0.92007064, -0.46766728]
        se1 = np.r_[0.09801821,  0.07718842,  0.13229421,  0.08544553]
        assert_almost_equal(mdf1.params, cf1, decimal=5)
        assert_almost_equal(mdf1.standard_errors(), se1, decimal=5)

        # Test with global odds ratio dependence
        v = GlobalOddsRatio("nominal")
        md = GEE(endog, exog, groups, None, family, v)
        md.setup_nominal()
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf2 = np.r_[0.45397549,  0.42278345, -0.91997131, -0.50115943]
        se2 = np.r_[0.09646057,  0.07405713,  0.1324629 ,  0.09025019]
        assert_almost_equal(mdf2.params, cf2, decimal=5)
        assert_almost_equal(mdf2.standard_errors(), se2, decimal=5)
예제 #6
0
    def test_nominal(self):

        family = Multinomial(3)

        endog, exog, groups = load_data("gee_nominal_1.csv", icept=False)

        # Test with independence correlation
        v = Independence()
        md = GEE(endog, exog, groups, None, family, v)
        md.setup_nominal()
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf1 = np.r_[0.44944752, 0.45569985, -0.92007064, -0.46766728]
        se1 = np.r_[0.09801821, 0.07718842, 0.13229421, 0.08544553]
        assert_almost_equal(mdf1.params, cf1, decimal=5)
        assert_almost_equal(mdf1.standard_errors(), se1, decimal=5)

        # Test with global odds ratio dependence
        v = GlobalOddsRatio("nominal")
        md = GEE(endog, exog, groups, None, family, v)
        md.setup_nominal()
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf2 = np.r_[0.45397549, 0.42278345, -0.91997131, -0.50115943]
        se2 = np.r_[0.09646057, 0.07405713, 0.1324629, 0.09025019]
        assert_almost_equal(mdf2.params, cf2, decimal=5)
        assert_almost_equal(mdf2.standard_errors(), se2, decimal=5)
예제 #7
0
    def test_nested_linear(self):

        family = Gaussian()

        endog, exog, group = load_data("gee_nested_linear_1.csv")

        group_n = []
        for i in range(endog.shape[0]//10):
            group_n.extend([0,]*5)
            group_n.extend([1,]*5)
        group_n = np.array(group_n)[:,None]

        dp = Independence()
        md = GEE(endog, exog, group, None, family, dp)
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.1671073 ,  1.00467426, -2.01723004,  0.97297106]
        se = np.r_[0.08629606,  0.04058653,  0.04067038,  0.03777989]
        assert_almost_equal(mdf1.params, cf, decimal=6)
        assert_almost_equal(mdf1.standard_errors(), se,
                            decimal=6)

        ne = Nested()
        md = GEE(endog, exog, group, None, family, ne,
                 dep_data=group_n)
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.16655319,  1.02183688, -2.00858719,  1.00101969]
        se = np.r_[0.08632616,  0.02913582,  0.03114428,  0.02893991]
        assert_almost_equal(mdf2.params, cf, decimal=6)
        assert_almost_equal(mdf2.standard_errors(), se,
                            decimal=6)
예제 #8
0
    def test_default_time(self):
        # Check that the time defaults work correctly.

        endog,exog,group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()


        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family,
                  cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(), decimal=6)
예제 #9
0
    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200 # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3*np.random.normal(size=n)
        group = np.kron(np.arange(n/4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0.,])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                  cov_struct=va, constraint=(L, R))
        rslt1 = mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0,])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        rslt2 = mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\
                    np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            rslt1 = mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2*norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)
예제 #10
0
    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200 # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3*np.random.normal(size=n)
        group = np.kron(np.arange(n/4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0.,])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                  cov_struct=va, constraint=(L, R))
        rslt1 = mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0,])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        rslt2 = mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\
                    np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            rslt1 = mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2*norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)
예제 #11
0
    def test_ordinal(self):

        family = Binomial()

        endog_orig, exog_orig, groups = load_data("gee_ordinal_1.csv",
                                                  icept=False)

        data = np.concatenate((endog_orig[:,None], exog_orig,
                               groups[:,None]), axis=1)

        # Recode as cumulative indicators
        endog, exog, intercepts, nlevel = gee_setup_ordinal(data, 0)

        exog1 = np.concatenate((intercepts, exog), axis=1)
        groups = exog1[:,-1]
        exog1 = exog1[:,0:-1]

        v = GlobalOddsRatio(nlevel, "ordinal")

        beta = gee_ordinal_starting_values(endog_orig,
                                           exog_orig.shape[1])

        md = GEE(endog, exog1, groups, None, family, v)
        mdf = md.fit(start_params = beta)

        cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666,
                   0.02983409, 1.18123172,  0.01845318, -1.10233886]
        se = np.r_[0.10878752,  0.10326078,  0.11171241, 0.05488705,
                   0.05995019, 0.0916574,  0.05951445,  0.08539281]

        assert_almost_equal(mdf.params, cf, decimal=5)
        assert_almost_equal(mdf.bse, se, decimal=5)
예제 #12
0
    def test_ordinal_pandas(self):

        family = Binomial()

        endog_orig, exog_orig, groups = load_data("gee_ordinal_1.csv",
                                                  icept=False)

        data = np.concatenate(
            (endog_orig[:, None], exog_orig, groups[:, None]), axis=1)
        data = pd.DataFrame(data)
        data.columns = ["endog", "x1", "x2", "x3", "x4", "x5", "group"]

        # Recode as cumulative indicators
        endog, exog, intercepts, nlevel = \
            gee_setup_ordinal(data, "endog")

        exog1 = np.concatenate((intercepts, exog), axis=1)
        groups = exog1[:, -1]
        exog1 = exog1[:, 0:-1]

        v = GlobalOddsRatio(nlevel, "ordinal")

        beta = gee_ordinal_starting_values(endog_orig, exog_orig.shape[1])

        md = GEE(endog, exog1, groups, None, family, v)
        mdf = md.fit(start_params=beta)

        cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666,
                   0.02983409, 1.18123172, 0.01845318, -1.10233886]
        se = np.r_[0.10878752, 0.10326078, 0.11171241, 0.05488705, 0.05995019,
                   0.0916574, 0.05951445, 0.08539281]

        assert_almost_equal(mdf.params, cf, decimal=2)
        assert_almost_equal(mdf.bse, se, decimal=2)
예제 #13
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
예제 #14
0
def test_missing():
    # gh-1877
    data = [['id', 'al', 'status', 'fake', 'grps'],
            ['4A', 'A', 1, 1, 0],
            ['5A', 'A', 1, 2.0, 1],
            ['6A', 'A', 1, 3, 2],
            ['7A', 'A', 1, 2.0, 3],
            ['8A', 'A', 1, 1, 4],
            ['9A', 'A', 1, 2.0, 5],
            ['11A', 'A', 1, 1, 6],
            ['12A', 'A', 1, 2.0, 7],
            ['13A', 'A', 1, 1, 8],
            ['14A', 'A', 1, 1, 9],
            ['15A', 'A', 1, 1, 10],
            ['16A', 'A', 1, 2.0, 11],
            ['17A', 'A', 1, 3.0, 12],
            ['18A', 'A', 1, 3.0, 13],
            ['19A', 'A', 1, 2.0, 14],
            ['20A', 'A', 1, 2.0, 15],
            ['2C', 'C', 0, 3.0, 0],
            ['3C', 'C', 0, 1, 1],
            ['4C', 'C', 0, 1, 2],
            ['5C', 'C', 0, 2.0, 3],
            ['6C', 'C', 0, 1, 4],
            ['9C', 'C', 0, 1, 5],
            ['10C', 'C', 0, 3, 6],
            ['12C', 'C', 0, 3, 7],
            ['14C', 'C', 0, 2.5, 8],
            ['15C', 'C', 0, 1, 9],
            ['17C', 'C', 0, 1, 10],
            ['22C', 'C', 0, 1, 11],
            ['23C', 'C', 0, 1, 12],
            ['24C', 'C', 0, 1, 13],
            ['32C', 'C', 0, 2.0, 14],
            ['35C', 'C', 0, 1, 15]]

    df = pd.DataFrame(data[1:], columns=data[0])
    df.ix[df.fake == 1, 'fake'] = np.nan
    mod = smf.gee('status ~ fake', data=df, groups='grps',
                  cov_struct=sm.cov_struct.Independence(),
                  family=sm.families.Binomial())

    df = df.dropna()
    #df.loc[:, 'constant'] = 1
    df['constant'] = 1

    mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps,
               cov_struct=sm.cov_struct.Independence(),
               family=sm.families.Binomial())

    assert_equal(mod.endog, mod2.endog)
    assert_equal(mod.exog, mod2.exog)
    assert_equal(mod.groups, mod2.groups)

    res = mod.fit()
    res2 = mod2.fit()

    assert_almost_equal(res.params.values, res2.params.values)
예제 #15
0
def test_missing():
    # gh-1877
    data = [['id', 'al', 'status', 'fake', 'grps'],
            ['4A', 'A', 1, 1, 0],
            ['5A', 'A', 1, 2.0, 1],
            ['6A', 'A', 1, 3, 2],
            ['7A', 'A', 1, 2.0, 3],
            ['8A', 'A', 1, 1, 4],
            ['9A', 'A', 1, 2.0, 5],
            ['11A', 'A', 1, 1, 6],
            ['12A', 'A', 1, 2.0, 7],
            ['13A', 'A', 1, 1, 8],
            ['14A', 'A', 1, 1, 9],
            ['15A', 'A', 1, 1, 10],
            ['16A', 'A', 1, 2.0, 11],
            ['17A', 'A', 1, 3.0, 12],
            ['18A', 'A', 1, 3.0, 13],
            ['19A', 'A', 1, 2.0, 14],
            ['20A', 'A', 1, 2.0, 15],
            ['2C', 'C', 0, 3.0, 0],
            ['3C', 'C', 0, 1, 1],
            ['4C', 'C', 0, 1, 2],
            ['5C', 'C', 0, 2.0, 3],
            ['6C', 'C', 0, 1, 4],
            ['9C', 'C', 0, 1, 5],
            ['10C', 'C', 0, 3, 6],
            ['12C', 'C', 0, 3, 7],
            ['14C', 'C', 0, 2.5, 8],
            ['15C', 'C', 0, 1, 9],
            ['17C', 'C', 0, 1, 10],
            ['22C', 'C', 0, 1, 11],
            ['23C', 'C', 0, 1, 12],
            ['24C', 'C', 0, 1, 13],
            ['32C', 'C', 0, 2.0, 14],
            ['35C', 'C', 0, 1, 15]]

    df = pd.DataFrame(data[1:], columns=data[0])
    df.ix[df.fake == 1, 'fake'] = np.nan
    mod = smf.gee('status ~ fake', data=df, groups='grps',
                  cov_struct=sm.cov_struct.Independence(),
                  family=sm.families.Binomial())

    df = df.dropna()
    df['constant'] = 1

    mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps,
               cov_struct=sm.cov_struct.Independence(),
               family=sm.families.Binomial())

    assert_equal(mod.endog, mod2.endog)
    assert_equal(mod.exog, mod2.exog)
    assert_equal(mod.groups, mod2.groups)

    res = mod.fit()
    res2 = mod2.fit()

    assert_almost_equal(res.params.values, res2.params.values)
예제 #16
0
    def test_weighted(self):

        # Simple check where the answer can be computed by hand.
        exog = np.ones(20)
        weights = np.ones(20)
        weights[0:10] = 2
        endog = np.zeros(20)
        endog[0:10] += 1
        groups = np.kron(np.arange(10), np.r_[1, 1])
        model = GEE(endog, exog, groups, weights=weights)
        result = model.fit()
        assert_allclose(result.params, np.r_[2/3.])

        # Comparison against stata using groups with different sizes.
        weights = np.ones(20)
        weights[10:] = 2
        endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6,
                      7, 8, 7, 8]
        exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
                      3, 3, 3, 3]
        groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6,
                       8, 8, 9, 9, 10, 10]
        exog = np.column_stack((np.ones(20), exog1))

        # Comparison using independence model
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Independence())
        g = np.mean([2, 4, 2, 2, 4, 2, 2, 2])
        fac = 20 / float(20 - g)
        result = model.fit(ddof_scale=0, scaling_factor=fac)

        assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6)
        assert_allclose(result.scale, 1.808576)

        # Stata multiples robust SE by sqrt(N / (N - g)), where N is
        # the total sample size and g is the average group size.
        assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5)

        # Comparison using exchangeable model
        # Smoke test for now
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Exchangeable())
        result = model.fit(ddof_scale=0)
예제 #17
0
    def test_weighted(self):

        # Simple check where the answer can be computed by hand.
        exog = np.ones(20)
        weights = np.ones(20)
        weights[0:10] = 2
        endog = np.zeros(20)
        endog[0:10] += 1
        groups = np.kron(np.arange(10), np.r_[1, 1])
        model = GEE(endog, exog, groups, weights=weights)
        result = model.fit()
        assert_allclose(result.params, np.r_[2/3.])

        # Comparison against stata using groups with different sizes.
        weights = np.ones(20)
        weights[10:] = 2
        endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6,
                      7, 8, 7, 8]
        exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
                      3, 3, 3, 3]
        groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6,
                       8, 8, 9, 9, 10, 10]
        exog = np.column_stack((np.ones(20), exog1))

        # Comparison using independence model
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Independence())
        g = np.mean([2, 4, 2, 2, 4, 2, 2, 2])
        fac = 20 / float(20 - g)
        result = model.fit(ddof_scale=0, scaling_factor=fac)

        assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6)
        assert_allclose(result.scale, 1.808576)

        # Stata multiples robust SE by sqrt(N / (N - g)), where N is
        # the total sample size and g is the average group size.
        assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5)

        # Comparison using exchangeable model
        # Smoke test for now
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Exchangeable())
        result = model.fit(ddof_scale=0)
예제 #18
0
    def test_post_estimation(self):

        family = Gaussian()
        endog, exog, group = load_data("gee_linear_1.csv")

        ve = Exchangeable()

        md = GEE(endog, exog, group, None, family, ve)
        mdf = md.fit()

        assert_almost_equal(np.dot(exog, mdf.params), mdf.fittedvalues)
        assert_almost_equal(endog - np.dot(exog, mdf.params), mdf.resid)
예제 #19
0
    def test_post_estimation(self):

        family = Gaussian()
        endog, exog, group = load_data("gee_linear_1.csv")

        ve = Exchangeable()

        md = GEE(endog, exog, group, None, family, ve)
        mdf = md.fit()

        assert_almost_equal(np.dot(exog, mdf.params), mdf.fittedvalues)
        assert_almost_equal(endog - np.dot(exog, mdf.params), mdf.resid)
예제 #20
0
    def test_wrapper(self):

        endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False)
        endog = pd.Series(endog)
        exog = pd.DataFrame(exog)
        group_n = pd.Series(group_n)

        family = Poisson()
        vi = Independence()

        mod = GEE(endog, exog, group_n, None, family, vi)
        rslt2 = mod.fit()

        check_wrapper(rslt2)
예제 #21
0
    def test_wrapper(self):

        endog, exog, group_n = load_data("gee_poisson_1.csv",
                                        icept=False)
        endog = pd.Series(endog)
        exog = pd.DataFrame(exog)
        group_n = pd.Series(group_n)

        family = Poisson()
        vi = Independence()

        mod = GEE(endog, exog, group_n, None, family, vi)
        rslt2 = mod.fit()

        check_wrapper(rslt2)
예제 #22
0
    def test_autoregressive(self):

        dep_params_true = [0, 0.589208623896, 0.559823804948]

        params_true = [[1.08043787, 1.12709319, 0.90133927],
                       [0.9613677, 1.05826987, 0.90832055],
                       [1.05370439, 0.96084864, 0.93923374]]

        np.random.seed(342837482)

        num_group = 100
        ar_param = 0.5
        k = 3

        ga = Gaussian()

        for gsize in 1,2,3:

            ix = np.arange(gsize)[:,None] - np.arange(gsize)[None,:]
            ix = np.abs(ix)
            cmat = ar_param ** ix
            cmat_r = np.linalg.cholesky(cmat)

            endog = []
            exog = []
            groups = []
            for i in range(num_group):
                x = np.random.normal(size=(gsize,k))
                exog.append(x)
                expval = x.sum(1)
                errors = np.dot(cmat_r, np.random.normal(size=gsize))
                endog.append(expval + errors)
                groups.append(i*np.ones(gsize))

            endog = np.concatenate(endog)
            groups = np.concatenate(groups)
            exog = np.concatenate(exog, axis=0)

            ar = Autoregressive()
            md = GEE(endog, exog, groups, family=ga, cov_struct = ar)
            mdf = md.fit()

            assert_almost_equal(ar.dep_params, dep_params_true[gsize-1])
            assert_almost_equal(mdf.params, params_true[gsize-1])
예제 #23
0
    def test_autoregressive(self):

        dep_params_true = [0, 0.589208623896, 0.559823804948]

        params_true = [[1.08043787, 1.12709319, 0.90133927],
                       [0.9613677, 1.05826987, 0.90832055],
                       [1.05370439, 0.96084864, 0.93923374]]

        np.random.seed(342837482)

        num_group = 100
        ar_param = 0.5
        k = 3

        ga = Gaussian()

        for gsize in 1, 2, 3:

            ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :]
            ix = np.abs(ix)
            cmat = ar_param**ix
            cmat_r = np.linalg.cholesky(cmat)

            endog = []
            exog = []
            groups = []
            for i in range(num_group):
                x = np.random.normal(size=(gsize, k))
                exog.append(x)
                expval = x.sum(1)
                errors = np.dot(cmat_r, np.random.normal(size=gsize))
                endog.append(expval + errors)
                groups.append(i * np.ones(gsize))

            endog = np.concatenate(endog)
            groups = np.concatenate(groups)
            exog = np.concatenate(exog, axis=0)

            ar = Autoregressive()
            md = GEE(endog, exog, groups, family=ga, cov_struct=ar)
            mdf = md.fit()

            assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1])
            assert_almost_equal(mdf.params, params_true[gsize - 1])
예제 #24
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n,1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family,
                   cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time,
                                family=family, cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time",
                                family=family, cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time,
                                family=family, cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time",
                                family=family, cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
예제 #25
0
    def test_linear_constrained(self):

        family = Gaussian()

        exog = np.random.normal(size=(300, 4))
        exog[:, 0] = 1
        endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) + np.random.normal(size=300)
        group = np.kron(np.arange(100), np.r_[1, 1, 1])

        vi = Independence()
        ve = Exchangeable()

        L = np.r_[[[0, 0, 0, 1]]]
        R = np.r_[0,]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v, constraint=(L, R))
            mdf = md.fit()
            assert_almost_equal(mdf.params[3], 0, decimal=10)
예제 #26
0
    def test_ordinal(self):

        family = Binomial()

        endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False)

        v = GlobalOddsRatio("ordinal")

        md = GEE(endog, exog, groups, None, family, v)
        md.setup_ordinal()
        mdf = md.fit()

        cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666,
                   0.02983409, 1.18123172, 0.01845318, -1.10233886]
        se = np.r_[0.10878752, 0.10326078, 0.11171241, 0.05488705, 0.05995019,
                   0.0916574, 0.05951445, 0.08539281]

        assert_almost_equal(mdf.params, cf, decimal=5)
        assert_almost_equal(mdf.bse, se, decimal=5)
예제 #27
0
    def test_ordinal(self):

        family = Binomial()

        endog, exog, groups = load_data("gee_ordinal_1.csv",
                                        icept=False)

        v = GlobalOddsRatio("ordinal")

        md = GEE(endog, exog, groups, None, family, v)
        md.setup_ordinal()
        mdf = md.fit()

        cf = np.r_[1.09238131, 0.02148193, -0.39879146, -0.01855666,
                   0.02983409, 1.18123172,  0.01845318, -1.10233886]
        se = np.r_[0.10878752,  0.10326078,  0.11171241, 0.05488705,
                   0.05995019, 0.0916574,  0.05951445,  0.08539281]

        assert_almost_equal(mdf.params, cf, decimal=5)
        assert_almost_equal(mdf.bse, se, decimal=5)
예제 #28
0
    def test_linear_constrained(self):

        family = Gaussian()

        exog = np.random.normal(size=(300, 4))
        exog[:, 0] = 1
        endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\
            np.random.normal(size=300)
        group = np.kron(np.arange(100), np.r_[1, 1, 1])

        vi = Independence()
        ve = Exchangeable()

        L = np.r_[[[0, 0, 0, 1]]]
        R = np.r_[0, ]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v, constraint=(L, R))
            mdf = md.fit()
            assert_almost_equal(mdf.params[3], 0, decimal=10)
예제 #29
0
    def test_margins(self):

        n = 300
        exog = np.random.normal(size=(n, 4))
        exog[:, 0] = 1
        exog[:, 1] = 1 * (exog[:, 2] < 0)

        group = np.kron(np.arange(n / 4), np.ones(4))
        time = np.zeros((n, 1))

        beta = np.r_[0, 1, -1, 0.5]
        lpr = np.dot(exog, beta)
        prob = 1 / (1 + np.exp(-lpr))

        endog = 1 * (np.random.uniform(size=n) < prob)

        fa = Binomial()
        ex = Exchangeable()

        md = GEE(endog, exog, group, time, fa, ex)
        mdf = md.fit()

        marg = GEEMargins(mdf, ())
        marg.summary()
예제 #30
0
    def test_margins(self):

        n = 300
        exog = np.random.normal(size=(n, 4))
        exog[:,0] = 1
        exog[:,1] = 1*(exog[:,2] < 0)

        group = np.kron(np.arange(n/4), np.ones(4))
        time = np.zeros((n, 1))

        beta = np.r_[0, 1, -1, 0.5]
        lpr = np.dot(exog, beta)
        prob = 1 / (1 + np.exp(-lpr))

        endog = 1*(np.random.uniform(size=n) < prob)

        fa = Binomial()
        ex = Exchangeable()

        md = GEE(endog, exog, group, time, fa, ex)
        mdf = md.fit()

        marg = GEEMargins(mdf, ())
        marg.summary()
예제 #31
0
    def test_poisson(self):
        #library(gee)
        #Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]
        #X4 = Z[,6]
        #X5 = Z[,7]

        #mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="independence", scale.fix=TRUE)
        #smi = summary(mi)
        #u = coefficients(smi)
        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="exchangeable", scale.fix=TRUE)
        #sme = summary(me)

        #u = coefficients(sme)
        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s]]", cfi, cfe)
        #sprintf("se = [[%s],[%s]]", sei, see)

        family = Poisson()

        endog,exog,group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.0364450410793481,-0.0543209391301178,
                0.0156642711741052,0.57628591338724,
                -0.00465659951186211,-0.477093153099256],
              [-0.0315615554826533,-0.0562589480840004,
                0.0178419412298561,0.571512795340481,
                -0.00363255566297332,-0.475971696727736]]
        se = [[0.0611309237214186,0.0390680524493108,
               0.0334234174505518,0.0366860768962715,
               0.0304758505008105,0.0316348058881079],
              [0.0610840153582275,0.0376887268649102,
               0.0325168379415177,0.0369786751362213,
               0.0296141014225009,0.0306115470200955]]

        for j,v in enumerate((vi,ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:,None], group_n[:,None],
                            exog[:,1:]), axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
             md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
                                   D, family=family, cov_struct=v)
             mdf = md.fit()
             assert_almost_equal(mdf.params, cf[j], decimal=5)
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
예제 #32
0
from statsmodels.genmod.families import Gaussian, Binomial, Poisson
from statsmodels.genmod.dependence_structures import (Exchangeable,
    Independence, GlobalOddsRatio, Autoregressive, Nested)

from statsmodels.genmod.tests import gee_gaussian_simulation_check as gees

da,va = gees.gen_gendat_ar0(0.6)()
ga = Gaussian()
lhs = np.array([[0., 1, 1, 0, 0],])
rhs = np.r_[0.,]

example = []
if 'constraint' in example:
    md = GEE(da.endog, da.exog, da.group, da.time, ga, va,
                     constraint=(lhs, rhs))
    mdf = md.fit()
    print(mdf.summary())


md2 = GEE(da.endog, da.exog, da.group, da.time, ga, va,
                 constraint=None)
mdf2 = md2.fit()
print('\n\n')
print(mdf2.summary())


mdf2.use_t = False
mdf2.df_resid = np.diff(mdf2.model.exog.shape)
tt2 = mdf2.t_test(np.eye(len(mdf2.params)))
# need master to get wald_test
#print mdf2.wald_test(np.eye(len(mdf2.params))[1:])
예제 #33
0
    def test_linear(self):
        #library(gee)

        #Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]
        #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="independence", tol=1e-8, maxit=100)
        #smi = summary(mi)
        #u = coefficients(smi)

        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="exchangeable", tol=1e-8, maxit=100)
        #sme = summary(me)
        #u = coefficients(sme)

        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s]]", cfi, cfe)
        #sprintf("se = [[%s],[%s]]", sei, see)

        family = Gaussian()

        endog,exog,group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.01850226507491,0.81436304278962,
                -1.56167635393184,0.794239361055003],
              [-0.0182920577154767,0.814898414022467,
                -1.56194040106201,0.793499517527478]]
        se = [[0.0440733554189401,0.0479993639119261,
               0.0496045952071308,0.0479467597161284],
              [0.0440369906460754,0.0480069787567662,
               0.049519758758187,0.0479760443027526]]

        for j,v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                  family=family, cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)
예제 #34
0
    for jg, gendat in enumerate(gendats):

        dparams = []
        params = []
        std_errors = []
        pvalues = []

        for j in range(nrep):

            da, va, mt, constraint = gendat()

            beta = da.starting_values(0)

            md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None, mt, va)
            mdf = md.fit(start_params=beta)

            if mdf is None:
                continue

            scale_inv = 1 / md.estimate_scale()

            dparams.append(np.r_[va.dparams, scale_inv])

            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da, va, mt, constraint = gendat()

            beta = da.starting_values(constraint[0].shape[0])
예제 #35
0
    def test_logistic(self):
        #R code for comparing results:

        #library(gee)
        #Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]

        #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="independence")
        #smi = summary(mi)
        #u = coefficients(smi)
        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="exchangeable")
        #sme = summary(me)
        #u = coefficients(sme)
        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="AR-M")
        #sma = summary(ma)
        #u = coefficients(sma)
        #cfa = paste(u[,1], collapse=",")
        #sea = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        #sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)

        endog,exog,group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[0.0167272965285882,1.13038654425893,
               -1.86896345082962,1.09397608331333],
              [0.0178982283915449,1.13118798191788,
               -1.86133518416017,1.08944256230299],
              [0.0109621937947958,1.13226505028438,
               -1.88278757333046,1.09954623769449]]
        se = [[0.127291720283049,0.166725808326067,
               0.192430061340865,0.173141068839597],
              [0.127045031730155,0.165470678232842,
               0.192052750030501,0.173174779369249],
              [0.127240302296444,0.170554083928117,
               0.191045527104503,0.169776150974586]]

        for j,v in enumerate((vi,ve,va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j],
                                    decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
             md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                   family=family, cov_struct=v)
             mdf = md.fit()
             assert_almost_equal(mdf.params, cf[j], decimal=6)
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
예제 #36
0
    def test_poisson(self):
        """
        library(gee)
        Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        X4 = Z[,6]
        X5 = Z[,7]

        mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="independence", scale.fix=TRUE)
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="exchangeable", scale.fix=TRUE)
        sme = summary(me)

        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Poisson()

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.0364450410793481, -0.0543209391301178, 0.0156642711741052,
            0.57628591338724, -0.00465659951186211, -0.477093153099256
        ],
              [
                  -0.0315615554826533, -0.0562589480840004, 0.0178419412298561,
                  0.571512795340481, -0.00363255566297332, -0.475971696727736
              ]]
        se = [[
            0.0611309237214186, 0.0390680524493108, 0.0334234174505518,
            0.0366860768962715, 0.0304758505008105, 0.0316348058881079
        ],
              [
                  0.0610840153582275, 0.0376887268649102, 0.0325168379415177,
                  0.0369786751362213, 0.0296141014225009, 0.0306115470200955
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
        pvalues = []
        params = []
        std_errors = []
        dparams = []

        for j in range(nrep):

            da, va = gendat()
            ga = Poisson()

            # Poisson seems to be more sensitive to starting values,
            # so we run the independence model first.
            md = GEE(da.endog, da.exog, da.group, da.time, ga,
                     Independence())
            mdf = md.fit()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit(start_params = mdf.params)
            if mdf is None or (not mdf.converged):
                print("Failed to converge")
                continue

            scale_inv = 1. / md.estimate_scale()
            dparams.append(np.r_[va.dparams, scale_inv])
            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da,va = gendat()
            ga = Poisson()
예제 #38
0
# Loop over data generating models
for gendat in gendats:

    pvalues = []
    params = []
    std_errors = []
    dparams = []

    for j in range(nrep):

        da, va = gendat()
        ga = Gaussian()

        md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
        mdf = md.fit()

        scale_inv = 1 / md.estimate_scale()
        dparams.append(np.r_[va.dparams, scale_inv])
        params.append(np.asarray(mdf.params))
        std_errors.append(np.asarray(mdf.standard_errors))

        da, va = gendat()
        ga = Gaussian()

        md = GEE(da.endog,
                 da.exog,
                 da.group,
                 da.time,
                 ga,
                 va,
예제 #39
0
    def test_linear(self):
        """
        library(gee)

        Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="independence", tol=1e-8, maxit=100)
        smi = summary(mi)
        u = coefficients(smi)

        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="exchangeable", tol=1e-8, maxit=100)
        sme = summary(me)
        u = coefficients(sme)

        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Gaussian()

        endog, exog, group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.01850226507491, 0.81436304278962, -1.56167635393184,
            0.794239361055003
        ],
              [
                  -0.0182920577154767, 0.814898414022467, -1.56194040106201,
                  0.793499517527478
              ]]
        se = [[
            0.0440733554189401, 0.0479993639119261, 0.0496045952071308,
            0.0479467597161284
        ],
              [
                  0.0440369906460754, 0.0480069787567662, 0.049519758758187,
                  0.0479760443027526
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)
예제 #40
0
from statsmodels.genmod.families import Gaussian, Binomial, Poisson
from statsmodels.genmod.dependence_structures import (Exchangeable,
    Independence, GlobalOddsRatio, Autoregressive, Nested)

from statsmodels.genmod.tests import gee_gaussian_simulation_check as gees

da,va = gees.gen_gendat_ar0(0.6)()
ga = Gaussian()
lhs = np.array([[0., 1, 1, 0, 0],])
rhs = np.r_[0.,]

example = []
if 'constraint' in example:
    md = GEE(da.endog, da.exog, da.group, da.time, ga, va,
                     constraint=(lhs, rhs))
    mdf = md.fit()
    print mdf.summary()


md2 = GEE(da.endog, da.exog, da.group, da.time, ga, va,
                 constraint=None)
mdf2 = md2.fit()
print '\n\n'
print mdf2.summary()


mdf2.use_t = False
mdf2.df_resid = np.diff(mdf2.model.exog.shape)
tt2 = mdf2.t_test(np.eye(len(mdf2.params)))
# need master to get wald_test
#print mdf2.wald_test(np.eye(len(mdf2.params))[1:])
    for jg,gendat in enumerate(gendats):

        dparams = []
        params = []
        std_errors = []
        pvalues = []

        for j in range(nrep):

            da, va, mt, constraint = gendat()

            beta = da.starting_values(0)

            md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None,
                     mt, va)
            mdf = md.fit(start_params = beta)

            if mdf is None:
                continue

            scale_inv = 1 / md.estimate_scale()

            dparams.append(np.r_[va.dparams, scale_inv])

            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da, va, mt, constraint = gendat()

            beta = da.starting_values(constraint[0].shape[0])
예제 #42
0
from statsmodels.genmod.generalized_estimating_equations import GEE, GEEMargins

from statsmodels.genmod.families import Gaussian, Binomial, Poisson
from statsmodels.genmod.dependence_structures import Exchangeable, Independence, GlobalOddsRatio, Autoregressive, Nested

from statsmodels.genmod.tests import gee_gaussian_simulation_check as gees

da, va = gees.gen_gendat_ar0(0.6)()
ga = Gaussian()
lhs = np.array([[0.0, 1, 1, 0, 0]])
rhs = np.r_[0.0,]

example = []
if "constraint" in example:
    md = GEE(da.endog, da.exog, da.group, da.time, ga, va, constraint=(lhs, rhs))
    mdf = md.fit()
    print(mdf.summary())


md2 = GEE(da.endog, da.exog, da.group, da.time, ga, va, constraint=None)
mdf2 = md2.fit()
print("\n\n")
print(mdf2.summary())


mdf2.use_t = False
mdf2.df_resid = np.diff(mdf2.model.exog.shape)
tt2 = mdf2.t_test(np.eye(len(mdf2.params)))
# need master to get wald_test
# print mdf2.wald_test(np.eye(len(mdf2.params))[1:])
예제 #43
0
    def test_logistic(self):
        """
        R code for comparing results:

        library(gee)
        Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]

        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="independence")
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="exchangeable")
        sme = summary(me)
        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="AR-M")
        sma = summary(ma)
        u = coefficients(sma)
        cfa = paste(u[,1], collapse=",")
        sea = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = range(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[
            0.0167272965285882, 1.13038654425893, -1.86896345082962,
            1.09397608331333
        ],
              [
                  0.0178982283915449, 1.13118798191788, -1.86133518416017,
                  1.08944256230299
              ],
              [
                  0.0109621937947958, 1.13226505028438, -1.88278757333046,
                  1.09954623769449
              ]]
        se = [[
            0.127291720283049, 0.166725808326067, 0.192430061340865,
            0.173141068839597
        ],
              [
                  0.127045031730155, 0.165470678232842, 0.192052750030501,
                  0.173174779369249
              ],
              [
                  0.127240302296444, 0.170554083928117, 0.191045527104503,
                  0.169776150974586
              ]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Check for run-time exceptions in summary
        print mdf.summary()
    for gendat in gendats:

        pvalues = []
        params = []
        std_errors = []
        dparams = []

        for j in range(nrep):

            da, va = gendat()
            ga = Poisson()

            # Poisson seems to be more sensitive to starting values,
            # so we run the independence model first.
            md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence())
            mdf = md.fit()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit(start_params=mdf.params)
            if mdf is None or (not mdf.converged):
                print("Failed to converge")
                continue

            scale_inv = 1. / md.estimate_scale()
            dparams.append(np.r_[va.dparams, scale_inv])
            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da, va = gendat()
            ga = Poisson()
    # Loop over data generating models
    for gendat in gendats:

        pvalues = []
        params = []
        std_errors = []
        dep_params = []

        for j in range(nrep):

            da,va = gendat()
            ga = Gaussian()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit()

            scale_inv = 1 / md.estimate_scale()
            dep_params.append(np.r_[va.dep_params, scale_inv])
            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors()))

            da,va = gendat()
            ga = Gaussian()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va,
                     constraint=(lhs, rhs))
            mdf = md.fit()
            score = md.score_test_results
            pvalue = score["p-value"]
            pvalues.append(pvalue)