Пример #1
0
    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200 # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3*np.random.normal(size=n)
        group = np.kron(np.arange(n/4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0.,])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                  cov_struct=va, constraint=(L, R))
        rslt1 = mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0,])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        rslt2 = mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\
                    np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            rslt1 = mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2*norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)
Пример #2
0
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()
        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]

        cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                   "Id",
                                   D,
                                   family=family,
                                   cov_struct=vi)

        cls.start_params = np.array([
            -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566,
            -0.47709315
        ])
Пример #3
0
    def test_nested_linear(self):

        family = Gaussian()

        endog, exog, group = load_data("gee_nested_linear_1.csv")

        group_n = []
        for i in range(endog.shape[0] // 10):
            group_n.extend([
                0,
            ] * 5)
            group_n.extend([
                1,
            ] * 5)
        group_n = np.array(group_n)[:, None]

        dp = Independence()
        md = GEE(endog, exog, group, None, family, dp)
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106]
        se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989]
        assert_almost_equal(mdf1.params, cf, decimal=6)
        assert_almost_equal(mdf1.standard_errors(), se, decimal=6)

        ne = Nested()
        md = GEE(endog, exog, group, None, family, ne, dep_data=group_n)
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969]
        se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991]
        assert_almost_equal(mdf2.params, cf, decimal=6)
        assert_almost_equal(mdf2.standard_errors(), se, decimal=6)
Пример #4
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values,
                            rslt2.params.values,
                            decimal=10)
Пример #5
0
    def test_nominal(self):

        family = Multinomial(3)

        endog, exog, groups = load_data("gee_nominal_1.csv", icept=False)

        # Test with independence correlation
        va = Independence()
        mod1 = NominalGEE(endog, exog, groups, None, family, va)
        rslt1 = mod1.fit()

        # Regression test
        cf1 = np.r_[0.44944752, 0.45569985, -0.92007064, -0.46766728]
        se1 = np.r_[0.09801821, 0.07718842, 0.13229421, 0.08544553]
        assert_almost_equal(rslt1.params, cf1, decimal=5)
        assert_almost_equal(rslt1.standard_errors(), se1, decimal=5)

        # Test with global odds ratio dependence
        va = GlobalOddsRatio("nominal")
        mod2 = NominalGEE(endog, exog, groups, None, family, va)
        rslt2 = mod2.fit(start_params=rslt1.params)

        # Regression test
        cf2 = np.r_[0.45448248, 0.41945568, -0.92008924, -0.50485758]
        se2 = np.r_[0.09632274, 0.07433944, 0.13264646, 0.0911768]
        assert_almost_equal(rslt2.params, cf2, decimal=5)
        assert_almost_equal(rslt2.standard_errors(), se2, decimal=5)

        # Make sure we get the correct results type
        assert_equal(type(rslt1), NominalGEEResultsWrapper)
        assert_equal(type(rslt1._results), NominalGEEResults)
Пример #6
0
    def test_compare_OLS(self):
        #Gaussian GEE with independence correlation should agree
        #exactly with OLS for parameter estimates and standard errors
        #derived from the naive covariance estimate.

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        mdf = md.fit()

        ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit()

        # don't use wrapper, asserts_xxx don't work
        ols = ols._results

        assert_almost_equal(ols.params, mdf.params, decimal=10)

        se = mdf.standard_errors(cov_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.cov_naive))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
Пример #7
0
    def test_nominal(self):

        endog, exog, groups = load_data("gee_nominal_1.csv",
                                        icept=False)

        # Test with independence correlation
        va = Independence()
        mod1 = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt1 = mod1.fit()

        # Regression test
        cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266]
        se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258]
        assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5)
        assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5)

        # Test with global odds ratio dependence
        va = GlobalOddsRatio("nominal")
        mod2 = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt2 = mod2.fit(start_params=rslt1.params)

        # Regression test
        cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116]
        se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064]
        assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5)
        assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5)

        # Make sure we get the correct results type
        assert_equal(type(rslt1), NominalGEEResultsWrapper)
        assert_equal(type(rslt1._results), NominalGEEResults)
Пример #8
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base",
                                data["subject"],
                                data,
                                cov_struct=ind,
                                family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base",
                                data,
                                family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
Пример #9
0
def gendat_overdispersed():
    exs = Overdispersed_simulator()
    exs.params = np.r_[2., 0.2, 0.2, -0.1, -0.2]
    exs.ngroups = 200
    exs.scale_inv = 2.
    exs.dparams = []
    exs.simulate()
    return exs, Independence()
def BuildPoissonModels(hist_data, feature_list, comp_data=None):
    ''' Build score predictions via (linear) poisson regression. '''
    hist_data_1 = hist_data[["team_1_score"] + feature_list]
    hist_data_2 = hist_data[["team_2_score"] + feature_list]

    formula_1 = "team_1_score ~ " + " + ".join(feature_list)
    formula_2 = "team_2_score ~ " + " + ".join(feature_list)

    # using the GEE package along with independance assumptions to fit poisson model.
    # Am assuming this is using a maximum likleyhood approach?
    fam = Poisson()
    ind = Independence()

    model_1 = GEE.from_formula(formula_1,
                               "team_1_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)
    model_2 = GEE.from_formula(formula_2,
                               "team_2_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)

    model_1_fit = model_1.fit()
    model_2_fit = model_2.fit()
    print(model_1_fit.summary())

    hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data)
    hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data)

    # return historical data if comp_data wasn't passed.
    if comp_data is None:
        return hist_data

    # prepare comp data
    comp_data['team_1_score_pred'] = model_1_fit.predict(
        comp_data[feature_list])
    comp_data['team_2_score_pred'] = model_2_fit.predict(
        comp_data[feature_list])

    comp_data['team_1_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[
            'team_2_score_pred']), 1)
    comp_data['team_tie_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred']
                              ), 1)
    comp_data['team_2_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred'
                                                            ]), 1)

    return hist_data, comp_data
Пример #11
0
    def setup_class(cls):

        endog, exog, groups = load_data("gee_nominal_1.csv", icept=False)

        # Test with independence correlation
        va = Independence()
        cls.mod = NominalGEE(endog, exog, groups, cov_struct=va)
        cls.start_params = np.array(
            [0.44944752, 0.45569985, -0.92007064, -0.46766728])
Пример #12
0
    def test_wrapper(self):

        endog, exog, groups = load_data("gee_nominal_1.csv", icept=False)
        endog = pd.Series(endog, name='yendog')
        exog = pd.DataFrame(exog)
        groups = pd.Series(groups, name='the_group')

        va = Independence()
        mod = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt2 = mod.fit()

        check_wrapper(rslt2)
Пример #13
0
    def setup_class(cls):


        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()

        cls.mod = GEE(endog, exog, group_n, None, family, vi)

        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
                                      0.57628591, -0.0046566,  -0.47709315])
Пример #14
0
    def test_wrapper(self):

        endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False)
        endog = pd.Series(endog)
        exog = pd.DataFrame(exog)
        group_n = pd.Series(group_n)

        family = Poisson()
        vi = Independence()

        mod = GEE(endog, exog, group_n, None, family, vi)
        rslt2 = mod.fit()

        check_wrapper(rslt2)
Пример #15
0
    def test_linear_constrained(self):

        family = Gaussian()

        exog = np.random.normal(size=(300, 4))
        exog[:, 0] = 1
        endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\
            np.random.normal(size=300)
        group = np.kron(np.arange(100), np.r_[1, 1, 1])

        vi = Independence()
        ve = Exchangeable()

        L = np.r_[[[0, 0, 0, 1]]]
        R = np.r_[0, ]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v, constraint=(L, R))
            mdf = md.fit()
            assert_almost_equal(mdf.params[3], 0, decimal=10)
Пример #16
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
Пример #17
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Пример #18
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))  # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Пример #19
0
    def test_logistic(self):
        """
        R code for comparing results:

        library(gee)
        Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]

        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="independence")
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="exchangeable")
        sme = summary(me)
        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="AR-M")
        sma = summary(ma)
        u = coefficients(sma)
        cfa = paste(u[,1], collapse=",")
        sea = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[
            0.0167272965285882, 1.13038654425893, -1.86896345082962,
            1.09397608331333
        ],
              [
                  0.0178982283915449, 1.13118798191788, -1.86133518416017,
                  1.08944256230299
              ],
              [
                  0.0109621937947958, 1.13226505028438, -1.88278757333046,
                  1.09954623769449
              ]]
        se = [[
            0.127291720283049, 0.166725808326067, 0.192430061340865,
            0.173141068839597
        ],
              [
                  0.127045031730155, 0.165470678232842, 0.192052750030501,
                  0.173174779369249
              ],
              [
                  0.127240302296444, 0.170554083928117, 0.191045527104503,
                  0.169776150974586
              ]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  "Id",
                                  D,
                                  family=family,
                                  cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
Пример #20
0
def run_permutation_test(dependent, network, number_of_permutations,
                         output_path):
    nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)),
                                   orient='index')
    degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index')
    centrality = pd.DataFrame.from_dict(dict(
        nx.betweenness_centrality(network)),
                                        orient='index')
    h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0)
    h1.columns = [
        'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree',
        'centrality'
    ]
    h1['degree_dist'] = h1.degree / float(h1.degree.max())

    equation = dependent + "~ Age + Sex"
    from statsmodels.genmod.generalized_estimating_equations import GEE
    from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                               Autoregressive)
    from statsmodels.genmod.families import Poisson
    fam = Poisson()
    ind = Independence()

    model = GEE.from_formula(equation,
                             "Location",
                             h1,
                             cov_struct=ind,
                             family=fam)
    main_model_result = model.fit()
    main_result = pd.DataFrame(main_model_result.params).T

    degree_random_coeff = []
    for i in range(number_of_permutations):
        rand_h1 = h1.copy()
        rand_h1[dependent] = np.random.permutation(h1[dependent])
        fam = Poisson()
        ind = Independence()
        model = GEE.from_formula(equation,
                                 "Location",
                                 rand_h1,
                                 cov_struct=ind,
                                 family=fam)
        result = model.fit()
        degree_random_coeff.append(result.params)

    d = pd.DataFrame.from_records(degree_random_coeff)
    import seaborn as sns
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    ax1.hist(d['Age[T.HY]'], bins=100)
    ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272')
    p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p
    ax1.set_xlabel(
        'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' +
        '{0:.2f}'.format(p))
    ax1.set_ylabel('Frequency')

    ax2.hist(d['Age[T.UNK]'], bins=100)
    ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272')
    p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' +
                   '{0:.2f}'.format(p))

    ax3.hist(d['Sex[T.M]'], bins=100)
    ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272')
    p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' +
                   '{0:.2f}'.format(p))
    title = 'permutation test for ' + dependent
    f.suptitle(title)
    plt.tight_layout()
    plt.savefig(output_path + '/' + dependent + '_Permutation_test.png',
                dpi=300)
    plt.show()
Пример #21
0
    # Loop over data generating models
    for gendat in gendats:

        pvalues = []
        params = []
        std_errors = []
        dparams = []

        for j in range(nrep):

            da, va = gendat()
            ga = Poisson()

            # Poisson seems to be more sensitive to starting values,
            # so we run the independence model first.
            md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence())
            mdf = md.fit()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit(start_params=mdf.params)
            if mdf is None or (not mdf.converged):
                print("Failed to converge")
                continue

            scale_inv = 1. / md.estimate_scale()
            dparams.append(np.r_[va.dparams, scale_inv])
            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da, va = gendat()
            ga = Poisson()
    for gendat in gendats:

        pvalues = []
        params = []
        std_errors = []
        dparams = []

        for j in range(nrep):

            da, va = gendat()
            ga = Poisson()

            # Poisson seems to be more sensitive to starting values,
            # so we run the independence model first.
            md = GEE(da.endog, da.exog, da.group, da.time, ga,
                     Independence())
            mdf = md.fit()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit(start_params = mdf.params)
            if mdf is None or (not mdf.converged):
                print("Failed to converge")
                continue

            scale_inv = 1. / md.estimate_scale()
            dparams.append(np.r_[va.dparams, scale_inv])
            params.append(np.asarray(mdf.params))
            std_errors.append(np.asarray(mdf.standard_errors))

            da,va = gendat()
            ga = Poisson()
Пример #23
0
    def test_poisson(self):
        """
        library(gee)
        Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        X4 = Z[,6]
        X5 = Z[,7]

        mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="independence", scale.fix=TRUE)
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="exchangeable", scale.fix=TRUE)
        sme = summary(me)

        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Poisson()

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.0364450410793481, -0.0543209391301178, 0.0156642711741052,
            0.57628591338724, -0.00465659951186211, -0.477093153099256
        ],
              [
                  -0.0315615554826533, -0.0562589480840004, 0.0178419412298561,
                  0.571512795340481, -0.00363255566297332, -0.475971696727736
              ]]
        se = [[
            0.0611309237214186, 0.0390680524493108, 0.0334234174505518,
            0.0366860768962715, 0.0304758505008105, 0.0316348058881079
        ],
              [
                  0.0610840153582275, 0.0376887268649102, 0.0325168379415177,
                  0.0369786751362213, 0.0296141014225009, 0.0306115470200955
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                  "Id",
                                  D,
                                  family=family,
                                  cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
Пример #24
0
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 12 11:36:51 2016

@author: emg
"""

import numpy as np
import pandas as pd
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                           Autoregressive)
from statsmodels.genmod.families import Poisson

fam = Poisson()
ind = Independence()
model1 = GEE.from_formula("author_count ~ top + mod",
                          "author",
                          authors,
                          cov_struct=ind,
                          family=fam)
result1 = model1.fit()
print(result1.summary())
Пример #25
0
    def test_linear(self):
        """
        library(gee)

        Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="independence", tol=1e-8, maxit=100)
        smi = summary(mi)
        u = coefficients(smi)

        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="exchangeable", tol=1e-8, maxit=100)
        sme = summary(me)
        u = coefficients(sme)

        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Gaussian()

        endog, exog, group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.01850226507491, 0.81436304278962, -1.56167635393184,
            0.794239361055003
        ],
              [
                  -0.0182920577154767, 0.814898414022467, -1.56194040106201,
                  0.793499517527478
              ]]
        se = [[
            0.0440733554189401, 0.0479993639119261, 0.0496045952071308,
            0.0479467597161284
        ],
              [
                  0.0440369906460754, 0.0480069787567662, 0.049519758758187,
                  0.0479760443027526
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  "Id",
                                  D,
                                  family=family,
                                  cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)