예제 #1
0
    def test_missing_formula(self):
        # Test missing data handling for formulas.

        endog = np.random.normal(size=100)
        exog1 = np.random.normal(size=100)
        exog2 = np.random.normal(size=100)
        exog3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog2[10:12] = np.nan

        data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
                             "exog3": exog3, "groups": groups})

        mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

        data = data.dropna()
        groups = groups[data.index.values]

        mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params.values, rslt2.params.values)
        assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
예제 #2
0
    def test_missing_formula(self):
        # Test missing data handling for formulas.

        endog = np.random.normal(size=100)
        exog1 = np.random.normal(size=100)
        exog2 = np.random.normal(size=100)
        exog3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog2[10:12] = np.nan

        data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
                             "exog3": exog3, "groups": groups})

        mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

        data = data.dropna()
        groups = groups[data.index.values]

        mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params.values, rslt2.params.values)
        assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
예제 #3
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
def BuildPoissonModels(hist_data, feature_list, comp_data=None):
    ''' Build score predictions via (linear) poisson regression. '''
    hist_data_1 = hist_data[["team_1_score"] + feature_list]
    hist_data_2 = hist_data[["team_2_score"] + feature_list]

    formula_1 = "team_1_score ~ " + " + ".join(feature_list)
    formula_2 = "team_2_score ~ " + " + ".join(feature_list)

    # using the GEE package along with independance assumptions to fit poisson model.
    # Am assuming this is using a maximum likleyhood approach?
    fam = Poisson()
    ind = Independence()

    model_1 = GEE.from_formula(formula_1,
                               "team_1_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)
    model_2 = GEE.from_formula(formula_2,
                               "team_2_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)

    model_1_fit = model_1.fit()
    model_2_fit = model_2.fit()
    print(model_1_fit.summary())

    hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data)
    hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data)

    # return historical data if comp_data wasn't passed.
    if comp_data is None:
        return hist_data

    # prepare comp data
    comp_data['team_1_score_pred'] = model_1_fit.predict(
        comp_data[feature_list])
    comp_data['team_2_score_pred'] = model_2_fit.predict(
        comp_data[feature_list])

    comp_data['team_1_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[
            'team_2_score_pred']), 1)
    comp_data['team_tie_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred']
                              ), 1)
    comp_data['team_2_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred'
                                                            ]), 1)

    return hist_data, comp_data
예제 #5
0
    def test_predict_exposure(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.random.uniform(1, 2, size=n)
        Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Poisson(),
                                 offset="offset", exposure="exposure")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data["offset"])
        pred3 = result.predict(exposure=data["exposure"])
        pred4 = result.predict(offset=data["offset"], exposure=data["exposure"])
        pred5 = result.predict(exog=data[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:])
        # without patsy
        pred6 = result.predict(exog=result.model.exog[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:],
                               transform=False)
        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4)
        assert_allclose(pred1[-10:], pred5)
        assert_allclose(pred1[-10:], pred6)
예제 #6
0
    def test_compare_OLS(self):
        #Gaussian GEE with independence correlation should agree
        #exactly with OLS for parameter estimates and standard errors
        #derived from the naive covariance estimate.

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        mdf = md.fit()

        ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit()

        # don't use wrapper, asserts_xxx don't work
        ols = ols._results

        assert_almost_equal(ols.params, mdf.params, decimal=10)

        se = mdf.standard_errors(cov_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.cov_naive))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
예제 #7
0
    def test_predict_exposure(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.random.uniform(1, 2, size=n)
        Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Poisson(),
                                 offset="offset", exposure="exposure")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data["offset"])
        pred3 = result.predict(exposure=data["exposure"])
        pred4 = result.predict(offset=data["offset"], exposure=data["exposure"])
        pred5 = result.predict(exog=data[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:])
        # without patsy
        pred6 = result.predict(exog=result.model.exog[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:],
                               transform=False)
        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4)
        assert_allclose(pred1[-10:], pred5)
        assert_allclose(pred1[-10:], pred6)
예제 #8
0
    def test_sensitivity(self):

        va = Exchangeable()
        family = Gaussian()

        np.random.seed(34234)
        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(50), np.r_[1, 1])

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2})

        mod = GEE.from_formula("Y ~ X1 + X2", groups, D,
                               family=family, cov_struct=va)
        rslt = mod.fit()
        ps = rslt.params_sensitivity(0, 0.5, 2)
        assert_almost_equal(len(ps), 2)
        assert_almost_equal([x.cov_struct.dep_params for x in ps],
                            [0.0, 0.5])

        # Regression test
        assert_almost_equal([x.params[0] for x in ps],
                            [0.1696214707458818, 0.17836097387799127])
예제 #9
0
    def t_est_missing(self):

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        Y[0] = np.nan
        Y[5:7] = np.nan
        X2[10:12] = np.nan

        D = pd.DataFrame({
            "Y": Y,
            "X1": X1,
            "X2": X2,
            "X3": X3,
            "groups": groups
        })

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=D["groups"],
                              missing='drop')
        mdf = md.fit()

        assert (len(md.endog) == 95)
        assert (md.exog.shape) == (95, 4)
예제 #10
0
    def test_sensitivity(self):

        va = Exchangeable()
        family = Gaussian()

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(50), np.r_[1, 1])

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2})

        mod = GEE.from_formula("Y ~ X1 + X2",
                               groups,
                               D,
                               family=family,
                               cov_struct=va)
        rslt = mod.fit()
        ps = rslt.params_sensitivity(0, 0.5, 2)
        assert_almost_equal(len(ps), 2)
        assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5])

        # Regression test
        assert_almost_equal([x.params[0] for x in ps], np.r_[-0.1256575,
                                                             -0.126747036])
예제 #11
0
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()
        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]

        cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                   "Id",
                                   D,
                                   family=family,
                                   cov_struct=vi)

        cls.start_params = np.array([
            -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566,
            -0.47709315
        ])
예제 #12
0
    def test_compare_OLS(self):
        """
        Gaussian GEE with independence correlation should agree
        exactly with OLS for parameter estimates and standard errors
        derived from the naive covariance estimate.
        """

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(range(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs)
        mdf = md.fit()

        ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(ols.params.values, mdf.params, decimal=10)

        naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.naive_covariance))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
예제 #13
0
    def test_compare_OLS(self):
        #Gaussian GEE with independence correlation should agree
        #exactly with OLS for parameter estimates and standard errors
        #derived from the naive covariance estimate.

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        mdf = md.fit()

        ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit()

        # don't use wrapper, asserts_xxx don't work
        ols = ols._results

        assert_almost_equal(ols.params, mdf.params, decimal=10)

        se = mdf.standard_errors(cov_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.cov_naive))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
예제 #14
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"],
                                data, cov_struct=ind, family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base", data,
                               family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
예제 #15
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        md1 = GEE.from_formula("y ~ age + trt + base",
                               data,
                               groups=data["subject"],
                               cov_struct=ind,
                               family=fam)
        mdf1 = md1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        md2 = GLM.from_formula("y ~ age + trt + base",
                               data,
                               family=families.Poisson())
        mdf2 = md2.fit(scale="X2")

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
예제 #16
0
    def test_compare_OLS(self):
        """
        Gaussian GEE with independence correlation should agree
        exactly with OLS for parameter estimates and standard errors
        derived from the naive covariance estimate.
        """

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(range(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=groups,
                              family=family,
                              covstruct=vs)
        mdf = md.fit()

        ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(ols.params.values, mdf.params, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.naive_covariance))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
예제 #17
0
    def test_predict(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        Y = np.random.normal(0.1*(X1 + X2) + offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Gaussian(),
                                 offset="offset")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data.offset)
        pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset)
        pred4 = result.predict(exog=data[["X1", "X2"]], offset=0*data.offset)
        pred5 = result.predict(offset=0*data.offset)

        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4 + data.offset)
        assert_allclose(pred1, pred5 + data.offset)

        x1_new = np.random.normal(size=10)
        x2_new = np.random.normal(size=10)
        new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new})
        pred6 = result.predict(exog=new_exog)
        params = result.params
        pred6_correct = params[0] + params[1]*x1_new + params[2]*x2_new
        assert_allclose(pred6, pred6_correct)
예제 #18
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values,
                            rslt2.params.values,
                            decimal=10)
예제 #19
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base",
                                data["subject"],
                                data,
                                cov_struct=ind,
                                family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base",
                                data,
                                family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
예제 #20
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n,1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family,
                   cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time,
                                family=family, cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time",
                                family=family, cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time,
                                family=family, cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time",
                                family=family, cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
예제 #21
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                            family=family, cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
예제 #22
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit()

        sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
예제 #23
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
예제 #24
0
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()
        # Test with formulas
        D = np.concatenate((endog[:,None], group_n[:,None],
                            exog[:,1:]), axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]

        cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
                                    D, family=family, cov_struct=vi)

        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
                                      0.57628591, -0.0046566,  -0.47709315])
예제 #25
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1*(np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups,
                               family=family, cov_struct=vs).fit()

        sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit(disp=False)

        assert_almost_equal(sml.params.values, md.params, decimal=10)
예제 #26
0
    def test_predict(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n)
        data = pd.DataFrame({
            "Y": Y,
            "X1": X1,
            "X2": X2,
            "groups": groups,
            "offset": offset
        })

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml,
                                 groups,
                                 data,
                                 family=Gaussian(),
                                 offset="offset")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data.offset)
        pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset)
        pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset)
        pred5 = result.predict(offset=0 * data.offset)

        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4 + data.offset)
        assert_allclose(pred1, pred5 + data.offset)

        x1_new = np.random.normal(size=10)
        x2_new = np.random.normal(size=10)
        new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new})
        pred6 = result.predict(exog=new_exog)
        params = result.params
        pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new
        assert_allclose(pred6, pred6_correct)
예제 #27
0
    def test_offset_formula(self):
        """
        Test various ways of passing offset and exposure to `from_formula`.
        """

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.exp(offset)
        Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model1 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset="offset")
        result1 = model1.fit()
        assert_equal(result1.converged, True)

        model2 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=offset)
        result2 = model2.fit(start_params=result1.params)
        assert_allclose(result1.params, result2.params)
        assert_equal(result2.converged, True)

        model3 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure=exposure)
        result3 = model3.fit(start_params=result1.params)
        assert_allclose(result1.params, result3.params)
        assert_equal(result3.converged, True)

        model4 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure")
        result4 = model4.fit(start_params=result1.params)
        assert_allclose(result1.params, result4.params)
        assert_equal(result4.converged, True)

        model5 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure", offset="offset")
        result5 = model5.fit()
        assert_equal(result5.converged, True)

        model6 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=2*offset)
        result6 = model6.fit(start_params=result5.params)
        assert_allclose(result5.params, result6.params)
        assert_equal(result6.converged, True)
예제 #28
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))   # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
예제 #29
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
예제 #30
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1*(np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
예제 #31
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
예제 #32
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
예제 #33
0
    def t_est_missing(self):

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        Y[0] = np.nan
        Y[5:7] = np.nan
        X2[10:12] = np.nan

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3,
                          "groups": groups})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None,
                              groups=D["groups"], missing='drop')
        mdf = md.fit()

        assert(len(md.endog) == 95)
        assert(md.exog.shape) == (95,4)
예제 #34
0
    def test_offset_formula(self):
        # Test various ways of passing offset and exposure to `from_formula`.

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.exp(offset)
        Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model1 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset="offset")
        result1 = model1.fit()
        assert_equal(result1.converged, True)

        model2 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=offset)
        result2 = model2.fit(start_params=result1.params)
        assert_allclose(result1.params, result2.params)
        assert_equal(result2.converged, True)

        model3 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure=exposure)
        result3 = model3.fit(start_params=result1.params)
        assert_allclose(result1.params, result3.params)
        assert_equal(result3.converged, True)

        model4 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure")
        result4 = model4.fit(start_params=result1.params)
        assert_allclose(result1.params, result4.params)
        assert_equal(result4.converged, True)

        model5 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure", offset="offset")
        result5 = model5.fit()
        assert_equal(result5.converged, True)

        model6 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=2*offset)
        result6 = model6.fit(start_params=result5.params)
        assert_allclose(result5.params, result6.params)
        assert_equal(result6.converged, True)
예제 #35
0
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=groups,
                              family=family,
                              covstruct=vs).fit()

        sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
예제 #36
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))  # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
예제 #37
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        md1 = GEE.from_formula("y ~ age + trt + base", data,
                               groups=data["subject"], cov_struct=ind,
                               family=fam)
        mdf1 = md1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        md2 = GLM.from_formula("y ~ age + trt + base", data,
                               family=families.Poisson())
        mdf2 = md2.fit(scale="X2")

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
예제 #38
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1 * (np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=groups,
                              family=family,
                              cov_struct=vs).fit()

        sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit(disp=False)

        assert_almost_equal(sml.params.values, md.params, decimal=10)
예제 #39
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1 * (np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
예제 #40
0
def run_permutation_test(dependent, network, number_of_permutations,
                         output_path):
    nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)),
                                   orient='index')
    degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index')
    centrality = pd.DataFrame.from_dict(dict(
        nx.betweenness_centrality(network)),
                                        orient='index')
    h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0)
    h1.columns = [
        'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree',
        'centrality'
    ]
    h1['degree_dist'] = h1.degree / float(h1.degree.max())

    equation = dependent + "~ Age + Sex"
    from statsmodels.genmod.generalized_estimating_equations import GEE
    from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                               Autoregressive)
    from statsmodels.genmod.families import Poisson
    fam = Poisson()
    ind = Independence()

    model = GEE.from_formula(equation,
                             "Location",
                             h1,
                             cov_struct=ind,
                             family=fam)
    main_model_result = model.fit()
    main_result = pd.DataFrame(main_model_result.params).T

    degree_random_coeff = []
    for i in range(number_of_permutations):
        rand_h1 = h1.copy()
        rand_h1[dependent] = np.random.permutation(h1[dependent])
        fam = Poisson()
        ind = Independence()
        model = GEE.from_formula(equation,
                                 "Location",
                                 rand_h1,
                                 cov_struct=ind,
                                 family=fam)
        result = model.fit()
        degree_random_coeff.append(result.params)

    d = pd.DataFrame.from_records(degree_random_coeff)
    import seaborn as sns
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    ax1.hist(d['Age[T.HY]'], bins=100)
    ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272')
    p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p
    ax1.set_xlabel(
        'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' +
        '{0:.2f}'.format(p))
    ax1.set_ylabel('Frequency')

    ax2.hist(d['Age[T.UNK]'], bins=100)
    ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272')
    p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' +
                   '{0:.2f}'.format(p))

    ax3.hist(d['Sex[T.M]'], bins=100)
    ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272')
    p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' +
                   '{0:.2f}'.format(p))
    title = 'permutation test for ' + dependent
    f.suptitle(title)
    plt.tight_layout()
    plt.savefig(output_path + '/' + dependent + '_Permutation_test.png',
                dpi=300)
    plt.show()
예제 #41
0
LR = 2 * (model_panel2_results.llf - model_panel1_results.llf)
p = chi2.sf(LR, 2) 
print('p: %.30f' % p) 

# provides a summary of the number of zeros
print(US_cases_long_demogr_week['cases_count_pos'].describe())
print(US_cases_long_demogr_week['cases_count_pos'].value_counts())
count_total = sum(US_cases_long_demogr_week['cases_count_pos'].value_counts().to_dict().values())
count_zero = US_cases_long_demogr_week['cases_count_pos'].value_counts()[0.0]
print("Count of zero is {}, about {:.4f} of the data.".format(count_zero, count_zero / count_total ))

# Approach one to generalized linear models for panel data: Generalized Estimating Equations
# poisson model
poi=Poisson()
ar=Autoregressive()
gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
gee_model0_results = gee_model0.fit(maxiter=200)
print(gee_model0_results.summary())
print(ar.summary())
print("scale=%.2f" % (gee_model0_results.scale))

# There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case,
# specific starting values are needed to get the estimating algorithm to converge.
# First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077.
fam = Poisson()
ex = Exchangeable()
ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
ex_results = ex_model.fit()
print(ex_results.summary())
print(ex.summary())
import numpy as np
import pandas as pd
import math
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable,
    Independence,Autoregressive)
from statsmodels.genmod.families import Poisson
fam = Poisson()
ind = Independence()


df = pd.read_csv("file:///C:/Users/Luke/Documents/drugi_test.csv")


count = 0
for i in range(60, len(df)):
    df_tmp = df.head(i).tail(60)
    model1 = GEE.from_formula("liczba ~ indeks", "indeks", df_tmp, cov_struct=ind, family=fam)
    results = model1.fit()

    if i>117 and i<120:
        print(df_tmp.get_value(60,"indeks"))
    if results.pvalues.Intercept < 0.05 and results.pvalues.indeks < 0.05:
        suma = 0
        for n in range(1, 15):
            suma += math.exp(results.params.Intercept + results.params.indeks*(i+n))
        print(str(i)+": " + str(suma))
    else:
        prediction = df_tmp.mean().liczba*15
        print(str(i)+": " + str(prediction))
예제 #43
0
    def test_poisson(self):
        #library(gee)
        #Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]
        #X4 = Z[,6]
        #X5 = Z[,7]

        #mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="independence", scale.fix=TRUE)
        #smi = summary(mi)
        #u = coefficients(smi)
        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="exchangeable", scale.fix=TRUE)
        #sme = summary(me)

        #u = coefficients(sme)
        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s]]", cfi, cfe)
        #sprintf("se = [[%s],[%s]]", sei, see)

        family = Poisson()

        endog,exog,group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.0364450410793481,-0.0543209391301178,
                0.0156642711741052,0.57628591338724,
                -0.00465659951186211,-0.477093153099256],
              [-0.0315615554826533,-0.0562589480840004,
                0.0178419412298561,0.571512795340481,
                -0.00363255566297332,-0.475971696727736]]
        se = [[0.0611309237214186,0.0390680524493108,
               0.0334234174505518,0.0366860768962715,
               0.0304758505008105,0.0316348058881079],
              [0.0610840153582275,0.0376887268649102,
               0.0325168379415177,0.0369786751362213,
               0.0296141014225009,0.0306115470200955]]

        for j,v in enumerate((vi,ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:,None], group_n[:,None],
                            exog[:,1:]), axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
             md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
                                   D, family=family, cov_struct=v)
             mdf = md.fit()
             assert_almost_equal(mdf.params, cf[j], decimal=5)
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
예제 #44
0
    def test_linear(self):
        #library(gee)

        #Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]
        #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="independence", tol=1e-8, maxit=100)
        #smi = summary(mi)
        #u = coefficients(smi)

        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="exchangeable", tol=1e-8, maxit=100)
        #sme = summary(me)
        #u = coefficients(sme)

        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s]]", cfi, cfe)
        #sprintf("se = [[%s],[%s]]", sei, see)

        family = Gaussian()

        endog,exog,group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.01850226507491,0.81436304278962,
                -1.56167635393184,0.794239361055003],
              [-0.0182920577154767,0.814898414022467,
                -1.56194040106201,0.793499517527478]]
        se = [[0.0440733554189401,0.0479993639119261,
               0.0496045952071308,0.0479467597161284],
              [0.0440369906460754,0.0480069787567662,
               0.049519758758187,0.0479760443027526]]

        for j,v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                  family=family, cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)
예제 #45
0
    def test_logistic(self):
        #R code for comparing results:

        #library(gee)
        #Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]

        #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="independence")
        #smi = summary(mi)
        #u = coefficients(smi)
        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="exchangeable")
        #sme = summary(me)
        #u = coefficients(sme)
        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="AR-M")
        #sma = summary(ma)
        #u = coefficients(sma)
        #cfa = paste(u[,1], collapse=",")
        #sea = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        #sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)

        endog,exog,group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[0.0167272965285882,1.13038654425893,
               -1.86896345082962,1.09397608331333],
              [0.0178982283915449,1.13118798191788,
               -1.86133518416017,1.08944256230299],
              [0.0109621937947958,1.13226505028438,
               -1.88278757333046,1.09954623769449]]
        se = [[0.127291720283049,0.166725808326067,
               0.192430061340865,0.173141068839597],
              [0.127045031730155,0.165470678232842,
               0.192052750030501,0.173174779369249],
              [0.127240302296444,0.170554083928117,
               0.191045527104503,0.169776150974586]]

        for j,v in enumerate((vi,ve,va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j],
                                    decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
             md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                   family=family, cov_struct=v)
             mdf = md.fit()
             assert_almost_equal(mdf.params, cf[j], decimal=6)
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
예제 #46
0
    vals = np.size(dataset2.iloc[:, i].unique())
    if vals >= 100:
        vals = 100

    plt.hist(dataset2.iloc[:, i], bins=vals, color='#3F5D7D')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

#Correlation with Response Variable
dataset2.corrwith(dataset.TC).plot.bar(figsize=(20, 10),
                                       title="Correlation with TC",
                                       fontsize=15,
                                       rot=45,
                                       grid=True)

X = ['MJJ', 'JJA', 'JAS']
# building the model

fam = Poisson()
ind = Independence()
model1 = GEE.from_formula("TC ~ MJJ + JJA + JAS", cov_struct=ind, family=fam)
result1 = model1.fit()
print(result1.summary())

# testing the model

predVals = poisson_res.predict(X)

plt.plot(range(len(TC)), TC, 'r*-', range(len(TC)), predVals, 'bo-')
plt.title('Train dataset Real vs. Predicted Values')
plt.legend(['Real Values', 'Predicted Values'])
plt.show()
예제 #47
0
    def test_linear(self):
        """
        library(gee)

        Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="independence", tol=1e-8, maxit=100)
        smi = summary(mi)
        u = coefficients(smi)

        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
                 corstr="exchangeable", tol=1e-8, maxit=100)
        sme = summary(me)
        u = coefficients(sme)

        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Gaussian()

        endog, exog, group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.01850226507491, 0.81436304278962, -1.56167635393184,
            0.794239361055003
        ],
              [
                  -0.0182920577154767, 0.814898414022467, -1.56194040106201,
                  0.793499517527478
              ]]
        se = [[
            0.0440733554189401, 0.0479993639119261, 0.0496045952071308,
            0.0479467597161284
        ],
              [
                  0.0440369906460754, 0.0480069787567662, 0.049519758758187,
                  0.0479760443027526
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)
예제 #48
0
    def test_poisson(self):
        """
        library(gee)
        Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        X4 = Z[,6]
        X5 = Z[,7]

        mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="independence", scale.fix=TRUE)
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="exchangeable", scale.fix=TRUE)
        sme = summary(me)

        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Poisson()

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.0364450410793481, -0.0543209391301178, 0.0156642711741052,
            0.57628591338724, -0.00465659951186211, -0.477093153099256
        ],
              [
                  -0.0315615554826533, -0.0562589480840004, 0.0178419412298561,
                  0.571512795340481, -0.00363255566297332, -0.475971696727736
              ]]
        se = [[
            0.0611309237214186, 0.0390680524493108, 0.0334234174505518,
            0.0366860768962715, 0.0304758505008105, 0.0316348058881079
        ],
              [
                  0.0610840153582275, 0.0376887268649102, 0.0325168379415177,
                  0.0369786751362213, 0.0296141014225009, 0.0306115470200955
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
예제 #49
0
    def test_logistic(self):
        """
        R code for comparing results:

        library(gee)
        Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]

        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="independence")
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="exchangeable")
        sme = summary(me)
        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="AR-M")
        sma = summary(ma)
        u = coefficients(sma)
        cfa = paste(u[,1], collapse=",")
        sea = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = range(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[
            0.0167272965285882, 1.13038654425893, -1.86896345082962,
            1.09397608331333
        ],
              [
                  0.0178982283915449, 1.13118798191788, -1.86133518416017,
                  1.08944256230299
              ],
              [
                  0.0109621937947958, 1.13226505028438, -1.88278757333046,
                  1.09954623769449
              ]]
        se = [[
            0.127291720283049, 0.166725808326067, 0.192430061340865,
            0.173141068839597
        ],
              [
                  0.127045031730155, 0.165470678232842, 0.192052750030501,
                  0.173174779369249
              ],
              [
                  0.127240302296444, 0.170554083928117, 0.191045527104503,
                  0.169776150974586
              ]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Check for run-time exceptions in summary
        print mdf.summary()
예제 #50
0
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 12 11:36:51 2016

@author: emg
"""

import numpy as np
import pandas as pd
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                           Autoregressive)
from statsmodels.genmod.families import Poisson

fam = Poisson()
ind = Independence()
model1 = GEE.from_formula("author_count ~ top + mod",
                          "author",
                          authors,
                          cov_struct=ind,
                          family=fam)
result1 = model1.fit()
print(result1.summary())