Пример #1
0
    def setup_class(cls):
        np.random.seed(4321)
        n = 20
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Binomial
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link, binom_version=0)
        mod1 = sm.GLM(endog, exog, family=family(link=link()))
        cls.res1 = mod1.fit(rtol=1e-10, atol=0, tol_criterion='params',
                            scaletype='x2')

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        avg_endog = agg_endog['endog'] / agg_wt
        mod2 = sm.GLM(avg_endog, agg_exog,
                      family=family(link=link()),
                      var_weights=agg_wt)
        cls.res2 = mod2.fit(rtol=1e-10, atol=0, tol_criterion='params')
Пример #2
0
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([7, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Tweedie
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link(), var_power=1.5))
        cls.res1 = mod1.fit(rtol=1e-20, atol=0, tol_criterion='params')

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        agg_endog = agg_endog['endog']
        mod2 = sm.GLM(agg_endog, agg_exog,
                      family=family(link=link(), var_power=1.5),
                      exposure=agg_wt, var_weights=agg_wt ** 0.5)
        cls.res2 = mod2.fit(rtol=1e-20, atol=0, tol_criterion='params')
Пример #3
0
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Poisson
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link))
        cls.res1 = mod1.fit()

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        avg_endog = agg_endog['endog'] / agg_wt
        mod2 = sm.GLM(avg_endog, agg_exog, family=family(link=link),
                      var_weights=agg_wt)
        cls.res2 = mod2.fit()
Пример #4
0
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Binomial
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link, binom_version=0)
        wt = np.random.randint(1, 5, n)
        mod1 = sm.GLM(endog, exog, family=family(link=link), freq_weights=wt)
        cls.res1 = mod1.fit()

        exog_dup = np.repeat(exog, wt, axis=0)
        endog_dup = np.repeat(endog, wt)
        mod2 = sm.GLM(endog_dup, exog_dup, family=family(link=link))
        cls.res2 = mod2.fit()
Пример #5
0
    def setup_class(cls):
        from .results.results_glm import CancerLog
        res2 = CancerLog
        endog = res2.endog
        exog = res2.exog[:, :-1]
        exog = sm.add_constant(exog, prepend=True)

        aweights = np.repeat(1, len(endog))
        aweights[::5] = 5
        aweights[::13] = 3
        model = sm.GLM(endog, exog,
                       family=sm.families.Gamma(link=sm.families.links.log()),
                       var_weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=0)
Пример #6
0
 def setup(self):
     # fit for each test, because results will be changed by test
     x = self.exog
     np.random.seed(987689)
     y = x.sum(1) + np.random.randn(x.shape[0])
     self.results = sm.GLM(y, self.exog).fit()
Пример #7
0
def test_gradient_irls():
    # Compare the results when using gradient optimization and IRLS.
    # TODO: Find working examples for inverse_squared link
    np.random.seed(87342)

    fams = [(sm.families.Binomial, [links.logit, links.probit, links.cloglog,
                                    links.log, links.cauchy]),
            (sm.families.Poisson, [links.log, links.identity, links.sqrt]),
            (sm.families.Gamma, [links.log, links.identity,
                                 links.inverse_power]),
            (sm.families.Gaussian, [links.identity, links.log,
                                    links.inverse_power]),
            (sm.families.InverseGaussian, [links.log, links.identity,
                                           links.inverse_power,
                                           links.inverse_squared]),
            (sm.families.NegativeBinomial, [links.log, links.inverse_power,
                                            links.inverse_squared,
                                            links.identity])]

    n = 100
    p = 3
    exog = np.random.normal(size=(n, p))
    exog[:, 0] = 1

    skip_one = False
    for family_class, family_links in fams:
        for link in family_links:
            for binom_version in [0, 1]:

                if family_class != sm.families.Binomial and binom_version == 1:
                    continue

                if (family_class, link) == (sm.families.Poisson,
                                            links.identity):
                    lin_pred = 20 + exog.sum(1)
                elif (family_class, link) == (sm.families.Binomial, links.log):
                    lin_pred = -1 + exog.sum(1) / 8
                elif (family_class, link) == (sm.families.Poisson, links.sqrt):
                    lin_pred = 2 + exog.sum(1)
                elif (family_class, link) == (sm.families.InverseGaussian,
                                              links.log):
                    #skip_zero = True
                    lin_pred = -1 + exog.sum(1)
                elif (family_class, link) == (sm.families.InverseGaussian,
                                              links.identity):
                    lin_pred = 20 + 5 * exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
                elif (family_class, link) == (sm.families.InverseGaussian,
                                              links.inverse_squared):
                    lin_pred = 0.5 + exog.sum(1) / 5
                    continue  # skip due to non-convergence
                elif (family_class, link) == (sm.families.InverseGaussian,
                                              links.inverse_power):
                    lin_pred = 1 + exog.sum(1) / 5
                elif (family_class, link) == (sm.families.NegativeBinomial,
                                              links.identity):
                    lin_pred = 20 + 5 * exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
                elif (family_class, link) == (sm.families.NegativeBinomial,
                                              links.inverse_squared):
                    lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
                    continue  # skip due to non-convergence
                elif (family_class, link) == (sm.families.NegativeBinomial,
                                              links.inverse_power):
                    lin_pred = 1 + exog.sum(1) / 5

                elif (family_class, link) == (sm.families.Gaussian,
                                              links.inverse_power):
                    # adding skip because of convergence failure
                    skip_one = True
                # GH#4620
                # the following fails with identity link, because endog < 0
                # elif family_class == fam.Gamma:
                #     lin_pred = (0.5 * exog.sum(1) +
                #                 np.random.uniform(size=exog.shape[0]))
                else:
                    lin_pred = np.random.uniform(size=exog.shape[0])

                endog = gen_endog(lin_pred, family_class, link, binom_version)

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    mod_irls = sm.GLM(endog, exog,
                                      family=family_class(link=link()))
                rslt_irls = mod_irls.fit(method="IRLS")

                if (family_class, link) not in [(sm.families.Poisson,
                                                 links.sqrt),
                                                (sm.families.Gamma,
                                                 links.inverse_power),
                                                (sm.families.InverseGaussian,
                                                 links.identity)]:
                    # GH#4620
                    check_score_hessian(rslt_irls)

                # Try with and without starting values.
                for max_start_irls, start_params in [(0, rslt_irls.params),
                                                     (3, None)]:
                    # TODO: skip convergence failures for now
                    if max_start_irls > 0 and skip_one:
                        continue
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        mod_gradient = sm.GLM(endog, exog,
                                              family=family_class(link=link()))
                    rslt_gradient = mod_gradient.fit(
                        max_start_irls=max_start_irls,
                        start_params=start_params,
                        method="newton",
                        maxiter=300)

                    assert_allclose(rslt_gradient.params,
                                    rslt_irls.params, rtol=1e-6, atol=5e-5)

                    assert_allclose(rslt_gradient.llf, rslt_irls.llf,
                                    rtol=1e-6, atol=1e-6)

                    assert_allclose(rslt_gradient.scale, rslt_irls.scale,
                                    rtol=1e-6, atol=1e-6)

                    # Get the standard errors using expected information.
                    gradient_bse = rslt_gradient.bse
                    ehess = mod_gradient.hessian(rslt_gradient.params,
                                                 observed=False)
                    gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
                    assert_allclose(gradient_bse, rslt_irls.bse,
                                    rtol=1e-6, atol=5e-5)
Пример #8
0
def test_wtd_gradient_irls():
    # Compare the results when using gradient optimization and IRLS.
    # TODO: Find working examples for inverse_squared link
    np.random.seed(87342)

    fam = sm.families
    lnk = sm.families.links
    families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log,
                                lnk.cauchy]),
                (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]),
                (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]),
                (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]),
                (fam.InverseGaussian, [lnk.log, lnk.identity,
                                       lnk.inverse_power,
                                       lnk.inverse_squared]),
                (fam.NegativeBinomial, [lnk.log, lnk.inverse_power,
                                        lnk.inverse_squared, lnk.identity])]

    n = 100
    p = 3
    exog = np.random.normal(size=(n, p))
    exog[:, 0] = 1

    skip_one = False
    for family_class, family_links in families:
        for link in family_links:
            for binom_version in [0, 1]:
                method = 'bfgs'

                if family_class != fam.Binomial and binom_version == 1:
                    continue
                elif family_class == fam.Binomial and link == lnk.cloglog:
                    # Can't get gradient to converage with var_weights here
                    continue
                elif family_class == fam.Binomial and link == lnk.log:
                    # Can't get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Poisson, lnk.identity):
                    lin_pred = 20 + exog.sum(1)
                elif (family_class, link) == (fam.Binomial, lnk.log):
                    lin_pred = -1 + exog.sum(1) / 8
                elif (family_class, link) == (fam.Poisson, lnk.sqrt):
                    lin_pred = -2 + exog.sum(1)
                elif (family_class, link) == (fam.Gamma, lnk.log):
                    # Can't get gradient to converge with var_weights here
                    continue
                elif (family_class, link) == (fam.Gamma, lnk.identity):
                    # Can't get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gamma, lnk.inverse_power):
                    # Can't get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gaussian, lnk.log):
                    # Can't get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
                    # Can't get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.InverseGaussian, lnk.log):
                    # Can't get gradient to converage with var_weights here
                    lin_pred = -1 + exog.sum(1)
                    continue
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.identity):
                    # Can't get gradient to converage with var_weights here
                    lin_pred = 20 + 5 * exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
                    continue
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.inverse_squared):
                    lin_pred = 0.5 + exog.sum(1) / 5
                    continue  # skip due to non-convergence
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.inverse_power):
                    lin_pred = 1 + exog.sum(1) / 5
                    method = 'newton'
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.identity):
                    lin_pred = 20 + 5 * exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-3, np.inf)
                    method = 'newton'
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.inverse_squared):
                    lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
                    continue  # skip due to non-convergence
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.inverse_power):
                    # Can't get gradient to converage with var_weights here
                    lin_pred = 1 + exog.sum(1) / 5
                    continue

                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
                    # adding skip because of convergence failure
                    skip_one = True
                else:
                    lin_pred = np.random.uniform(size=exog.shape[0])

                endog = gen_endog(lin_pred, family_class, link, binom_version)
                if binom_version == 0:
                    wts = np.ones_like(endog)
                    tmp = np.random.randint(2,
                                            5,
                                            size=(endog > endog.mean()).sum())
                    wts[endog > endog.mean()] = tmp
                else:
                    wts = np.ones(shape=endog.shape[0])
                    y = endog[:, 0] / endog.sum(axis=1)
                    tmp = np.random.gamma(2, size=(y > y.mean()).sum())
                    wts[y > y.mean()] = tmp

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    mod_irls = sm.GLM(endog, exog, var_weights=wts,
                                      family=family_class(link=link()))
                    rslt_irls = mod_irls.fit(method="IRLS", atol=1e-10,
                                             tol_criterion='params')

                # Try with and without starting values.
                for max_start_irls, start_params in ((0, rslt_irls.params),
                                                     (3, None)):
                    # TODO: skip convergence failures for now
                    if max_start_irls > 0 and skip_one:
                        continue
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        mod_gradient = sm.GLM(endog, exog, var_weights=wts,
                                              family=family_class(link=link()))
                    rslt_gradient = mod_gradient.fit(
                        max_start_irls=max_start_irls,
                        start_params=start_params,
                        method=method)
                    assert_allclose(rslt_gradient.params,
                                    rslt_irls.params,
                                    rtol=1e-6, atol=5e-5)

                    assert_allclose(rslt_gradient.llf, rslt_irls.llf,
                                    rtol=1e-6, atol=1e-6)

                    assert_allclose(rslt_gradient.scale, rslt_irls.scale,
                                    rtol=1e-6, atol=1e-6)

                    # Get the standard errors using expected information.
                    gradient_bse = rslt_gradient.bse
                    ehess = mod_gradient.hessian(rslt_gradient.params,
                                                 observed=False)
                    gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
                    assert_allclose(gradient_bse, rslt_irls.bse,
                                    rtol=1e-6, atol=5e-5)