Пример #1
1
def test_zero_penalty():
    x, y, poly = multivariate_sample_data()
    alphas = [0, 0]
    gam_gs = GLMGam(y, smoother=poly, alpha=alphas)
    gam_gs_res = gam_gs.fit()
    y_est_gam = gam_gs_res.predict()

    glm = GLM(y, poly.basis).fit()
    y_est = glm.predict()

    assert_allclose(y_est, y_est_gam)
Пример #2
0
 def init(cls):
     cls.res2 = cls.mod2.fit()
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze())
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit()
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
Пример #3
0
def ppglmfit(X,Y):
    '''
    The GLM solver in statsmodels is very general. It accepts any link
    function and expects that, if you want a constant term in your model,
    that you have already manually added a column of ones to your
    design matrix. This wrapper simplifies using GLM to fit the common
    case of a Poisson point-process model, where the constant term has
    not been explicitly added to the design matrix

    Args:
        X: N_observations x N_features design matrix.
        Y: Binary point process observations
    Returns:
        μ, B: the offset and parameter estimates for the GLM model.
    '''
    # add constant value to X, if the 1st column is not constant
    if mean(Y)>0.1:
        print('Caution: spike rate very high, is Poisson assumption valid?')
    if sum(Y)<100:
        print('Caution: fewer than 100 spikes to fit model')
    if not all(X[:,0]==X[0,0]):
        X = hstack([ ones((shape(X)[0],1),dtype=X.dtype), X])
    poisson_model   = GLM(Y,X,family=Poisson())
    poisson_results = poisson_model.fit()
    M = poisson_results.params
    return M[0],M[1:]
Пример #4
0
    def setup_class(cls):
        df = data_bin
        res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
              family=families.Binomial()).fit(attach_wls=True, atol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = MLEInfluence(res)
Пример #5
0
def test_cov_params():

    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()

    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
    alpha = [0, 0]
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)
    res_glm = glm.fit()

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=0.0025)

    alpha = 1e-13
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    atol=1e-10)

    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=0, maxiter=5000, maxfun=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=1e-4, atol=1e-8)
Пример #6
0
def test_poisson_residuals():
    nobs, k_exog = 100, 5
    np.random.seed(987125)
    x = np.random.randn(nobs, k_exog - 1)
    x = add_constant(x)

    y_true = x.sum(1) / 2
    y = y_true + 2 * np.random.randn(nobs)
    exposure = 1 + np.arange(nobs) // 4

    yp = np.random.poisson(np.exp(y_true) * exposure)
    yp[10:15] += 10

    fam = sm.families.Poisson()
    mod_poi_e = GLM(yp, x, family=fam, exposure=exposure)
    res_poi_e = mod_poi_e.fit()

    mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure)
    res_poi_w = mod_poi_w.fit()

    assert_allclose(res_poi_e.resid_response / exposure,
                    res_poi_w.resid_response)
    assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson)
    assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe)
    assert_allclose(res_poi_e.resid_anscombe_unscaled,
                    res_poi_w.resid_anscombe)
Пример #7
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit(cov_type='HC1')

        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
Пример #8
0
    def setup_class(cls):
        cls.cov_type = 'cluster'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
Пример #9
0
    def setup_class(cls):
        cls.cov_type = 'HC0'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HC0')
Пример #10
0
    def test_glm(self):
        # prelimnimary, getting started with basic test for GLM.get_prediction
        from statsmodels.genmod.generalized_linear_model import GLM

        res_wls = self.res_wls
        mod_wls = res_wls.model
        y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights

        w_sqrt = np.sqrt(wi)  # notation wi is weights, `w` is var
        mod_glm = GLM(y * w_sqrt, X * w_sqrt[:,None])

        # compare using t distribution
        res_glm = mod_glm.fit(use_t=True)
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        n_compare = 30   # in glm with predict wendog
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # compare using normal distribution

        res_glm = mod_glm.fit() # default use_t=False
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        res_wls = mod_wls.fit(use_t=False)
        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # function for parameter transformation
        # should be separate test method
        from statsmodels.genmod._prediction import params_transform_univariate
        rates = params_transform_univariate(res_glm.params, res_glm.cov_params())

        rates2 = np.column_stack((np.exp(res_glm.params),
                                  res_glm.bse * np.exp(res_glm.params),
                                  np.exp(res_glm.conf_int())))
        assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13)

        from statsmodels.genmod.families import links

        # with identity transform
        pt = params_transform_univariate(res_glm.params, res_glm.cov_params(), link=links.identity())

        assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13)
        assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13)
        ptt = pt.t_test()
        assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13)
        assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13)

        # prediction with exog and no weights does not error
        res_glm = mod_glm.fit()
        pred_glm = res_glm.get_prediction(X)
Пример #11
0
 def init(cls):
     cov_type = 'HC0'
     cls.res2 = cls.mod2.fit(cov_type=cov_type)
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze(),
               var_weights=cls.aweights)
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit(cov_type=cov_type)
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
Пример #12
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Logit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
Пример #13
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink()))
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
Пример #14
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit()

        #res_hc0_ = cls.res1.get_robustcov_results('HC1')
        get_robustcov_results(cls.res1._results, 'HC1', use_self=True)
        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
Пример #15
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
Пример #16
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit(cov_type='HC1')

        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1./corr_fact)
Пример #17
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = GLM(y, x, family=family.Poisson())
        cls.res2 = modp.fit()

        mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.atol = 5e-6
Пример #18
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial(link=links.probit()))
        cls.res1 = mod1.fit(method='newton',
                            cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
        cls.rtol = 1e-6
Пример #19
0
    def setup_class(cls):
        yi = np.array([0, 2, 14, 19, 30])
        ni = 40 * np.ones(len(yi))
        xi = np.arange(1, len(yi) + 1)
        exog = np.column_stack((np.ones(len(yi)), xi))
        endog = np.column_stack((yi, ni - yi))

        res = GLM(endog, exog, family=families.Binomial()).fit()

        cls.infl1 = res.get_influence()
        cls.infl0 = MLEInfluence(res)
        cls.cd_rtol = 5e-5
Пример #20
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        # check kernel specified as string
        kwds = {'kernel': 'bartlett', 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        kwds2 = {'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
Пример #21
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit()

        #res_hc0_ = cls.res1.get_robustcov_results('HC1')
        get_robustcov_results(cls.res1._results, 'HC1', use_self=True)
        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1./corr_fact)
Пример #22
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        # check kernel as string
        mod2 = OLS(endog, exog)
        kwds2 = {'kernel': 'uniform', 'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
Пример #23
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Poisson())
        cls.res2 = modp.fit()

        mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty)
        mod.pen_weight *= 1.5  # same as discrete Poisson
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
Пример #24
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial())
        cls.res2 = modp.fit(disp=0)

        mod = GLMPenalized(y, x, family=family.Binomial(), penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
Пример #25
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel':sw.weights_uniform, 'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)

        #for debugging
        cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags':2})
Пример #26
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        groups = np.repeat(np.arange(5), 7)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(groups=pd.Series(groups),  # check for #3606
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
Пример #27
0
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0)
        cls.res2 = reslogit.results_constraint2

        mod1 = GLM(spector_data.endog, spector_data.exog,
                   family=families.Binomial())

        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(constr, atol=1e-10)

        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10})
        cls.constraints_rq = (R, q)
Пример #28
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_clu
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = res1 = mod.fit()

        get_robustcov_results(cls.res1._results, 'cluster',
                                                  groups=group,
                                                  use_correction=True,
                                                  df_correction=True,  #TODO has no effect
                                                  use_t=False, #True,
                                                  use_self=True)
        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1)
Пример #29
0
    def setup_class(cls):
        cls.cov_type = 'hac-groupsum'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog, exog, family=families.Gaussian())
        kwds = dict(time=pd.Series(time),  # check for #3606
                    maxlags=2,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
Пример #30
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
Пример #31
0
    def __init__(self):
        '''
        Test Binomial family with canonical logit link using star98 dataset.
        '''
        self.decimal_resids = DECIMAL_1
        self.decimal_bic = DECIMAL_2

        from statsmodels.datasets.star98 import load
        from .results.results_glm import Star98
        data = load()
        data.exog = add_constant(data.exog, prepend=False)
        self.res1 = GLM(data.endog, data.exog, \
        family=sm.families.Binomial()).fit()
        #NOTE: if you want to replicate with RModel
        #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm,
        #        family=r.binomial, weights=trials)

        self.res2 = Star98()
Пример #32
0
    def __init__(self):
        '''
        Tests Poisson family with canonical log link.

        Test results were obtained by R.
        '''
        from .results.results_glm import Cpunish
        from statsmodels.datasets.cpunish import load
        self.data = load()
        self.data.exog[:, 3] = np.log(self.data.exog[:, 3])
        self.data.exog = add_constant(self.data.exog, prepend=False)
        self.res1 = GLM(self.data.endog,
                        self.data.exog,
                        family=sm.families.Poisson()).fit()
        self.res2 = Cpunish()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(self.data.endog, self.data.exog)
        self.resd = modd.fit(start_params=self.res1.params * 0.9, disp=False)
Пример #33
0
    def _delta_hat_estimation(self, temp_y, temp_x, temp_t):
        """Estimates delta to correct treatment estimation"""
        H_a = []

        for idx, treatment in enumerate(np.asarray(temp_t)):
            if treatment == 1:
                H_a.append(1 / self.pi_hat1[idx])
            elif treatment == 0:
                H_a.append(-1 / self.pi_hat0[idx])

        H_a = np.array(H_a)

        # Create GLM using H_a as a forced offset
        targetting_model = GLM(endog=np.asarray(temp_y),
                               exog=H_a,
                               offset=np.asarray(self.y_hat_a)).fit()

        return targetting_model.params[0]
Пример #34
0
    def setup_class(cls):
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_noexposure_constraint
        # 2 is dropped baseline for categorical
        cls.idx = [7, 3, 4, 5, 6, 0, 1]

        # example without offset
        formula = 'deaths ~ logpyears + smokes + C(agecat)'
        mod = GLM.from_formula(formula, data=data,
                               family=families.Poisson())

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
                                   fit_kwds={'atol': 1e-10})
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr, atol=1e-10)
Пример #35
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum)

        cls.res1 = GLM(
            cpunish_data.endog,
            cpunish_data.exog,
            family=sm.families.Poisson(),
            freq_weights=fweights).fit(
                cov_type='HC0')  #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_hc1
Пример #36
0
    def __init__(self):
        '''
        Test Gaussian family with canonical identity link
        '''
        # Test Precisions
        self.decimal_resids = DECIMAL_3
        self.decimal_params = DECIMAL_2
        self.decimal_bic = DECIMAL_0
        self.decimal_bse = DECIMAL_3

        from statsmodels.datasets.longley import load
        self.data = load()
        self.data.exog = add_constant(self.data.exog, prepend=False)
        self.res1 = GLM(self.data.endog,
                        self.data.exog,
                        family=sm.families.Gaussian()).fit()
        from .results.results_glm import Longley
        self.res2 = Longley()
Пример #37
0
def test_warnings_raised():
    weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
    # faking aweights by using normalized freq_weights
    weights = np.array(weights)

    gid = np.arange(1, 17 + 1) // 2

    cov_kwds = {'groups': gid, 'use_correction': False}

    with pytest.warns(SpecificationWarning):
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), freq_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()

    with pytest.warns(SpecificationWarning):
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), var_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()
def test_est_unregularized_naive():

    # tests that the shape of all the intermediate steps
    # remains correct for unregularized naive estimation,
    # does this for OLS and GLM

    np.random.seed(435265)
    X = np.random.normal(size=(50, 3))
    y = np.random.randint(0, 2, size=50)
    beta = np.random.normal(size=3)
    mod = OLS(y, X)
    res = _est_unregularized_naive(mod, 0, 2, fit_kwds={"alpha": 0.5})

    assert_equal(res.shape, beta.shape)

    mod = GLM(y, X, family=Binomial())
    res = _est_unregularized_naive(mod, 0, 2, fit_kwds={"alpha": 0.5})

    assert_equal(res.shape, beta.shape)
Пример #39
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                       family=sm.families.Poisson(), var_weights=aweights
                       ).fit()

        # Need to copy to avoid inplace adjustment
        from copy import copy
        cls.res2 = copy(res_stata.results_poisson_aweight_nonrobust)
        cls.res2.resids = cls.res2.resids.copy()

        # Need to adjust resids for pearson and deviance to add weights
        cls.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
Пример #40
0
    def __init__(self):
        '''
        Tests Gamma family with canonical inverse link (power -1)
        '''
        # Test Precisions
        self.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata
        self.decimal_resids = DECIMAL_2

        from statsmodels.datasets.scotland import load
        from results.results_glm import Scotvote
        data = load()
        data.exog = add_constant(data.exog)
        res1 = GLM(data.endog, data.exog, \
                    family=sm.families.Gamma()).fit()
        self.res1 = res1
#        res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
        res2 = Scotvote()
        res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma
        self.res2 = res2
Пример #41
0
    def init(self):
        nobs = self.nobs
        y_true, x, exog = self.y_true, self.x, self.exog
        if not hasattr(self, 'scale'):
            scale = 1
        else:
            scale = self.scale

        f = self.family

        self.mu_true = mu_true = f.link.inverse(y_true)

        np.random.seed(8765993)
        #y_obs = np.asarray([stats.poisson.rvs(p) for p in mu], float)
        y_obs = self.rvs(mu_true, scale=scale, size=nobs)  #this should work
        m = GAM(y_obs, x, family=f)  #TODO: y_obs is twice __init__ and fit
        m.fit(y_obs, maxiter=100)
        res_gam = m.results
        self.res_gam = res_gam  #attached for debugging
        self.mod_gam = m  #attached for debugging

        res_glm = GLM(y_obs, exog, family=f).fit()

        #Note: there still are some naming inconsistencies
        self.res1 = res1 = Dummy()  #for gam model
        #res2 = Dummy() #for benchmark
        self.res2 = res2 = res_glm  #reuse existing glm results, will add additional

        #eta in GLM terminology
        res2.y_pred = res_glm.model.predict(res_glm.params, exog, linear=True)
        res1.y_pred = res_gam.predict(x)
        res1.y_predshort = res_gam.predict(x[:10])  #, linear=True)

        #mu
        res2.mu_pred = res_glm.model.predict(res_glm.params,
                                             exog,
                                             linear=False)
        res1.mu_pred = res_gam.mu

        #parameters
        slopes = [i for ss in m.smoothers for i in ss.params[1:]]
        const = res_gam.alpha + sum([ss.params[1] for ss in m.smoothers])
        res1.params = np.array([const] + slopes)
Пример #42
0
def test_score_test_OLS():
    # nicer example than Longley
    from statsmodels.regression.linear_model import OLS
    np.random.seed(5)
    nobs = 100
    sige = 0.5
    x = np.random.uniform(0, 1, size=(nobs, 5))
    x[:, 0] = 1
    beta = 1. / np.arange(1., x.shape[1] + 1)
    y = x.dot(beta) + sige * np.random.randn(nobs)

    res_ols = OLS(y, x).fit()
    res_olsc = OLS(y, x[:, :-2]).fit()
    co = res_ols.compare_lm_test(res_olsc, demean=False)

    res_glm = GLM(y, x[:, :-2], family=sm.families.Gaussian()).fit()
    co2 = res_glm.model.score_test(res_glm.params, exog_extra=x[:, -2:])
    # difference in df_resid versus nobs in scale see #1786
    assert_allclose(co[0] * 97 / 100., co2[0], rtol=1e-13)
Пример #43
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        # This is really close when corr_fact = (wsum - 1.) / wsum, but to
        # avoid having loosen precision of the assert_allclose, I'm doing this
        # manually. Its *possible* lowering the IRLS convergence criterion
        # in stata and here will make this less sketchy.
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609
        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                        family=sm.families.Poisson(), var_weights=aweights
                        ).fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_aweight_hc1
Пример #44
0
    def test_basic(self):
        res1 = self.res1
        res2 = self.res2

        assert_allclose(self.eff, res2.TE, rtol=1e-13)
        assert_allclose(self.var_eff, res2.seTE**2, rtol=1e-13)

        assert_allclose(res1.mean_effect_fe, res2.TE_fixed, rtol=1e-13)
        # R meta does not adjust sd FE for HKSJ
        assert_allclose(res1.sd_eff_w_fe, res2.seTE_fixed, rtol=1e-13)

        assert_allclose(res1.q, res2.Q, rtol=1e-13)
        assert_allclose(res1.tau2, res2.tau2, rtol=1e-10)

        assert_allclose(res1.mean_effect_re, res2.TE_random, rtol=1e-13)
        assert_allclose(res1.sd_eff_w_re_hksj, res2.seTE_random, rtol=1e-13)

        th = res1.test_homogeneity()
        q, pv = th
        df = th.df
        assert_allclose(q, res2.Q, rtol=1e-13)
        assert_allclose(pv, res2.pval_Q, rtol=1e-13)
        assert_allclose(df, res2.df_Q, rtol=1e-13)

        assert_allclose(res1.i2, res2.I2, rtol=1e-13)
        assert_allclose(res1.h2, res2.H**2, rtol=1e-13)

        ci = res1.conf_int(use_t=True)  # fe, re, fe_wls, re_wls
        # R meta does not adjust FE for HKSJ, still uses normal dist
        # assert_allclose(ci[0][0], res2.lower_fixed, atol=1e-10)
        # assert_allclose(ci[0][1], res2.upper_fixed, atol=1e-10)
        assert_allclose(ci[3][0], res2.lower_random, rtol=1e-13)
        assert_allclose(ci[3][1], res2.upper_random, rtol=1e-10)

        ci = res1.conf_int(use_t=False)  # fe, re, fe_wls, re_wls
        assert_allclose(ci[0][0], res2.lower_fixed, rtol=1e-13)
        assert_allclose(ci[0][1], res2.upper_fixed, rtol=1e-13)

        weights = 1 / self.var_eff
        mod_glm = GLM(self.eff, np.ones(len(self.eff)),
                      var_weights=weights)
        res_glm = mod_glm.fit()
        assert_allclose(res_glm.params, res2.TE_fixed, rtol=1e-13)

        weights = 1 / (self.var_eff + res1.tau2)
        mod_glm = GLM(self.eff, np.ones(len(self.eff)),
                      var_weights=weights)
        res_glm = mod_glm.fit()
        assert_allclose(res_glm.params, res2.TE_random, rtol=1e-13)
Пример #45
0
    def setup_class(cls):
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_exposure_constraint
        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical

        # example with offset
        formula = 'deaths ~ smokes + C(agecat)'
        mod = GLM.from_formula(formula,
                               data=data,
                               family=families.Poisson(),
                               offset=np.log(data['pyears'].values))

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants)
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr)._results
Пример #46
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        gid = np.arange(1, 17 + 1) // 2
        n_groups = len(np.unique(gid))

        # no wnobs yet in sandwich covariance calcualtion
        cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1))   #np.sqrt((wsum - 1.) / wsum)
        cov_kwds = {'groups': gid, 'use_correction':False}
        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                        family=sm.families.Poisson(), freq_weights=fweights
                        ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_clu1
Пример #47
0
    def __init__(self):
        '''
        Tests the Inverse Gaussian family in GLM.

        Notes
        -----
        Used the rndivgx.ado file provided by Hardin and Hilbe to
        generate the data.  Results are read from model_results, which
        were obtained by running R_ig.s
        '''
        # Test Precisions
        self.decimal_aic_R = DECIMAL_0
        self.decimal_loglike = DECIMAL_0

        from results.results_glm import InvGauss
        res2 = InvGauss()
        res1 = GLM(res2.endog, res2.exog, \
                family=sm.families.InverseGaussian()).fit()
        self.res1 = res1
        self.res2 = res2
def compute_chi2_null_test(model_results, data, dep_var, max_iter, l2_weight):
    """
    Compute difference from null model using deviance:
    P(null) - P(model) ~ chi_2
    """
    null_formula = '%s ~ 1' % (dep_var)
    null_model = GLM.from_formula(null_formula,
                                  data,
                                  family=Binomial(link=logit()))
    null_model_results = null_model.fit_regularized(maxiter=max_iter,
                                                    method='elastic_net',
                                                    alpha=l2_weight,
                                                    L1_wt=0.0)
    model_loglike = model_results.model.loglike(model_results.params)
    null_model_loglike = null_model_results.model.loglike(
        null_model_results.params)
    llr = -2 * (null_model_loglike - model_loglike)
    model_df = model_results.model.df_model
    p_val = chi2.sf(llr, model_df)
    return llr, model_df, p_val
def test_est_regularized_debiased():

    # tests that the shape of all the intermediate steps
    # remains correct for regularized debiased estimation,
    # does this for OLS and GLM

    np.random.seed(435265)
    X = np.random.normal(size=(50, 3))
    y = np.random.randint(0, 2, size=50)
    beta = np.random.normal(size=3)
    mod = OLS(y, X)
    res = _est_regularized_debiased(mod, 0, 2, fit_kwds={"alpha": 0.5})
    bhat = res[0]
    grad = res[1]
    ghat_l = res[2]
    that_l = res[3]

    assert_(isinstance(res, tuple))
    assert_equal(bhat.shape, beta.shape)
    assert_equal(grad.shape, beta.shape)
    assert_(isinstance(ghat_l, list))
    assert_(isinstance(that_l, list))
    assert_equal(len(ghat_l), len(that_l))
    assert_equal(ghat_l[0].shape, (2, ))
    assert_(isinstance(that_l[0], float))

    mod = GLM(y, X, family=Binomial())
    res = _est_regularized_debiased(mod, 0, 2, fit_kwds={"alpha": 0.5})
    bhat = res[0]
    grad = res[1]
    ghat_l = res[2]
    that_l = res[3]

    assert_(isinstance(res, tuple))
    assert_equal(bhat.shape, beta.shape)
    assert_equal(grad.shape, beta.shape)
    assert_(isinstance(ghat_l, list))
    assert_(isinstance(that_l, list))
    assert_equal(len(ghat_l), len(that_l))
    assert_equal(ghat_l[0].shape, (2, ))
    assert_(isinstance(that_l[0], float))
Пример #50
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Пример #51
0
 def __init__(self, model, taylor):
     self.model = model
     self.stats = model.dm_statistics if hasattr(model,
                                                 "dm_statistics") else None
     self.dm = pd.DataFrame({
         lev: t.data[:, i]
         for t in model.fixed_terms.values()
         for i, lev in enumerate(t.levels)
     })
     self.priors = {}
     missing = "drop" if self.model.dropna else "none"
     self.mle = GLM(
         endog=self.model.y.data,
         exog=self.dm,
         family=self.model.family.smfamily(),
         missing=missing,
     ).fit()
     self.taylor = taylor
     with open(join(dirname(__file__), "config", "derivs.txt"),
               "r") as file:
         self.deriv = [next(file).strip("\n") for x in range(taylor + 1)]
Пример #52
0
    def _fit_mle(self):
        """Fits MLE of the common part of the model.

        This used to be called in the class instantiation, but there is no need to fit the GLM when
        there are no automatic priors. So this method is only called when needed.
        """
        missing = "drop" if self.model.dropna else "none"
        try:
            self.mle = GLM(
                endog=self.model.response.data,
                exog=self.dm,
                family=self.model.family.smfamily(self.model.family.smlink),
                missing=missing,
            ).fit()
        except PerfectSeparationError as error:
            msg = "Perfect separation detected, automatic priors are not available. "
            msg += "Please indicate priors manually."
            raise PerfectSeparationError(msg) from error
        except:
            print("Unexpected error:", sys.exc_info()[0])
            raise
Пример #53
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
Пример #54
0
def test_warnings_raised():
    if sys.version_info < (3, 4):
        raise SkipTest
    weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
    # faking aweights by using normalized freq_weights
    weights = np.array(weights)

    gid = np.arange(1, 17 + 1) // 2

    cov_kwds = {'groups': gid, 'use_correction': False}
    with warnings.catch_warnings(record=True) as w:
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), freq_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()
        assert len(w) >= 1

    with warnings.catch_warnings(record=True) as w:
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), var_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()
        assert len(w) >= 1
Пример #55
0
    def fit_scores(self, balance=True, nmodels=None, k=3):
        if not self.formula:
            # use all columns in the model (untransformed)
            self.formula = '{} ~ {}'.format(self.yvar, '+'.join(self.xvars))
            if self.stepwise:
                print "Optimizing Forumla via forward stepwise selection..."
                # use all columns + trasnformed columns in model
                self.formula, self.swdata = \
                   self.forward_stepwise(self.balanced_sample(), self.yvar, k=k)
        if balance:
            if nmodels is None:
                # fit mutliple models based on imbalance severity (rounded up to nearest tenth)
                minor, major = [
                    self.data[self.data[self.yvar] == i]
                    for i in (self.minority, self.majority)
                ]
                nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
            self.nmodels = nmodels
            for i in range(nmodels):
                progress(
                    i + 1,
                    nmodels,
                    prestr="Fitting {} Models on Balanced Samples...".format(
                        nmodels))

                # sample from majority to create balance dataset
                df = self.balanced_sample()
                y_samp, X_samp = patsy.dmatrices(self.formula,
                                                 data=df,
                                                 return_type='dataframe')
                glm = GLM(y_samp, X_samp, family=sm.families.Binomial())
                res = glm.fit()
                self.model_accurracy.append(
                    self._scores_to_accuracy(res, X_samp, y_samp))
                self.models.append(res)
            print "\nAverage Accuracy:", "{}%".\
                  format(round(np.mean(self.model_accurracy) * 100, 2))
        else:
            # ignore any imbalance and fit one model
            self.nmodels = 1
            print '\nFitting 1 (Unbalanced) Model...'
            glm = GLM(self.y, self.X, family=sm.families.Binomial())
            res = glm.fit()
            self.model_accurracy.append(
                self._scores_to_accuracy(res, self.X, self.y))
            self.models.append(res)
            print "Accuracy", round(np.mean(self.model_accurracy[0]) * 100, 2)
Пример #56
0
    def __init__(self):
        '''
        Test Negative Binomial family with canonical log link
        '''
        # Test Precision
        self.decimal_resid = DECIMAL_1
        self.decimal_params = DECIMAL_3
        self.decimal_resids = -1 # 1 % mismatch at 0
        self.decimal_fittedvalues = DECIMAL_1

        from statsmodels.datasets.committee import load
        self.data = load()
        self.data.exog[:,2] = np.log(self.data.exog[:,2])
        interaction = self.data.exog[:,2]*self.data.exog[:,1]
        self.data.exog = np.column_stack((self.data.exog,interaction))
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                family=sm.families.NegativeBinomial()).fit()
        from results.results_glm import Committee
        res2 = Committee()
        res2.aic_R += 2 # They don't count a degree of freedom for the scale
        self.res2 = res2
Пример #57
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))  # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Пример #58
0
def test_calc_wdesign_mat():

    # seperately tests that _calc_wdesign_mat
    # returns sensible results
    #
    # regression test

    np.random.seed(435265)
    X = np.random.normal(size=(3, 3))
    y = np.random.randint(0, 2, size=3)
    beta = np.random.normal(size=3)
    mod = OLS(y, X)
    dmat = _calc_wdesign_mat(mod, beta, {})
    assert_allclose(dmat, np.array([[1.306314, -0.024897, 1.326498],
                                    [-0.539219, -0.483028, -0.703503],
                                    [-3.327987, 0.524541, -0.139761]]),
                    atol=1e-6, rtol=0)

    mod = GLM(y, X, family=Binomial())
    dmat = _calc_wdesign_mat(mod, beta, {})
    assert_allclose(dmat, np.array([[0.408616, -0.007788, 0.41493],
                                    [-0.263292, -0.235854, -0.343509],
                                    [-0.11241, 0.017718, -0.004721]]),
                    atol=1e-6, rtol=0)
Пример #59
0
    def setupClass(cls):
        self = cls  # alias

        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        self.res1 = GLM(cpunish_data.endog,
                        cpunish_data.exog,
                        family=sm.families.Poisson(),
                        var_weights=aweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)

        # Need to copy to avoid inplace adjustment
        from copy import copy
        self.res2 = copy(res_stata.results_poisson_aweight_nonrobust)
        self.res2.resids = self.res2.resids.copy()

        # Need to adjust resids for pearson and deviance to add weights
        self.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
Пример #60
0
 def __init__(self):
     from results.results_glm import Lbw
     self.res2 = Lbw()
     self.res1 = GLM(self.res2.endog, self.res2.exog,
             family=sm.families.Binomial()).fit()