示例#1
0
def test_poisson_residuals():
    nobs, k_exog = 100, 5
    np.random.seed(987125)
    x = np.random.randn(nobs, k_exog - 1)
    x = add_constant(x)

    y_true = x.sum(1) / 2
    y = y_true + 2 * np.random.randn(nobs)
    exposure = 1 + np.arange(nobs) // 4

    yp = np.random.poisson(np.exp(y_true) * exposure)
    yp[10:15] += 10

    fam = sm.families.Poisson()
    mod_poi_e = GLM(yp, x, family=fam, exposure=exposure)
    res_poi_e = mod_poi_e.fit()

    mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure)
    res_poi_w = mod_poi_w.fit()

    assert_allclose(res_poi_e.resid_response / exposure,
                    res_poi_w.resid_response)
    assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson)
    assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe)
    assert_allclose(res_poi_e.resid_anscombe_unscaled,
                    res_poi_w.resid_anscombe)
示例#2
0
def test_poisson_residuals():
    nobs, k_exog = 100, 5
    np.random.seed(987125)
    x = np.random.randn(nobs, k_exog - 1)
    x = add_constant(x)

    y_true = x.sum(1) / 2
    y = y_true + 2 * np.random.randn(nobs)
    exposure = 1 + np.arange(nobs) // 4

    yp = np.random.poisson(np.exp(y_true) * exposure)
    yp[10:15] += 10

    fam = sm.families.Poisson()
    mod_poi_e = GLM(yp, x, family=fam, exposure=exposure)
    res_poi_e = mod_poi_e.fit()

    mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure)
    res_poi_w = mod_poi_w.fit()

    assert_allclose(res_poi_e.resid_response / exposure,
                    res_poi_w.resid_response)
    assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson)
    assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe)
    assert_allclose(res_poi_e.resid_anscombe_unscaled,
                    res_poi_w.resid_anscombe)
示例#3
0
    def test_glm(self):
        # prelimnimary, getting started with basic test for GLM.get_prediction
        from statsmodels.genmod.generalized_linear_model import GLM

        res_wls = self.res_wls
        mod_wls = res_wls.model
        y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights

        w_sqrt = np.sqrt(wi)  # notation wi is weights, `w` is var
        mod_glm = GLM(y * w_sqrt, X * w_sqrt[:,None])

        # compare using t distribution
        res_glm = mod_glm.fit(use_t=True)
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        n_compare = 30   # in glm with predict wendog
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # compare using normal distribution

        res_glm = mod_glm.fit() # default use_t=False
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        res_wls = mod_wls.fit(use_t=False)
        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # function for parameter transformation
        # should be separate test method
        from statsmodels.genmod._prediction import params_transform_univariate
        rates = params_transform_univariate(res_glm.params, res_glm.cov_params())

        rates2 = np.column_stack((np.exp(res_glm.params),
                                  res_glm.bse * np.exp(res_glm.params),
                                  np.exp(res_glm.conf_int())))
        assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13)

        from statsmodels.genmod.families import links

        # with identity transform
        pt = params_transform_univariate(res_glm.params, res_glm.cov_params(), link=links.identity())

        assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13)
        assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13)
        ptt = pt.t_test()
        assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13)
        assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13)

        # prediction with exog and no weights does not error
        res_glm = mod_glm.fit()
        pred_glm = res_glm.get_prediction(X)
示例#4
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink()))
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
示例#5
0
    def test_glm(self):
        # prelimnimary, getting started with basic test for GLM.get_prediction
        from statsmodels.genmod.generalized_linear_model import GLM

        res_wls = self.res_wls
        mod_wls = res_wls.model
        y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights

        w_sqrt = np.sqrt(wi)  # notation wi is weights, `w` is var
        mod_glm = GLM(y * w_sqrt, X * w_sqrt[:, None])

        # compare using t distribution
        res_glm = mod_glm.fit(use_t=True)
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        n_compare = 30  # in glm with predict wendog
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # compare using normal distribution

        res_glm = mod_glm.fit()  # default use_t=False
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        res_wls = mod_wls.fit(use_t=False)
        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # function for parameter transformation
        # should be separate test method
        from statsmodels.genmod._prediction import params_transform_univariate
        rates = params_transform_univariate(res_glm.params,
                                            res_glm.cov_params())

        rates2 = np.column_stack(
            (np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params),
             np.exp(res_glm.conf_int())))
        assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13)

        from statsmodels.genmod.families import links

        # with identity transform
        pt = params_transform_univariate(res_glm.params,
                                         res_glm.cov_params(),
                                         link=links.identity())

        assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13)
        assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13)
        ptt = pt.t_test()
        assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13)
        assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13)
示例#6
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink()))
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Logit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
示例#8
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Logit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial(link=links.probit()))
        cls.res1 = mod1.fit(method='newton',
                            cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
        cls.rtol = 1e-6
示例#10
0
    def setup_class(cls):
        np.random.seed(987125643)  # not intentional seed
        endog_count = np.random.poisson(endog)
        cls.cov_type = 'HC0'

        mod1 = GLM(endog_count, exog, family=families.Poisson())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod1 = smd.Poisson(endog_count, exog)
        cls.res2 = mod1.fit(cov_type='HC0')

        cls.res1.rtol = 1e-11
示例#11
0
    def setup_class(cls):
        cls.cov_type = 'hac-groupsum'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog, exog, family=families.Gaussian())
        kwds = dict(time=pd.Series(time),  # check for #3606
                    maxlags=2,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
示例#12
0
    def setup_class(cls):
        cls.cov_type = 'hac-groupsum'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog, exog, family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
示例#13
0
    def test_basic(self):
        res1 = self.res1
        res2 = self.res2

        assert_allclose(self.eff, res2.TE, rtol=1e-13)
        assert_allclose(self.var_eff, res2.seTE**2, rtol=1e-13)

        assert_allclose(res1.mean_effect_fe, res2.TE_fixed, rtol=1e-13)
        # R meta does not adjust sd FE for HKSJ
        assert_allclose(res1.sd_eff_w_fe, res2.seTE_fixed, rtol=1e-13)

        assert_allclose(res1.q, res2.Q, rtol=1e-13)
        assert_allclose(res1.tau2, res2.tau2, rtol=1e-10)

        assert_allclose(res1.mean_effect_re, res2.TE_random, rtol=1e-13)
        assert_allclose(res1.sd_eff_w_re_hksj, res2.seTE_random, rtol=1e-13)

        th = res1.test_homogeneity()
        q, pv = th
        df = th.df
        assert_allclose(q, res2.Q, rtol=1e-13)
        assert_allclose(pv, res2.pval_Q, rtol=1e-13)
        assert_allclose(df, res2.df_Q, rtol=1e-13)

        assert_allclose(res1.i2, res2.I2, rtol=1e-13)
        assert_allclose(res1.h2, res2.H**2, rtol=1e-13)

        ci = res1.conf_int(use_t=True)  # fe, re, fe_wls, re_wls
        # R meta does not adjust FE for HKSJ, still uses normal dist
        # assert_allclose(ci[0][0], res2.lower_fixed, atol=1e-10)
        # assert_allclose(ci[0][1], res2.upper_fixed, atol=1e-10)
        assert_allclose(ci[3][0], res2.lower_random, rtol=1e-13)
        assert_allclose(ci[3][1], res2.upper_random, rtol=1e-10)

        ci = res1.conf_int(use_t=False)  # fe, re, fe_wls, re_wls
        assert_allclose(ci[0][0], res2.lower_fixed, rtol=1e-13)
        assert_allclose(ci[0][1], res2.upper_fixed, rtol=1e-13)

        weights = 1 / self.var_eff
        mod_glm = GLM(self.eff, np.ones(len(self.eff)),
                      var_weights=weights)
        res_glm = mod_glm.fit()
        assert_allclose(res_glm.params, res2.TE_fixed, rtol=1e-13)

        weights = 1 / (self.var_eff + res1.tau2)
        mod_glm = GLM(self.eff, np.ones(len(self.eff)),
                      var_weights=weights)
        res_glm = mod_glm.fit()
        assert_allclose(res_glm.params, res2.TE_random, rtol=1e-13)
示例#14
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
示例#15
0
 def init(cls):
     cls.res2 = cls.mod2.fit()
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze())
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit()
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
示例#16
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        offset = -0.25 * np.ones(len(y))  # also check offset
        cov_type = 'HC0'
        modp = GLM(y,
                   x[:, :cls.k_nonzero],
                   family=family.Binomial(),
                   offset=offset)
        cls.res2 = modp.fit(cov_type=cov_type,
                            method='newton',
                            maxiter=1000,
                            disp=0)

        mod = GLMPenalized(y,
                           x,
                           family=family.Binomial(),
                           offset=offset,
                           penal=cls.penalty)
        mod.pen_weight *= 1  # lower than in other cases
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(cov_type=cov_type,
                           method='bfgs',
                           max_start_irls=0,
                           maxiter=100,
                           disp=0,
                           trim=0.001)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 1e-3
        cls.k_params = cls.k_nonzero
示例#17
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        gid = np.arange(1, 17 + 1) // 2
        n_groups = len(np.unique(gid))

        # no wnobs yet in sandwich covariance calcualtion
        cls.corr_fact = 1 / np.sqrt(
            n_groups / (n_groups - 1))  #np.sqrt((wsum - 1.) / wsum)
        cov_kwds = {'groups': gid, 'use_correction': False}
        with pytest.warns(None):
            mod = GLM(cpunish_data.endog,
                      cpunish_data.exog,
                      family=sm.families.Poisson(),
                      freq_weights=fweights)
            cls.res1 = mod.fit(cov_type='cluster', cov_kwds=cov_kwds)

        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_clu1
示例#18
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        # adding 10 to avoid strict rtol at predicted values close to zero
        y = y + 10
        cov_type = 'HC0'
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Gaussian())
        cls.res2 = modp.fit(cov_type=cov_type,
                            method='bfgs',
                            maxiter=100,
                            disp=0)

        weights = (np.arange(x.shape[1]) >= 4).astype(float)
        mod = GLMPenalized(y,
                           x,
                           family=family.Gaussian(),
                           penal=smpen.L2ContraintsPenalty(weights=weights))
        # make pen_weight large to force redundant to close to zero
        mod.pen_weight *= 500
        cls.res1 = mod.fit(cov_type=cov_type,
                           method='bfgs',
                           maxiter=100,
                           disp=0,
                           trim=False)

        cls.exog_index = slice(None, cls.k_nonzero, None)
        cls.k_params = x.shape[1]
        cls.atol = 1e-5
        cls.rtol = 1e-5
示例#19
0
 def init(cls):
     cls.res2 = cls.mod2.fit()
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze())
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit()
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
示例#20
0
    def kfold_cv(self, d, formula, k):
        n = len(d)
        d = d.sample(n, replace=False)
        partition = n // k
        current, last = 0, partition
        train_accs = []
        test_accs = []
        while current < n:
            if last > n - partition:
                last = n
            test = d.iloc[current:last]
            train = d.drop(test.index)

            y = train[[self.yvar]]
            X = self.select_from_design(train.columns).loc[train.index]
            yt = test[[self.yvar]]
            Xt = self.select_from_design(test.columns).loc[test.index]
            shared = list(set(Xt.columns) & set(X.columns))
            glm = GLM(y, X[shared], family=sm.families.Binomial())
            try:
                res = glm.fit()
                train_acc = self._scores_to_accuracy(res, X[shared], y)
                test_acc = self._scores_to_accuracy(res, Xt[shared], yt)
                train_accs.append(train_acc)
                test_accs.append(test_acc)
            except PerfectSeparationError:
                print "Perfectly Separated!"
            current = last
            last += partition
        return np.mean(train_accs), np.mean(test_accs)
示例#21
0
def ppglmfit(X,Y):
    '''
    The GLM solver in statsmodels is very general. It accepts any link
    function and expects that, if you want a constant term in your model,
    that you have already manually added a column of ones to your
    design matrix. This wrapper simplifies using GLM to fit the common
    case of a Poisson point-process model, where the constant term has
    not been explicitly added to the design matrix

    Parameters
    ----------
    X: N_observations x N_features design matrix.
    Y: Binary point process observations

    Returns
    -------
    μ, B: the offset and parameter estimates for the GLM model.
    '''
    # add constant value to X, if the 1st column is not constant
    if np.mean(Y)>0.1:
        print('Caution: spike rate very high, is Poisson assumption valid?')
    if np.sum(Y)<100:
        print('Caution: fewer than 100 spikes to fit model')
    if not all(X[:,0]==X[0,0]):
        X = np.hstack([np.ones((X.shape[0],1),dtype=X.dtype), X])
    poisson_model   = GLM(Y,X,family=Poisson())
    poisson_results = poisson_model.fit()
    M = poisson_results.params
    return M[0],M[1:]
示例#22
0
def test_cov_params():

    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()

    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
    alpha = [0, 0]
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)
    res_glm = glm.fit()

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=0.0025)

    alpha = 1e-13
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    atol=1e-10)

    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=0, maxiter=5000, maxfun=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=1e-4, atol=1e-8)
示例#23
0
def ppglmfit(X,Y):
    '''
    The GLM solver in statsmodels is very general. It accepts any link
    function and expects that, if you want a constant term in your model,
    that you have already manually added a column of ones to your
    design matrix. This wrapper simplifies using GLM to fit the common
    case of a Poisson point-process model, where the constant term has
    not been explicitly added to the design matrix

    Args:
        X: N_observations x N_features design matrix.
        Y: Binary point process observations
    Returns:
        μ, B: the offset and parameter estimates for the GLM model.
    '''
    # add constant value to X, if the 1st column is not constant
    if mean(Y)>0.1:
        print('Caution: spike rate very high, is Poisson assumption valid?')
    if sum(Y)<100:
        print('Caution: fewer than 100 spikes to fit model')
    if not all(X[:,0]==X[0,0]):
        X = hstack([ ones((shape(X)[0],1),dtype=X.dtype), X])
    poisson_model   = GLM(Y,X,family=Poisson())
    poisson_results = poisson_model.fit()
    M = poisson_results.params
    return M[0],M[1:]
示例#24
0
    def setup_class(cls):
        import statsmodels.stats.sandwich_covariance as sw
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(time=time,
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
示例#25
0
def test_cov_params():

    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()

    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
    alpha = [0, 0]
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)
    res_glm = glm.fit()

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=0.0025)

    alpha = 1e-13
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    atol=1e-10)

    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=0, maxiter=5000, maxfun=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=1e-4, atol=1e-8)
示例#26
0
    def fit_scores(self, balance=True, nmodels=None, k=3):
        if not self.formula:
            # use all columns in the model (untransformed)
            self.formula = '{} ~ {}'.format(self.yvar, '+'.join(self.xvars))
            if self.stepwise:
                print "Optimizing Forumla via forward stepwise selection..."
                # use all columns + trasnformed columns in model
                self.formula, self.swdata = \
                   self.forward_stepwise(self.balanced_sample(), self.yvar, k=k)
        if balance:
            if nmodels is None:
                # fit mutliple models based on imbalance severity (rounded up to nearest tenth)
                minor, major = [
                    self.data[self.data[self.yvar] == i]
                    for i in (self.minority, self.majority)
                ]
                nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
            self.nmodels = nmodels
            for i in range(nmodels):
                progress(
                    i + 1,
                    nmodels,
                    prestr="Fitting {} Models on Balanced Samples...".format(
                        nmodels))

                # sample from majority to create balance dataset
                df = self.balanced_sample()
                y_samp, X_samp = patsy.dmatrices(self.formula,
                                                 data=df,
                                                 return_type='dataframe')
                glm = GLM(y_samp, X_samp, family=sm.families.Binomial())
                res = glm.fit()
                self.model_accurracy.append(
                    self._scores_to_accuracy(res, X_samp, y_samp))
                self.models.append(res)
            print "\nAverage Accuracy:", "{}%".\
                  format(round(np.mean(self.model_accurracy) * 100, 2))
        else:
            # ignore any imbalance and fit one model
            self.nmodels = 1
            print '\nFitting 1 (Unbalanced) Model...'
            glm = GLM(self.y, self.X, family=sm.families.Binomial())
            res = glm.fit()
            self.model_accurracy.append(
                self._scores_to_accuracy(res, self.X, self.y))
            self.models.append(res)
            print "Accuracy", round(np.mean(self.model_accurracy[0]) * 100, 2)
示例#27
0
    def setup_class(cls):
        cls.cov_type = 'HC0'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HC0')
示例#28
0
    def setup_class(cls):
        cls.cov_type = 'cluster'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
示例#29
0
    def setup_class(cls):
        cls.cov_type = 'HC0'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HC0')
示例#30
0
    def setup_class(cls):
        cls.cov_type = 'cluster'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit(cov_type='HC1')

        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
示例#32
0
 def init(cls):
     cov_type = 'HC0'
     cls.res2 = cls.mod2.fit(cov_type=cov_type)
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze(),
               var_weights=cls.aweights)
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit(cov_type=cov_type)
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
示例#33
0
def fit_poisson(X, Y):
    """ Fits the Poisson regression model with the training data
        :param X: the feature matrix
        :param Y: the label matrix
        :return: the fitted Poisson model (instance of statsmodels.genmod.generalized_linear_model.GLMResults)
    """
    t = sum(Y, axis=1)
    pr = GLM(t, X, family=Poisson())
    return pr.fit()
示例#34
0
 def init(cls):
     cov_type = 'HC0'
     cls.res2 = cls.mod2.fit(cov_type=cov_type)
     mod = GLM(cls.endog, cls.exogc,
               offset=0.5 * cls.exog[:, cls.idx_c].squeeze(),
               var_weights=cls.aweights)
     mod.exog_names[:] = ['const', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit(cov_type=cov_type)
     cls.idx_p_uc = np.arange(cls.exogc.shape[1])
示例#35
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit(cov_type='HC1')

        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1. / corr_fact)
示例#36
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds = {'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
示例#37
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit()

        #res_hc0_ = cls.res1.get_robustcov_results('HC1')
        get_robustcov_results(cls.res1._results, 'HC1', use_self=True)
        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
示例#39
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit(cov_type='HC1')

        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1./corr_fact)
示例#40
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = GLM(y, x, family=family.Poisson())
        cls.res2 = modp.fit()

        mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.atol = 5e-6
示例#41
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = GLM(y, x, family=family.Poisson())
        cls.res2 = modp.fit()

        mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.atol = 5e-6
示例#42
0
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Binomial())
        res = mod.fit(method="newton", tol=1e-10)
        from statsmodels.discrete.discrete_model import Logit
        mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(method="newton", tol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = res2.get_influence()
示例#43
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        # check kernel specified as string
        kwds = {'kernel': 'bartlett', 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        kwds2 = {'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
示例#44
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        # check kernel as string
        mod2 = OLS(endog, exog)
        kwds2 = {'kernel': 'uniform', 'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
示例#45
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit()

        #res_hc0_ = cls.res1.get_robustcov_results('HC1')
        get_robustcov_results(cls.res1._results, 'HC1', use_self=True)
        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1. / corr_fact)
示例#46
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_hc1
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = mod.fit()

        #res_hc0_ = cls.res1.get_robustcov_results('HC1')
        get_robustcov_results(cls.res1._results, 'HC1', use_self=True)
        cls.bse_rob = cls.res1.bse
        nobs, k_vars = mod.exog.shape
        corr_fact = (nobs) / float(nobs - 1.)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(1./corr_fact)
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        # check kernel as string
        mod2 = OLS(endog, exog)
        kwds2 = {'kernel': 'uniform', 'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
示例#48
0
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Poisson())
        res = mod.fit(attach_wls=True, atol=1e-10)
        from statsmodels.discrete.discrete_model import Poisson
        mod2 = Poisson(df['constrict'],
                       df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(tol=1e-10)

        cls.infl0 = res.get_influence()
        cls.infl1 = res2.get_influence()
示例#49
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel':sw.weights_uniform, 'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)

        #for debugging
        cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags':2})
示例#50
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial())
        cls.res2 = modp.fit(disp=0)

        mod = GLMPenalized(y, x, family=family.Binomial(), penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
示例#51
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Poisson())
        cls.res2 = modp.fit()

        mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty)
        mod.pen_weight *= 1.5  # same as discrete Poisson
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
示例#52
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_clu
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = res1 = mod.fit()

        get_robustcov_results(cls.res1._results, 'cluster',
                                                  groups=group,
                                                  use_correction=True,
                                                  df_correction=True,  #TODO has no effect
                                                  use_t=False, #True,
                                                  use_self=True)
        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1)
示例#53
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        groups = np.repeat(np.arange(5), 7)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(groups=pd.Series(groups),  # check for #3606
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
示例#54
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        x = x[:, :4]
        offset = -0.25 * np.ones(len(y))  # also check offset
        modp = GLM(y, x, family=family.Binomial(), offset=offset)
        cls.res2 = modp.fit(method='bfgs', max_start_irls=100)

        mod = GLMPenalized(y, x, family=family.Binomial(), offset=offset,
                           penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(method='bfgs', max_start_irls=3, maxiter=100, disp=0,
                           start_params=cls.res2.params*0.9)

        cls.atol = 1e-10
        cls.k_params = 4
示例#55
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum)

        mod = GLM(cpunish_data.endog, cpunish_data.exog,
                  family=sm.families.Poisson(),
                  freq_weights=fweights)
        cls.res1 = mod.fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_hc1
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_clu
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = res1 = mod.fit(cov_type='cluster',
                                  cov_kwds=dict(groups=group,
                                                use_correction=True,
                                                df_correction=True),  #TODO has no effect
                                  use_t=False, #True,
                                                )
        cls.bse_rob = cls.res1.bse

        nobs, k_vars = mod.exog.shape
        k_params = len(cls.res1.params)
        #n_groups = len(np.unique(group))
        corr_fact = (nobs-1.) / float(nobs - k_params)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(corr_fact)
示例#57
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_clu
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = res1 = mod.fit(cov_type='cluster',
                                  cov_kwds=dict(groups=group,
                                                use_correction=True,
                                                df_correction=True),  #TODO has no effect
                                  use_t=False, #True,
                                  )

        # The model results, t_test, ... should also work without
        # normalized_cov_params, see #2209
        # Note: we cannot set on the wrapper res1, we need res1._results
        cls.res1._results.normalized_cov_params = None

        cls.bse_rob = cls.res1.bse

        cls.corr_fact = cls.get_correction_factor(cls.res1)