Пример #1
0
    def __init__(self):
        '''
        Test Negative Binomial family with canonical log link
        '''
        # Test Precision
        self.decimal_resid = DECIMAL_1
        self.decimal_params = DECIMAL_3
        self.decimal_resids = -1  # 1 % mismatch at 0
        self.decimal_fittedvalues = DECIMAL_1

        from statsmodels.datasets.committee import load
        self.data = load()
        self.data.exog[:, 2] = np.log(self.data.exog[:, 2])
        interaction = self.data.exog[:, 2] * self.data.exog[:, 1]
        self.data.exog = np.column_stack((self.data.exog, interaction))
        self.data.exog = add_constant(self.data.exog, prepend=False)
        self.res1 = GLM(self.data.endog,
                        self.data.exog,
                        family=sm.families.NegativeBinomial()).fit()
        from .results.results_glm import Committee
        res2 = Committee()
        res2.aic_R += 2  # They don't count a degree of freedom for the scale
        self.res2 = res2
Пример #2
0
    def setup_class(cls):
        cls.res2 = results_st.results_poisson_clu
        mod = GLM(endog, exog, family=families.Poisson())
        cls.res1 = res1 = mod.fit(cov_type='cluster',
                                  cov_kwds=dict(groups=group,
                                                use_correction=True,
                                                df_correction=True),  #TODO has no effect
                                  use_t=False, #True,
                                                )

        # The model results, t_test, ... should also work without
        # normalized_cov_params, see #2209
        # Note: we cannot set on the wrapper res1, we need res1._results
        cls.res1._results.normalized_cov_params = None

        cls.bse_rob = cls.res1.bse

        nobs, k_vars = mod.exog.shape
        k_params = len(cls.res1.params)
        #n_groups = len(np.unique(group))
        corr_fact = (nobs-1.) / float(nobs - k_params)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(corr_fact)
Пример #3
0
    def __init__(self):
        # Test Precisions
        self.decimal_bic = DECIMAL_1
        self.decimal_aic_R = DECIMAL_1
        self.decimal_aic_Stata = DECIMAL_3
        self.decimal_loglike = DECIMAL_1
        self.decimal_resids = DECIMAL_3

        nobs = 100
        x = np.arange(nobs)
        np.random.seed(54321)
        y = 1.0 + 2.0 * x + x**2 + 0.1 * np.random.randn(nobs)
        self.X = np.c_[np.ones((nobs, 1)), x, x**2]
        self.y_inv = (1. + .02 * x +
                      .001 * x**2)**-1 + .001 * np.random.randn(nobs)
        InverseLink_Model = GLM(self.y_inv,
                                self.X,
                                family=sm.families.Gaussian(
                                    sm.families.links.inverse_power))
        InverseLink_Res = InverseLink_Model.fit()
        self.res1 = InverseLink_Res
        from .results.results_glm import GaussianInverse
        self.res2 = GaussianInverse()
Пример #4
0
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        gid = np.arange(1, 17 + 1) // 2
        n_groups = len(np.unique(gid))

        # no wnobs yet in sandwich covariance calcualtion
        cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1))
        # np.sqrt((wsum - 1.) / wsum)
        cov_kwds = {'groups': gid, 'use_correction': False}
        with pytest.warns(None):
            mod = GLM(cpunish_data.endog,
                      cpunish_data.exog,
                      family=sm.families.Poisson(),
                      freq_weights=fweights)
            cls.res1 = mod.fit(cov_type='cluster', cov_kwds=cov_kwds)

        cls.res2 = res_stata.results_poisson_fweight_clu1
Пример #5
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        # adding 10 to avoid strict rtol at predicted values close to zero
        y = y + 10
        cov_type = 'HC0'
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Gaussian())
        cls.res2 = modp.fit(cov_type=cov_type,
                            method='bfgs',
                            maxiter=100,
                            disp=0)

        mod = GLMPenalized(y, x, family=family.Gaussian(), penal=cls.penalty)
        mod.pen_weight *= 1.5  # same as discrete Poisson
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(cov_type=cov_type,
                           method='bfgs',
                           maxiter=100,
                           disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-6
        cls.rtol = 1e-6
Пример #6
0
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0)
        cls.res2 = reslogit.results_constraint2_robust

        mod1 = GLM(spector_data.endog, spector_data.exog,
                   family=families.Binomial())

        # not used to match Stata for HC
        # nobs, k_params = mod1.exog.shape
        # k_params -= 1   # one constraint
        cov_type = 'HC0'
        cov_kwds = {'scaling_factor': 32/31}
        # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)}
        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(constr, cov_type=cov_type,
                                         cov_kwds=cov_kwds, atol=1e-10)

        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10,
                                                         'cov_type': cov_type,
                                                         'cov_kwds': cov_kwds})
        cls.constraints_rq = (R, q)
Пример #7
0
    def _get_intercept_stats(self, add_slopes=True):
        # start with mean and variance of Y on the link scale
        mod = GLM(
            endog=self.model.response.data,
            exog=np.repeat(1, len(self.model.response.data)),
            family=self.model.family.smfamily(self.model.family.smlink),
            missing="drop" if self.model.dropna else "none",
        ).fit()
        mu = mod.params
        # multiply SE by sqrt(N) to turn it into (approx.) sigma(Y) on link scale
        sigma = (mod.cov_params()[0] * len(mod.mu))**0.5

        # modify mu and sigma based on means and sigmas of slope priors.
        if len(self.model.common_terms) > 1 and add_slopes:
            means = np.array([x["mu"] for x in self.priors.values()])
            sigmas = np.array([x["sigma"] for x in self.priors.values()])
            # add to intercept prior
            index = list(self.priors.keys())
            mu -= np.dot(means, self.stats["mean_x"][index])
            sigma = (sigma**2 +
                     np.dot(sigmas**2, self.stats["mean_x"][index]**2))**0.5

        return mu, sigma
Пример #8
0
    def setupClass(cls):
        self = cls  # alias

        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        # This is really close when corr_fact = (wsum - 1.) / wsum, but to
        # avoid having loosen precision of the assert_allclose, I'm doing this
        # manually. Its *possible* lowering the IRLS convergence criterion
        # in stata and here will make this less sketchy.
        self.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609
        self.res1 = GLM(
            cpunish_data.endog,
            cpunish_data.exog,
            family=sm.families.Poisson(),
            var_weights=aweights).fit(
                cov_type='HC0')  #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        self.res2 = res_stata.results_poisson_aweight_hc1
Пример #9
0
def test_calc_wdesign_mat():

    # seperately tests that _calc_wdesign_mat
    # returns sensible results
    #
    # regression test

    np.random.seed(435265)
    X = np.random.normal(size=(3, 3))
    y = np.random.randint(0, 2, size=3)
    beta = np.random.normal(size=3)
    mod = OLS(y, X)
    dmat = _calc_wdesign_mat(mod, beta, {})
    assert_allclose(dmat, np.array([[1.306314, -0.024897, 1.326498],
                                    [-0.539219, -0.483028, -0.703503],
                                    [-3.327987, 0.524541, -0.139761]]),
                    atol=1e-6, rtol=0)

    mod = GLM(y, X, family=Binomial())
    dmat = _calc_wdesign_mat(mod, beta, {})
    assert_allclose(dmat, np.array([[0.408616, -0.007788, 0.41493],
                                    [-0.263292, -0.235854, -0.343509],
                                    [-0.11241, 0.017718, -0.004721]]),
                    atol=1e-6, rtol=0)
Пример #10
0
def test_influence_glm_bernoulli():
    # example uses Finney's data and is used in Pregibon 1981

    df = data_bin
    results_sas = np.asarray(results_sas_df)

    res = GLM(df['constrict'],
              df[['const', 'log_rate', 'log_volumne']],
              family=families.Binomial()).fit(attach_wls=True, atol=1e-10)

    infl = res.get_influence(observed=False)

    k_vars = 3
    assert_allclose(infl.dfbetas, results_sas[:, 5:8], atol=1e-4)
    assert_allclose(infl.d_params,
                    results_sas[:, 5:8] * res.bse.values,
                    atol=1e-4)
    assert_allclose(infl.cooks_distance[0] * k_vars,
                    results_sas[:, 8],
                    atol=6e-5)
    assert_allclose(infl.hat_matrix_diag, results_sas[:, 4], atol=6e-5)

    c_bar = infl.cooks_distance[0] * 3 * (1 - infl.hat_matrix_diag)
    assert_allclose(c_bar, results_sas[:, 9], atol=6e-5)
Пример #11
0
    def setupClass(cls):
        self = cls  # alias

        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        self.res1 = GLM(cpunish_data.endog,
                        cpunish_data.exog,
                        family=sm.families.Poisson(),
                        var_weights=aweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)

        # Need to copy to avoid inplace adjustment
        from copy import copy
        self.res2 = copy(res_stata.results_poisson_aweight_nonrobust)
        self.res2.resids = self.res2.resids.copy()

        # Need to adjust resids for pearson and deviance to add weights
        self.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
Пример #12
0
    def setup_class(cls):
        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                       family=sm.families.Poisson()).fit()

        cls.res2 = res_stata.results_poisson_none_nonrobust
Пример #13
0
plt.rc("figure", figsize=(16, 8))
plt.rc("font", size=14)

import statsmodels.stats.tests.test_influence

test_module = statsmodels.stats.tests.test_influence.__file__
cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module))

file_name = "binary_constrict.csv"
file_path = os.path.join(cur_dir, "results", file_name)
df = pd.read_csv(file_path, index_col=0)

res = GLM(
    df["constrict"],
    df[["const", "log_rate", "log_volumne"]],
    family=families.Binomial(),
).fit(attach_wls=True, atol=1e-10)
print(res.summary())

# ## get the influence measures
#
# GLMResults has a `get_influence` method similar to OLSResults, that
# returns and instance of the GLMInfluence class. This class has methods and
# (cached) attributes to inspect influence and outlier measures.
#
# This measures are based on a one-step approximation to the the results
# for deleting one observation. One-step approximations are usually accurate
# for small changes but underestimate the magnitude of large changes. Event
# though large changes are underestimated, they still show clearly the
# effect of influential observations
Пример #14
0
    def fit_scores(self, balance=True, nmodels=None):
        """
        Fits logistic regression model(s) used for
        generating propensity scores

        Parameters
        ----------
        balance : bool
            Should balanced datasets be used?
            (n_control == n_test)
        nmodels : int
            How many models should be fit?
            Score becomes the average of the <nmodels> models if nmodels > 1

        Returns
        -------
        None
        """
        # reset models if refitting
        if len(self.models) > 0:
            self.models = []
        if len(self.model_accuracy) > 0:
            self.model_accuracy = []
        if not self.formula:
            # use all columns in the model
            self.formula = "{} ~ {}".format(self.yvar, "+".join(self.xvars))
        if balance:
            if nmodels is None:
                # fit multiple models based on imbalance severity (rounded up to nearest tenth)
                minor, major = [
                    self.data[self.data[self.yvar] == i]
                    for i in (self.minority, self.majority)
                ]
                nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
            self.nmodels = nmodels
            i = 0
            errors = 0
            while i < nmodels and errors < 5:
                uf_progress(i + 1,
                            nmodels,
                            prestr="Fitting Models on Balanced Samples")
                # sample from majority to create balance dataset
                df = self.balanced_sample()
                df = pd.concat(
                    [
                        uf_drop_static_cols(df[df[self.yvar] == 1],
                                            yvar=self.yvar),
                        uf_drop_static_cols(df[df[self.yvar] == 0],
                                            yvar=self.yvar),
                    ],
                    sort=True,
                )
                y_samp, X_samp = patsy.dmatrices(self.formula,
                                                 data=df,
                                                 return_type="dataframe")
                X_samp.drop(self.yvar, axis=1, errors="ignore", inplace=True)
                # print("y_samp:",y_samp)
                # print("X_samp:",X_samp)
                glm = GLM(y_samp, X_samp, family=sm.families.Binomial())

                try:
                    res = glm.fit()
                    # print("GLM", res.summary())
                    self.model_accuracy.append(
                        self._scores_to_accuracy(res, X_samp, y_samp))
                    self.models.append(res)
                    i = i + 1
                except Exception as e:
                    errors = (
                        errors + 1
                    )  # to avoid infinite loop for misspecified matrix
                    print("Error: {}".format(e))
            print(
                "\nAverage Accuracy:",
                "{}%".format(round(np.mean(self.model_accuracy) * 100, 2)),
            )
        else:
            # ignore any imbalance and fit one model
            print("Fitting 1 (Unbalanced) Model...")
            # print("self.y", self.y)
            # print("self.X", self.X)
            glm = GLM(self.y, self.X, family=sm.families.Binomial())
            res = glm.fit()
            self.model_accuracy.append(
                self._scores_to_accuracy(res, self.X, self.y))
            self.models.append(res)
            print("\nAccuracy",
                  round(np.mean(self.model_accuracy[0]) * 100, 2))
    def test_glm(self):
        # prelimnimary, getting started with basic test for GLM.get_prediction
        from statsmodels.genmod.generalized_linear_model import GLM

        res_wls = self.res_wls
        mod_wls = res_wls.model
        y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights

        w_sqrt = np.sqrt(wi)  # notation wi is weights, `w` is var
        mod_glm = GLM(y * w_sqrt, X * w_sqrt[:, None])

        # compare using t distribution
        res_glm = mod_glm.fit(use_t=True)
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        n_compare = 30  # in glm with predict wendog
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # compare using normal distribution

        res_glm = mod_glm.fit()  # default use_t=False
        pred_glm = res_glm.get_prediction()
        sf_glm = pred_glm.summary_frame()

        res_wls = mod_wls.fit(use_t=False)
        pred_res_wls = res_wls.get_prediction()
        sf_wls = pred_res_wls.summary_frame()
        assert_allclose(sf_glm.values[:n_compare],
                        sf_wls.values[:n_compare, :4])

        # function for parameter transformation
        # should be separate test method
        from statsmodels.genmod._prediction import params_transform_univariate
        rates = params_transform_univariate(res_glm.params,
                                            res_glm.cov_params())

        rates2 = np.column_stack(
            (np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params),
             np.exp(res_glm.conf_int())))
        assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13)

        from statsmodels.genmod.families import links

        # with identity transform
        pt = params_transform_univariate(res_glm.params,
                                         res_glm.cov_params(),
                                         link=links.identity())

        assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13)
        assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13)
        ptt = pt.t_test()
        assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13)
        assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13)

        # prediction with exog and no weights does not error
        res_glm = mod_glm.fit()
        pred_glm = res_glm.get_prediction(X)

        # check that list works, issue 4437
        x = res_glm.model.exog.mean(0)
        pred_res3 = res_glm.get_prediction(x)
        ci3 = pred_res3.conf_int()
        pred_res3b = res_glm.get_prediction(x.tolist())
        ci3b = pred_res3b.conf_int()
        assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13)
        assert_allclose(ci3b, ci3, rtol=1e-13)
        res_df = pred_res3b.summary_frame()
        assert_equal(res_df.index.values, [0])

        x = res_glm.model.exog[-2:]
        pred_res3 = res_glm.get_prediction(x)
        ci3 = pred_res3.conf_int()
        pred_res3b = res_glm.get_prediction(x.tolist())
        ci3b = pred_res3b.conf_int()
        assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13)
        assert_allclose(ci3b, ci3, rtol=1e-13)
        res_df = pred_res3b.summary_frame()
        assert_equal(res_df.index.values, [0, 1])
Пример #16
0
    plt.legend(loc='upper left')
    plt.title('gam.GAM Poisson')

    counter = 2
    for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:, 0], x[:, 1]]):
        sortidx = np.argsort(xx)
        #plt.figure()
        plt.subplot(2, 2, counter)
        plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5)
        plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true')
        plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM')
        plt.legend(loc='upper left')
        plt.title('gam.GAM Poisson ' + ii)
        counter += 1

    res = GLM(p, exog_reduced, family=f).fit()

    #plot component, compared to true component
    x1 = x[:, 0]
    x2 = x[:, 1]
    f1 = exog[:, :order + 1].sum(1) - 1  #take out constant
    f2 = exog[:, order + 1:].sum(1) - 1
    plt.figure()
    #Note: need to correct for constant which is indeterminatedly distributed
    #plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0]+1, 'r')
    #better would be subtract f(0) m.smoothers[0](np.array([0]))
    plt.plot(x1, f1, linewidth=2)
    plt.plot(x1, m.smoothers[0](x1) - m.smoothers[0].params[0], 'r')

    plt.figure()
    plt.plot(x2, f2, linewidth=2)
Пример #17
0
 def init(cls):
     cls.res2 = cls.mod2.fit()
     mod = GLM(cls.endog, cls.exog)
     mod.exog_names[:] = ['const', 'x1', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit_constrained('x1=0.5')
Пример #18
0
results_oos = m.predict(points = X_test[:,0:2], P= X_test[:,2:],exog_scale = m.exog_scale, exog_resid = m.exog_resid)
rmse_oos = np.sqrt(np.mean((y_test-results_oos.predictions)**2))
lik_oos = np.sum(poisson(x = y_test, mu = np.clip(results_oos.predictions,a_min=0.0001,a_max=None)))
row = pd.Series({'RMSE_IS':rmse_is,'LIK_IS':lik_is,'RMSE_OOS':rmse_oos,'LIK_OOS':lik_oos},name='GWR (count)')
results =results.append(row)


# =============================================================================
# Linear Kriging with count features
# =============================================================================

X_train, X_test, _, y_train, y_test, _, _, _,_,_ = data_pipeline(feature_engineering = True, 
                                                                 feature_type = 'count')

m = GLM(endog=y_train.reshape((-1,)),exog=X_train[:,2:], family=Poisson(link= sm.genmod.families.links.log))
results = m.fit()


res = y_train.reshape((-1,)) - results.fittedvalues
#
kernel = ConstantKernel()*RBF(10, (1e-2, 1e2))+WhiteKernel()
gp = GaussianProcessRegressor(kernel = kernel, 
                              n_restarts_optimizer=1)
gp.fit(X_train[:,0:2], res)
#
pred_oos = m.predict(exog = X_test[:,2:], params = results.params) + gp.predict(X_test[:,0:2])
pred_is = m.predict(exog = X_train[:,2:], params = results.params) + gp.predict(X_train[:,0:2])
#
rmse_oos = np.sqrt(np.mean((pred_oos-y_test.reshape((-1,)))**2))
lik_oos = np.sum(poisson(x = y_test.reshape((-1,)), mu = np.clip(pred_oos,a_min=0.0001,a_max=None)))
Пример #19
0
    def _scale_group_specific(self, term):

        # these default priors are only defined for HalfNormal priors
        if term.prior.args["sigma"].name != "HalfNormal":
            return

        sigma_corr = term.prior.scale

        # recreate the corresponding common effect data
        fix_data = term.data.sum(axis=1)

        # handle intercepts and cell means
        if term.constant:
            _, sigma = self._get_intercept_stats()
            sigma *= sigma_corr
        # handle slopes
        else:
            exists = [
                x for x in self.dm.columns  # pylint: disable=not-an-iterable
                if np.array_equal(fix_data, self.dm[x].values)
            ]
            # handle case where there IS a corresponding common effect
            if exists and exists[0] in self.priors.keys():
                sigma = self.priors[exists[0]]["sigma"]
            # handle case where there IS NOT a corresponding common effect
            else:
                # the usual case: add the group specific effect data as a common effect
                # in the design matrix
                if not exists:
                    fix_dataframe = pd.DataFrame(fix_data)
                    # things break if column names are integers (the default)
                    fix_dataframe.rename(
                        columns={
                            c: "_" + str(c)
                            for c in fix_dataframe.columns  # pylint: disable=not-an-iterable
                        },
                        inplace=True,
                    )
                    exog = self.dm.join(fix_dataframe)
                # this handles the corner case where there technically is the
                # corresponding common effect, but the parameterization differs
                # between the common- and group-specific-effect specification. usually
                # this means the common effects use cell-means coding but the
                # group specific effects use k-1 coding
                else:
                    group = term.name.split("|")[1]
                    exog = self.model.group_specific_terms.values()
                    exog = [
                        v.data.sum(1) for v in exog
                        if v.name.split("|")[-1] == group
                    ]
                    index = ["_" + str(i) for i in range(len(exog))]
                    exog = pd.DataFrame(exog, index=index).T
                # this will replace self.mle (which is missing predictors)
                missing = "drop" if self.model.dropna else "none"
                full_mod = GLM(
                    endog=self.model.response.data,
                    exog=exog,
                    family=self.model.family.smfamily(),
                    missing=missing,
                ).fit()
                sigma = self._get_slope_stats(exog=exog,
                                              predictor=fix_data,
                                              full_mod=full_mod,
                                              sigma_corr=sigma_corr)

        # set the prior sigma.
        term.prior.args["sigma"].update(sigma=np.squeeze(np.atleast_1d(sigma)))
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30):
    """
    Calculate local FDR values for a list of Z-scores.

    Parameters
    ----------
    zscores : array-like
        A vector of Z-scores
    null_proportion : float
        The assumed proportion of true null hypotheses
    null_pdf : function mapping reals to positive reals
        The density of null Z-scores; if None, use standard normal
    deg : integer
        The maximum exponent in the polynomial expansion of the
        density of non-null Z-scores
    nbins : integer
        The number of bins for estimating the marginal density
        of Z-scores.

    Returns
    -------
    fdr : array-like
        A vector of FDR values

    References
    ----------
    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
    Model.  Statistical Science 23:1, 1-22.

    Examples
    --------
    Basic use (the null Z-scores are taken to be standard normal):

    >>> from statsmodels.stats.multitest import local_fdr
    >>> import numpy as np
    >>> zscores = np.random.randn(30)
    >>> fdr = local_fdr(zscores)

    Use a Gaussian null distribution estimated from the data:

    >>> null = EmpiricalNull(zscores)
    >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
    """

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_linear_model import families
    from statsmodels.regression.linear_model import OLS

    # Bins for Poisson modeling of the marginal Z-score density
    minz = min(zscores)
    maxz = max(zscores)
    bins = np.linspace(minz, maxz, nbins)

    # Bin counts
    zhist = np.histogram(zscores, bins)[0]

    # Bin centers
    zbins = (bins[:-1] + bins[1:]) / 2

    # The design matrix at bin centers
    dmat = np.vander(zbins, deg + 1)

    # Use this to get starting values for Poisson regression
    md = OLS(np.log(1 + zhist), dmat).fit()

    # Poisson regression
    md = GLM(zhist, dmat,
             family=families.Poisson()).fit(start_params=md.params)

    # The design matrix for all Z-scores
    dmat_full = np.vander(zscores, deg + 1)

    # The height of the estimated marginal density of Z-scores,
    # evaluated at every observed Z-score.
    fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))

    # The null density.
    if null_pdf is None:
        f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
    else:
        f0 = null_pdf(zscores)

    # The local FDR values
    fdr = null_proportion * f0 / fz

    fdr = np.clip(fdr, 0, 1)

    return fdr
Пример #21
0
    def fit(self, flow_df, relevance_column=constants.RELEVANCE):
        """
        Fit the gravity model parameters to the flows in file `filename`.
        Can fit globally or singly constrained gravity models using a
        Generalized Linear Model (GLM) with a Poisson regression.

        Parameters
        ----------
        flow_df  :  FlowDataFrame where the flows are stored and with info about the spatial tessellation.
            In addition to the default columns, the spatial tessellation must contain the column
            "relevance": float, number of opportunities at the location
                (e.g., population or total number of visits).

        Returns
        -------

        X  :  list of independent variables (features) used in the GLM fit.

        y   :  list of dependent varibles (flows) used in the GLM fit.

        poisson_results  :  statsmodels.genmod.generalized_linear_model.GLMResultsWrapper
            statsmodels object with information on the fit's quality and predictions.

        References
        ----------

        .. [1] Agresti, Alan.
            "Categorical data analysis."
            Vol. 482. John Wiley & Sons, 2003.

        .. [2] Flowerdew, Robin, and Murray Aitkin.
            "A method of fitting the gravity model based on the Poisson distribution."
            Journal of regional science 22.2 (1982): 191-202.

        """
        self.lats_lngs = flow_df.tessellation.geometry.apply(utils.get_geom_centroid, args=[True]).values
        self.weights = flow_df.tessellation[relevance_column].fillna(0).values
        self.tileid2index = dict(
            [(tileid, i) for i, tileid in enumerate(flow_df.tessellation[constants.TILE_ID].values)])

        self.X, self.y = [], []  # independent (X) and dependent (y) variables

        # flow_df.progress_apply(lambda flow_example: self._update_training_set(flow_example),
        #                        axis=1)
        flow_df.apply(lambda flow_example: self._update_training_set(flow_example), axis=1)

        # Perform GLM fit
        poisson_model = GLM(self.y, self.X, family=sm.genmod.families.family.Poisson(link=sm.genmod.families.links.log))
        poisson_results = poisson_model.fit()

        # Set best fit parameters
        if self._gravity_type == 'globally constrained':
            self._origin_exp = poisson_results.params[1]
            self._destination_exp = poisson_results.params[2]
            self._deterrence_func_args = [poisson_results.params[3]]
        else:  # if singly constrained
            self._origin_exp = 1.
            self._destination_exp = poisson_results.params[-2]
            self._deterrence_func_args = [poisson_results.params[-1]]

        # we delete the instance variables we do not need anymore
        del self.X
        del self.y
Пример #22
0
from numpy.testing import assert_allclose

assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13)
assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13)

print pred_res2.summary_frame().head()

res_wls_n = mod_wls.fit(use_t=False)
pred_wls_n = res_wls_n.get_prediction()
print(pred_wls_n.summary_frame().head())

from statsmodels.genmod.generalized_linear_model import GLM

w_sqrt = np.sqrt(w)
mod_glm = GLM(y / w_sqrt, X / w_sqrt[:, None])
res_glm = mod_glm.fit()
pred_glm = res_glm.get_prediction()
print(pred_glm.summary_frame().head())

res_glm_t = mod_glm.fit(use_t=True)
pred_glm_t = res_glm_t.get_prediction()
print(pred_glm_t.summary_frame().head())

rates = params_transform_univariate(res_glm.params, res_glm.cov_params())
print('\nRates exp(params)')
print(rates.summary_frame())

rates2 = np.column_stack(
    (np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params),
     np.exp(res_glm.conf_int())))
Пример #23
0
 def mod(y, x):
     return GLM(y, x, family=families.Binomial())
U_Const = statsmodels.tools.add_constant(U)

# In[85]:

from statsmodels.discrete.discrete_model import Poisson

mpr = Poisson(V, U_Const)
res_mpr = mpr.fit()

# In[93]:

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

mod = GLM(V, U_Const, family=families.Poisson())
res = mod.fit()
print(res.summary())

# ### La surdispersion

# In[95]:

#Surdispersion

print(res.pearson_chi2 / res.df_resid)

# #### On voit bien que le rapport de la pearson chi2_dll /residual deviance est superieur à 1 ,d'ou l'existence de la surdispersion

# ### Frequence de nombre de Zero dans les données
Пример #25
0
 def __init__(self):
     self.setup_class()  # why does nose do it properly
     from statsmodels.genmod.generalized_linear_model import GLM
     from statsmodels.genmod import families
     self.mod = lambda y, x: GLM(y, x, family=families.Binomial())
     self.y = self.y_bin
Пример #26
0
import pandas as pd

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

import statsmodels.stats.tests.test_influence
test_module = statsmodels.stats.tests.test_influence.__file__
cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module))

file_name = 'binary_constrict.csv'
file_path = os.path.join(cur_dir, 'results', file_name)
df = pd.read_csv(file_path, index_col=0)

res = GLM(
    df['constrict'],
    df[['const', 'log_rate', 'log_volumne']],
    family=families.Binomial()).fit(
        attach_wls=True, atol=1e-10)
print(res.summary())

# ## get the influence measures
#
# GLMResults has a `get_influence` method similar to OLSResults, that
# returns and instance of the GLMInfluence class. This class has methods and
# (cached) attributes to inspect influence and outlier measures.
#
# This measures are based on a one-step approximation to the the results
# for deleting one observation. One-step approximations are usually accurate
# for small changes but underestimate the magnitude of large changes. Event
# though large changes are underestimated, they still show clearly the
# effect of influential observations
Пример #27
0
    def _get_slope_stats(self,
                         exog,
                         predictor,
                         sigma_corr,
                         full_mod=None,
                         points=4):
        """
        Parameters
        ----------
            full_mod : statsmodels.genmod.generalized_linear_model.GLM
                Statsmodels GLM to replace MLE model. For when ``'predictor'`` is not in the common
                part of the model.
            points : int
                Number of points to use for LL approximation.
        """

        if full_mod is None:
            full_mod = self.mle

        # figure out which column of exog to drop for the null model
        keeps = [
            i for i, x in enumerate(list(exog.columns))
            if not np.array_equal(predictor, exog[x].values.flatten())
        ]
        i = [x for x in range(exog.shape[1]) if x not in keeps][0]

        # get log-likelihood values from beta=0 to beta=MLE
        values = np.linspace(0.0, full_mod.params[i], points)
        # if there are multiple predictors, use statsmodels to optimize the LL
        if keeps:
            null = [
                GLM(endog=self.model.response.data,
                    exog=exog,
                    family=self.model.family.smfamily()).fit_constrained(
                        str(exog.columns[i]) + "=" + str(val))
                for val in values[:-1]
            ]
            null = np.append(null, full_mod)
            log_likelihood = np.array([x.llf for x in null])
        # if just a single predictor, use statsmodels to evaluate the LL
        else:
            null = [
                self.model.family.smfamily().loglike(
                    np.squeeze(self.model.response.data), val * predictor)
                for val in values[:-1]
            ]
            log_likelihood = np.append(null, full_mod.llf)

        # compute params of quartic approximatino to log-likelihood
        # c: intercept, d: shift parameter
        # a: quartic coefficient, b: quadratic coefficient

        intercept, shift_parameter = log_likelihood[-1], -(
            full_mod.params[i].item())
        X = np.array([(values + shift_parameter)**4,
                      (values + shift_parameter)**2]).T
        coef_a, coef_b = np.squeeze(
            np.linalg.multi_dot([
                np.linalg.inv(np.dot(X.T, X)), X.T,
                (log_likelihood[:, None] - intercept)
            ]))

        # m, v: mean and variance of beta distribution of correlations
        # p, q: corresponding shape parameters of beta distribution
        mean = 0.5
        variance = sigma_corr**2 / 4
        p = mean * (mean * (1 - mean) / variance - 1)
        q = (1 - mean) * (mean * (1 - mean) / variance - 1)

        # function to return central moments of rescaled beta distribution
        def moment(k):
            return (2 * p / (p + q))**k * hyp2f1(p, -k, p + q, (p + q) / p)

        # evaluate the derivatives of beta = f(correlation).
        # dict 'point' gives points about which to Taylor expand. We want to
        # expand about the mean (generally 0), but some of the derivatives
        # do not exist at 0. Evaluating at a point very close to 0 (e.g., .001)
        # generally gives good results, but the higher order the expansion, the
        # further from 0 we need to evaluate the derivatives, or they blow up.
        point = dict(zip(range(1, 14), 2**np.linspace(-1, 5, 13) / 100))
        vals = dict(a=coef_a,
                    b=coef_b,
                    n=len(self.model.response.data),
                    r=point[self.taylor])
        _deriv = [eval(x, globals(), vals) for x in self.deriv]  # pylint: disable=eval-used

        # compute and return the approximate sigma
        def term(i, j):
            return (1 / np.math.factorial(i) * 1 / np.math.factorial(j) *
                    _deriv[i] * _deriv[j] *
                    (moment(i + j) - moment(i) * moment(j)))

        terms = [
            term(i, j) for i in range(1, self.taylor + 1)
            for j in range(1, self.taylor + 1)
        ]
        return np.array(terms).sum()**0.5
Пример #28
0
 def setup_class(cls):
     cls.res2 = results_st.results_poisson_clu
     mod = smd.Poisson(endog, exog)
     mod = GLM(endog, exog, family=families.Poisson())
     cls.res1 = mod.fit()
     cls.get_robust_clu()
Пример #29
0
 def __init__(self):
     from results.results_glm import Lbw
     self.res2 = Lbw()
     self.res1 = GLM(self.res2.endog, self.res2.exog,
             family=sm.families.Binomial()).fit()
Пример #30
0
 def init(cls):
     cov_type = 'HC0'
     cls.res2 = cls.mod2.fit(cov_type=cov_type)
     mod = GLM(cls.endog, cls.exog, var_weights=cls.aweights)
     mod.exog_names[:] = ['const', 'x1', 'x2', 'x3', 'x4']
     cls.res1 = mod.fit_constrained('x1=0.5', cov_type=cov_type)