예제 #1
0
    def __init__(self, endog, exog, exog_infl=None, offset=None,
                 inflation='logit', exposure=None, missing='none', **kwargs):
        super(GenericZeroInflated, self).__init__(endog, exog, offset=offset,
                                                  exposure=exposure,
                                                  missing=missing, **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise TypeError("inflation == %s, which is not handled"
                % inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError('exog and exog_infl have different number of'
                             'observation. `missing` handling is not supported')

        infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']
예제 #2
0
 def probit_reg(x, y):
     """Univariate probit regression"""
     x = np.append(np.ones(10).reshape(-1, 1), x.reshape(-1, 1),
                   axis=1).reshape(len(x), 2)
     pm = Probit(y, x)
     return pm.fit().params
예제 #3
0
class GenericZeroInflated(CountModel):
    __doc__ = """
    Generic Zero Inflated Model

    %(params)s
    %(extra_params)s

    Attributes
    ----------
    endog : ndarray
        A reference to the endogenous response variable
    exog : ndarray
        A reference to the exogenous design.
    exog_infl : ndarray
        A reference to the zero-inflated exogenous design.
    """ % {'params' : base._model_params_doc,
           'extra_params' : _doc_zi_params + base._missing_param_doc}

    def __init__(self, endog, exog, exog_infl=None, offset=None,
                 inflation='logit', exposure=None, missing='none', **kwargs):
        super(GenericZeroInflated, self).__init__(endog, exog, offset=offset,
                                                  exposure=exposure,
                                                  missing=missing, **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self._no_exog_infl = True
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]
            self._no_exog_infl = False

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise ValueError("inflation == %s, which is not handled"
                             % inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError('exog and exog_infl have different number of'
                             'observation. `missing` handling is not supported')

        infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']

    def _get_exogs(self):
        """list of exogs, for internal use in post-estimation
        """
        return (self.exog, self.exog_infl)

    def loglike(self, params):
        """
        Loglikelihood of Generic Zero Inflated model.

        Parameters
        ----------
        params : array_like
            The parameters of the model.

        Returns
        -------
        loglike : float
            The log-likelihood function of the model evaluated at `params`.
            See notes.

        Notes
        --------
        .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model})
            where P - pdf of main model, L - loglike function of main model.
        """
        return np.sum(self.loglikeobs(params))

    def loglikeobs(self, params):
        """
        Loglikelihood for observations of Generic Zero Inflated model.

        Parameters
        ----------
        params : array_like
            The parameters of the model.

        Returns
        -------
        loglike : ndarray
            The log likelihood for each observation of the model evaluated
            at `params`. See Notes for definition.

        Notes
        --------
        .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\ln(1-w_{i})+L_{main\\_model}
            where P - pdf of main model, L - loglike function of main model.

        for observations :math:`i=1,...,n`
        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)

        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        llf_main = self.model_main.loglikeobs(params_main)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        llf = np.zeros_like(y, dtype=np.float64)
        llf[zero_idx] = (np.log(w[zero_idx] +
            (1 - w[zero_idx]) * np.exp(llf_main[zero_idx])))
        llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx]

        return llf

    @Appender(DiscreteModel.fit.__doc__)
    def fit(self, start_params=None, method='bfgs', maxiter=35,
            full_output=1, disp=1, callback=None,
            cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs):
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self._get_start_params()

        if callback is None:
            # work around perfect separation callback #3895
            callback = lambda *x: x

        mlefit = super(GenericZeroInflated, self).fit(start_params=start_params,
                       maxiter=maxiter, disp=disp, method=method,
                       full_output=full_output, callback=callback,
                       **kwargs)

        zipfit = self.result_class(self, mlefit._results)
        result = self.result_class_wrapper(zipfit)

        if cov_kwds is None:
            cov_kwds = {}

        result._get_robustcov_results(cov_type=cov_type,
                                      use_self=True, use_t=use_t, **cov_kwds)
        return result

    @Appender(DiscreteModel.fit_regularized.__doc__)
    def fit_regularized(self, start_params=None, method='l1',
            maxiter='defined_by_method', full_output=1, disp=1, callback=None,
            alpha=0, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=1e-4,
            qc_tol=0.03, **kwargs):

        _validate_l1_method(method)

        if np.size(alpha) == 1 and alpha != 0:
            k_params = self.k_exog + self.k_inflate
            alpha = alpha * np.ones(k_params)

        extra = self.k_extra - self.k_inflate
        alpha_p = alpha[:-(self.k_extra - extra)] if (self.k_extra
            and np.size(alpha) > 1) else alpha
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self.model_main.fit_regularized(
                start_params=start_params, method=method, maxiter=maxiter,
                full_output=full_output, disp=0, callback=callback,
                alpha=alpha_p, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol,
                size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs).params
            start_params = np.append(np.ones(self.k_inflate), start_params)
        cntfit = super(CountModel, self).fit_regularized(
                start_params=start_params, method=method, maxiter=maxiter,
                full_output=full_output, disp=disp, callback=callback,
                alpha=alpha, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol,
                size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs)

        discretefit = self.result_class_reg(self, cntfit)
        return self.result_class_reg_wrapper(discretefit)

    def score_obs(self, params):
        """
        Generic Zero Inflated model score (gradient) vector of the log-likelihood

        Parameters
        ----------
        params : array_like
            The parameters of the model

        Returns
        -------
        score : ndarray, 1-D
            The score vector of the model, i.e. the first derivative of the
            loglikelihood function, evaluated at `params`
        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        mu = self.model_main.predict(params_main)

        # TODO: need to allow for complex to use CS numerical derivatives
        dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64)
        dldw = np.zeros_like(self.exog_infl, dtype=np.float64)

        dldp[zero_idx,:] = (score_main[zero_idx].T *
                     (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T
        dldp[nonzero_idx,:] = score_main[nonzero_idx]

        if self.inflation == 'logit':
            dldw[zero_idx,:] =  (self.exog_infl[zero_idx].T * w[zero_idx] *
                                 (1 - w[zero_idx]) *
                                 (1 - np.exp(llf_main[zero_idx])) /
                                  np.exp(llf[zero_idx])).T
            dldw[nonzero_idx,:] = -(self.exog_infl[nonzero_idx].T *
                                    w[nonzero_idx]).T
        elif self.inflation == 'probit':
            return approx_fprime(params, self.loglikeobs)

        return np.hstack((dldw, dldp))

    def score(self, params):
        return self.score_obs(params).sum(0)

    def _hessian_main(self, params):
        pass

    def _hessian_logit(self, params):
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate))

        pmf = np.exp(llf)

        #d2l/dw2
        for i in range(self.k_inflate):
            for j in range(i, -1, -1):
                hess_arr[i, j] = ((
                    self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] *
                    (w[zero_idx] * (1 - w[zero_idx]) * ((1 -
                    np.exp(llf_main[zero_idx])) * (1 - 2 * w[zero_idx]) *
                    np.exp(llf[zero_idx]) - (w[zero_idx] - w[zero_idx]**2) *
                    (1 - np.exp(llf_main[zero_idx]))**2) /
                    pmf[zero_idx]**2)).sum() -
                    (self.exog_infl[nonzero_idx, i] * self.exog_infl[nonzero_idx, j] *
                    w[nonzero_idx] * (1 - w[nonzero_idx])).sum())

        #d2l/dpdw
        for i in range(self.k_inflate):
            for j in range(self.k_exog):
                hess_arr[i, j + self.k_inflate] = -(score_main[zero_idx, j] *
                    w[zero_idx] * (1 - w[zero_idx]) *
                    self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum()

        return hess_arr

    def _hessian_probit(self, params):
        pass

    def hessian(self, params):
        """
        Generic Zero Inflated model Hessian matrix of the loglikelihood

        Parameters
        ----------
        params : array_like
            The parameters of the model

        Returns
        -------
        hess : ndarray, (k_vars, k_vars)
            The Hessian, second derivative of loglikelihood function,
            evaluated at `params`

        Notes
        -----
        """
        hess_arr_main = self._hessian_main(params)
        hess_arr_infl = self._hessian_inflate(params)

        if hess_arr_main is None or hess_arr_infl is None:
            return approx_hess(params, self.loglike)

        dim = self.k_exog + self.k_inflate

        hess_arr = np.zeros((dim, dim))

        hess_arr[:self.k_inflate,:] = hess_arr_infl
        hess_arr[self.k_inflate:,self.k_inflate:] = hess_arr_main

        tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1)
        hess_arr[tri_idx] = hess_arr.T[tri_idx]

        return hess_arr

    def predict(self, params, exog=None, exog_infl=None, exposure=None,
                offset=None, which='mean', y_values=None):
        """
        Predict response variable or other statistic given exogenous variables.

        Parameters
        ----------
        params : array_like
            The parameters of the model.
        exog : ndarray, optional
            Explanatory variables for the main count model.
            If ``exog`` is None, then the data from the model will be used.
        exog_infl : ndarray, optional
            Explanatory variables for the zero-inflation model.
            ``exog_infl`` has to be provided if ``exog`` was provided unless
            ``exog_infl`` in the model is only a constant.
        offset : ndarray, optional
            Offset is added to the linear predictor of the mean function with
            coefficient equal to 1.
            Default is zero if exog is not None, and the model offset if exog
            is None.
        exposure : ndarray, optional
            Log(exposure) is added to the linear predictor with coefficient
            equal to 1. If exposure is specified, then it will be logged by
            the method. The user does not need to log it first.
            Default is one if exog is is not None, and it is the model exposure
            if exog is None.
        which : str (optional)
            Statitistic to predict. Default is 'mean'.

            - 'mean' : the conditional expectation of endog E(y | x),
              i.e. exp of linear predictor.
            - 'linear' : the linear predictor of the mean function.
            - 'var' : returns the estimated variance of endog implied by the
              model.
            - 'mean-main' : mean of the main count model
            - 'prob-main' : probability of selecting the main model.
                The probability of zero inflation is ``1 - prob-main``.
            - 'mean-nonzero' : expected value conditional on having observation
              larger than zero, E(y | X, y>0)
            - 'prob-zero' : probability of observing a zero count. P(y=0 | x)
            - 'prob' : probabilities of each count from 0 to max(endog), or
              for y_values if those are provided. This is a multivariate
              return (2-dim when predicting for several observations).

        y_values : array_like
            Values of the random variable endog at which pmf is evaluated.
            Only used if ``which="prob"``
        """
        no_exog = False
        if exog is None:
            no_exog = True
            exog = self.exog

        if exog_infl is None:
            if no_exog:
                exog_infl = self.exog_infl
            else:
                if self._no_exog_infl:
                    exog_infl = np.ones((len(exog), 1))
        else:
            exog_infl = np.asarray(exog_infl)
            if exog_infl.ndim == 1 and self.k_inflate == 1:
                exog_infl = exog_infl[:, None]

        if exposure is None:
            if no_exog:
                exposure = getattr(self, 'exposure', 0)
            else:
                exposure = 0
        else:
            exposure = np.log(exposure)

        if offset is None:
            if no_exog:
                offset = getattr(self, 'offset', 0)
            else:
                offset = 0

        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        prob_main = 1 - self.model_infl.predict(params_infl, exog_infl)

        lin_pred = np.dot(exog, params_main[:self.exog.shape[1]]) + exposure + offset

        # Refactor: This is pretty hacky,
        # there should be an appropriate predict method in model_main
        # this is just prob(y=0 | model_main)
        tmp_exog = self.model_main.exog
        tmp_endog = self.model_main.endog
        tmp_offset = getattr(self.model_main, 'offset', False)
        tmp_exposure = getattr(self.model_main, 'exposure', False)
        self.model_main.exog = exog
        self.model_main.endog = np.zeros((exog.shape[0]))
        self.model_main.offset = offset
        self.model_main.exposure = exposure
        llf = self.model_main.loglikeobs(params_main)
        self.model_main.exog = tmp_exog
        self.model_main.endog = tmp_endog
        # tmp_offset might be an array with elementwise equality testing
        #if np.size(tmp_offset) == 1 and tmp_offset[0] == 'no':
        if tmp_offset is False:
            del self.model_main.offset
        else:
            self.model_main.offset = tmp_offset
        #if np.size(tmp_exposure) == 1 and tmp_exposure[0] == 'no':
        if tmp_exposure is False:
            del self.model_main.exposure
        else:
            self.model_main.exposure = tmp_exposure
        # end hack

        prob_zero = (1 - prob_main) + prob_main * np.exp(llf)

        if which == 'mean':
            return prob_main * np.exp(lin_pred)
        elif which == 'mean-main':
            return np.exp(lin_pred)
        elif which == 'linear':
            return lin_pred
        elif which == 'mean-nonzero':
            return prob_main * np.exp(lin_pred) / (1 - prob_zero)
        elif which == 'prob-zero':
            return prob_zero
        elif which == 'prob-main':
            return prob_main
        elif which == 'var':
            mu = np.exp(lin_pred)
            return self._predict_var(params, mu, 1 - prob_main)
        elif which == 'prob':
            return self._predict_prob(params, exog, exog_infl, exposure,
                                      offset, y_values=y_values)
        else:
            raise ValueError('which = %s is not available' % which)

    def _derivative_predict(self, params, exog=None, transform='dydx'):
        """NotImplemented
        """
        raise NotImplementedError

    def _derivative_exog(self, params, exog=None, transform="dydx",
                         dummy_idx=None, count_idx=None):
        """NotImplemented
        """
        raise NotImplementedError

    def _deriv_mean_dparams(self, params):
        """
        Derivative of the expected endog with respect to the parameters.

        Parameters
        ----------
        params : ndarray
            parameter at which score is evaluated

        Returns
        -------
        The value of the derivative of the expected endog with respect
        to the parameter vector.
        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        mu = self.model_main.predict(params_main)

        score_infl = self.model_infl._deriv_mean_dparams(params_infl)
        score_main = self.model_main._deriv_mean_dparams(params_main)

        dmat_infl = - mu[:, None] * score_infl
        dmat_main = (1 - w[:, None]) * score_main

        dmat = np.column_stack((dmat_infl, dmat_main))
        return dmat

    def _deriv_score_obs_dendog(self, params):
        """derivative of score_obs w.r.t. endog

        Parameters
        ----------
        params : ndarray
            parameter at which score is evaluated

        Returns
        -------
        derivative : ndarray_2d
            The derivative of the score_obs with respect to endog.
        """
        raise NotImplementedError

        # The below currently does not work, discontinuity at zero
        # see https://github.com/statsmodels/statsmodels/pull/7951#issuecomment-996355875  # noqa
        from statsmodels.tools.numdiff import _approx_fprime_scalar
        endog_original = self.endog

        def f(y):
            if y.ndim == 2 and y.shape[1] == 1:
                y = y[:, 0]
            self.endog = y
            self.model_main.endog = y
            sf = self.score_obs(params)
            self.endog = endog_original
            self.model_main.endog = endog_original
            return sf

        ds = _approx_fprime_scalar(self.endog[:, None], f, epsilon=1e-2)

        return ds
예제 #4
0
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
'''Probit analysis plus plotting 3D graph of hit rate distribution with respect to delta and theta'''

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Receive Data
data = pd.read_csv("HL.csv")
print(data)
col = ["delta", "epsilon", "cross_term"]
dep_var = data["hits"].tolist()
X = data[col]
theta = data["theta"]

z = Probit(dep_var, X)
result = z.fit()
print(result.summary())

z = np.array(data["hits"].tolist())
x = np.array(data["epsilon"].tolist())
y = np.array(data["delta"].tolist())
print(z)
ax.scatter(x, y, z, s=1, c=None, depthshade=True)
plt.show()
예제 #5
0
def multiple_estimate(g,
                      zvec,
                      ins,
                      betas,
                      model=3,
                      estimator_types=[1, 2, 3, 4, 5],
                      runs=1000,
                      estoc_distr="Probit",
                      estoc_params=[0, 1],
                      silent=True,
                      tau_param=0.5):
    N = g.number_of_nodes()
    if model == 3 and estoc_distr in ["Tau-Exposure", "Tau-Exposure-Binario"]:
        tau_param = ins[1]

    # Caso seja necessário o vetor frac
    if len([x for x in estimator_types if x in [2, 3, 4]]):

        # Vetor frac
        frac = np.empty(shape=(N))
        for i in range(N):
            soma = np.float64(0)
            for k in g.neighbors(i):
                soma += np.float64(zvec[k])
            frac[i] = soma / g.degree(i)

        # Vetor das features
        features = np.empty(shape=(N, 3))
        for j in range(N):
            features[j][0] = 1
            features[j][1] = zvec[j]
            features[j][2] = frac[j]

    # Caso seja o modelo tau
    if 5 in estimator_types:
        # Bitmaps
        c1 = np.zeros(shape=(N))
        c0 = np.zeros(shape=(N))

        # Divide os dois grupos
        for i in range(N):
            soma = np.float64(0.0)
            for k in g.neighbors(i):
                soma += np.float64(zvec[k])
            frac = soma / g.degree(i)

            if zvec[i] == 0 and frac <= (1 - tau_param):
                c0[i] = 1

            elif zvec[i] == 1 and frac >= tau_param:
                c1[i] = 1

        # Tamanhos
        tam_c1 = int(sum(c1))
        tam_c0 = int(sum(c0))

        if tam_c0 == 0:
            tam_c0 = 1
        if tam_c1 == 0:
            tam_c1 = 1

    # Arrays para os resultados
    predicoes = np.empty(shape=(len(estimator_types), len(betas), runs))
    ATE = np.empty(shape=(len(estimator_types), len(betas), runs))

    # Gera os dados e coloca nos array
    for j in range(len(betas)):
        beta = betas[j]

        cons1 = 1.0 / (1.0 + np.exp(-sum(beta)))
        cons2 = 1.0 / (1.0 + np.exp(-beta[0]))

        # Estima
        for k in range(runs):
            if estoc_distr == "Logit":
                U = np.random.uniform(0.0, 1.0, N)
                yvec = np.array([
                    1 * (U[x] <
                         (1.0 / (1.0 + np.exp(-np.dot(features[x], beta)))))
                    for x in range(N)
                ])

                Zigual1 = np.array([1 * (U[x] < cons1) for x in range(N)])
                Zigual0 = np.array([1 * (U[x] < cons2) for x in range(N)])
                real = (sum(Zigual1) - sum(Zigual0)) / N

            else:
                U = np.random.normal(estoc_params[0], estoc_params[1], N)
                yvec = simulate(g, model, zvec, beta, ins, U)
                real = real_ATE(g, model, beta, ins, U)

            for i in range(len(estimator_types)):
                est_model = estimator_types[i]

                # SUTVA
                if est_model == 1:
                    z1_count = 0
                    z0_count = 0

                    sum_resp_z1 = 0
                    sum_resp_z0 = 0

                    # Soma dos valores
                    for l in range(N):
                        if zvec[l] == 1:
                            z1_count += 1
                            sum_resp_z1 += yvec[l]
                        else:
                            z0_count += 1
                            sum_resp_z0 += yvec[l]

                    # Excessões
                    if z1_count == 0:
                        sum_resp_z1 = 0
                        z1_count = 1

                    if z0_count == 0:
                        sum_resp_z0 = 0
                        z0_count = 1

                    predicoes[i][j][k] = (sum_resp_z1 / z1_count -
                                          sum_resp_z0 / z0_count)

                # Linear
                elif est_model == 2:
                    lr = linear_model.LinearRegression().fit(features,
                                                             yvec).coef_
                    predicoes[i][j][k] = (lr[1] + lr[2])

                # Probit
                elif est_model == 3:
                    vals = Probit(yvec, features).fit(disp=0).params
                    predicoes[i][j][k] = (norm.cdf(sum(vals)) -
                                          norm.cdf(vals[0]))

                # Logit
                elif est_model == 4:
                    vals = Logit(yvec, features).fit(disp=0).params
                    predicoes[i][j][k] = (
                        (np.exp(-vals[0]) - np.exp(-sum(vals))) /
                        ((1 + np.exp(-vals[0])) * (1 + np.exp(-sum(vals)))))

                # Tau exposure
                elif est_model == 5:
                    soma_c1 = 0
                    soma_c0 = 0

                    for l in range(N):
                        if c1[l] == 1:
                            soma_c1 += yvec[l]
                        elif c0[l] == 1:
                            soma_c0 += yvec[l]

                    predicoes[i][j][k] = (soma_c1 / tam_c1 - soma_c0 / tam_c0)

                ATE[i][j][k] = real

                if not silent:
                    print("est: {}/{}| beta: {}/{}| rodada: {}/{}".format(
                        i + 1, len(estimator_types), j + 1, len(betas), k + 1,
                        runs))

    return ([predicoes, ATE])
예제 #6
0
def estimate(g, zvec, yvec, est_model):
    N = len(zvec)

    # SUTVA
    if est_model == 1:
        z1 = 0
        z0 = 0

        sum_resp_z1 = 0
        sum_resp_z0 = 0

        # Soma dos valores
        for i in range(N):
            if zvec[i] == 1:
                z1 += 1
                sum_resp_z1 += yvec[i]
            else:
                z0 += 1
                sum_resp_z0 += yvec[i]

        # Excessões
        if z1 == 0:
            sum_resp_z1 = 0
            z1 = 1

        if z0 == 0:
            sum_resp_z0 = 0
            z0 = 1

        return (sum_resp_z1 / z1 - sum_resp_z0 / z0)

    # Vetor tau
    tau = np.empty(shape=(N))
    for i in range(N):
        soma = 0.0
        for k in g.neighbors(i):
            soma += np.float64(zvec[k])
        tau[i] = soma / g.degree(i)

    # Vetor das features
    features = np.empty(shape=(N, 3))
    for j in range(N):
        features[j][0] = 1
        features[j][1] = zvec[j]
        features[j][2] = tau[j]

    # Linear
    if est_model == 2:
        lr = linear_model.LinearRegression().fit(features, yvec).coef_
        return (lr[1] + lr[2])

    # Probit
    if est_model == 3:
        vals = Probit(yvec, features).fit(disp=0).params
        return (norm.cdf(sum(vals)) - norm.cdf(vals[0]))

    # Logit
    if est_model == 4:
        vals = Logit(yvec, features).fit(disp=0).params
        return ((np.exp(-vals[0]) - np.exp(-sum(vals))) /
                ((1 + np.exp(-vals[0])) * (1 + np.exp(-sum(vals)))))
예제 #7
0
파일: mcPS.py 프로젝트: lnsongxf/Mphil
def main():
    # Magic numbers
    dMux = 0
    dSigmax = 1
    dMuepsilon = 0
    dSigmaepsilon = 1
    dMueta = 0
    dSigmaeta = 1
    iNobs = 1000
    vdBeta = np.array([1, 2])
    vdZeta = np.array([3, 4])
    vdDezinho = np.array([0])
    iSeed = 6969
    iNgroups = 11
    iIter = 1000

    # Initialisation
    np.random.seed(iSeed)
    vdBeta = np.array(vdBeta).reshape(-1, 1)
    vdZeta = np.array(vdZeta).reshape(-1, 1)
    iLenbeta = len(vdBeta)

    # Start the iterations
    ## Create objects to store the ATE, variance, test statistics and R-Squares
    dvATE = np.ones(iIter)
    dvVar = np.ones(iIter)
    dvTtest = np.ones(iIter)
    dvRsquared = np.ones(iIter)

    for i in range(iIter - 1):
        mdX = fnGenX(iNobs, iLenbeta, dMux, dSigmax)
        iLenX = mdX.shape[1]
        vdEpsilon = fnGenError(iNobs, dMuepsilon, dSigmaepsilon)
        vdPstar = fnGenPstar(mdX, vdBeta, vdEpsilon)
        vdD = fnGenTreat(vdPstar)
        vdEta = fnGenError(iNobs, dMueta, dSigmaeta)
        vdY = fnGenY(vdD, vdDezinho, mdX, vdZeta, vdEta)

        ## Create a dataframe with everything together
        ### This is not good because of the names, if we change the size of X then we need to manually change this, but I can check later how to make this better if needed
        dfData = pd.DataFrame(np.hstack([vdY, vdD, mdX]),
                              columns=['vdY', 'vdD', 'vdX1', 'vdX2'])
        dfData["vdD"] = dfData["vdD"] == 1
        ### Can work out later in a better layout for these descriptives
        #print dfData.groupby('vdD').describe().unstack(1).reset_index()

        # Estimation
        model = Probit(dfData['vdD'],
                       dfData[dfData.columns[-mdX.shape[1]:]].copy())
        probit_model = model.fit()
        #print(probit_model.summary())
        dRsquare = probit_model.prsquared
        # Get the predicted probabilities
        vdProbs = probit_model.predict(
            dfData[dfData.columns[-mdX.shape[1]:]].copy())

        ## Looking at the estimated probabilities
        #plt.figure(figsize=[10,8])
        #n, bins, patches = plt.hist(x=vdProbs, bins=8, color='#0504aa',alpha=0.7, rwidth=0.85)
        #plt.grid(axis='y', alpha=0.75)
        #plt.xlabel('Value',fontsize=15)
        #plt.ylabel('Frequency',fontsize=15)
        #plt.xticks(fontsize=15)
        #plt.yticks(fontsize=15)
        #plt.ylabel('Frequency',fontsize=15)
        #plt.title('Propensity Score Histogram',fontsize=15)
        #plt.show()

        ## Building the groups
        vdGroups = np.linspace(0, 1, iNgroups)
        ## Putting back Y, treatment and the propensity score
        dfFinalData = pd.DataFrame(np.hstack(
            [vdY, vdD, vdProbs.reshape(-1, 1)]),
                                   columns=['vdY', 'vdD', 'vdPS'])

        #dfGroup1  = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[0]) & (dfFinalData['vdPS'] < vdGroups[1])]
        dfGroup2 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[1])
                                   & (dfFinalData['vdPS'] < vdGroups[2])]
        dfGroup3 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[2])
                                   & (dfFinalData['vdPS'] < vdGroups[3])]
        dfGroup4 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[3])
                                   & (dfFinalData['vdPS'] < vdGroups[4])]
        dfGroup5 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[4])
                                   & (dfFinalData['vdPS'] < vdGroups[5])]
        dfGroup6 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[5])
                                   & (dfFinalData['vdPS'] < vdGroups[6])]
        dfGroup7 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[6])
                                   & (dfFinalData['vdPS'] < vdGroups[7])]
        dfGroup8 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[7])
                                   & (dfFinalData['vdPS'] < vdGroups[8])]
        dfGroup9 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[8])
                                   & (dfFinalData['vdPS'] < vdGroups[9])]
        #dfGroup10 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[9]) & (dfFinalData['vdPS'] < vdGroups[10])]

        #dMean1 = dfGroup1.groupby('vdD').mean().iloc[1, 0] - dfGroup1.groupby('vdD').mean().iloc[0, 0]
        dMean2 = (dfGroup2.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup2.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup2.shape[0] / float(iNobs))
        dMean3 = (dfGroup3.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup3.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup3.shape[0] / float(iNobs))
        dMean4 = (dfGroup4.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup4.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup4.shape[0] / float(iNobs))
        dMean5 = (dfGroup5.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup5.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup5.shape[0] / float(iNobs))
        dMean6 = (dfGroup6.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup6.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup6.shape[0] / float(iNobs))
        dMean7 = (dfGroup7.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup7.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup7.shape[0] / float(iNobs))
        dMean8 = (dfGroup8.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup8.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup8.shape[0] / float(iNobs))
        dMean9 = (dfGroup9.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup9.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup9.shape[0] / float(iNobs))
        #dMean10 = dfGroup10.groupby('vdD').mean().iloc[1, 0] - dfGroup10.groupby('vdD').mean().iloc[0, 0]

        dATE = dMean2 + dMean3 + dMean4 + dMean5 + dMean6 + dMean7 + dMean8 + dMean9

        # Add an extra column with the mean of the corresponding treatment or no treatment inside the same block
        dfGroup2['vdYmean'] = dfGroup2.groupby("vdD")["vdY"].transform('mean')
        dfGroup3['vdYmean'] = dfGroup3.groupby("vdD")["vdY"].transform('mean')
        dfGroup4['vdYmean'] = dfGroup4.groupby("vdD")["vdY"].transform('mean')
        dfGroup5['vdYmean'] = dfGroup5.groupby("vdD")["vdY"].transform('mean')
        dfGroup6['vdYmean'] = dfGroup6.groupby("vdD")["vdY"].transform('mean')
        dfGroup7['vdYmean'] = dfGroup7.groupby("vdD")["vdY"].transform('mean')
        dfGroup8['vdYmean'] = dfGroup8.groupby("vdD")["vdY"].transform('mean')
        dfGroup9['vdYmean'] = dfGroup9.groupby("vdD")["vdY"].transform('mean')

        # Take the difference between the individual Y and the average of the corresponding group (by treated and non-treated)
        dfGroup2['dvDiffSquared'] = (dfGroup2['vdY'] - dfGroup2['vdYmean'])**2
        dfGroup3['dvDiffSquared'] = (dfGroup3['vdY'] - dfGroup3['vdYmean'])**2
        dfGroup4['dvDiffSquared'] = (dfGroup4['vdY'] - dfGroup4['vdYmean'])**2
        dfGroup5['dvDiffSquared'] = (dfGroup5['vdY'] - dfGroup5['vdYmean'])**2
        dfGroup6['dvDiffSquared'] = (dfGroup6['vdY'] - dfGroup6['vdYmean'])**2
        dfGroup7['dvDiffSquared'] = (dfGroup7['vdY'] - dfGroup7['vdYmean'])**2
        dfGroup8['dvDiffSquared'] = (dfGroup8['vdY'] - dfGroup8['vdYmean'])**2
        dfGroup9['dvDiffSquared'] = (dfGroup9['vdY'] - dfGroup9['vdYmean'])**2

        # For each line, add the number of individuals in the same treatment (or no treatment) group
        dfGroup2['iSizeGroup'] = dfGroup2.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup3['iSizeGroup'] = dfGroup3.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup4['iSizeGroup'] = dfGroup4.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup5['iSizeGroup'] = dfGroup5.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup6['iSizeGroup'] = dfGroup6.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup7['iSizeGroup'] = dfGroup7.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup8['iSizeGroup'] = dfGroup8.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup9['iSizeGroup'] = dfGroup9.groupby("vdD")["vdY"].transform(
            'count')

        # Divide the squared difference by the square of the size of the corresponding group
        dfGroup2['dvDiffSquaredDivided'] = dfGroup2[
            'dvDiffSquared'] / dfGroup2['iSizeGroup']**2
        dfGroup3['dvDiffSquaredDivided'] = dfGroup3[
            'dvDiffSquared'] / dfGroup3['iSizeGroup']**2
        dfGroup4['dvDiffSquaredDivided'] = dfGroup4[
            'dvDiffSquared'] / dfGroup4['iSizeGroup']**2
        dfGroup5['dvDiffSquaredDivided'] = dfGroup5[
            'dvDiffSquared'] / dfGroup5['iSizeGroup']**2
        dfGroup6['dvDiffSquaredDivided'] = dfGroup6[
            'dvDiffSquared'] / dfGroup6['iSizeGroup']**2
        dfGroup7['dvDiffSquaredDivided'] = dfGroup7[
            'dvDiffSquared'] / dfGroup7['iSizeGroup']**2
        dfGroup8['dvDiffSquaredDivided'] = dfGroup8[
            'dvDiffSquared'] / dfGroup8['iSizeGroup']**2
        dfGroup9['dvDiffSquaredDivided'] = dfGroup9[
            'dvDiffSquared'] / dfGroup9['iSizeGroup']**2

        # Sum the V term for treated and non-treated individuals and multiply by the size of the block divided by population squared
        dVGroup2 = (dfGroup2.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup2.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup2.shape[0] / float(iNobs))**2)
        dVGroup3 = (dfGroup3.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup3.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup3.shape[0] / float(iNobs))**2)
        dVGroup4 = (dfGroup4.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup4.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup4.shape[0] / float(iNobs))**2)
        dVGroup5 = (dfGroup5.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup5.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup5.shape[0] / float(iNobs))**2)
        dVGroup6 = (dfGroup6.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup6.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup6.shape[0] / float(iNobs))**2)
        dVGroup7 = (dfGroup7.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup7.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup7.shape[0] / float(iNobs))**2)
        dVGroup8 = (dfGroup8.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup8.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup8.shape[0] / float(iNobs))**2)
        dVGroup9 = (dfGroup9.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup9.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup9.shape[0] / float(iNobs))**2)

        # Compute the variance
        dVar = dVGroup2 + dVGroup3 + dVGroup4 + dVGroup5 + dVGroup6 + dVGroup7 + dVGroup8 + dVGroup9

        # Output
        #print ("ATE= %g" % dATE)
        #print ("Estimated Variance = %g" % dVar)

        # Compute the test statistic
        dTTest = dATE / (math.sqrt(dVar / iNobs))

        # Store results
        dvATE[i] = dATE
        dvVar[i] = dVar
        dvTtest[i] = dTTest
        dvRsquared[i] = dRsquare

        # Report results

        pd.DataFrame(stats.describe(dvATE[:-1]))
예제 #8
0
class ProbitRegression(Learner):
    """
	The probit regression learning algorithm. Given data, this class
	constructs and stores a probability unit regression mdl that can
	be used to quantify the probability of testing data-points taking
	on certain class values.
	"""
    def __init__(self, alpha: float, **params: any):
        """
		Initialises the Probit regression algorithm.

		:param alpha: regularization term alpha.
		:param params: Ignored.
		"""
        super().__init__(**params)
        self.name = 'Probit Regression'
        self.alpha = alpha
        self.gamma = 0.5
        self.add_intercept = True
        self.binary_points = True

        self.beta = list()
        self.data: Optional[RecordSet] = None
        self.model: Optional[Probit] = None  # will be set during fit

    def fit(self, rs: RecordSet) -> None:
        """
		fit a Probit regression mdl

		:param rs: The record set to fit with.
		"""
        # set params
        self.data = cp.deepcopy(rs)
        patterns = self.data.entries[:, :-1]
        out = self.data.entries[:, -1:]

        if self.add_intercept:
            intercept = np.ones((patterns.shape[0], 1))
            patterns = np.hstack((intercept, patterns))

        # avoid error
        if self.alpha == 0:
            raise Exception("Alpha Probit too low to obtain reliable results")

        self.model = Probit(endog=out.ravel(), exog=patterns)
        self.model = self.model.fit_regularized(alpha=self.alpha,
                                                maxiter=10e5,
                                                disp=False)

    def predict(self, rs: RecordSet) -> np.ndarray:
        """
		Assigns a predicted class label to the given record sets.

		:param rs: The record set to assign predictions to.
		:return: A column vector of predictions corresponding to the record set's rows.
		"""
        # set params
        patterns = rs.entries[:, :-1]

        if self.add_intercept:
            intercept = np.ones((patterns.shape[0], 1))
            patterns = np.hstack((intercept, patterns))

        # predict
        predictions = self.model.predict(exog=patterns)

        if self.binary_points:
            predictions = self.discrete_points(predictions=predictions)

        # return 2d
        predictions = np.reshape(predictions, (-1, 1))
        return predictions

    def discrete_points(self, predictions):
        """
		Turns probabilities into discrete classes

		:param predictions: The predicted class probabilities
		:return: A vector with discrete classes
		"""
        n = predictions.shape[0]
        for i in range(0, n):
            if predictions[i] >= self.gamma:
                predictions[i] = 1
            else:
                predictions[i] = 0
        return predictions
예제 #9
0
def table2_reg(df_reg, disp_it):
    """Function to create the tables for the first probit models.
    
        Args:
        dataFrame containing the categorial variables as dummies and the interaction terms
        disp_it boolean value indicating whether information about iterations should be displayed
        
        Returns:
        -------
        A table containing the regression output of the first 4 model specifications.
    """
    #first model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970']] 
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y,X)
    probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model1.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeff1 = probit_model1.get_margeff()
    #probit_margeff1.summary()
     

    #second model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y,X)
    probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model2.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeff2 = probit_model2.get_margeff()
    probit_margeff2.summary()
    
    #third model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\
                'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \
                'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970',
                'd_hinccat3X1970', 'd_hinccat4X1970']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y,X)
    probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model3.summary()) 
     
        #compute margins (get_margeff)
    probit_margeff3 = probit_model3.get_margeff()
    #probit_margeff3.summary()
    
    #fourth model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\
                'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \
                'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970',
                'd_hinccat3X1970', 'd_hinccat4X1970', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970']]
    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y,X)
    probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model4.summary()) 
     
        #compute margins (get_margeff)
    probit_margeff4 = probit_model4.get_margeff()
    #print(probit_margeff4.summary())

    table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []})
    table[' '] = ['Sales ban', '','p-value', 'Sales ban x 1(1970)', ' ','p-value', 'Observations', 'Log Likelihood', \
                         'Additional Covariates', 'Legal Variables']    
    table = table.set_index(' ')
    table['(1)'] = [round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)), round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[2],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[2],3)), round(probit_margeff1.pvalues[2],3), round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\
                        'R','PX' ]
    table['(2)'] = [round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)), round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[2],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[2],3)), round(probit_margeff2.pvalues[2],3), round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\
                        'R','PX, AD' ]
    table['(3)'] = [round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)), round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[2],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[2],3)), round(probit_margeff3.pvalues[2],3), round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\
                        'R,A,C,E,I','PX, AD' ]
    table['(4)'] = [round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)), round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[2],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[2],3)), round(probit_margeff4.pvalues[2],3), round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\
                        'R,A,C,E,I','PX, AD, K' ]
    
    return table, model1, model2, model3, model4
예제 #10
0
def table3_reg(df_reg, disp_it):
    """Function to create the tables for the second probit models.
    
        Args:
        dataFrame containing the categorial variables as dummies and the interaction terms
        
        Returns:
        -------
        A table containing the regression output of the 8 model specifications for the second table.
    """
    #1. _everuse_d as dependent variable 
    #first model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] 
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y,X)
    probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model1.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeff1 = probit_model1.get_margeff()
    #probit_margeff1.summary()
     

    #second model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y,X)
    probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model2.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeff2 = probit_model2.get_margeff()
    probit_margeff2.summary()
    
    #third model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y,X)
    probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model3.summary()) 
     
        #compute margins (get_margeff)
    probit_margeff3 = probit_model3.get_margeff()
    probit_margeff3.summary()
    
    #fourth model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \
                'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']]
                
    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y,X)
    probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model4.summary()) 
     
        #compute margins (get_margeff)
    probit_margeff4 = probit_model4.get_margeff()
    probit_margeff4.summary()
    
    #store results
    model1_help = model1
    model2_help = model2
    model3_help = model3
    model4_help = model3
    
    
    #2. _barrier as dependent variable
    #first model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] 
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y,X)
    probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model1.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeffb1 = probit_model1.get_margeff()
    probit_margeffb1.summary()
     

    #second model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y,X)
    probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model2.summary()) #got same results as paper
     
        #compute margins (get_margeff)
    probit_margeffb2 = probit_model2.get_margeff()
    probit_margeffb2.summary()
    
    #third model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y,X)
    probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model3.summary()) 
     
        #compute margins (get_margeff)
    probit_margeffb3 = probit_model3.get_margeff()
    probit_margeffb3.summary()
    
    #fourth model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \
                'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']]
                
    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y,X)
    probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it)
    #print(probit_model4.summary()) 
     
        #compute margins (get_margeff)
    probit_margeffb4 = probit_model4.get_margeff()
    probit_margeffb4.summary()


    #3. create table for output
    
    table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []})
    table[' '] = ['Ever used Pill','Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ','p-value', 'Sales ban x 1(1970)', ' ','p-value',\
                  'Obersvations', 'Log Likelihood', ' ', 'Ever used barrier', 'Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ',\
                  'p-value', 'Sales ban x 1(1970)', ' ','p-value',\
                  'Obersvations', 'Log Likelihood', \
                  'Additional Covariates', 'Legal Variables']    
    table = table.set_index(' ')
    table['(1)'] = [' ', round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)),\
                    round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[4],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[4],3)), round(probit_margeff1.pvalues[4],3),\
                    round(probit_margeff1.margeff[3],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[3],3)), round(probit_margeff1.pvalues[3],3),\
                    round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\
                    ' ', ' ', round(probit_margeffb1.margeff[0],3), '({})'.format(round(probit_margeffb1.margeff_se[0],3)),\
                    round(probit_margeffb1.pvalues[0],3), round(probit_margeffb1.margeff[4],3), \
                    '({})'.format(round(probit_margeffb1.margeff_se[4],3)), round(probit_margeffb1.pvalues[4],3),\
                    round(probit_margeffb1.margeff[3],3), '({})'.format(round(probit_margeffb1.margeff_se[3],3)),\
                    round(probit_margeffb1.pvalues[3],3), round(probit_margeffb1.results.nobs,3),\
                    round(probit_margeffb1.results.llf,3), 'R','PX']
        
    table['(2)'] = [' ', round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)),\
                    round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[4],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[4],3)), round(probit_margeff2.pvalues[4],3),\
                    round(probit_margeff2.margeff[3],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[3],3)), round(probit_margeff2.pvalues[3],3),\
                    round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\
                    ' ', ' ', round(probit_margeffb2.margeff[0],3), '({})'.format(round(probit_margeffb2.margeff_se[0],3)),\
                    round(probit_margeffb2.pvalues[0],3), round(probit_margeffb2.margeff[4],3), \
                    '({})'.format(round(probit_margeffb2.margeff_se[4],3)), round(probit_margeffb2.pvalues[4],3),\
                    round(probit_margeffb2.margeff[3],3), '({})'.format(round(probit_margeffb2.margeff_se[3],3)),\
                    round(probit_margeffb2.pvalues[3],3), round(probit_margeffb2.results.nobs,3),\
                    round(probit_margeffb2.results.llf,3), \
                    'R','PX, AD' ]
        
    table['(3)'] = [' ', round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)),\
                    round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[4],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[4],3)), round(probit_margeff3.pvalues[4],3),\
                    round(probit_margeff3.margeff[3],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[3],3)), round(probit_margeff3.pvalues[3],3),\
                    round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\
                    ' ', ' ', round(probit_margeffb3.margeff[0],3), '({})'.format(round(probit_margeffb3.margeff_se[0],3)),\
                    round(probit_margeffb3.pvalues[0],3), round(probit_margeffb3.margeff[4],3), \
                    '({})'.format(round(probit_margeffb3.margeff_se[4],3)), round(probit_margeffb3.pvalues[4],3),\
                    round(probit_margeffb3.margeff[3],3), '({})'.format(round(probit_margeffb3.margeff_se[3],3)),\
                    round(probit_margeffb3.pvalues[3],3), round(probit_margeffb3.results.nobs,3),\
                    round(probit_margeffb3.results.llf,3),
                    'R,A,C,E,I','PX, AD' ]
        
    table['(4)'] = [' ', round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)),\
                    round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[4],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[4],3)), round(probit_margeff4.pvalues[4],3),\
                    round(probit_margeff4.margeff[3],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[3],3)), round(probit_margeff4.pvalues[3],3),\
                    round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\
                    ' ', ' ', round(probit_margeffb4.margeff[0],3), '({})'.format(round(probit_margeffb4.margeff_se[0],3)),\
                    round(probit_margeffb4.pvalues[0],3), round(probit_margeffb4.margeff[4],3), \
                    '({})'.format(round(probit_margeffb4.margeff_se[4],3)), round(probit_margeffb4.pvalues[4],3),\
                    round(probit_margeffb4.margeff[3],3), '({})'.format(round(probit_margeffb4.margeff_se[3],3)),\
                    round(probit_margeffb4.pvalues[3],3), round(probit_margeffb4.results.nobs,3),\
                    round(probit_margeffb4.results.llf,3),
                    'R,A,C,E,I','PX, AD, K' ]

    
    return table, model1, model2, model3, model4, model1_help, model2_help, model3_help, model4_help
        
    
    
    
    
    
    
예제 #11
0
def flip_bits(y, p):
    x = np.random.rand(y.shape[0], 1) < p
    y[x < p] = 1 - y[x < p]
    return y


n, d = 100, 2
data_x = np.random.randn(n, d)
w = np.random.randn(d, 1)
data_y = flip_bits((data_x @ w > 0), 0)

lam = 1e-2

# statsmodel.Probit
sm_probit_reg = Probit(exog=data_x, endog=data_y).fit(disp=0, method='bfgs')
sm_probit_prob = sm_probit_reg.predict(exog=data_x)

# Our Implementation:
probit_reg = ProbitReg()

# EM:
em_w, obj_trace_em = probit_reg.probreg_fit_em(data_x, data_y, lam)
em_ypred, em_prob = probit_reg.predict(data_x, em_w)

# gradient:
gradient_w, obj_trace_gradient = probit_reg.probit_reg_fit_gradient(
    data_x, data_y, lam)
gradient_ypred, gradient_prob = probit_reg.predict(data_x, gradient_w)

plt.figure()
예제 #12
0
from statsmodels.regression.linear_model import OLS
from statsmodels.discrete.discrete_model import Probit
from statsmodels.treatment.treatment_effects import (TreatmentEffect)

from .results import results_teffects as res_st

cur_dir = os.path.abspath(os.path.dirname(__file__))

file_name = 'cataneo2.csv'
file_path = os.path.join(cur_dir, 'results', file_name)

dta_cat = pd.read_csv(file_path)

formula = 'mbsmoke_ ~ mmarried_ + mage + mage2 + fbaby_ + medu'
res_probit = Probit.from_formula(formula, dta_cat).fit()

methods = [
    ("ra", res_st.results_ra),
    ("ipw", res_st.results_ipw),
    ("aipw", res_st.results_aipw),
    ("aipw_wls", res_st.results_aipw_wls),
    ("ipw_ra", res_st.results_ipwra),
]


class TestTEffects():
    @classmethod
    def setup_class(cls):
        formula_outcome = 'bweight ~ prenatal1_ + mmarried_ + mage + fbaby_'
        mod = OLS.from_formula(formula_outcome, dta_cat)
예제 #13
0
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit


# In[13]:


y = df["Outcome"]
x = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "DiabetesPedigreeFunction"]]

logit_model = sm.Logit(y,x)
result=logit_model.fit()
print(result.summary())


# # Probit Regression

# In[14]:


probitmodel = Probit(y,x)
probit_model = probitmodel.fit()

print(probit_model.summary())

예제 #14
0
    def __init__(self,
                 endog,
                 exog,
                 exog_infl=None,
                 offset=None,
                 inflation='logit',
                 exposure=None,
                 missing='none',
                 **kwargs):
        super(GenericZeroInflated, self).__init__(endog,
                                                  exog,
                                                  offset=offset,
                                                  exposure=exposure,
                                                  missing=missing,
                                                  **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self._no_exog_infl = True
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]
            self._no_exog_infl = False

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                     self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise ValueError("inflation == %s, which is not handled" %
                             inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError(
                'exog and exog_infl have different number of'
                'observation. `missing` handling is not supported')

        infl_names = [
            'inflate_%s' % i for i in self.model_infl.data.param_names
        ]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']
예제 #15
0
debug = raw_input("please attach to pid:{},then press any key".format(
    os.getpid()))
modp = Poisson(y, x)
resp = modp.fit()
print(resp.params)

mod = PoissonPenalized(y, x)
res = mod.fit(method='bfgs', maxiter=1000)
print(res.params)

############### Penalized Probit
y_star = linpred + 0.25 * np.random.randn(nobs)
y2 = (y_star > 0.75).astype(float)
y_star.mean(), y2.mean()

res0 = Probit(y2, x).fit()
print(res0.summary())
res_oracle = Probit(y2, x[:, :k_nonzero]).fit()
print(res_oracle.params)

res_oracle.pred_table()
margeff = res_oracle.get_margeff()
print(margeff.summary())

modl = ProbitPenalized(y2, x)
modl.penal.tau = 0
resl = modl.fit(method='newton', disp=True)
print(resl.params)
print(resl.params - res0.params)

res_regl = Probit(y2, x).fit_regularized(alpha=10)
예제 #16
0
class GenericZeroInflated(CountModel):
    __doc__ = """
    Generiz Zero Inflated model for count data

    %(params)s
    %(extra_params)s

    Attributes
    -----------
    endog : array
        A reference to the endogenous response variable
    exog : array
        A reference to the exogenous design.
    exog_infl: array
        A reference to the zero-inflated exogenous design.
    """ % {'params' : base._model_params_doc,
           'extra_params' : _doc_zi_params + base._missing_param_doc}

    def __init__(self, endog, exog, exog_infl=None, offset=None,
                 inflation='logit', exposure=None, missing='none', **kwargs):
        super(GenericZeroInflated, self).__init__(endog, exog, offset=offset,
                                                  exposure=exposure,
                                                  missing=missing, **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise TypeError("inflation == %s, which is not handled"
                % inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError('exog and exog_infl have different number of'
                             'observation. `missing` handling is not supported')

        infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']

    def loglike(self, params):
        """
        Loglikelihood of Generic Zero Inflated model

        Parameters
        ----------
        params : array-like
            The parameters of the model.

        Returns
        -------
        loglike : float
            The log-likelihood function of the model evaluated at `params`.
            See notes.

        Notes
        --------
        .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model})
            where P - pdf of main model, L - loglike function of main model.

        """
        return np.sum(self.loglikeobs(params))

    def loglikeobs(self, params):
        """
        Loglikelihood for observations of Generic Zero Inflated model

        Parameters
        ----------
        params : array-like
            The parameters of the model.

        Returns
        -------
        loglike : ndarray
            The log likelihood for each observation of the model evaluated
            at `params`. See Notes

        Notes
        --------
        .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\ln(1-w_{i})+L_{main\\_model}
            where P - pdf of main model, L - loglike function of main model.

        for observations :math:`i=1,...,n`

        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)

        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        llf_main = self.model_main.loglikeobs(params_main)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        llf = np.zeros_like(y, dtype=np.float64)
        llf[zero_idx] = (np.log(w[zero_idx] +
            (1 - w[zero_idx]) * np.exp(llf_main[zero_idx])))
        llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx]

        return llf

    def fit(self, start_params=None, method='bfgs', maxiter=35,
            full_output=1, disp=1, callback=None,
            cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs):
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self._get_start_params()

        if callback is None:
            # work around perfect separation callback #3895
            callback = lambda *x: x

        mlefit = super(GenericZeroInflated, self).fit(start_params=start_params,
                       maxiter=maxiter, disp=disp, method=method,
                       full_output=full_output, callback=callback,
                       **kwargs)

        zipfit = self.result_class(self, mlefit._results)
        result = self.result_class_wrapper(zipfit)

        if cov_kwds is None:
            cov_kwds = {}

        result._get_robustcov_results(cov_type=cov_type,
                                      use_self=True, use_t=use_t, **cov_kwds)
        return result

    fit.__doc__ = DiscreteModel.fit.__doc__

    def fit_regularized(self, start_params=None, method='l1',
            maxiter='defined_by_method', full_output=1, disp=1, callback=None,
            alpha=0, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=1e-4,
            qc_tol=0.03, **kwargs):

        if np.size(alpha) == 1 and alpha != 0:
            k_params = self.k_exog + self.k_inflate
            alpha = alpha * np.ones(k_params)

        extra = self.k_extra - self.k_inflate
        alpha_p = alpha[:-(self.k_extra - extra)] if (self.k_extra
            and np.size(alpha) > 1) else alpha
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self.model_main.fit_regularized(
                start_params=start_params, method=method, maxiter=maxiter,
                full_output=full_output, disp=0, callback=callback,
                alpha=alpha_p, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol,
                size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs).params
            start_params = np.append(np.ones(self.k_inflate), start_params)
        cntfit = super(CountModel, self).fit_regularized(
                start_params=start_params, method=method, maxiter=maxiter,
                full_output=full_output, disp=disp, callback=callback,
                alpha=alpha, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol,
                size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs)

        if method in ['l1', 'l1_cvxopt_cp']:
            discretefit = self.result_class_reg(self, cntfit)
        else:
            raise TypeError(
                    "argument method == %s, which is not handled" % method)

        return self.result_class_reg_wrapper(discretefit)

    fit_regularized.__doc__ = DiscreteModel.fit_regularized.__doc__

    def score_obs(self, params):
        """
        Generic Zero Inflated model score (gradient) vector of the log-likelihood

        Parameters
        ----------
        params : array-like
            The parameters of the model

        Returns
        -------
        score : ndarray, 1-D
            The score vector of the model, i.e. the first derivative of the
            loglikelihood function, evaluated at `params`
        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        mu = self.model_main.predict(params_main)

        dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64)
        dldw = np.zeros_like(self.exog_infl, dtype=np.float64)

        dldp[zero_idx,:] = (score_main[zero_idx].T *
                     (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T
        dldp[nonzero_idx,:] = score_main[nonzero_idx]

        if self.inflation == 'logit':
            dldw[zero_idx,:] =  (self.exog_infl[zero_idx].T * w[zero_idx] *
                                 (1 - w[zero_idx]) *
                                 (1 - np.exp(llf_main[zero_idx])) /
                                  np.exp(llf[zero_idx])).T
            dldw[nonzero_idx,:] = -(self.exog_infl[nonzero_idx].T *
                                    w[nonzero_idx]).T
        elif self.inflation == 'probit':
            return approx_fprime(params, self.loglikeobs)

        return np.hstack((dldw, dldp))

    def score(self, params):
        return self.score_obs(params).sum(0)

    def _hessian_main(self, params):
        pass

    def _hessian_logit(self, params):
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate))

        pmf = np.exp(llf)

        #d2l/dw2
        for i in range(self.k_inflate):
            for j in range(i, -1, -1):
                hess_arr[i, j] = ((
                    self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] *
                    (w[zero_idx] * (1 - w[zero_idx]) * ((1 -
                    np.exp(llf_main[zero_idx])) * (1 - 2 * w[zero_idx]) *
                    np.exp(llf[zero_idx]) - (w[zero_idx] - w[zero_idx]**2) *
                    (1 - np.exp(llf_main[zero_idx]))**2) /
                    pmf[zero_idx]**2)).sum() -
                    (self.exog_infl[nonzero_idx, i] * self.exog_infl[nonzero_idx, j] *
                    w[nonzero_idx] * (1 - w[nonzero_idx])).sum())

        #d2l/dpdw
        for i in range(self.k_inflate):
            for j in range(self.k_exog):
                hess_arr[i, j + self.k_inflate] = -(score_main[zero_idx, j] *
                    w[zero_idx] * (1 - w[zero_idx]) *
                    self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum()

        return hess_arr

    def _hessian_probit(self, params):
        pass

    def hessian(self, params):
        """
        Generic Zero Inflated model Hessian matrix of the loglikelihood

        Parameters
        ----------
        params : array-like
            The parameters of the model

        Returns
        -------
        hess : ndarray, (k_vars, k_vars)
            The Hessian, second derivative of loglikelihood function,
            evaluated at `params`

        Notes
        -----
        """
        hess_arr_main = self._hessian_main(params)
        hess_arr_infl = self._hessian_inflate(params)

        if hess_arr_main is None or hess_arr_infl is None:
            return approx_hess(params, self.loglike)

        dim = self.k_exog + self.k_inflate

        hess_arr = np.zeros((dim, dim))

        hess_arr[:self.k_inflate,:] = hess_arr_infl
        hess_arr[self.k_inflate:,self.k_inflate:] = hess_arr_main

        tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1)
        hess_arr[tri_idx] = hess_arr.T[tri_idx]

        return hess_arr

    def predict(self, params, exog=None, exog_infl=None, exposure=None,
                offset=None, which='mean'):
        """
        Predict response variable of a count model given exogenous variables.

        Parameters
        ----------
        params : array-like
            The parameters of the model
        exog : array, optional
            A reference to the exogenous design.
            If not assigned, will be used exog from fitting.
        exog_infl : array, optional
            A reference to the zero-inflated exogenous design.
            If not assigned, will be used exog from fitting.
        offset : array, optional
            Offset is added to the linear prediction with coefficient equal to 1.
        exposure : array, optional
            Log(exposure) is added to the linear prediction with coefficient
            equal to 1. If exposure is specified, then it will be logged by the method.
            The user does not need to log it first.
        which : string, optional
            Define values that will be predicted.
            'mean', 'mean-main', 'linear', 'mean-nonzero', 'prob-zero, 'prob', 'prob-main'
            Default is 'mean'.

        Notes
        -----
        """
        if exog is None:
            exog = self.exog

        if exog_infl is None:
            exog_infl = self.exog_infl

        if exposure is None:
            exposure = getattr(self, 'exposure', 0)
        else:
            exposure = np.log(exposure)

        if offset is None:
            offset = 0

        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        prob_main = 1 - self.model_infl.predict(params_infl, exog_infl)

        lin_pred = np.dot(exog, params_main[:self.exog.shape[1]]) + exposure + offset

        # Refactor: This is pretty hacky,
        # there should be an appropriate predict method in model_main
        # this is just prob(y=0 | model_main)
        tmp_exog = self.model_main.exog
        tmp_endog = self.model_main.endog
        tmp_offset = getattr(self.model_main, 'offset', ['no'])
        tmp_exposure = getattr(self.model_main, 'exposure', ['no'])
        self.model_main.exog = exog
        self.model_main.endog = np.zeros((exog.shape[0]))
        self.model_main.offset = offset
        self.model_main.exposure = exposure
        llf = self.model_main.loglikeobs(params_main)
        self.model_main.exog = tmp_exog
        self.model_main.endog = tmp_endog
        # tmp_offset might be an array with elementwise equality testing
        if len(tmp_offset) == 1 and tmp_offset[0] == 'no':
            del self.model_main.offset
        else:
            self.model_main.offset = tmp_offset
        if len(tmp_exposure) == 1 and tmp_exposure[0] == 'no':
            del self.model_main.exposure
        else:
            self.model_main.exposure = tmp_exposure
        # end hack

        prob_zero = (1 - prob_main) + prob_main * np.exp(llf)

        if which == 'mean':
            return prob_main * np.exp(lin_pred)
        elif which == 'mean-main':
            return np.exp(lin_pred)
        elif which == 'linear':
            return lin_pred
        elif which == 'mean-nonzero':
            return prob_main * np.exp(lin_pred) / (1 - prob_zero)
        elif which == 'prob-zero':
            return prob_zero
        elif which == 'prob-main':
            return prob_main
        elif which == 'prob':
            return self._predict_prob(params, exog, exog_infl, exposure, offset)
        else:
            raise ValueError('which = %s is not available' % which)
예제 #17
0
class GenericZeroInflated(CountModel):
    __doc__ = """
    Generiz Zero Inflated model for count data

    %(params)s
    %(extra_params)s

    Attributes
    ----------
    endog : array
        A reference to the endogenous response variable
    exog : array
        A reference to the exogenous design.
    exog_infl: array
        A reference to the zero-inflated exogenous design.
    """ % {
        'params': base._model_params_doc,
        'extra_params': _doc_zi_params + base._missing_param_doc
    }

    def __init__(self,
                 endog,
                 exog,
                 exog_infl=None,
                 offset=None,
                 inflation='logit',
                 exposure=None,
                 missing='none',
                 **kwargs):
        super(GenericZeroInflated, self).__init__(endog,
                                                  exog,
                                                  offset=offset,
                                                  exposure=exposure,
                                                  missing=missing,
                                                  **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                     self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise ValueError("inflation == %s, which is not handled" %
                             inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError(
                'exog and exog_infl have different number of'
                'observation. `missing` handling is not supported')

        infl_names = [
            'inflate_%s' % i for i in self.model_infl.data.param_names
        ]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']

    def loglike(self, params):
        """
        Loglikelihood of Generic Zero Inflated model

        Parameters
        ----------
        params : array_like
            The parameters of the model.

        Returns
        -------
        loglike : float
            The log-likelihood function of the model evaluated at `params`.
            See notes.

        Notes
        --------
        .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model})
            where P - pdf of main model, L - loglike function of main model.

        """
        return np.sum(self.loglikeobs(params))

    def loglikeobs(self, params):
        """
        Loglikelihood for observations of Generic Zero Inflated model

        Parameters
        ----------
        params : array_like
            The parameters of the model.

        Returns
        -------
        loglike : ndarray
            The log likelihood for each observation of the model evaluated
            at `params`. See Notes

        Notes
        --------
        .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+
            \\ln(1-w_{i})+L_{main\\_model}
            where P - pdf of main model, L - loglike function of main model.

        for observations :math:`i=1,...,n`

        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)

        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        llf_main = self.model_main.loglikeobs(params_main)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        llf = np.zeros_like(y, dtype=np.float64)
        llf[zero_idx] = (
            np.log(w[zero_idx] +
                   (1 - w[zero_idx]) * np.exp(llf_main[zero_idx])))
        llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx]

        return llf

    def fit(self,
            start_params=None,
            method='bfgs',
            maxiter=35,
            full_output=1,
            disp=1,
            callback=None,
            cov_type='nonrobust',
            cov_kwds=None,
            use_t=None,
            **kwargs):
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self._get_start_params()

        if callback is None:
            # work around perfect separation callback #3895
            callback = lambda *x: x

        mlefit = super(GenericZeroInflated,
                       self).fit(start_params=start_params,
                                 maxiter=maxiter,
                                 disp=disp,
                                 method=method,
                                 full_output=full_output,
                                 callback=callback,
                                 **kwargs)

        zipfit = self.result_class(self, mlefit._results)
        result = self.result_class_wrapper(zipfit)

        if cov_kwds is None:
            cov_kwds = {}

        result._get_robustcov_results(cov_type=cov_type,
                                      use_self=True,
                                      use_t=use_t,
                                      **cov_kwds)
        return result

    fit.__doc__ = DiscreteModel.fit.__doc__

    def fit_regularized(self,
                        start_params=None,
                        method='l1',
                        maxiter='defined_by_method',
                        full_output=1,
                        disp=1,
                        callback=None,
                        alpha=0,
                        trim_mode='auto',
                        auto_trim_tol=0.01,
                        size_trim_tol=1e-4,
                        qc_tol=0.03,
                        **kwargs):

        _validate_l1_method(method)

        if np.size(alpha) == 1 and alpha != 0:
            k_params = self.k_exog + self.k_inflate
            alpha = alpha * np.ones(k_params)

        extra = self.k_extra - self.k_inflate
        alpha_p = alpha[:-(self.k_extra - extra)] if (
            self.k_extra and np.size(alpha) > 1) else alpha
        if start_params is None:
            offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0)
            if np.size(offset) == 1 and offset == 0:
                offset = None
            start_params = self.model_main.fit_regularized(
                start_params=start_params,
                method=method,
                maxiter=maxiter,
                full_output=full_output,
                disp=0,
                callback=callback,
                alpha=alpha_p,
                trim_mode=trim_mode,
                auto_trim_tol=auto_trim_tol,
                size_trim_tol=size_trim_tol,
                qc_tol=qc_tol,
                **kwargs).params
            start_params = np.append(np.ones(self.k_inflate), start_params)
        cntfit = super(CountModel,
                       self).fit_regularized(start_params=start_params,
                                             method=method,
                                             maxiter=maxiter,
                                             full_output=full_output,
                                             disp=disp,
                                             callback=callback,
                                             alpha=alpha,
                                             trim_mode=trim_mode,
                                             auto_trim_tol=auto_trim_tol,
                                             size_trim_tol=size_trim_tol,
                                             qc_tol=qc_tol,
                                             **kwargs)

        discretefit = self.result_class_reg(self, cntfit)
        return self.result_class_reg_wrapper(discretefit)

    fit_regularized.__doc__ = DiscreteModel.fit_regularized.__doc__

    def score_obs(self, params):
        """
        Generic Zero Inflated model score (gradient) vector of the log-likelihood

        Parameters
        ----------
        params : array_like
            The parameters of the model

        Returns
        -------
        score : ndarray, 1-D
            The score vector of the model, i.e. the first derivative of the
            loglikelihood function, evaluated at `params`
        """
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        mu = self.model_main.predict(params_main)

        dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64)
        dldw = np.zeros_like(self.exog_infl, dtype=np.float64)

        dldp[zero_idx, :] = (score_main[zero_idx].T *
                             (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T
        dldp[nonzero_idx, :] = score_main[nonzero_idx]

        if self.inflation == 'logit':
            dldw[zero_idx, :] = (self.exog_infl[zero_idx].T * w[zero_idx] *
                                 (1 - w[zero_idx]) *
                                 (1 - np.exp(llf_main[zero_idx])) /
                                 np.exp(llf[zero_idx])).T
            dldw[nonzero_idx, :] = -(self.exog_infl[nonzero_idx].T *
                                     w[nonzero_idx]).T
        elif self.inflation == 'probit':
            return approx_fprime(params, self.loglikeobs)

        return np.hstack((dldw, dldp))

    def score(self, params):
        return self.score_obs(params).sum(0)

    def _hessian_main(self, params):
        pass

    def _hessian_logit(self, params):
        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        y = self.endog
        w = self.model_infl.predict(params_infl)
        w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps)
        score_main = self.model_main.score_obs(params_main)
        llf_main = self.model_main.loglikeobs(params_main)
        llf = self.loglikeobs(params)
        zero_idx = np.nonzero(y == 0)[0]
        nonzero_idx = np.nonzero(y)[0]

        hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate))

        pmf = np.exp(llf)

        #d2l/dw2
        for i in range(self.k_inflate):
            for j in range(i, -1, -1):
                hess_arr[i, j] = ((
                    self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] *
                    (w[zero_idx] * (1 - w[zero_idx]) *
                     ((1 - np.exp(llf_main[zero_idx])) *
                      (1 - 2 * w[zero_idx]) * np.exp(llf[zero_idx]) -
                      (w[zero_idx] - w[zero_idx]**2) *
                      (1 - np.exp(llf_main[zero_idx]))**2) / pmf[zero_idx]**2)
                ).sum() - (self.exog_infl[nonzero_idx, i] *
                           self.exog_infl[nonzero_idx, j] * w[nonzero_idx] *
                           (1 - w[nonzero_idx])).sum())

        #d2l/dpdw
        for i in range(self.k_inflate):
            for j in range(self.k_exog):
                hess_arr[i, j + self.k_inflate] = -(
                    score_main[zero_idx, j] * w[zero_idx] * (1 - w[zero_idx]) *
                    self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum()

        return hess_arr

    def _hessian_probit(self, params):
        pass

    def hessian(self, params):
        """
        Generic Zero Inflated model Hessian matrix of the loglikelihood

        Parameters
        ----------
        params : array_like
            The parameters of the model

        Returns
        -------
        hess : ndarray, (k_vars, k_vars)
            The Hessian, second derivative of loglikelihood function,
            evaluated at `params`

        Notes
        -----
        """
        hess_arr_main = self._hessian_main(params)
        hess_arr_infl = self._hessian_inflate(params)

        if hess_arr_main is None or hess_arr_infl is None:
            return approx_hess(params, self.loglike)

        dim = self.k_exog + self.k_inflate

        hess_arr = np.zeros((dim, dim))

        hess_arr[:self.k_inflate, :] = hess_arr_infl
        hess_arr[self.k_inflate:, self.k_inflate:] = hess_arr_main

        tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1)
        hess_arr[tri_idx] = hess_arr.T[tri_idx]

        return hess_arr

    def predict(self,
                params,
                exog=None,
                exog_infl=None,
                exposure=None,
                offset=None,
                which='mean'):
        """
        Predict response variable of a count model given exogenous variables.

        Parameters
        ----------
        params : array_like
            The parameters of the model
        exog : array, optional
            A reference to the exogenous design.
            If not assigned, will be used exog from fitting.
        exog_infl : array, optional
            A reference to the zero-inflated exogenous design.
            If not assigned, will be used exog from fitting.
        offset : array, optional
            Offset is added to the linear prediction with coefficient equal to 1.
        exposure : array, optional
            Log(exposure) is added to the linear prediction with coefficient
            equal to 1. If exposure is specified, then it will be logged by the method.
            The user does not need to log it first.
        which : string, optional
            Define values that will be predicted.
            'mean', 'mean-main', 'linear', 'mean-nonzero', 'prob-zero, 'prob', 'prob-main'
            Default is 'mean'.

        Notes
        -----
        """
        if exog is None:
            exog = self.exog

        if exog_infl is None:
            exog_infl = self.exog_infl

        if exposure is None:
            exposure = getattr(self, 'exposure', 0)
        else:
            exposure = np.log(exposure)

        if offset is None:
            offset = 0

        params_infl = params[:self.k_inflate]
        params_main = params[self.k_inflate:]

        prob_main = 1 - self.model_infl.predict(params_infl, exog_infl)

        lin_pred = np.dot(exog,
                          params_main[:self.exog.shape[1]]) + exposure + offset

        # Refactor: This is pretty hacky,
        # there should be an appropriate predict method in model_main
        # this is just prob(y=0 | model_main)
        tmp_exog = self.model_main.exog
        tmp_endog = self.model_main.endog
        tmp_offset = getattr(self.model_main, 'offset', ['no'])
        tmp_exposure = getattr(self.model_main, 'exposure', ['no'])
        self.model_main.exog = exog
        self.model_main.endog = np.zeros((exog.shape[0]))
        self.model_main.offset = offset
        self.model_main.exposure = exposure
        llf = self.model_main.loglikeobs(params_main)
        self.model_main.exog = tmp_exog
        self.model_main.endog = tmp_endog
        # tmp_offset might be an array with elementwise equality testing
        if len(tmp_offset) == 1 and tmp_offset[0] == 'no':
            del self.model_main.offset
        else:
            self.model_main.offset = tmp_offset
        if len(tmp_exposure) == 1 and tmp_exposure[0] == 'no':
            del self.model_main.exposure
        else:
            self.model_main.exposure = tmp_exposure
        # end hack

        prob_zero = (1 - prob_main) + prob_main * np.exp(llf)

        if which == 'mean':
            return prob_main * np.exp(lin_pred)
        elif which == 'mean-main':
            return np.exp(lin_pred)
        elif which == 'linear':
            return lin_pred
        elif which == 'mean-nonzero':
            return prob_main * np.exp(lin_pred) / (1 - prob_zero)
        elif which == 'prob-zero':
            return prob_zero
        elif which == 'prob-main':
            return prob_main
        elif which == 'prob':
            return self._predict_prob(params, exog, exog_infl, exposure,
                                      offset)
        else:
            raise ValueError('which = %s is not available' % which)
예제 #18
0
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('F3.csv')
X = dataset.iloc[:, 0:3].values
y = dataset.iloc[:, 3].values

y = (y == 1)

from statsmodels.discrete.discrete_model import Probit

import statsmodels.api as sm

X = sm.add_constant(X)

model = Probit(y, X.astype(float))
probit_model = model.fit()

print(probit_model.summary())

print('-------------------- Predict ---------------------')
dataset = pd.read_csv('selectedSimEval.txt')

AX = dataset.iloc[:, 1:4].values

AX = sm.add_constant(AX)

Ay_pred = probit_model.predict(AX)

Ay_pred = Ay_pred.reshape(len(Ay_pred), 1)