def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise TypeError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError('exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl']
def probit_reg(x, y): """Univariate probit regression""" x = np.append(np.ones(10).reshape(-1, 1), x.reshape(-1, 1), axis=1).reshape(len(x), 2) pm = Probit(y, x) return pm.fit().params
class GenericZeroInflated(CountModel): __doc__ = """ Generic Zero Inflated Model %(params)s %(extra_params)s Attributes ---------- endog : ndarray A reference to the endogenous response variable exog : ndarray A reference to the exogenous design. exog_infl : ndarray A reference to the zero-inflated exogenous design. """ % {'params' : base._model_params_doc, 'extra_params' : _doc_zi_params + base._missing_param_doc} def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self._no_exog_infl = True self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] self._no_exog_infl = False if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise ValueError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError('exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl'] def _get_exogs(self): """list of exogs, for internal use in post-estimation """ return (self.exog, self.exog_infl) def loglike(self, params): """ Loglikelihood of Generic Zero Inflated model. Parameters ---------- params : array_like The parameters of the model. Returns ------- loglike : float The log-likelihood function of the model evaluated at `params`. See notes. Notes -------- .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model}) where P - pdf of main model, L - loglike function of main model. """ return np.sum(self.loglikeobs(params)) def loglikeobs(self, params): """ Loglikelihood for observations of Generic Zero Inflated model. Parameters ---------- params : array_like The parameters of the model. Returns ------- loglike : ndarray The log likelihood for each observation of the model evaluated at `params`. See Notes for definition. Notes -------- .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\ln(1-w_{i})+L_{main\\_model} where P - pdf of main model, L - loglike function of main model. for observations :math:`i=1,...,n` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) llf_main = self.model_main.loglikeobs(params_main) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] llf = np.zeros_like(y, dtype=np.float64) llf[zero_idx] = (np.log(w[zero_idx] + (1 - w[zero_idx]) * np.exp(llf_main[zero_idx]))) llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx] return llf @Appender(DiscreteModel.fit.__doc__) def fit(self, start_params=None, method='bfgs', maxiter=35, full_output=1, disp=1, callback=None, cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs): if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self._get_start_params() if callback is None: # work around perfect separation callback #3895 callback = lambda *x: x mlefit = super(GenericZeroInflated, self).fit(start_params=start_params, maxiter=maxiter, disp=disp, method=method, full_output=full_output, callback=callback, **kwargs) zipfit = self.result_class(self, mlefit._results) result = self.result_class_wrapper(zipfit) if cov_kwds is None: cov_kwds = {} result._get_robustcov_results(cov_type=cov_type, use_self=True, use_t=use_t, **cov_kwds) return result @Appender(DiscreteModel.fit_regularized.__doc__) def fit_regularized(self, start_params=None, method='l1', maxiter='defined_by_method', full_output=1, disp=1, callback=None, alpha=0, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=1e-4, qc_tol=0.03, **kwargs): _validate_l1_method(method) if np.size(alpha) == 1 and alpha != 0: k_params = self.k_exog + self.k_inflate alpha = alpha * np.ones(k_params) extra = self.k_extra - self.k_inflate alpha_p = alpha[:-(self.k_extra - extra)] if (self.k_extra and np.size(alpha) > 1) else alpha if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self.model_main.fit_regularized( start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=0, callback=callback, alpha=alpha_p, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs).params start_params = np.append(np.ones(self.k_inflate), start_params) cntfit = super(CountModel, self).fit_regularized( start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=disp, callback=callback, alpha=alpha, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs) discretefit = self.result_class_reg(self, cntfit) return self.result_class_reg_wrapper(discretefit) def score_obs(self, params): """ Generic Zero Inflated model score (gradient) vector of the log-likelihood Parameters ---------- params : array_like The parameters of the model Returns ------- score : ndarray, 1-D The score vector of the model, i.e. the first derivative of the loglikelihood function, evaluated at `params` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] mu = self.model_main.predict(params_main) # TODO: need to allow for complex to use CS numerical derivatives dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64) dldw = np.zeros_like(self.exog_infl, dtype=np.float64) dldp[zero_idx,:] = (score_main[zero_idx].T * (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T dldp[nonzero_idx,:] = score_main[nonzero_idx] if self.inflation == 'logit': dldw[zero_idx,:] = (self.exog_infl[zero_idx].T * w[zero_idx] * (1 - w[zero_idx]) * (1 - np.exp(llf_main[zero_idx])) / np.exp(llf[zero_idx])).T dldw[nonzero_idx,:] = -(self.exog_infl[nonzero_idx].T * w[nonzero_idx]).T elif self.inflation == 'probit': return approx_fprime(params, self.loglikeobs) return np.hstack((dldw, dldp)) def score(self, params): return self.score_obs(params).sum(0) def _hessian_main(self, params): pass def _hessian_logit(self, params): params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate)) pmf = np.exp(llf) #d2l/dw2 for i in range(self.k_inflate): for j in range(i, -1, -1): hess_arr[i, j] = (( self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] * (w[zero_idx] * (1 - w[zero_idx]) * ((1 - np.exp(llf_main[zero_idx])) * (1 - 2 * w[zero_idx]) * np.exp(llf[zero_idx]) - (w[zero_idx] - w[zero_idx]**2) * (1 - np.exp(llf_main[zero_idx]))**2) / pmf[zero_idx]**2)).sum() - (self.exog_infl[nonzero_idx, i] * self.exog_infl[nonzero_idx, j] * w[nonzero_idx] * (1 - w[nonzero_idx])).sum()) #d2l/dpdw for i in range(self.k_inflate): for j in range(self.k_exog): hess_arr[i, j + self.k_inflate] = -(score_main[zero_idx, j] * w[zero_idx] * (1 - w[zero_idx]) * self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum() return hess_arr def _hessian_probit(self, params): pass def hessian(self, params): """ Generic Zero Inflated model Hessian matrix of the loglikelihood Parameters ---------- params : array_like The parameters of the model Returns ------- hess : ndarray, (k_vars, k_vars) The Hessian, second derivative of loglikelihood function, evaluated at `params` Notes ----- """ hess_arr_main = self._hessian_main(params) hess_arr_infl = self._hessian_inflate(params) if hess_arr_main is None or hess_arr_infl is None: return approx_hess(params, self.loglike) dim = self.k_exog + self.k_inflate hess_arr = np.zeros((dim, dim)) hess_arr[:self.k_inflate,:] = hess_arr_infl hess_arr[self.k_inflate:,self.k_inflate:] = hess_arr_main tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1) hess_arr[tri_idx] = hess_arr.T[tri_idx] return hess_arr def predict(self, params, exog=None, exog_infl=None, exposure=None, offset=None, which='mean', y_values=None): """ Predict response variable or other statistic given exogenous variables. Parameters ---------- params : array_like The parameters of the model. exog : ndarray, optional Explanatory variables for the main count model. If ``exog`` is None, then the data from the model will be used. exog_infl : ndarray, optional Explanatory variables for the zero-inflation model. ``exog_infl`` has to be provided if ``exog`` was provided unless ``exog_infl`` in the model is only a constant. offset : ndarray, optional Offset is added to the linear predictor of the mean function with coefficient equal to 1. Default is zero if exog is not None, and the model offset if exog is None. exposure : ndarray, optional Log(exposure) is added to the linear predictor with coefficient equal to 1. If exposure is specified, then it will be logged by the method. The user does not need to log it first. Default is one if exog is is not None, and it is the model exposure if exog is None. which : str (optional) Statitistic to predict. Default is 'mean'. - 'mean' : the conditional expectation of endog E(y | x), i.e. exp of linear predictor. - 'linear' : the linear predictor of the mean function. - 'var' : returns the estimated variance of endog implied by the model. - 'mean-main' : mean of the main count model - 'prob-main' : probability of selecting the main model. The probability of zero inflation is ``1 - prob-main``. - 'mean-nonzero' : expected value conditional on having observation larger than zero, E(y | X, y>0) - 'prob-zero' : probability of observing a zero count. P(y=0 | x) - 'prob' : probabilities of each count from 0 to max(endog), or for y_values if those are provided. This is a multivariate return (2-dim when predicting for several observations). y_values : array_like Values of the random variable endog at which pmf is evaluated. Only used if ``which="prob"`` """ no_exog = False if exog is None: no_exog = True exog = self.exog if exog_infl is None: if no_exog: exog_infl = self.exog_infl else: if self._no_exog_infl: exog_infl = np.ones((len(exog), 1)) else: exog_infl = np.asarray(exog_infl) if exog_infl.ndim == 1 and self.k_inflate == 1: exog_infl = exog_infl[:, None] if exposure is None: if no_exog: exposure = getattr(self, 'exposure', 0) else: exposure = 0 else: exposure = np.log(exposure) if offset is None: if no_exog: offset = getattr(self, 'offset', 0) else: offset = 0 params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] prob_main = 1 - self.model_infl.predict(params_infl, exog_infl) lin_pred = np.dot(exog, params_main[:self.exog.shape[1]]) + exposure + offset # Refactor: This is pretty hacky, # there should be an appropriate predict method in model_main # this is just prob(y=0 | model_main) tmp_exog = self.model_main.exog tmp_endog = self.model_main.endog tmp_offset = getattr(self.model_main, 'offset', False) tmp_exposure = getattr(self.model_main, 'exposure', False) self.model_main.exog = exog self.model_main.endog = np.zeros((exog.shape[0])) self.model_main.offset = offset self.model_main.exposure = exposure llf = self.model_main.loglikeobs(params_main) self.model_main.exog = tmp_exog self.model_main.endog = tmp_endog # tmp_offset might be an array with elementwise equality testing #if np.size(tmp_offset) == 1 and tmp_offset[0] == 'no': if tmp_offset is False: del self.model_main.offset else: self.model_main.offset = tmp_offset #if np.size(tmp_exposure) == 1 and tmp_exposure[0] == 'no': if tmp_exposure is False: del self.model_main.exposure else: self.model_main.exposure = tmp_exposure # end hack prob_zero = (1 - prob_main) + prob_main * np.exp(llf) if which == 'mean': return prob_main * np.exp(lin_pred) elif which == 'mean-main': return np.exp(lin_pred) elif which == 'linear': return lin_pred elif which == 'mean-nonzero': return prob_main * np.exp(lin_pred) / (1 - prob_zero) elif which == 'prob-zero': return prob_zero elif which == 'prob-main': return prob_main elif which == 'var': mu = np.exp(lin_pred) return self._predict_var(params, mu, 1 - prob_main) elif which == 'prob': return self._predict_prob(params, exog, exog_infl, exposure, offset, y_values=y_values) else: raise ValueError('which = %s is not available' % which) def _derivative_predict(self, params, exog=None, transform='dydx'): """NotImplemented """ raise NotImplementedError def _derivative_exog(self, params, exog=None, transform="dydx", dummy_idx=None, count_idx=None): """NotImplemented """ raise NotImplementedError def _deriv_mean_dparams(self, params): """ Derivative of the expected endog with respect to the parameters. Parameters ---------- params : ndarray parameter at which score is evaluated Returns ------- The value of the derivative of the expected endog with respect to the parameter vector. """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) mu = self.model_main.predict(params_main) score_infl = self.model_infl._deriv_mean_dparams(params_infl) score_main = self.model_main._deriv_mean_dparams(params_main) dmat_infl = - mu[:, None] * score_infl dmat_main = (1 - w[:, None]) * score_main dmat = np.column_stack((dmat_infl, dmat_main)) return dmat def _deriv_score_obs_dendog(self, params): """derivative of score_obs w.r.t. endog Parameters ---------- params : ndarray parameter at which score is evaluated Returns ------- derivative : ndarray_2d The derivative of the score_obs with respect to endog. """ raise NotImplementedError # The below currently does not work, discontinuity at zero # see https://github.com/statsmodels/statsmodels/pull/7951#issuecomment-996355875 # noqa from statsmodels.tools.numdiff import _approx_fprime_scalar endog_original = self.endog def f(y): if y.ndim == 2 and y.shape[1] == 1: y = y[:, 0] self.endog = y self.model_main.endog = y sf = self.score_obs(params) self.endog = endog_original self.model_main.endog = endog_original return sf ds = _approx_fprime_scalar(self.endog[:, None], f, epsilon=1e-2) return ds
import pandas as pd import numpy as np import sklearn import matplotlib.pyplot as plt import seaborn as sns import numpy '''Probit analysis plus plotting 3D graph of hit rate distribution with respect to delta and theta''' from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # Receive Data data = pd.read_csv("HL.csv") print(data) col = ["delta", "epsilon", "cross_term"] dep_var = data["hits"].tolist() X = data[col] theta = data["theta"] z = Probit(dep_var, X) result = z.fit() print(result.summary()) z = np.array(data["hits"].tolist()) x = np.array(data["epsilon"].tolist()) y = np.array(data["delta"].tolist()) print(z) ax.scatter(x, y, z, s=1, c=None, depthshade=True) plt.show()
def multiple_estimate(g, zvec, ins, betas, model=3, estimator_types=[1, 2, 3, 4, 5], runs=1000, estoc_distr="Probit", estoc_params=[0, 1], silent=True, tau_param=0.5): N = g.number_of_nodes() if model == 3 and estoc_distr in ["Tau-Exposure", "Tau-Exposure-Binario"]: tau_param = ins[1] # Caso seja necessário o vetor frac if len([x for x in estimator_types if x in [2, 3, 4]]): # Vetor frac frac = np.empty(shape=(N)) for i in range(N): soma = np.float64(0) for k in g.neighbors(i): soma += np.float64(zvec[k]) frac[i] = soma / g.degree(i) # Vetor das features features = np.empty(shape=(N, 3)) for j in range(N): features[j][0] = 1 features[j][1] = zvec[j] features[j][2] = frac[j] # Caso seja o modelo tau if 5 in estimator_types: # Bitmaps c1 = np.zeros(shape=(N)) c0 = np.zeros(shape=(N)) # Divide os dois grupos for i in range(N): soma = np.float64(0.0) for k in g.neighbors(i): soma += np.float64(zvec[k]) frac = soma / g.degree(i) if zvec[i] == 0 and frac <= (1 - tau_param): c0[i] = 1 elif zvec[i] == 1 and frac >= tau_param: c1[i] = 1 # Tamanhos tam_c1 = int(sum(c1)) tam_c0 = int(sum(c0)) if tam_c0 == 0: tam_c0 = 1 if tam_c1 == 0: tam_c1 = 1 # Arrays para os resultados predicoes = np.empty(shape=(len(estimator_types), len(betas), runs)) ATE = np.empty(shape=(len(estimator_types), len(betas), runs)) # Gera os dados e coloca nos array for j in range(len(betas)): beta = betas[j] cons1 = 1.0 / (1.0 + np.exp(-sum(beta))) cons2 = 1.0 / (1.0 + np.exp(-beta[0])) # Estima for k in range(runs): if estoc_distr == "Logit": U = np.random.uniform(0.0, 1.0, N) yvec = np.array([ 1 * (U[x] < (1.0 / (1.0 + np.exp(-np.dot(features[x], beta))))) for x in range(N) ]) Zigual1 = np.array([1 * (U[x] < cons1) for x in range(N)]) Zigual0 = np.array([1 * (U[x] < cons2) for x in range(N)]) real = (sum(Zigual1) - sum(Zigual0)) / N else: U = np.random.normal(estoc_params[0], estoc_params[1], N) yvec = simulate(g, model, zvec, beta, ins, U) real = real_ATE(g, model, beta, ins, U) for i in range(len(estimator_types)): est_model = estimator_types[i] # SUTVA if est_model == 1: z1_count = 0 z0_count = 0 sum_resp_z1 = 0 sum_resp_z0 = 0 # Soma dos valores for l in range(N): if zvec[l] == 1: z1_count += 1 sum_resp_z1 += yvec[l] else: z0_count += 1 sum_resp_z0 += yvec[l] # Excessões if z1_count == 0: sum_resp_z1 = 0 z1_count = 1 if z0_count == 0: sum_resp_z0 = 0 z0_count = 1 predicoes[i][j][k] = (sum_resp_z1 / z1_count - sum_resp_z0 / z0_count) # Linear elif est_model == 2: lr = linear_model.LinearRegression().fit(features, yvec).coef_ predicoes[i][j][k] = (lr[1] + lr[2]) # Probit elif est_model == 3: vals = Probit(yvec, features).fit(disp=0).params predicoes[i][j][k] = (norm.cdf(sum(vals)) - norm.cdf(vals[0])) # Logit elif est_model == 4: vals = Logit(yvec, features).fit(disp=0).params predicoes[i][j][k] = ( (np.exp(-vals[0]) - np.exp(-sum(vals))) / ((1 + np.exp(-vals[0])) * (1 + np.exp(-sum(vals))))) # Tau exposure elif est_model == 5: soma_c1 = 0 soma_c0 = 0 for l in range(N): if c1[l] == 1: soma_c1 += yvec[l] elif c0[l] == 1: soma_c0 += yvec[l] predicoes[i][j][k] = (soma_c1 / tam_c1 - soma_c0 / tam_c0) ATE[i][j][k] = real if not silent: print("est: {}/{}| beta: {}/{}| rodada: {}/{}".format( i + 1, len(estimator_types), j + 1, len(betas), k + 1, runs)) return ([predicoes, ATE])
def estimate(g, zvec, yvec, est_model): N = len(zvec) # SUTVA if est_model == 1: z1 = 0 z0 = 0 sum_resp_z1 = 0 sum_resp_z0 = 0 # Soma dos valores for i in range(N): if zvec[i] == 1: z1 += 1 sum_resp_z1 += yvec[i] else: z0 += 1 sum_resp_z0 += yvec[i] # Excessões if z1 == 0: sum_resp_z1 = 0 z1 = 1 if z0 == 0: sum_resp_z0 = 0 z0 = 1 return (sum_resp_z1 / z1 - sum_resp_z0 / z0) # Vetor tau tau = np.empty(shape=(N)) for i in range(N): soma = 0.0 for k in g.neighbors(i): soma += np.float64(zvec[k]) tau[i] = soma / g.degree(i) # Vetor das features features = np.empty(shape=(N, 3)) for j in range(N): features[j][0] = 1 features[j][1] = zvec[j] features[j][2] = tau[j] # Linear if est_model == 2: lr = linear_model.LinearRegression().fit(features, yvec).coef_ return (lr[1] + lr[2]) # Probit if est_model == 3: vals = Probit(yvec, features).fit(disp=0).params return (norm.cdf(sum(vals)) - norm.cdf(vals[0])) # Logit if est_model == 4: vals = Logit(yvec, features).fit(disp=0).params return ((np.exp(-vals[0]) - np.exp(-sum(vals))) / ((1 + np.exp(-vals[0])) * (1 + np.exp(-sum(vals)))))
def main(): # Magic numbers dMux = 0 dSigmax = 1 dMuepsilon = 0 dSigmaepsilon = 1 dMueta = 0 dSigmaeta = 1 iNobs = 1000 vdBeta = np.array([1, 2]) vdZeta = np.array([3, 4]) vdDezinho = np.array([0]) iSeed = 6969 iNgroups = 11 iIter = 1000 # Initialisation np.random.seed(iSeed) vdBeta = np.array(vdBeta).reshape(-1, 1) vdZeta = np.array(vdZeta).reshape(-1, 1) iLenbeta = len(vdBeta) # Start the iterations ## Create objects to store the ATE, variance, test statistics and R-Squares dvATE = np.ones(iIter) dvVar = np.ones(iIter) dvTtest = np.ones(iIter) dvRsquared = np.ones(iIter) for i in range(iIter - 1): mdX = fnGenX(iNobs, iLenbeta, dMux, dSigmax) iLenX = mdX.shape[1] vdEpsilon = fnGenError(iNobs, dMuepsilon, dSigmaepsilon) vdPstar = fnGenPstar(mdX, vdBeta, vdEpsilon) vdD = fnGenTreat(vdPstar) vdEta = fnGenError(iNobs, dMueta, dSigmaeta) vdY = fnGenY(vdD, vdDezinho, mdX, vdZeta, vdEta) ## Create a dataframe with everything together ### This is not good because of the names, if we change the size of X then we need to manually change this, but I can check later how to make this better if needed dfData = pd.DataFrame(np.hstack([vdY, vdD, mdX]), columns=['vdY', 'vdD', 'vdX1', 'vdX2']) dfData["vdD"] = dfData["vdD"] == 1 ### Can work out later in a better layout for these descriptives #print dfData.groupby('vdD').describe().unstack(1).reset_index() # Estimation model = Probit(dfData['vdD'], dfData[dfData.columns[-mdX.shape[1]:]].copy()) probit_model = model.fit() #print(probit_model.summary()) dRsquare = probit_model.prsquared # Get the predicted probabilities vdProbs = probit_model.predict( dfData[dfData.columns[-mdX.shape[1]:]].copy()) ## Looking at the estimated probabilities #plt.figure(figsize=[10,8]) #n, bins, patches = plt.hist(x=vdProbs, bins=8, color='#0504aa',alpha=0.7, rwidth=0.85) #plt.grid(axis='y', alpha=0.75) #plt.xlabel('Value',fontsize=15) #plt.ylabel('Frequency',fontsize=15) #plt.xticks(fontsize=15) #plt.yticks(fontsize=15) #plt.ylabel('Frequency',fontsize=15) #plt.title('Propensity Score Histogram',fontsize=15) #plt.show() ## Building the groups vdGroups = np.linspace(0, 1, iNgroups) ## Putting back Y, treatment and the propensity score dfFinalData = pd.DataFrame(np.hstack( [vdY, vdD, vdProbs.reshape(-1, 1)]), columns=['vdY', 'vdD', 'vdPS']) #dfGroup1 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[0]) & (dfFinalData['vdPS'] < vdGroups[1])] dfGroup2 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[1]) & (dfFinalData['vdPS'] < vdGroups[2])] dfGroup3 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[2]) & (dfFinalData['vdPS'] < vdGroups[3])] dfGroup4 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[3]) & (dfFinalData['vdPS'] < vdGroups[4])] dfGroup5 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[4]) & (dfFinalData['vdPS'] < vdGroups[5])] dfGroup6 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[5]) & (dfFinalData['vdPS'] < vdGroups[6])] dfGroup7 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[6]) & (dfFinalData['vdPS'] < vdGroups[7])] dfGroup8 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[7]) & (dfFinalData['vdPS'] < vdGroups[8])] dfGroup9 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[8]) & (dfFinalData['vdPS'] < vdGroups[9])] #dfGroup10 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[9]) & (dfFinalData['vdPS'] < vdGroups[10])] #dMean1 = dfGroup1.groupby('vdD').mean().iloc[1, 0] - dfGroup1.groupby('vdD').mean().iloc[0, 0] dMean2 = (dfGroup2.groupby('vdD').mean().iloc[1, 0] - dfGroup2.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup2.shape[0] / float(iNobs)) dMean3 = (dfGroup3.groupby('vdD').mean().iloc[1, 0] - dfGroup3.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup3.shape[0] / float(iNobs)) dMean4 = (dfGroup4.groupby('vdD').mean().iloc[1, 0] - dfGroup4.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup4.shape[0] / float(iNobs)) dMean5 = (dfGroup5.groupby('vdD').mean().iloc[1, 0] - dfGroup5.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup5.shape[0] / float(iNobs)) dMean6 = (dfGroup6.groupby('vdD').mean().iloc[1, 0] - dfGroup6.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup6.shape[0] / float(iNobs)) dMean7 = (dfGroup7.groupby('vdD').mean().iloc[1, 0] - dfGroup7.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup7.shape[0] / float(iNobs)) dMean8 = (dfGroup8.groupby('vdD').mean().iloc[1, 0] - dfGroup8.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup8.shape[0] / float(iNobs)) dMean9 = (dfGroup9.groupby('vdD').mean().iloc[1, 0] - dfGroup9.groupby('vdD').mean().iloc[0, 0]) * ( dfGroup9.shape[0] / float(iNobs)) #dMean10 = dfGroup10.groupby('vdD').mean().iloc[1, 0] - dfGroup10.groupby('vdD').mean().iloc[0, 0] dATE = dMean2 + dMean3 + dMean4 + dMean5 + dMean6 + dMean7 + dMean8 + dMean9 # Add an extra column with the mean of the corresponding treatment or no treatment inside the same block dfGroup2['vdYmean'] = dfGroup2.groupby("vdD")["vdY"].transform('mean') dfGroup3['vdYmean'] = dfGroup3.groupby("vdD")["vdY"].transform('mean') dfGroup4['vdYmean'] = dfGroup4.groupby("vdD")["vdY"].transform('mean') dfGroup5['vdYmean'] = dfGroup5.groupby("vdD")["vdY"].transform('mean') dfGroup6['vdYmean'] = dfGroup6.groupby("vdD")["vdY"].transform('mean') dfGroup7['vdYmean'] = dfGroup7.groupby("vdD")["vdY"].transform('mean') dfGroup8['vdYmean'] = dfGroup8.groupby("vdD")["vdY"].transform('mean') dfGroup9['vdYmean'] = dfGroup9.groupby("vdD")["vdY"].transform('mean') # Take the difference between the individual Y and the average of the corresponding group (by treated and non-treated) dfGroup2['dvDiffSquared'] = (dfGroup2['vdY'] - dfGroup2['vdYmean'])**2 dfGroup3['dvDiffSquared'] = (dfGroup3['vdY'] - dfGroup3['vdYmean'])**2 dfGroup4['dvDiffSquared'] = (dfGroup4['vdY'] - dfGroup4['vdYmean'])**2 dfGroup5['dvDiffSquared'] = (dfGroup5['vdY'] - dfGroup5['vdYmean'])**2 dfGroup6['dvDiffSquared'] = (dfGroup6['vdY'] - dfGroup6['vdYmean'])**2 dfGroup7['dvDiffSquared'] = (dfGroup7['vdY'] - dfGroup7['vdYmean'])**2 dfGroup8['dvDiffSquared'] = (dfGroup8['vdY'] - dfGroup8['vdYmean'])**2 dfGroup9['dvDiffSquared'] = (dfGroup9['vdY'] - dfGroup9['vdYmean'])**2 # For each line, add the number of individuals in the same treatment (or no treatment) group dfGroup2['iSizeGroup'] = dfGroup2.groupby("vdD")["vdY"].transform( 'count') dfGroup3['iSizeGroup'] = dfGroup3.groupby("vdD")["vdY"].transform( 'count') dfGroup4['iSizeGroup'] = dfGroup4.groupby("vdD")["vdY"].transform( 'count') dfGroup5['iSizeGroup'] = dfGroup5.groupby("vdD")["vdY"].transform( 'count') dfGroup6['iSizeGroup'] = dfGroup6.groupby("vdD")["vdY"].transform( 'count') dfGroup7['iSizeGroup'] = dfGroup7.groupby("vdD")["vdY"].transform( 'count') dfGroup8['iSizeGroup'] = dfGroup8.groupby("vdD")["vdY"].transform( 'count') dfGroup9['iSizeGroup'] = dfGroup9.groupby("vdD")["vdY"].transform( 'count') # Divide the squared difference by the square of the size of the corresponding group dfGroup2['dvDiffSquaredDivided'] = dfGroup2[ 'dvDiffSquared'] / dfGroup2['iSizeGroup']**2 dfGroup3['dvDiffSquaredDivided'] = dfGroup3[ 'dvDiffSquared'] / dfGroup3['iSizeGroup']**2 dfGroup4['dvDiffSquaredDivided'] = dfGroup4[ 'dvDiffSquared'] / dfGroup4['iSizeGroup']**2 dfGroup5['dvDiffSquaredDivided'] = dfGroup5[ 'dvDiffSquared'] / dfGroup5['iSizeGroup']**2 dfGroup6['dvDiffSquaredDivided'] = dfGroup6[ 'dvDiffSquared'] / dfGroup6['iSizeGroup']**2 dfGroup7['dvDiffSquaredDivided'] = dfGroup7[ 'dvDiffSquared'] / dfGroup7['iSizeGroup']**2 dfGroup8['dvDiffSquaredDivided'] = dfGroup8[ 'dvDiffSquared'] / dfGroup8['iSizeGroup']**2 dfGroup9['dvDiffSquaredDivided'] = dfGroup9[ 'dvDiffSquared'] / dfGroup9['iSizeGroup']**2 # Sum the V term for treated and non-treated individuals and multiply by the size of the block divided by population squared dVGroup2 = (dfGroup2.groupby("vdD").sum().iloc[1, 5] + dfGroup2.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup2.shape[0] / float(iNobs))**2) dVGroup3 = (dfGroup3.groupby("vdD").sum().iloc[1, 5] + dfGroup3.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup3.shape[0] / float(iNobs))**2) dVGroup4 = (dfGroup4.groupby("vdD").sum().iloc[1, 5] + dfGroup4.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup4.shape[0] / float(iNobs))**2) dVGroup5 = (dfGroup5.groupby("vdD").sum().iloc[1, 5] + dfGroup5.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup5.shape[0] / float(iNobs))**2) dVGroup6 = (dfGroup6.groupby("vdD").sum().iloc[1, 5] + dfGroup6.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup6.shape[0] / float(iNobs))**2) dVGroup7 = (dfGroup7.groupby("vdD").sum().iloc[1, 5] + dfGroup7.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup7.shape[0] / float(iNobs))**2) dVGroup8 = (dfGroup8.groupby("vdD").sum().iloc[1, 5] + dfGroup8.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup8.shape[0] / float(iNobs))**2) dVGroup9 = (dfGroup9.groupby("vdD").sum().iloc[1, 5] + dfGroup9.groupby("vdD").sum().iloc[0, 5]) * ( (dfGroup9.shape[0] / float(iNobs))**2) # Compute the variance dVar = dVGroup2 + dVGroup3 + dVGroup4 + dVGroup5 + dVGroup6 + dVGroup7 + dVGroup8 + dVGroup9 # Output #print ("ATE= %g" % dATE) #print ("Estimated Variance = %g" % dVar) # Compute the test statistic dTTest = dATE / (math.sqrt(dVar / iNobs)) # Store results dvATE[i] = dATE dvVar[i] = dVar dvTtest[i] = dTTest dvRsquared[i] = dRsquare # Report results pd.DataFrame(stats.describe(dvATE[:-1]))
class ProbitRegression(Learner): """ The probit regression learning algorithm. Given data, this class constructs and stores a probability unit regression mdl that can be used to quantify the probability of testing data-points taking on certain class values. """ def __init__(self, alpha: float, **params: any): """ Initialises the Probit regression algorithm. :param alpha: regularization term alpha. :param params: Ignored. """ super().__init__(**params) self.name = 'Probit Regression' self.alpha = alpha self.gamma = 0.5 self.add_intercept = True self.binary_points = True self.beta = list() self.data: Optional[RecordSet] = None self.model: Optional[Probit] = None # will be set during fit def fit(self, rs: RecordSet) -> None: """ fit a Probit regression mdl :param rs: The record set to fit with. """ # set params self.data = cp.deepcopy(rs) patterns = self.data.entries[:, :-1] out = self.data.entries[:, -1:] if self.add_intercept: intercept = np.ones((patterns.shape[0], 1)) patterns = np.hstack((intercept, patterns)) # avoid error if self.alpha == 0: raise Exception("Alpha Probit too low to obtain reliable results") self.model = Probit(endog=out.ravel(), exog=patterns) self.model = self.model.fit_regularized(alpha=self.alpha, maxiter=10e5, disp=False) def predict(self, rs: RecordSet) -> np.ndarray: """ Assigns a predicted class label to the given record sets. :param rs: The record set to assign predictions to. :return: A column vector of predictions corresponding to the record set's rows. """ # set params patterns = rs.entries[:, :-1] if self.add_intercept: intercept = np.ones((patterns.shape[0], 1)) patterns = np.hstack((intercept, patterns)) # predict predictions = self.model.predict(exog=patterns) if self.binary_points: predictions = self.discrete_points(predictions=predictions) # return 2d predictions = np.reshape(predictions, (-1, 1)) return predictions def discrete_points(self, predictions): """ Turns probabilities into discrete classes :param predictions: The predicted class probabilities :return: A vector with discrete classes """ n = predictions.shape[0] for i in range(0, n): if predictions[i] >= self.gamma: predictions[i] = 1 else: predictions[i] = 0 return predictions
def table2_reg(df_reg, disp_it): """Function to create the tables for the first probit models. Args: dataFrame containing the categorial variables as dummies and the interaction terms disp_it boolean value indicating whether information about iterations should be displayed Returns: ------- A table containing the regression output of the first 4 model specifications. """ #first model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y,X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff1 = probit_model1.get_margeff() #probit_margeff1.summary() #second model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y,X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff2 = probit_model2.get_margeff() probit_margeff2.summary() #third model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\ 'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \ 'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970', 'd_hinccat3X1970', 'd_hinccat4X1970']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y,X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeff3 = probit_model3.get_margeff() #probit_margeff3.summary() #fourth model Y = df_reg['_oral'] X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\ 'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \ 'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970', 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y,X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeff4 = probit_model4.get_margeff() #print(probit_margeff4.summary()) table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []}) table[' '] = ['Sales ban', '','p-value', 'Sales ban x 1(1970)', ' ','p-value', 'Observations', 'Log Likelihood', \ 'Additional Covariates', 'Legal Variables'] table = table.set_index(' ') table['(1)'] = [round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)), round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[2],3), \ '({})'.format(round(probit_margeff1.margeff_se[2],3)), round(probit_margeff1.pvalues[2],3), round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\ 'R','PX' ] table['(2)'] = [round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)), round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[2],3), \ '({})'.format(round(probit_margeff2.margeff_se[2],3)), round(probit_margeff2.pvalues[2],3), round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\ 'R','PX, AD' ] table['(3)'] = [round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)), round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[2],3), \ '({})'.format(round(probit_margeff3.margeff_se[2],3)), round(probit_margeff3.pvalues[2],3), round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\ 'R,A,C,E,I','PX, AD' ] table['(4)'] = [round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)), round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[2],3), \ '({})'.format(round(probit_margeff4.margeff_se[2],3)), round(probit_margeff4.pvalues[2],3), round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\ 'R,A,C,E,I','PX, AD, K' ] return table, model1, model2, model3, model4
def table3_reg(df_reg, disp_it): """Function to create the tables for the second probit models. Args: dataFrame containing the categorial variables as dummies and the interaction terms Returns: ------- A table containing the regression output of the 8 model specifications for the second table. """ #1. _everuse_d as dependent variable #first model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y,X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff1 = probit_model1.get_margeff() #probit_margeff1.summary() #second model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y,X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeff2 = probit_model2.get_margeff() probit_margeff2.summary() #third model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y,X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeff3 = probit_model3.get_margeff() probit_margeff3.summary() #fourth model Y = df_reg['_everuse_d'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \ 'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y,X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeff4 = probit_model4.get_margeff() probit_margeff4.summary() #store results model1_help = model1 model2_help = model2 model3_help = model3 model4_help = model3 #2. _barrier as dependent variable #first model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']] X['int'] = np.repeat(1, len(Y)) model1 = Probit(Y,X) probit_model1 = model1.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model1.summary()) #got same results as paper #compute margins (get_margeff) probit_margeffb1 = probit_model1.get_margeff() probit_margeffb1.summary() #second model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']] X['int'] = np.repeat(1, len(Y)) model2 = Probit(Y,X) probit_model2 = model2.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model2.summary()) #got same results as paper #compute margins (get_margeff) probit_margeffb2 = probit_model2.get_margeff() probit_margeffb2.summary() #third model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965']] X['int'] = np.repeat(1, len(Y)) model3 = Probit(Y,X) probit_model3 = model3.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model3.summary()) #compute margins (get_margeff) probit_margeffb3 = probit_model3.get_margeff() probit_margeffb3.summary() #fourth model Y = df_reg['_barrier'] X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \ 'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \ 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \ 'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\ 'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \ 'd_ed_cat13X1970', 'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \ 'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \ 'd_hinccat1X1970', 'd_hinccat2X1970', \ 'd_hinccat3X1970', 'd_hinccat4X1970', 'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \ 'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \ 'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \ 'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']] X['int'] = np.repeat(1, len(Y)) model4 = Probit(Y,X) probit_model4 = model4.fit(cov_type='cluster', cov_kwds={'groups': df_reg['_region']}, disp = disp_it) #print(probit_model4.summary()) #compute margins (get_margeff) probit_margeffb4 = probit_model4.get_margeff() probit_margeffb4.summary() #3. create table for output table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []}) table[' '] = ['Ever used Pill','Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ','p-value', 'Sales ban x 1(1970)', ' ','p-value',\ 'Obersvations', 'Log Likelihood', ' ', 'Ever used barrier', 'Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ',\ 'p-value', 'Sales ban x 1(1970)', ' ','p-value',\ 'Obersvations', 'Log Likelihood', \ 'Additional Covariates', 'Legal Variables'] table = table.set_index(' ') table['(1)'] = [' ', round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)),\ round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[4],3), \ '({})'.format(round(probit_margeff1.margeff_se[4],3)), round(probit_margeff1.pvalues[4],3),\ round(probit_margeff1.margeff[3],3), \ '({})'.format(round(probit_margeff1.margeff_se[3],3)), round(probit_margeff1.pvalues[3],3),\ round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\ ' ', ' ', round(probit_margeffb1.margeff[0],3), '({})'.format(round(probit_margeffb1.margeff_se[0],3)),\ round(probit_margeffb1.pvalues[0],3), round(probit_margeffb1.margeff[4],3), \ '({})'.format(round(probit_margeffb1.margeff_se[4],3)), round(probit_margeffb1.pvalues[4],3),\ round(probit_margeffb1.margeff[3],3), '({})'.format(round(probit_margeffb1.margeff_se[3],3)),\ round(probit_margeffb1.pvalues[3],3), round(probit_margeffb1.results.nobs,3),\ round(probit_margeffb1.results.llf,3), 'R','PX'] table['(2)'] = [' ', round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)),\ round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[4],3), \ '({})'.format(round(probit_margeff2.margeff_se[4],3)), round(probit_margeff2.pvalues[4],3),\ round(probit_margeff2.margeff[3],3), \ '({})'.format(round(probit_margeff2.margeff_se[3],3)), round(probit_margeff2.pvalues[3],3),\ round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\ ' ', ' ', round(probit_margeffb2.margeff[0],3), '({})'.format(round(probit_margeffb2.margeff_se[0],3)),\ round(probit_margeffb2.pvalues[0],3), round(probit_margeffb2.margeff[4],3), \ '({})'.format(round(probit_margeffb2.margeff_se[4],3)), round(probit_margeffb2.pvalues[4],3),\ round(probit_margeffb2.margeff[3],3), '({})'.format(round(probit_margeffb2.margeff_se[3],3)),\ round(probit_margeffb2.pvalues[3],3), round(probit_margeffb2.results.nobs,3),\ round(probit_margeffb2.results.llf,3), \ 'R','PX, AD' ] table['(3)'] = [' ', round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)),\ round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[4],3), \ '({})'.format(round(probit_margeff3.margeff_se[4],3)), round(probit_margeff3.pvalues[4],3),\ round(probit_margeff3.margeff[3],3), \ '({})'.format(round(probit_margeff3.margeff_se[3],3)), round(probit_margeff3.pvalues[3],3),\ round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\ ' ', ' ', round(probit_margeffb3.margeff[0],3), '({})'.format(round(probit_margeffb3.margeff_se[0],3)),\ round(probit_margeffb3.pvalues[0],3), round(probit_margeffb3.margeff[4],3), \ '({})'.format(round(probit_margeffb3.margeff_se[4],3)), round(probit_margeffb3.pvalues[4],3),\ round(probit_margeffb3.margeff[3],3), '({})'.format(round(probit_margeffb3.margeff_se[3],3)),\ round(probit_margeffb3.pvalues[3],3), round(probit_margeffb3.results.nobs,3),\ round(probit_margeffb3.results.llf,3), 'R,A,C,E,I','PX, AD' ] table['(4)'] = [' ', round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)),\ round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[4],3), \ '({})'.format(round(probit_margeff4.margeff_se[4],3)), round(probit_margeff4.pvalues[4],3),\ round(probit_margeff4.margeff[3],3), \ '({})'.format(round(probit_margeff4.margeff_se[3],3)), round(probit_margeff4.pvalues[3],3),\ round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\ ' ', ' ', round(probit_margeffb4.margeff[0],3), '({})'.format(round(probit_margeffb4.margeff_se[0],3)),\ round(probit_margeffb4.pvalues[0],3), round(probit_margeffb4.margeff[4],3), \ '({})'.format(round(probit_margeffb4.margeff_se[4],3)), round(probit_margeffb4.pvalues[4],3),\ round(probit_margeffb4.margeff[3],3), '({})'.format(round(probit_margeffb4.margeff_se[3],3)),\ round(probit_margeffb4.pvalues[3],3), round(probit_margeffb4.results.nobs,3),\ round(probit_margeffb4.results.llf,3), 'R,A,C,E,I','PX, AD, K' ] return table, model1, model2, model3, model4, model1_help, model2_help, model3_help, model4_help
def flip_bits(y, p): x = np.random.rand(y.shape[0], 1) < p y[x < p] = 1 - y[x < p] return y n, d = 100, 2 data_x = np.random.randn(n, d) w = np.random.randn(d, 1) data_y = flip_bits((data_x @ w > 0), 0) lam = 1e-2 # statsmodel.Probit sm_probit_reg = Probit(exog=data_x, endog=data_y).fit(disp=0, method='bfgs') sm_probit_prob = sm_probit_reg.predict(exog=data_x) # Our Implementation: probit_reg = ProbitReg() # EM: em_w, obj_trace_em = probit_reg.probreg_fit_em(data_x, data_y, lam) em_ypred, em_prob = probit_reg.predict(data_x, em_w) # gradient: gradient_w, obj_trace_gradient = probit_reg.probit_reg_fit_gradient( data_x, data_y, lam) gradient_ypred, gradient_prob = probit_reg.predict(data_x, gradient_w) plt.figure()
from statsmodels.regression.linear_model import OLS from statsmodels.discrete.discrete_model import Probit from statsmodels.treatment.treatment_effects import (TreatmentEffect) from .results import results_teffects as res_st cur_dir = os.path.abspath(os.path.dirname(__file__)) file_name = 'cataneo2.csv' file_path = os.path.join(cur_dir, 'results', file_name) dta_cat = pd.read_csv(file_path) formula = 'mbsmoke_ ~ mmarried_ + mage + mage2 + fbaby_ + medu' res_probit = Probit.from_formula(formula, dta_cat).fit() methods = [ ("ra", res_st.results_ra), ("ipw", res_st.results_ipw), ("aipw", res_st.results_aipw), ("aipw_wls", res_st.results_aipw_wls), ("ipw_ra", res_st.results_ipwra), ] class TestTEffects(): @classmethod def setup_class(cls): formula_outcome = 'bweight ~ prenatal1_ + mmarried_ + mage + fbaby_' mod = OLS.from_formula(formula_outcome, dta_cat)
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import seaborn as sns sns.set(style="white") sns.set(style="whitegrid", color_codes=True) import statsmodels.api as sm from statsmodels.discrete.discrete_model import Probit # In[13]: y = df["Outcome"] x = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "DiabetesPedigreeFunction"]] logit_model = sm.Logit(y,x) result=logit_model.fit() print(result.summary()) # # Probit Regression # In[14]: probitmodel = Probit(y,x) probit_model = probitmodel.fit() print(probit_model.summary())
def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self._no_exog_infl = True self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] self._no_exog_infl = False if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise ValueError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError( 'exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = [ 'inflate_%s' % i for i in self.model_infl.data.param_names ] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl']
debug = raw_input("please attach to pid:{},then press any key".format( os.getpid())) modp = Poisson(y, x) resp = modp.fit() print(resp.params) mod = PoissonPenalized(y, x) res = mod.fit(method='bfgs', maxiter=1000) print(res.params) ############### Penalized Probit y_star = linpred + 0.25 * np.random.randn(nobs) y2 = (y_star > 0.75).astype(float) y_star.mean(), y2.mean() res0 = Probit(y2, x).fit() print(res0.summary()) res_oracle = Probit(y2, x[:, :k_nonzero]).fit() print(res_oracle.params) res_oracle.pred_table() margeff = res_oracle.get_margeff() print(margeff.summary()) modl = ProbitPenalized(y2, x) modl.penal.tau = 0 resl = modl.fit(method='newton', disp=True) print(resl.params) print(resl.params - res0.params) res_regl = Probit(y2, x).fit_regularized(alpha=10)
class GenericZeroInflated(CountModel): __doc__ = """ Generiz Zero Inflated model for count data %(params)s %(extra_params)s Attributes ----------- endog : array A reference to the endogenous response variable exog : array A reference to the exogenous design. exog_infl: array A reference to the zero-inflated exogenous design. """ % {'params' : base._model_params_doc, 'extra_params' : _doc_zi_params + base._missing_param_doc} def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise TypeError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError('exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = ['inflate_%s' % i for i in self.model_infl.data.param_names] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl'] def loglike(self, params): """ Loglikelihood of Generic Zero Inflated model Parameters ---------- params : array-like The parameters of the model. Returns ------- loglike : float The log-likelihood function of the model evaluated at `params`. See notes. Notes -------- .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model}) where P - pdf of main model, L - loglike function of main model. """ return np.sum(self.loglikeobs(params)) def loglikeobs(self, params): """ Loglikelihood for observations of Generic Zero Inflated model Parameters ---------- params : array-like The parameters of the model. Returns ------- loglike : ndarray The log likelihood for each observation of the model evaluated at `params`. See Notes Notes -------- .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\ln(1-w_{i})+L_{main\\_model} where P - pdf of main model, L - loglike function of main model. for observations :math:`i=1,...,n` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) llf_main = self.model_main.loglikeobs(params_main) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] llf = np.zeros_like(y, dtype=np.float64) llf[zero_idx] = (np.log(w[zero_idx] + (1 - w[zero_idx]) * np.exp(llf_main[zero_idx]))) llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx] return llf def fit(self, start_params=None, method='bfgs', maxiter=35, full_output=1, disp=1, callback=None, cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs): if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self._get_start_params() if callback is None: # work around perfect separation callback #3895 callback = lambda *x: x mlefit = super(GenericZeroInflated, self).fit(start_params=start_params, maxiter=maxiter, disp=disp, method=method, full_output=full_output, callback=callback, **kwargs) zipfit = self.result_class(self, mlefit._results) result = self.result_class_wrapper(zipfit) if cov_kwds is None: cov_kwds = {} result._get_robustcov_results(cov_type=cov_type, use_self=True, use_t=use_t, **cov_kwds) return result fit.__doc__ = DiscreteModel.fit.__doc__ def fit_regularized(self, start_params=None, method='l1', maxiter='defined_by_method', full_output=1, disp=1, callback=None, alpha=0, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=1e-4, qc_tol=0.03, **kwargs): if np.size(alpha) == 1 and alpha != 0: k_params = self.k_exog + self.k_inflate alpha = alpha * np.ones(k_params) extra = self.k_extra - self.k_inflate alpha_p = alpha[:-(self.k_extra - extra)] if (self.k_extra and np.size(alpha) > 1) else alpha if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self.model_main.fit_regularized( start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=0, callback=callback, alpha=alpha_p, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs).params start_params = np.append(np.ones(self.k_inflate), start_params) cntfit = super(CountModel, self).fit_regularized( start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=disp, callback=callback, alpha=alpha, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs) if method in ['l1', 'l1_cvxopt_cp']: discretefit = self.result_class_reg(self, cntfit) else: raise TypeError( "argument method == %s, which is not handled" % method) return self.result_class_reg_wrapper(discretefit) fit_regularized.__doc__ = DiscreteModel.fit_regularized.__doc__ def score_obs(self, params): """ Generic Zero Inflated model score (gradient) vector of the log-likelihood Parameters ---------- params : array-like The parameters of the model Returns ------- score : ndarray, 1-D The score vector of the model, i.e. the first derivative of the loglikelihood function, evaluated at `params` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] mu = self.model_main.predict(params_main) dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64) dldw = np.zeros_like(self.exog_infl, dtype=np.float64) dldp[zero_idx,:] = (score_main[zero_idx].T * (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T dldp[nonzero_idx,:] = score_main[nonzero_idx] if self.inflation == 'logit': dldw[zero_idx,:] = (self.exog_infl[zero_idx].T * w[zero_idx] * (1 - w[zero_idx]) * (1 - np.exp(llf_main[zero_idx])) / np.exp(llf[zero_idx])).T dldw[nonzero_idx,:] = -(self.exog_infl[nonzero_idx].T * w[nonzero_idx]).T elif self.inflation == 'probit': return approx_fprime(params, self.loglikeobs) return np.hstack((dldw, dldp)) def score(self, params): return self.score_obs(params).sum(0) def _hessian_main(self, params): pass def _hessian_logit(self, params): params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate)) pmf = np.exp(llf) #d2l/dw2 for i in range(self.k_inflate): for j in range(i, -1, -1): hess_arr[i, j] = (( self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] * (w[zero_idx] * (1 - w[zero_idx]) * ((1 - np.exp(llf_main[zero_idx])) * (1 - 2 * w[zero_idx]) * np.exp(llf[zero_idx]) - (w[zero_idx] - w[zero_idx]**2) * (1 - np.exp(llf_main[zero_idx]))**2) / pmf[zero_idx]**2)).sum() - (self.exog_infl[nonzero_idx, i] * self.exog_infl[nonzero_idx, j] * w[nonzero_idx] * (1 - w[nonzero_idx])).sum()) #d2l/dpdw for i in range(self.k_inflate): for j in range(self.k_exog): hess_arr[i, j + self.k_inflate] = -(score_main[zero_idx, j] * w[zero_idx] * (1 - w[zero_idx]) * self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum() return hess_arr def _hessian_probit(self, params): pass def hessian(self, params): """ Generic Zero Inflated model Hessian matrix of the loglikelihood Parameters ---------- params : array-like The parameters of the model Returns ------- hess : ndarray, (k_vars, k_vars) The Hessian, second derivative of loglikelihood function, evaluated at `params` Notes ----- """ hess_arr_main = self._hessian_main(params) hess_arr_infl = self._hessian_inflate(params) if hess_arr_main is None or hess_arr_infl is None: return approx_hess(params, self.loglike) dim = self.k_exog + self.k_inflate hess_arr = np.zeros((dim, dim)) hess_arr[:self.k_inflate,:] = hess_arr_infl hess_arr[self.k_inflate:,self.k_inflate:] = hess_arr_main tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1) hess_arr[tri_idx] = hess_arr.T[tri_idx] return hess_arr def predict(self, params, exog=None, exog_infl=None, exposure=None, offset=None, which='mean'): """ Predict response variable of a count model given exogenous variables. Parameters ---------- params : array-like The parameters of the model exog : array, optional A reference to the exogenous design. If not assigned, will be used exog from fitting. exog_infl : array, optional A reference to the zero-inflated exogenous design. If not assigned, will be used exog from fitting. offset : array, optional Offset is added to the linear prediction with coefficient equal to 1. exposure : array, optional Log(exposure) is added to the linear prediction with coefficient equal to 1. If exposure is specified, then it will be logged by the method. The user does not need to log it first. which : string, optional Define values that will be predicted. 'mean', 'mean-main', 'linear', 'mean-nonzero', 'prob-zero, 'prob', 'prob-main' Default is 'mean'. Notes ----- """ if exog is None: exog = self.exog if exog_infl is None: exog_infl = self.exog_infl if exposure is None: exposure = getattr(self, 'exposure', 0) else: exposure = np.log(exposure) if offset is None: offset = 0 params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] prob_main = 1 - self.model_infl.predict(params_infl, exog_infl) lin_pred = np.dot(exog, params_main[:self.exog.shape[1]]) + exposure + offset # Refactor: This is pretty hacky, # there should be an appropriate predict method in model_main # this is just prob(y=0 | model_main) tmp_exog = self.model_main.exog tmp_endog = self.model_main.endog tmp_offset = getattr(self.model_main, 'offset', ['no']) tmp_exposure = getattr(self.model_main, 'exposure', ['no']) self.model_main.exog = exog self.model_main.endog = np.zeros((exog.shape[0])) self.model_main.offset = offset self.model_main.exposure = exposure llf = self.model_main.loglikeobs(params_main) self.model_main.exog = tmp_exog self.model_main.endog = tmp_endog # tmp_offset might be an array with elementwise equality testing if len(tmp_offset) == 1 and tmp_offset[0] == 'no': del self.model_main.offset else: self.model_main.offset = tmp_offset if len(tmp_exposure) == 1 and tmp_exposure[0] == 'no': del self.model_main.exposure else: self.model_main.exposure = tmp_exposure # end hack prob_zero = (1 - prob_main) + prob_main * np.exp(llf) if which == 'mean': return prob_main * np.exp(lin_pred) elif which == 'mean-main': return np.exp(lin_pred) elif which == 'linear': return lin_pred elif which == 'mean-nonzero': return prob_main * np.exp(lin_pred) / (1 - prob_zero) elif which == 'prob-zero': return prob_zero elif which == 'prob-main': return prob_main elif which == 'prob': return self._predict_prob(params, exog, exog_infl, exposure, offset) else: raise ValueError('which = %s is not available' % which)
class GenericZeroInflated(CountModel): __doc__ = """ Generiz Zero Inflated model for count data %(params)s %(extra_params)s Attributes ---------- endog : array A reference to the endogenous response variable exog : array A reference to the exogenous design. exog_infl: array A reference to the zero-inflated exogenous design. """ % { 'params': base._model_params_doc, 'extra_params': _doc_zi_params + base._missing_param_doc } def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise ValueError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError( 'exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = [ 'inflate_%s' % i for i in self.model_infl.data.param_names ] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl'] def loglike(self, params): """ Loglikelihood of Generic Zero Inflated model Parameters ---------- params : array_like The parameters of the model. Returns ------- loglike : float The log-likelihood function of the model evaluated at `params`. See notes. Notes -------- .. math:: \\ln L=\\sum_{y_{i}=0}\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\sum_{y_{i}>0}(\\ln(1-w_{i})+L_{main\\_model}) where P - pdf of main model, L - loglike function of main model. """ return np.sum(self.loglikeobs(params)) def loglikeobs(self, params): """ Loglikelihood for observations of Generic Zero Inflated model Parameters ---------- params : array_like The parameters of the model. Returns ------- loglike : ndarray The log likelihood for each observation of the model evaluated at `params`. See Notes Notes -------- .. math:: \\ln L=\\ln(w_{i}+(1-w_{i})*P_{main\\_model})+ \\ln(1-w_{i})+L_{main\\_model} where P - pdf of main model, L - loglike function of main model. for observations :math:`i=1,...,n` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) llf_main = self.model_main.loglikeobs(params_main) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] llf = np.zeros_like(y, dtype=np.float64) llf[zero_idx] = ( np.log(w[zero_idx] + (1 - w[zero_idx]) * np.exp(llf_main[zero_idx]))) llf[nonzero_idx] = np.log(1 - w[nonzero_idx]) + llf_main[nonzero_idx] return llf def fit(self, start_params=None, method='bfgs', maxiter=35, full_output=1, disp=1, callback=None, cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs): if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self._get_start_params() if callback is None: # work around perfect separation callback #3895 callback = lambda *x: x mlefit = super(GenericZeroInflated, self).fit(start_params=start_params, maxiter=maxiter, disp=disp, method=method, full_output=full_output, callback=callback, **kwargs) zipfit = self.result_class(self, mlefit._results) result = self.result_class_wrapper(zipfit) if cov_kwds is None: cov_kwds = {} result._get_robustcov_results(cov_type=cov_type, use_self=True, use_t=use_t, **cov_kwds) return result fit.__doc__ = DiscreteModel.fit.__doc__ def fit_regularized(self, start_params=None, method='l1', maxiter='defined_by_method', full_output=1, disp=1, callback=None, alpha=0, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=1e-4, qc_tol=0.03, **kwargs): _validate_l1_method(method) if np.size(alpha) == 1 and alpha != 0: k_params = self.k_exog + self.k_inflate alpha = alpha * np.ones(k_params) extra = self.k_extra - self.k_inflate alpha_p = alpha[:-(self.k_extra - extra)] if ( self.k_extra and np.size(alpha) > 1) else alpha if start_params is None: offset = getattr(self, "offset", 0) + getattr(self, "exposure", 0) if np.size(offset) == 1 and offset == 0: offset = None start_params = self.model_main.fit_regularized( start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=0, callback=callback, alpha=alpha_p, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs).params start_params = np.append(np.ones(self.k_inflate), start_params) cntfit = super(CountModel, self).fit_regularized(start_params=start_params, method=method, maxiter=maxiter, full_output=full_output, disp=disp, callback=callback, alpha=alpha, trim_mode=trim_mode, auto_trim_tol=auto_trim_tol, size_trim_tol=size_trim_tol, qc_tol=qc_tol, **kwargs) discretefit = self.result_class_reg(self, cntfit) return self.result_class_reg_wrapper(discretefit) fit_regularized.__doc__ = DiscreteModel.fit_regularized.__doc__ def score_obs(self, params): """ Generic Zero Inflated model score (gradient) vector of the log-likelihood Parameters ---------- params : array_like The parameters of the model Returns ------- score : ndarray, 1-D The score vector of the model, i.e. the first derivative of the loglikelihood function, evaluated at `params` """ params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] mu = self.model_main.predict(params_main) dldp = np.zeros((self.exog.shape[0], self.k_exog), dtype=np.float64) dldw = np.zeros_like(self.exog_infl, dtype=np.float64) dldp[zero_idx, :] = (score_main[zero_idx].T * (1 - (w[zero_idx]) / np.exp(llf[zero_idx]))).T dldp[nonzero_idx, :] = score_main[nonzero_idx] if self.inflation == 'logit': dldw[zero_idx, :] = (self.exog_infl[zero_idx].T * w[zero_idx] * (1 - w[zero_idx]) * (1 - np.exp(llf_main[zero_idx])) / np.exp(llf[zero_idx])).T dldw[nonzero_idx, :] = -(self.exog_infl[nonzero_idx].T * w[nonzero_idx]).T elif self.inflation == 'probit': return approx_fprime(params, self.loglikeobs) return np.hstack((dldw, dldp)) def score(self, params): return self.score_obs(params).sum(0) def _hessian_main(self, params): pass def _hessian_logit(self, params): params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score_main = self.model_main.score_obs(params_main) llf_main = self.model_main.loglikeobs(params_main) llf = self.loglikeobs(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] hess_arr = np.zeros((self.k_inflate, self.k_exog + self.k_inflate)) pmf = np.exp(llf) #d2l/dw2 for i in range(self.k_inflate): for j in range(i, -1, -1): hess_arr[i, j] = (( self.exog_infl[zero_idx, i] * self.exog_infl[zero_idx, j] * (w[zero_idx] * (1 - w[zero_idx]) * ((1 - np.exp(llf_main[zero_idx])) * (1 - 2 * w[zero_idx]) * np.exp(llf[zero_idx]) - (w[zero_idx] - w[zero_idx]**2) * (1 - np.exp(llf_main[zero_idx]))**2) / pmf[zero_idx]**2) ).sum() - (self.exog_infl[nonzero_idx, i] * self.exog_infl[nonzero_idx, j] * w[nonzero_idx] * (1 - w[nonzero_idx])).sum()) #d2l/dpdw for i in range(self.k_inflate): for j in range(self.k_exog): hess_arr[i, j + self.k_inflate] = -( score_main[zero_idx, j] * w[zero_idx] * (1 - w[zero_idx]) * self.exog_infl[zero_idx, i] / pmf[zero_idx]).sum() return hess_arr def _hessian_probit(self, params): pass def hessian(self, params): """ Generic Zero Inflated model Hessian matrix of the loglikelihood Parameters ---------- params : array_like The parameters of the model Returns ------- hess : ndarray, (k_vars, k_vars) The Hessian, second derivative of loglikelihood function, evaluated at `params` Notes ----- """ hess_arr_main = self._hessian_main(params) hess_arr_infl = self._hessian_inflate(params) if hess_arr_main is None or hess_arr_infl is None: return approx_hess(params, self.loglike) dim = self.k_exog + self.k_inflate hess_arr = np.zeros((dim, dim)) hess_arr[:self.k_inflate, :] = hess_arr_infl hess_arr[self.k_inflate:, self.k_inflate:] = hess_arr_main tri_idx = np.triu_indices(self.k_exog + self.k_inflate, k=1) hess_arr[tri_idx] = hess_arr.T[tri_idx] return hess_arr def predict(self, params, exog=None, exog_infl=None, exposure=None, offset=None, which='mean'): """ Predict response variable of a count model given exogenous variables. Parameters ---------- params : array_like The parameters of the model exog : array, optional A reference to the exogenous design. If not assigned, will be used exog from fitting. exog_infl : array, optional A reference to the zero-inflated exogenous design. If not assigned, will be used exog from fitting. offset : array, optional Offset is added to the linear prediction with coefficient equal to 1. exposure : array, optional Log(exposure) is added to the linear prediction with coefficient equal to 1. If exposure is specified, then it will be logged by the method. The user does not need to log it first. which : string, optional Define values that will be predicted. 'mean', 'mean-main', 'linear', 'mean-nonzero', 'prob-zero, 'prob', 'prob-main' Default is 'mean'. Notes ----- """ if exog is None: exog = self.exog if exog_infl is None: exog_infl = self.exog_infl if exposure is None: exposure = getattr(self, 'exposure', 0) else: exposure = np.log(exposure) if offset is None: offset = 0 params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] prob_main = 1 - self.model_infl.predict(params_infl, exog_infl) lin_pred = np.dot(exog, params_main[:self.exog.shape[1]]) + exposure + offset # Refactor: This is pretty hacky, # there should be an appropriate predict method in model_main # this is just prob(y=0 | model_main) tmp_exog = self.model_main.exog tmp_endog = self.model_main.endog tmp_offset = getattr(self.model_main, 'offset', ['no']) tmp_exposure = getattr(self.model_main, 'exposure', ['no']) self.model_main.exog = exog self.model_main.endog = np.zeros((exog.shape[0])) self.model_main.offset = offset self.model_main.exposure = exposure llf = self.model_main.loglikeobs(params_main) self.model_main.exog = tmp_exog self.model_main.endog = tmp_endog # tmp_offset might be an array with elementwise equality testing if len(tmp_offset) == 1 and tmp_offset[0] == 'no': del self.model_main.offset else: self.model_main.offset = tmp_offset if len(tmp_exposure) == 1 and tmp_exposure[0] == 'no': del self.model_main.exposure else: self.model_main.exposure = tmp_exposure # end hack prob_zero = (1 - prob_main) + prob_main * np.exp(llf) if which == 'mean': return prob_main * np.exp(lin_pred) elif which == 'mean-main': return np.exp(lin_pred) elif which == 'linear': return lin_pred elif which == 'mean-nonzero': return prob_main * np.exp(lin_pred) / (1 - prob_zero) elif which == 'prob-zero': return prob_zero elif which == 'prob-main': return prob_main elif which == 'prob': return self._predict_prob(params, exog, exog_infl, exposure, offset) else: raise ValueError('which = %s is not available' % which)
import pandas as pd # Importing the dataset dataset = pd.read_csv('F3.csv') X = dataset.iloc[:, 0:3].values y = dataset.iloc[:, 3].values y = (y == 1) from statsmodels.discrete.discrete_model import Probit import statsmodels.api as sm X = sm.add_constant(X) model = Probit(y, X.astype(float)) probit_model = model.fit() print(probit_model.summary()) print('-------------------- Predict ---------------------') dataset = pd.read_csv('selectedSimEval.txt') AX = dataset.iloc[:, 1:4].values AX = sm.add_constant(AX) Ay_pred = probit_model.predict(AX) Ay_pred = Ay_pred.reshape(len(Ay_pred), 1)