def backward_selected(data, response, sls=0.01): """Linear model designed by backward selection. Parameters: ----------- data: pandas DataFrame with all possible predictors and response response: string, name of response column in data sls: significance level of a variable to stay in the model Returns: -------- model: an "optimal" fitted statsmodels linear model with an intercept selected by backward selection """ remaining = set(data.columns) remaining.remove(response) while remaining: formula = "{} ~ {} + 1".format(response, ' + '.join(remaining)) scores = smf.logit(formula, data).fit().pvalues worst_new_score = scores.max() worst_candidate = scores.idxmax() if worst_new_score > sls: remaining.remove(worst_candidate) else: break formula = "{} ~ {} + 1".format(response, ' + '.join(remaining)) model = smf.logit(formula, data).fit() return model
def stepwiseModel(data, response, sle=0.05, sls=0.01): """Linear model designed by stepwise selection. Parameters: ----------- data: pandas DataFrame with all possible predictors and response response: string, name of response column in data sle: significance level of a variable into the model sls: significance level of a variable to stay in the model Returns: -------- model: an "optimal" fitted statsmodels linear model with an intercept selected by stepwise selection """ remaining = set(data.columns) remaining.remove(response) selected = [] while remaining: scores_with_candidates = [] for candidate in remaining: formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate])) score = smf.logit(formula, data).fit().pvalues[candidate] scores_with_candidates.append((score, candidate)) scores_with_candidates.sort() best_new_score, best_candidate = scores_with_candidates.pop(0) if best_new_score <= sle: remaining.remove(best_candidate) selected.append(best_candidate) formula = "{} ~ {} + 1".format(response, ' + '.join(selected)) scores = smf.logit(formula, data).fit().pvalues worst_new_score = scores.max() worst_candidate = scores.idxmax() if worst_new_score > sls: selected.remove(worst_candidate) remaining.add(worst_candidate) if best_candidate == worst_candidate: break else: break formula = "{} ~ {} + 1".format(response, ' + '.join(selected)) model = smf.logit(formula, data).fit() return model
def hdgpoisson(Y, X1, X2): """ The function estimates a hurdle generalized poisson regression, which is the composite between point mess at zero and a zero-trucated generalized poisson distribution. In the model outcome, estimated coefficients starting with "P0:" are used to predict the probability of zero outcomes and estimated coefficients starting with "MU:" are used to predict frequency outcomes for a zero-trucated generalized poisson. Parameters: Y : a pandas series for the frequency outcome with integer values, including zeros. X1 : a pandas dataframe with the probability model variables that are all numeric values. X2 : a pandas dataframe with the count model variables that are all numeric values. Example: hdgpoisson(Y, X1, X2).fit().summary() """ class hdgpoisson(gll): def __init__(self, endog, exog, **kwds): super(hdgpoisson, self).__init__(endog, exog, **kwds) def nloglikeobs(self, params): _s = params[-1] d1 = _X1.shape[1] beta1 = params[:d1] beta2 = params[d1:-1] ll = _ll_hdgpoisson(self.endog, self.exog[:, :d1], self.exog[:, d1:], beta1, beta2, _s) return (-ll) def fit(self, start_params=None, maxiter=10000, maxfun=5000, method="ncg", **kwds): self.exog_names.append('_S') if start_params == None: start_params = numpy.concatenate([p10, p20]) return (super(hdgpoisson, self).fit(start_params=start_params, method=method, maxiter=maxiter, maxfun=maxfun, **kwds)) _Y = Y.copy() _X1 = X1.copy() _X2 = X2.copy() _X1.insert(loc=0, column="_CONST", value=1) _X1.columns = ["P0:" + _ for _ in _X1.columns] _X2.insert(loc=0, column="_CONST", value=1) _X2.columns = ["MU:" + _ for _ in _X2.columns] _X = _X1.join(_X2) p10 = logit(numpy.where(_Y == 0, 1, 0), _X1).fit(disp=0).params p20 = ztgpoisson(Y[Y > 0], X2[Y > 0]).fit(disp=0).params return (hdgpoisson(_Y, _X))