Пример #1
0
def backward_selected(data, response, sls=0.01):
    """Linear model designed by backward selection.

    Parameters:
    -----------
    data: pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    sls: significance level of a variable to stay in the model

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept selected by backward selection
    """
    remaining = set(data.columns)
    remaining.remove(response)
    while remaining:
        formula = "{} ~ {} + 1".format(response, ' + '.join(remaining))
        scores = smf.logit(formula, data).fit().pvalues
        worst_new_score = scores.max()
        worst_candidate = scores.idxmax()
        if worst_new_score > sls:
            remaining.remove(worst_candidate)
        else:
            break
    formula = "{} ~ {} + 1".format(response, ' + '.join(remaining))
    model = smf.logit(formula, data).fit()
    return model
Пример #2
0
def stepwiseModel(data, response, sle=0.05, sls=0.01):
    """Linear model designed by stepwise selection.

    Parameters:
    -----------
    data: pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    sle: significance level of a variable into the model
    sls: significance level of a variable to stay in the model

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept selected by stepwise selection
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    while remaining:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.logit(formula, data).fit().pvalues[candidate]
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop(0)
        if best_new_score <= sle:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            scores = smf.logit(formula, data).fit().pvalues
            worst_new_score = scores.max()
            worst_candidate = scores.idxmax()
            if worst_new_score > sls:
                selected.remove(worst_candidate)
                remaining.add(worst_candidate)
                if best_candidate == worst_candidate: break
        else:
            break
    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
    model = smf.logit(formula, data).fit()
    return model
Пример #3
0
def hdgpoisson(Y, X1, X2):
    """
  The function estimates a hurdle generalized poisson regression, which is the
  composite between point mess at zero and a zero-trucated generalized poisson
  distribution.
  In the model outcome, estimated coefficients starting with "P0:" are used
  to predict the probability of zero outcomes and estimated coefficients
  starting with "MU:" are used to predict frequency outcomes for a zero-trucated
  generalized poisson.
  Parameters:
    Y  : a pandas series for the frequency outcome with integer values, including zeros.
    X1 : a pandas dataframe with the probability model variables that are all numeric values.
    X2 : a pandas dataframe with the count model variables that are all numeric values.
  Example:
    hdgpoisson(Y, X1, X2).fit().summary()
  """
    class hdgpoisson(gll):
        def __init__(self, endog, exog, **kwds):
            super(hdgpoisson, self).__init__(endog, exog, **kwds)

        def nloglikeobs(self, params):
            _s = params[-1]
            d1 = _X1.shape[1]
            beta1 = params[:d1]
            beta2 = params[d1:-1]
            ll = _ll_hdgpoisson(self.endog, self.exog[:, :d1],
                                self.exog[:, d1:], beta1, beta2, _s)
            return (-ll)

        def fit(self,
                start_params=None,
                maxiter=10000,
                maxfun=5000,
                method="ncg",
                **kwds):
            self.exog_names.append('_S')
            if start_params == None:
                start_params = numpy.concatenate([p10, p20])
            return (super(hdgpoisson, self).fit(start_params=start_params,
                                                method=method,
                                                maxiter=maxiter,
                                                maxfun=maxfun,
                                                **kwds))

    _Y = Y.copy()
    _X1 = X1.copy()
    _X2 = X2.copy()
    _X1.insert(loc=0, column="_CONST", value=1)
    _X1.columns = ["P0:" + _ for _ in _X1.columns]
    _X2.insert(loc=0, column="_CONST", value=1)
    _X2.columns = ["MU:" + _ for _ in _X2.columns]
    _X = _X1.join(_X2)
    p10 = logit(numpy.where(_Y == 0, 1, 0), _X1).fit(disp=0).params
    p20 = ztgpoisson(Y[Y > 0], X2[Y > 0]).fit(disp=0).params
    return (hdgpoisson(_Y, _X))