예제 #1
0
 def setupClass(cls):
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=True)
     cls.res1 = Logit(data.endog, data.exog).fit_regularized(
             method="l1", alpha=0, disp=0, acc=1e-15, maxiter=1000,
             trim_mode='auto', auto_trim_tol=0.01)
     cls.res2 = Logit(data.endog, data.exog).fit(disp=0, tol=1e-15)
예제 #2
0
 def test_cvxopt_versus_slsqp(self):
     #Compares resutls from cvxopt to the standard slsqp
     if has_cvxopt:
         self.alpha = 3. * np.array([0, 1, 1, 1.]) #/ self.data.endog.shape[0]
         res_slsqp = Logit(self.data.endog, self.data.exog).fit_regularized(
             method="l1", alpha=self.alpha, disp=0, acc=1e-10, maxiter=1000,
             trim_mode='auto')
         res_cvxopt = Logit(self.data.endog, self.data.exog).fit_regularized(
             method="l1_cvxopt_cp", alpha=self.alpha, disp=0, abstol=1e-10,
             trim_mode='auto', auto_trim_tol=0.01, maxiter=1000)
         assert_almost_equal(res_slsqp.params, res_cvxopt.params, DECIMAL_4)
     else:
         raise SkipTest("Skipped test_cvxopt since cvxopt is not available")
예제 #3
0
 def setupClass(cls):
     cls.kvars = 4 # Number of variables
     cls.m = 3 # Number of unregularized parameters
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=True)
     # Do a regularized fit with alpha, effectively dropping the last column
     alpha = np.array([0, 0, 0, 10])
     cls.res_reg = Logit(data.endog, data.exog).fit_regularized(
         method="l1", alpha=alpha, disp=0, acc=1e-15, maxiter=2000,
         trim_mode='auto')
     # Actually drop the last columnand do an unregularized fit
     exog_no_PSI = data.exog[:, :cls.m]
     cls.res_unreg = Logit(data.endog, exog_no_PSI).fit(disp=0, tol=1e-15)
예제 #4
0
def score(df):

    X, y = get_X_y(df)

    vif = variance_inflation_factor
    print('VIF: ')
    for i in range(X.shape[1]):
        print(vif(X, i))

    X = add_constant(X)

    model = Logit(y, X).fit()
    print(model.summary(xname=names))

    kfold = KFold(n_splits=5)

    accuracies = []
    precisions = []
    recalls = []

    for train_index, test_index in kfold.split(X):
        model = LogisticRegression(solver="lbfgs")
        model.fit(X[train_index], y[train_index])
        y_predict = model.predict(X[test_index])
        y_true = y[test_index]
        accuracies.append(accuracy_score(y_true, y_predict))
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))

    print("Accuracy:", np.average(accuracies))
    print("Precision:", np.average(precisions))
    print("Recall:", np.average(recalls))
예제 #5
0
 def score(self,
           X,
           confounder_types,
           assignment='assignment',
           store_model_fit=False,
           intercept=True):
     df = X[[assignment]]
     regression_confounders = []
     for confounder, var_type in confounder_types.items():
         if var_type == 'o' or var_type == 'u':
             c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
             if len(c_dummies.columns) == 1:
                 df[c_dummies.columns] = c_dummies[c_dummies.columns]
                 regression_confounders.extend(c_dummies.columns)
             else:
                 df[c_dummies.columns[1:]] = c_dummies[
                     c_dummies.columns[1:]]
                 regression_confounders.extend(c_dummies.columns[1:])
         else:
             regression_confounders.append(confounder)
             df.loc[:, confounder] = X[confounder].copy()  #
             df.loc[:, confounder] = X[confounder].copy()  #
     if intercept:
         df.loc[:, 'intercept'] = 1.
         regression_confounders.append('intercept')
     logit = Logit(df[assignment], df[regression_confounders])
     result = logit.fit()
     if store_model_fit:
         self.model_fit = result
     X.loc[:,
           'propensity score'] = result.predict(df[regression_confounders])
     return X
예제 #6
0
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        # res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0)
        cls.res2 = reslogit.results_constraint2_robust

        mod1 = Logit(spector_data.endog, spector_data.exog)

        # not used to match Stata for HC
        # nobs, k_params = mod1.exog.shape
        # k_params -= 1   # one constraint
        cov_type = 'HC0'
        cov_kwds = {'scaling_factor': 32 / 31}
        # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)}
        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(
            constr,
            cov_type=cov_type,
            cov_kwds=cov_kwds,
            tol=1e-10,
        )

        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1,
                                   R,
                                   q,
                                   fit_kwds={
                                       'tol': 1e-10,
                                       'cov_type': cov_type,
                                       'cov_kwds': cov_kwds
                                   })
        cls.constraints_rq = (R, q)
예제 #7
0
def checkmodel(X_train, y_train):
    X = X_train
    X_const = add_constant(X, prepend=True)
    y = y_train
    logit_model = Logit(y, X_const).fit()
    print(logit_model.summary())
    return (logit_model)
def forward_selection(dataframe, target, list_to_dummify, criteria='bic'):
    '''
    runs forward selection process to select best predictor set based on bic or aic
    returns a dictionary with the variable set and aic/bic at each step
    ----------
    criteria: default value bic, otherwise aic is used
    list_to_dummify: a list of columns in string format that require dummification before modeling
    '''
    #create target array, intercept only dataframe, and list of variables to select from
    X = pd.DataFrame()
    y = dataframe[target]
    X['const'] = np.ones(cchd.shape[0])
    var_list = list(dataframe.columns)
    var_list.remove(target)

    #create empty dictionary to store output of each step
    models = {'model_vars': [], 'scoring_crit': []}

    #define while loop that will run until all variables have been selected
    while len(var_list) > 0:

        #define empty list to store aic/bic values temporarily for step attempt
        crit_vals = []

        #try adding variables one by one find lowest vif model for current step
        for var in var_list:
            #create temporary df with all previously selected variables + the new variable being tried
            tempX = pd.concat([X, dataframe[var]], axis=1)
            #dummify the variable if necessary
            if var in list_to_dummify:
                tempX = dummify_columns(tempX, [var])
            #fit the logistic model
            logit = Logit(y, tempX)
            fitted_logit = Logit.fit(logit)
            #store aic or bic in a list for each variable attempted
            if criteria == 'bic':
                crit_vals += [fitted_logit.bic]
            else:
                crit_vals += [fitted_logit.aic]

        #find the index of the lowest bic model and store the name of the variable which produced it
        min_crit_idx = crit_vals.index(min(crit_vals))
        best_var = var_list[min_crit_idx]

        #add the best variable to the df
        X = pd.concat([X, dataframe[best_var]], axis=1)

        #store the variables and aic/bic for the best model at the current step
        models['model_vars'] += [list(X.columns)]
        models['scoring_crit'] += [min(crit_vals)]

        #dummify the added variable if necessary
        if best_var in list_to_dummify:
            X = dummify_columns(X, [best_var])

        #remove the added variable from the variable list and track progress
        var_list.remove(best_var)
        print('adding var: %s' % (best_var))

    return models
def basic_significance(dataframe, list_to_dummify, target):
    '''
    fits a non-regularized logistic model to target using dataframe predictors
    prints model accuracy and outputs significant coefficients order by absolute magnitude
    ----------
    list_to_dummify: a list of columns in string format that require dummification before modeling
    '''
    #process the dataframe
    df = dataframe.copy()
    df = dummify_columns(df, list_to_dummify)
    X, y = xy_split(df, target)
    X = add_constant(X)
    #fit the model
    logit = Logit(y, X)
    fitted_logit = Logit.fit(logit)
    #store accuracy
    c_mat = confusion_matrix(
        y, np.round(Logit.predict(logit, fitted_logit.params)))
    accuracy = sum(c_mat.diagonal()) / np.sum(c_mat)
    print('model train accuracy: %s' % (accuracy))
    #store significant coefs
    coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05])
    coefs['coefs'] = fitted_logit.params.filter(items=coefs.index)
    coefs.columns = ['p-values', 'coefs']
    coefs['abs_coefs'] = np.abs(coefs.coefs)
    coefs = coefs.sort_values(by='abs_coefs', ascending=False)
    coefs = coefs.drop('abs_coefs', axis=1)
    return fitted_logit, coefs
예제 #10
0
def OLD_regress(density):

    dcols = sorted([c for c in density.columns if isinstance(c,float)])

    uchoice = density.choice.values
    uvals = density[dcols].values

    isnan = np.isnan(uchoice)
    if np.sum(isnan)>0:
        print('Excluding {:0.0f}% nans.'.format(np.mean(isnan)*100))
    uchoice = uchoice[~isnan]
    uvals = uvals[~isnan]

    try:
        reg = Logit(uchoice,uvals).fit(disp=False)
        reg_params = reg.params
        reg_err = np.abs(reg.conf_int(alpha=0.05).T - reg.params)
    except (np.linalg.LinAlgError, sm.tools.sm_exceptions.PerfectSeparationError):
        reg_params = np.nan * np.zeros(uvals.shape[1])
        reg_err = np.nan * np.zeros([2, len(reg_params)])
        

    res = pd.DataFrame(index=dcols)
    res.loc[:,'weight'] = reg_params
    if not np.any(np.isnan(reg_err)):
        assert np.allclose(reg_err[0],reg_err[1]) # symmetrical errorbars
        res.loc[:,'yerr'] = reg_err[0] # half of confidence interval
    else:
        res.loc[:,'yerr'] = np.nan

    return res
예제 #11
0
    def fit_logit(self):
        '''
        Takes in DF and does logistic regression for X vs Y
        Prints out baseline mode model diagnostics and predicted model diagnostics and ROC curve
        Returns SMOTE X and y values
        '''
        self.y = self.df['repeat'].values
        self.X = self.df.drop(['repeat', 'CustomerNo'], axis=1).values
        #smote the data
        self.X_smote, self.y_smote = smote(self.X, self.y, 0.5)
        self.X_const = add_constant(self.X_smote, prepend=True)
        logit_model = Logit(self.y_smote, self.X_const).fit()
        print(logit_model.summary())
        y_predict = logit_model.predict(self.X_const)

        #check a baseline model that is just the mode assigned to each indivs
        mode_model_acc, mode_model_precision, mode_model_recall = self.mode_cross_val(
            self.X_smote, self.y_smote)
        print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format(
            mode_model_acc, mode_model_precision, mode_model_recall))

        model_acc, model_precision, model_recall = self.logit_cross_val(
            self.X_smote, self.y_smote)
        print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format(
            model_acc, model_precision, model_recall))

        return self.X_smote, self.y_smote
def logreg(X, y, train_test=True, roc=True):
    '''
    INPUT:
        - X: 2-D feature matrix
        - y: target vector
        - train_test: boolean
        - roc: boolean
    OUTPUT:
        - fitted: fitted LogitResults
    Runs statsmodels Logistic Regression and prints summary.  Uses
    train_test_split to split data if train_test = True.  Plots and shows ROC
    curve if roc = True.  Returns fitted Logistic Regression model.
    '''

    if train_test:
        X_train, X_test, y_train, y_test = train_test_split(X, y)
    else:
        X_train, X_test, y_train, y_test = X, X, y, y

    vifs, filtered = get_vifs(X_train)
    X_train, X_test = X_train[filtered], X_test[filtered]

    log_reg = Logit(y_train, add_constant(X_train, has_constant='add'))
    fitted = log_reg.fit(method='bfgs', maxiter=500)
    try:
        print fitted.summary()
    except:
        return logreg(X, y)
    if roc:
        plot_roc(y_test, fitted.predict(add_constant(X_test, has_constant='add')))

    return fitted
예제 #13
0
def analyze_statsmodel(df_X, df_y):
    X = df_X.to_numpy()
    X_const = add_constant(X, prepend=True)
    y = df_y.to_numpy()

    logit_model = Logit(y, X_const).fit()
    print(logit_model.summary())
예제 #14
0
 def setupClass(cls):
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=False)
     cls.res1 = Logit(data.endog, data.exog).fit(method="newton", disp=0)
     res2 = Spector()
     res2.logit()
     cls.res2 = res2
예제 #15
0
    def __init__(self,
                 endog,
                 exog,
                 exog_infl=None,
                 offset=None,
                 inflation='logit',
                 exposure=None,
                 missing='none',
                 **kwargs):
        super(GenericZeroInflated, self).__init__(endog,
                                                  exog,
                                                  offset=offset,
                                                  exposure=exposure,
                                                  missing=missing,
                                                  **kwargs)

        if exog_infl is None:
            self.k_inflate = 1
            self.exog_infl = np.ones((endog.size, self.k_inflate),
                                     dtype=np.float64)
        else:
            self.exog_infl = exog_infl
            self.k_inflate = exog_infl.shape[1]

        if len(exog.shape) == 1:
            self.k_exog = 1
        else:
            self.k_exog = exog.shape[1]

        self.infl = inflation
        if inflation == 'logit':
            self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]),
                                    self.exog_infl)
            self._hessian_inflate = self._hessian_logit
        elif inflation == 'probit':
            self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]),
                                     self.exog_infl)
            self._hessian_inflate = self._hessian_probit

        else:
            raise TypeError("inflation == %s, which is not handled" %
                            inflation)

        self.inflation = inflation
        self.k_extra = self.k_inflate

        if len(self.exog) != len(self.exog_infl):
            raise ValueError(
                'exog and exog_infl have different number of'
                'observation. `missing` handling is not supported')

        infl_names = [
            'inflate_%s' % i for i in self.model_infl.data.param_names
        ]
        self.exog_names[:] = infl_names + list(self.exog_names)
        self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64)

        self._init_keys.extend(['exog_infl', 'inflation'])
        self._null_drop_keys = ['exog_infl']
예제 #16
0
 def setupClass(cls):
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=True)
     cls.model = Logit(data.endog, data.exog)
     cls.alphas = np.array([[0.1, 0.1, 0.1, 0.1], [0.4, 0.4, 0.5, 0.5],
                            [0.5, 0.5, 1, 1]])  #/ data.exog.shape[0]
     cls.res1 = DiscreteL1()
     cls.res1.sweep()
예제 #17
0
def sm_summary(X, docs, y):
    vectorizer.fit(docs)
    bc.fit(vector(docs), y)
    bc_predict = np.reshape(bc.predict(vector(docs)), (-1, 1))
    X = np.append(X, bc_predict, axis=1)
    X = add_constant(X)

    model = Logit(y, X).fit()
    print(model.summary())
예제 #18
0
def get_logit_coef(X, y, cols=None):
    if cols:
        X_fit = X[cols]
    else:
        X_fit = X
    X_fit = sm.add_constant(X_fit)
    logit = Logit(y, X_fit)
    fit = logit.fit()
    print fit.summary()
def log_reg(X_train, Y_train, X_val):
    from statsmodels.discrete.discrete_model import Logit
    from statsmodels.tools import add_constant
    X_train = add_constant(X_train)
    X_val = add_constant(X_val)
    logit = Logit(Y_train, X_train)
    fit = logit.fit(method = 'bfgs', maxiter = 10000)
    logitprobs = fit.predict(X_val)  
    return logitprobs
예제 #20
0
def select_features(X, y):
    if len(list(set(list(y)))) == 2:
        model = Logit(y, X)
    else:
        model = OLS(y, X)
    res = model.fit(disp = False)
    features = ind = multitest.multipletests(res.pvalues, method='holm')[0]	
    X = X[:, features]
    return X
예제 #21
0
def logit_reg():
    X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True)
    lm = Logit(y_smoted, X_smoted).fit(method='powell')
    y_pred = lm.predict(X_test).round(0)
    print 'Statsmodels Logit Regression--------------------------------'
    print 'Confusion Matrix:', confusion_matrix(y_test, y_pred)
    print 'Accuracy:', accuracy_score(y_test, y_pred)
    print 'Precision:', precision_score(y_test, y_pred)
    print 'Recall:', recall_score(y_test, y_pred)
    return lm
예제 #22
0
 def setupClass(cls):
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog, prepend=True)
     cls.alpha = 3 * np.array([0., 1., 1., 1.]) #/ data.exog.shape[0]
     cls.res1 = Logit(data.endog, data.exog).fit_regularized(
         method="l1", alpha=cls.alpha, disp=0, trim_mode='size',
         size_trim_tol=1e-5, acc=1e-10, maxiter=1000)
     res2 = DiscreteL1()
     res2.logit()
     cls.res2 = res2
예제 #23
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = Logit(y, x)
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(disp=0)

        cls.atol = 1e-4  # why not closer ?
예제 #24
0
def logregress(xi, xj, *args, **kwargs):
    x = np.vstack((xi, xj))[::3]
    y = np.vstack((np.zeros((xi.shape[0], 1)), np.ones((xj.shape[0], 1))))[::3]
    scaler = MinMaxScaler([-1, 1])
    scaler.fit(x)
    x = scaler.transform(x)
    #clf = LogisticRegression(random_state=0).fit(x, y[:, 0])
    model = Logit(y, x)
    res = model.fit()
    #print(res.prsquared)
    return res.prsquared
예제 #25
0
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Binomial())
        res = mod.fit(method="newton", tol=1e-10)
        from statsmodels.discrete.discrete_model import Logit
        mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(method="newton", tol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = res2.get_influence()
예제 #26
0
def create_Logit(X, y):
    '''
    creates statsmodels logistic regression model with 'linked click ' as target variable
    INPUT: pandas dataframe
    OUTPUT: statsmodels Logistic Regression model
    '''
    X = X.copy()
    X['constant'] = 1
    X.pop('email_id')
    logit = Logit(y, X)
    model = logit.fit(maxiter=400)
    return model
예제 #27
0
def fit_model(X: pd.DataFrame, y: pd.Series) -> BinaryResultsWrapper:
    """Fits and returns dynamicBt model

    Args:
        X: predictor variables
        y: response variable

    Reurns:
        Results wrapper
    """
    model = Logit(y, X).fit(method="newton")
    return model
예제 #28
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = Logit(y, x[:, :cls.k_nonzero])
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
예제 #29
0
def licata_fit(t, max_rl=None):
    """Using symbols from Licata 2017
    Model from Busse 2011 JNeuro

    convention: -1 means left, 1 means right
    """
    if max_rl is None:
        max_rl = np.max(np.abs((t.nR-t.nL).values))

    r = np.abs(t.nR-t.nL).values / max_rl
    r[t.side==0] = -r[t.side==0]

    ch = t.choice.values.copy()
    #ch[ch==0] = -1

    h_success = t.last_outcome.values.astype(int)
    h_success[t.last_choice==0] *= -1
    # interpretation of this: 1 if previous trial was R-choice&correct, -1 if L-choice&correct
    # b/c R choices are coded as 1, fit weights on this regressor are interpreted as: higher positive weight means correct-and-stay, more negative weight means correct-and-switch

    h_fail = (t.last_outcome==0).astype(int).values
    h_fail[t.last_choice==0] *= -1
    # interpretation of this: 1 if previous trial was R-choice&error, -1 if L-choice&error
    # b/c R choices are coded as 1, fit weights on this regressor are interpreted as: higher positive weight means error-and-stay, more negative weight means error-and-switch

    b0 = np.ones_like(r)

    y = ch
    x = np.array([b0, r, h_success, h_fail]).T

    #print(x.min(axis=0), x.max(axis=0))

    # run GLM
    
    # version 1:
    """
    logit_link = sm.genmod.families.links.logit
    glm_binom = sm.GLM(
            y,
            x,
            family=sm.families.Binomial(link=logit_link))
    glm_result = glm_binom.fit(maxiter=1000, method='bfgs')
    """

    # version 2:
    glm_result = Logit(y,x).fit(maxiter=1000, method='powell', disp=False)

    params = glm_result.params  
    err = glm_result.bse

    return params,err
예제 #30
0
    def score(
        self,
        X,
        confounder_types,
        assignment="assignment",
        store_model_fit=False,
        intercept=True,
        propensity_score_name="propensity score",
    ):
        """
        Fit a propensity score model using the data in X and the confounders listed in confounder_types. This adds
        the propensity scores to the dataframe, and returns the new dataframe.

        :param X: The data set, with (at least) an assignment, set of confounders, and an outcome
        :param assignment: A categorical variable (currently, 0 or 1) indicating test or control group, resp.
        :param outcome: The outcome of interest.  Should be real-valued or ordinal.
        :param confounder_types: A dictionary of variable_name: variable_type pairs of strings, where
        variable_type is in {'c', 'o', 'd'}, for 'continuous', 'ordinal', and 'discrete'.
        :param store_model_fit: boolean, Whether to store the model as an attribute of the class, as
        self.propensity_score_model
        :param intercept: Whether to include an intercept in the logistic regression model
        :return: A new dataframe with the propensity scores included
        """
        df = X[[assignment]].copy()
        regression_confounders = []
        for confounder, var_type in confounder_types.items():
            if var_type == "o" or var_type == "u":
                c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
                if len(c_dummies.columns) == 1:
                    df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1)
                    regression_confounders.extend(c_dummies.columns)
                else:
                    df = pd.concat([df, c_dummies[c_dummies.columns[1:]]],
                                   axis=1)
                    regression_confounders.extend(c_dummies.columns[1:])
            else:
                regression_confounders.append(confounder)
                df.loc[:, confounder] = X[confounder].copy()
                df.loc[:, confounder] = X[confounder].copy()
        if intercept:
            df.loc[:, "intercept"] = 1.0
            regression_confounders.append("intercept")
        logit = Logit(df[assignment], df[regression_confounders])
        model = logit.fit()
        if store_model_fit:
            self.propensity_score_model = model
        X.loc[:, propensity_score_name] = model.predict(
            df[regression_confounders])
        return X