def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=True) cls.res1 = Logit(data.endog, data.exog).fit_regularized( method="l1", alpha=0, disp=0, acc=1e-15, maxiter=1000, trim_mode='auto', auto_trim_tol=0.01) cls.res2 = Logit(data.endog, data.exog).fit(disp=0, tol=1e-15)
def test_cvxopt_versus_slsqp(self): #Compares resutls from cvxopt to the standard slsqp if has_cvxopt: self.alpha = 3. * np.array([0, 1, 1, 1.]) #/ self.data.endog.shape[0] res_slsqp = Logit(self.data.endog, self.data.exog).fit_regularized( method="l1", alpha=self.alpha, disp=0, acc=1e-10, maxiter=1000, trim_mode='auto') res_cvxopt = Logit(self.data.endog, self.data.exog).fit_regularized( method="l1_cvxopt_cp", alpha=self.alpha, disp=0, abstol=1e-10, trim_mode='auto', auto_trim_tol=0.01, maxiter=1000) assert_almost_equal(res_slsqp.params, res_cvxopt.params, DECIMAL_4) else: raise SkipTest("Skipped test_cvxopt since cvxopt is not available")
def setupClass(cls): cls.kvars = 4 # Number of variables cls.m = 3 # Number of unregularized parameters data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=True) # Do a regularized fit with alpha, effectively dropping the last column alpha = np.array([0, 0, 0, 10]) cls.res_reg = Logit(data.endog, data.exog).fit_regularized( method="l1", alpha=alpha, disp=0, acc=1e-15, maxiter=2000, trim_mode='auto') # Actually drop the last columnand do an unregularized fit exog_no_PSI = data.exog[:, :cls.m] cls.res_unreg = Logit(data.endog, exog_no_PSI).fit(disp=0, tol=1e-15)
def score(df): X, y = get_X_y(df) vif = variance_inflation_factor print('VIF: ') for i in range(X.shape[1]): print(vif(X, i)) X = add_constant(X) model = Logit(y, X).fit() print(model.summary(xname=names)) kfold = KFold(n_splits=5) accuracies = [] precisions = [] recalls = [] for train_index, test_index in kfold.split(X): model = LogisticRegression(solver="lbfgs") model.fit(X[train_index], y[train_index]) y_predict = model.predict(X[test_index]) y_true = y[test_index] accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) print("Accuracy:", np.average(accuracies)) print("Precision:", np.average(precisions)) print("Recall:", np.average(recalls))
def score(self, X, confounder_types, assignment='assignment', store_model_fit=False, intercept=True): df = X[[assignment]] regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == 'o' or var_type == 'u': c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df[c_dummies.columns] = c_dummies[c_dummies.columns] regression_confounders.extend(c_dummies.columns) else: df[c_dummies.columns[1:]] = c_dummies[ c_dummies.columns[1:]] regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:, confounder] = X[confounder].copy() # df.loc[:, confounder] = X[confounder].copy() # if intercept: df.loc[:, 'intercept'] = 1. regression_confounders.append('intercept') logit = Logit(df[assignment], df[regression_confounders]) result = logit.fit() if store_model_fit: self.model_fit = result X.loc[:, 'propensity score'] = result.predict(df[regression_confounders]) return X
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata # res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0) cls.res2 = reslogit.results_constraint2_robust mod1 = Logit(spector_data.endog, spector_data.exog) # not used to match Stata for HC # nobs, k_params = mod1.exog.shape # k_params -= 1 # one constraint cov_type = 'HC0' cov_kwds = {'scaling_factor': 32 / 31} # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)} constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained( constr, cov_type=cov_type, cov_kwds=cov_kwds, tol=1e-10, ) R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={ 'tol': 1e-10, 'cov_type': cov_type, 'cov_kwds': cov_kwds }) cls.constraints_rq = (R, q)
def checkmodel(X_train, y_train): X = X_train X_const = add_constant(X, prepend=True) y = y_train logit_model = Logit(y, X_const).fit() print(logit_model.summary()) return (logit_model)
def forward_selection(dataframe, target, list_to_dummify, criteria='bic'): ''' runs forward selection process to select best predictor set based on bic or aic returns a dictionary with the variable set and aic/bic at each step ---------- criteria: default value bic, otherwise aic is used list_to_dummify: a list of columns in string format that require dummification before modeling ''' #create target array, intercept only dataframe, and list of variables to select from X = pd.DataFrame() y = dataframe[target] X['const'] = np.ones(cchd.shape[0]) var_list = list(dataframe.columns) var_list.remove(target) #create empty dictionary to store output of each step models = {'model_vars': [], 'scoring_crit': []} #define while loop that will run until all variables have been selected while len(var_list) > 0: #define empty list to store aic/bic values temporarily for step attempt crit_vals = [] #try adding variables one by one find lowest vif model for current step for var in var_list: #create temporary df with all previously selected variables + the new variable being tried tempX = pd.concat([X, dataframe[var]], axis=1) #dummify the variable if necessary if var in list_to_dummify: tempX = dummify_columns(tempX, [var]) #fit the logistic model logit = Logit(y, tempX) fitted_logit = Logit.fit(logit) #store aic or bic in a list for each variable attempted if criteria == 'bic': crit_vals += [fitted_logit.bic] else: crit_vals += [fitted_logit.aic] #find the index of the lowest bic model and store the name of the variable which produced it min_crit_idx = crit_vals.index(min(crit_vals)) best_var = var_list[min_crit_idx] #add the best variable to the df X = pd.concat([X, dataframe[best_var]], axis=1) #store the variables and aic/bic for the best model at the current step models['model_vars'] += [list(X.columns)] models['scoring_crit'] += [min(crit_vals)] #dummify the added variable if necessary if best_var in list_to_dummify: X = dummify_columns(X, [best_var]) #remove the added variable from the variable list and track progress var_list.remove(best_var) print('adding var: %s' % (best_var)) return models
def basic_significance(dataframe, list_to_dummify, target): ''' fits a non-regularized logistic model to target using dataframe predictors prints model accuracy and outputs significant coefficients order by absolute magnitude ---------- list_to_dummify: a list of columns in string format that require dummification before modeling ''' #process the dataframe df = dataframe.copy() df = dummify_columns(df, list_to_dummify) X, y = xy_split(df, target) X = add_constant(X) #fit the model logit = Logit(y, X) fitted_logit = Logit.fit(logit) #store accuracy c_mat = confusion_matrix( y, np.round(Logit.predict(logit, fitted_logit.params))) accuracy = sum(c_mat.diagonal()) / np.sum(c_mat) print('model train accuracy: %s' % (accuracy)) #store significant coefs coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05]) coefs['coefs'] = fitted_logit.params.filter(items=coefs.index) coefs.columns = ['p-values', 'coefs'] coefs['abs_coefs'] = np.abs(coefs.coefs) coefs = coefs.sort_values(by='abs_coefs', ascending=False) coefs = coefs.drop('abs_coefs', axis=1) return fitted_logit, coefs
def OLD_regress(density): dcols = sorted([c for c in density.columns if isinstance(c,float)]) uchoice = density.choice.values uvals = density[dcols].values isnan = np.isnan(uchoice) if np.sum(isnan)>0: print('Excluding {:0.0f}% nans.'.format(np.mean(isnan)*100)) uchoice = uchoice[~isnan] uvals = uvals[~isnan] try: reg = Logit(uchoice,uvals).fit(disp=False) reg_params = reg.params reg_err = np.abs(reg.conf_int(alpha=0.05).T - reg.params) except (np.linalg.LinAlgError, sm.tools.sm_exceptions.PerfectSeparationError): reg_params = np.nan * np.zeros(uvals.shape[1]) reg_err = np.nan * np.zeros([2, len(reg_params)]) res = pd.DataFrame(index=dcols) res.loc[:,'weight'] = reg_params if not np.any(np.isnan(reg_err)): assert np.allclose(reg_err[0],reg_err[1]) # symmetrical errorbars res.loc[:,'yerr'] = reg_err[0] # half of confidence interval else: res.loc[:,'yerr'] = np.nan return res
def fit_logit(self): ''' Takes in DF and does logistic regression for X vs Y Prints out baseline mode model diagnostics and predicted model diagnostics and ROC curve Returns SMOTE X and y values ''' self.y = self.df['repeat'].values self.X = self.df.drop(['repeat', 'CustomerNo'], axis=1).values #smote the data self.X_smote, self.y_smote = smote(self.X, self.y, 0.5) self.X_const = add_constant(self.X_smote, prepend=True) logit_model = Logit(self.y_smote, self.X_const).fit() print(logit_model.summary()) y_predict = logit_model.predict(self.X_const) #check a baseline model that is just the mode assigned to each indivs mode_model_acc, mode_model_precision, mode_model_recall = self.mode_cross_val( self.X_smote, self.y_smote) print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format( mode_model_acc, mode_model_precision, mode_model_recall)) model_acc, model_precision, model_recall = self.logit_cross_val( self.X_smote, self.y_smote) print("ModelAccuracy: {}, ModelPrecision: {}, ModelRecall: {}".format( model_acc, model_precision, model_recall)) return self.X_smote, self.y_smote
def logreg(X, y, train_test=True, roc=True): ''' INPUT: - X: 2-D feature matrix - y: target vector - train_test: boolean - roc: boolean OUTPUT: - fitted: fitted LogitResults Runs statsmodels Logistic Regression and prints summary. Uses train_test_split to split data if train_test = True. Plots and shows ROC curve if roc = True. Returns fitted Logistic Regression model. ''' if train_test: X_train, X_test, y_train, y_test = train_test_split(X, y) else: X_train, X_test, y_train, y_test = X, X, y, y vifs, filtered = get_vifs(X_train) X_train, X_test = X_train[filtered], X_test[filtered] log_reg = Logit(y_train, add_constant(X_train, has_constant='add')) fitted = log_reg.fit(method='bfgs', maxiter=500) try: print fitted.summary() except: return logreg(X, y) if roc: plot_roc(y_test, fitted.predict(add_constant(X_test, has_constant='add'))) return fitted
def analyze_statsmodel(df_X, df_y): X = df_X.to_numpy() X_const = add_constant(X, prepend=True) y = df_y.to_numpy() logit_model = Logit(y, X_const).fit() print(logit_model.summary())
def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=False) cls.res1 = Logit(data.endog, data.exog).fit(method="newton", disp=0) res2 = Spector() res2.logit() cls.res2 = res2
def __init__(self, endog, exog, exog_infl=None, offset=None, inflation='logit', exposure=None, missing='none', **kwargs): super(GenericZeroInflated, self).__init__(endog, exog, offset=offset, exposure=exposure, missing=missing, **kwargs) if exog_infl is None: self.k_inflate = 1 self.exog_infl = np.ones((endog.size, self.k_inflate), dtype=np.float64) else: self.exog_infl = exog_infl self.k_inflate = exog_infl.shape[1] if len(exog.shape) == 1: self.k_exog = 1 else: self.k_exog = exog.shape[1] self.infl = inflation if inflation == 'logit': self.model_infl = Logit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_logit elif inflation == 'probit': self.model_infl = Probit(np.zeros(self.exog_infl.shape[0]), self.exog_infl) self._hessian_inflate = self._hessian_probit else: raise TypeError("inflation == %s, which is not handled" % inflation) self.inflation = inflation self.k_extra = self.k_inflate if len(self.exog) != len(self.exog_infl): raise ValueError( 'exog and exog_infl have different number of' 'observation. `missing` handling is not supported') infl_names = [ 'inflate_%s' % i for i in self.model_infl.data.param_names ] self.exog_names[:] = infl_names + list(self.exog_names) self.exog_infl = np.asarray(self.exog_infl, dtype=np.float64) self._init_keys.extend(['exog_infl', 'inflation']) self._null_drop_keys = ['exog_infl']
def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=True) cls.model = Logit(data.endog, data.exog) cls.alphas = np.array([[0.1, 0.1, 0.1, 0.1], [0.4, 0.4, 0.5, 0.5], [0.5, 0.5, 1, 1]]) #/ data.exog.shape[0] cls.res1 = DiscreteL1() cls.res1.sweep()
def sm_summary(X, docs, y): vectorizer.fit(docs) bc.fit(vector(docs), y) bc_predict = np.reshape(bc.predict(vector(docs)), (-1, 1)) X = np.append(X, bc_predict, axis=1) X = add_constant(X) model = Logit(y, X).fit() print(model.summary())
def get_logit_coef(X, y, cols=None): if cols: X_fit = X[cols] else: X_fit = X X_fit = sm.add_constant(X_fit) logit = Logit(y, X_fit) fit = logit.fit() print fit.summary()
def log_reg(X_train, Y_train, X_val): from statsmodels.discrete.discrete_model import Logit from statsmodels.tools import add_constant X_train = add_constant(X_train) X_val = add_constant(X_val) logit = Logit(Y_train, X_train) fit = logit.fit(method = 'bfgs', maxiter = 10000) logitprobs = fit.predict(X_val) return logitprobs
def select_features(X, y): if len(list(set(list(y)))) == 2: model = Logit(y, X) else: model = OLS(y, X) res = model.fit(disp = False) features = ind = multitest.multipletests(res.pvalues, method='holm')[0] X = X[:, features] return X
def logit_reg(): X_smoted, X_test, y_smoted, y_test = prep_X_y(df, constant=True) lm = Logit(y_smoted, X_smoted).fit(method='powell') y_pred = lm.predict(X_test).round(0) print 'Statsmodels Logit Regression--------------------------------' print 'Confusion Matrix:', confusion_matrix(y_test, y_pred) print 'Accuracy:', accuracy_score(y_test, y_pred) print 'Precision:', precision_score(y_test, y_pred) print 'Recall:', recall_score(y_test, y_pred) return lm
def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog, prepend=True) cls.alpha = 3 * np.array([0., 1., 1., 1.]) #/ data.exog.shape[0] cls.res1 = Logit(data.endog, data.exog).fit_regularized( method="l1", alpha=cls.alpha, disp=0, trim_mode='size', size_trim_tol=1e-5, acc=1e-10, maxiter=1000) res2 = DiscreteL1() res2.logit() cls.res2 = res2
def _initialize(cls): y, x = cls.y, cls.x modp = Logit(y, x) cls.res2 = modp.fit(disp=0) mod = LogitPenalized(y, x, penal=cls.penalty) mod.pen_weight = 0 cls.res1 = mod.fit(disp=0) cls.atol = 1e-4 # why not closer ?
def logregress(xi, xj, *args, **kwargs): x = np.vstack((xi, xj))[::3] y = np.vstack((np.zeros((xi.shape[0], 1)), np.ones((xj.shape[0], 1))))[::3] scaler = MinMaxScaler([-1, 1]) scaler.fit(x) x = scaler.transform(x) #clf = LogisticRegression(random_state=0).fit(x, y[:, 0]) model = Logit(y, x) res = model.fit() #print(res.prsquared) return res.prsquared
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()) res = mod.fit(method="newton", tol=1e-10) from statsmodels.discrete.discrete_model import Logit mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(method="newton", tol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = res2.get_influence()
def create_Logit(X, y): ''' creates statsmodels logistic regression model with 'linked click ' as target variable INPUT: pandas dataframe OUTPUT: statsmodels Logistic Regression model ''' X = X.copy() X['constant'] = 1 X.pop('email_id') logit = Logit(y, X) model = logit.fit(maxiter=400) return model
def fit_model(X: pd.DataFrame, y: pd.Series) -> BinaryResultsWrapper: """Fits and returns dynamicBt model Args: X: predictor variables y: response variable Reurns: Results wrapper """ model = Logit(y, X).fit(method="newton") return model
def _initialize(cls): y, x = cls.y, cls.x modp = Logit(y, x[:, :cls.k_nonzero]) cls.res2 = modp.fit(disp=0) mod = LogitPenalized(y, x, penal=cls.penalty) mod.pen_weight *= .5 mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def licata_fit(t, max_rl=None): """Using symbols from Licata 2017 Model from Busse 2011 JNeuro convention: -1 means left, 1 means right """ if max_rl is None: max_rl = np.max(np.abs((t.nR-t.nL).values)) r = np.abs(t.nR-t.nL).values / max_rl r[t.side==0] = -r[t.side==0] ch = t.choice.values.copy() #ch[ch==0] = -1 h_success = t.last_outcome.values.astype(int) h_success[t.last_choice==0] *= -1 # interpretation of this: 1 if previous trial was R-choice&correct, -1 if L-choice&correct # b/c R choices are coded as 1, fit weights on this regressor are interpreted as: higher positive weight means correct-and-stay, more negative weight means correct-and-switch h_fail = (t.last_outcome==0).astype(int).values h_fail[t.last_choice==0] *= -1 # interpretation of this: 1 if previous trial was R-choice&error, -1 if L-choice&error # b/c R choices are coded as 1, fit weights on this regressor are interpreted as: higher positive weight means error-and-stay, more negative weight means error-and-switch b0 = np.ones_like(r) y = ch x = np.array([b0, r, h_success, h_fail]).T #print(x.min(axis=0), x.max(axis=0)) # run GLM # version 1: """ logit_link = sm.genmod.families.links.logit glm_binom = sm.GLM( y, x, family=sm.families.Binomial(link=logit_link)) glm_result = glm_binom.fit(maxiter=1000, method='bfgs') """ # version 2: glm_result = Logit(y,x).fit(maxiter=1000, method='powell', disp=False) params = glm_result.params err = glm_result.bse return params,err
def score( self, X, confounder_types, assignment="assignment", store_model_fit=False, intercept=True, propensity_score_name="propensity score", ): """ Fit a propensity score model using the data in X and the confounders listed in confounder_types. This adds the propensity scores to the dataframe, and returns the new dataframe. :param X: The data set, with (at least) an assignment, set of confounders, and an outcome :param assignment: A categorical variable (currently, 0 or 1) indicating test or control group, resp. :param outcome: The outcome of interest. Should be real-valued or ordinal. :param confounder_types: A dictionary of variable_name: variable_type pairs of strings, where variable_type is in {'c', 'o', 'd'}, for 'continuous', 'ordinal', and 'discrete'. :param store_model_fit: boolean, Whether to store the model as an attribute of the class, as self.propensity_score_model :param intercept: Whether to include an intercept in the logistic regression model :return: A new dataframe with the propensity scores included """ df = X[[assignment]].copy() regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == "o" or var_type == "u": c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1) regression_confounders.extend(c_dummies.columns) else: df = pd.concat([df, c_dummies[c_dummies.columns[1:]]], axis=1) regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:, confounder] = X[confounder].copy() df.loc[:, confounder] = X[confounder].copy() if intercept: df.loc[:, "intercept"] = 1.0 regression_confounders.append("intercept") logit = Logit(df[assignment], df[regression_confounders]) model = logit.fit() if store_model_fit: self.propensity_score_model = model X.loc[:, propensity_score_name] = model.predict( df[regression_confounders]) return X