def score(df): X, y = get_X_y(df) vif = variance_inflation_factor print('VIF: ') for i in range(X.shape[1]): print(vif(X, i)) X = add_constant(X) model = Logit(y, X).fit() print(model.summary(xname=names)) kfold = KFold(n_splits=5) accuracies = [] precisions = [] recalls = [] for train_index, test_index in kfold.split(X): model = LogisticRegression(solver="lbfgs") model.fit(X[train_index], y[train_index]) y_predict = model.predict(X[test_index]) y_true = y[test_index] accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) print("Accuracy:", np.average(accuracies)) print("Precision:", np.average(precisions)) print("Recall:", np.average(recalls))
def basic_significance(dataframe, list_to_dummify, target): ''' fits a non-regularized logistic model to target using dataframe predictors prints model accuracy and outputs significant coefficients order by absolute magnitude ---------- list_to_dummify: a list of columns in string format that require dummification before modeling ''' #process the dataframe df = dataframe.copy() df = dummify_columns(df, list_to_dummify) X, y = xy_split(df, target) X = add_constant(X) #fit the model logit = Logit(y, X) fitted_logit = Logit.fit(logit) #store accuracy c_mat = confusion_matrix( y, np.round(Logit.predict(logit, fitted_logit.params))) accuracy = sum(c_mat.diagonal()) / np.sum(c_mat) print('model train accuracy: %s' % (accuracy)) #store significant coefs coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05]) coefs['coefs'] = fitted_logit.params.filter(items=coefs.index) coefs.columns = ['p-values', 'coefs'] coefs['abs_coefs'] = np.abs(coefs.coefs) coefs = coefs.sort_values(by='abs_coefs', ascending=False) coefs = coefs.drop('abs_coefs', axis=1) return fitted_logit, coefs
def SM_logit(X, y): """Computing logit function using statsmodels Logit and output is coefficient array.""" logit = Logit(y, X) result = logit.fit() coeff = result.params return coeff
def runAnalysis(self,y): log_res=[0 for i in range(0,self.m)]; for i in range(0,self.m): I=[i] I.extend([-1]) x=self.X[:,I]; lr=LR(y,x); res_lr=lr.fit(disp=0) if self.params=="Coef": log_res[i]=float(res_lr.params[0]); if self.params=="Odds": coef=float(res_lr.params[0]); log_res[i]=math.exp(coef); """ if self.params=="pval": log_res[i]=; if self.params=="logpval": pval=; if pval>0: log_res[i]=-np.log10(pval); else: log_res[i]=-1.0; """ return np.asarray(log_res);
def test_attack(n0, n1, numCov, err=.001): n = n0 + n1 x = [rand.randint(0, 2) for i in range(0, n)] y = [1 for i in range(0, n)] for i in range(0, n0): y[i] = 0 covs = [[rand.randint(0, 1) for i in range(0, n)] for j in range(0, numCov)] ORs = [] for i in range(0, numCov): print i ret = [x] ret.append(covs[i]) X = np.asarray(ret).T X = AC(X, False) lr = LR(y, X) res_lr = lr.fit(disp=0) OR = math.exp(float(res_lr.params[0])) ret.append(attack(y, covs, OR, err)) ORs.append(OR) print "The number of matches is: " for r in ret: str(len(r)) print ORs for r in ret: print r print len(ret)
def forward_selection(dataframe, target, list_to_dummify, criteria='bic'): ''' runs forward selection process to select best predictor set based on bic or aic returns a dictionary with the variable set and aic/bic at each step ---------- criteria: default value bic, otherwise aic is used list_to_dummify: a list of columns in string format that require dummification before modeling ''' #create target array, intercept only dataframe, and list of variables to select from X = pd.DataFrame() y = dataframe[target] X['const'] = np.ones(cchd.shape[0]) var_list = list(dataframe.columns) var_list.remove(target) #create empty dictionary to store output of each step models = {'model_vars': [], 'scoring_crit': []} #define while loop that will run until all variables have been selected while len(var_list) > 0: #define empty list to store aic/bic values temporarily for step attempt crit_vals = [] #try adding variables one by one find lowest vif model for current step for var in var_list: #create temporary df with all previously selected variables + the new variable being tried tempX = pd.concat([X, dataframe[var]], axis=1) #dummify the variable if necessary if var in list_to_dummify: tempX = dummify_columns(tempX, [var]) #fit the logistic model logit = Logit(y, tempX) fitted_logit = Logit.fit(logit) #store aic or bic in a list for each variable attempted if criteria == 'bic': crit_vals += [fitted_logit.bic] else: crit_vals += [fitted_logit.aic] #find the index of the lowest bic model and store the name of the variable which produced it min_crit_idx = crit_vals.index(min(crit_vals)) best_var = var_list[min_crit_idx] #add the best variable to the df X = pd.concat([X, dataframe[best_var]], axis=1) #store the variables and aic/bic for the best model at the current step models['model_vars'] += [list(X.columns)] models['scoring_crit'] += [min(crit_vals)] #dummify the added variable if necessary if best_var in list_to_dummify: X = dummify_columns(X, [best_var]) #remove the added variable from the variable list and track progress var_list.remove(best_var) print('adding var: %s' % (best_var)) return models
def score(self, X, confounder_types, assignment='assignment', store_model_fit=False, intercept=True): df = X[[assignment]] regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == 'o' or var_type == 'u': c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df[c_dummies.columns] = c_dummies[c_dummies.columns] regression_confounders.extend(c_dummies.columns) else: df[c_dummies.columns[1:]] = c_dummies[c_dummies.columns[1:]] regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:,confounder] = X[confounder].copy() # df.loc[:,confounder] = X[confounder].copy() # if intercept: df.loc[:,'intercept'] = 1. regression_confounders.append('intercept') logit = Logit(df[assignment], df[regression_confounders]) result = logit.fit() if store_model_fit: self.model_fit = result X.loc[:,'propensity score'] = result.predict(df[regression_confounders]) return X
def test_perfect_prediction(): cur_dir = os.path.dirname(os.path.abspath(__file__)) iris_dir = os.path.join(cur_dir, '..', '..', 'genmod', 'tests', 'results') iris_dir = os.path.abspath(iris_dir) iris = np.genfromtxt(os.path.join(iris_dir, 'iris.csv'), delimiter=",", skip_header=1) y = iris[:,-1] X = iris[:,:-1] X = X[y != 2] y = y[y != 2] X = sm.add_constant(X, prepend=True) mod = Logit(y,X) assert_raises(PerfectSeparationError, mod.fit) #turn off raise PerfectSeparationError mod.raise_on_perfect_prediction = False mod.fit(disp=False) #should not raise
def logreg(X, y, train_test=True, roc=True): ''' INPUT: - X: 2-D feature matrix - y: target vector - train_test: boolean - roc: boolean OUTPUT: - fitted: fitted LogitResults Runs statsmodels Logistic Regression and prints summary. Uses train_test_split to split data if train_test = True. Plots and shows ROC curve if roc = True. Returns fitted Logistic Regression model. ''' if train_test: X_train, X_test, y_train, y_test = train_test_split(X, y) else: X_train, X_test, y_train, y_test = X, X, y, y vifs, filtered = get_vifs(X_train) X_train, X_test = X_train[filtered], X_test[filtered] log_reg = Logit(y_train, add_constant(X_train, has_constant='add')) fitted = log_reg.fit(method='bfgs', maxiter=500) try: print fitted.summary() except: return logreg(X, y) if roc: plot_roc(y_test, fitted.predict(add_constant(X_test, has_constant='add'))) return fitted
def score(self, X, confounder_types, assignment='assignment', store_model_fit=False, intercept=True): df = X[[assignment]] regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == 'o' or var_type == 'u': c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df[c_dummies.columns] = c_dummies[c_dummies.columns] regression_confounders.extend(c_dummies.columns) else: df[c_dummies.columns[1:]] = c_dummies[ c_dummies.columns[1:]] regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:, confounder] = X[confounder].copy() # df.loc[:, confounder] = X[confounder].copy() # if intercept: df.loc[:, 'intercept'] = 1. regression_confounders.append('intercept') logit = Logit(df[assignment], df[regression_confounders]) result = logit.fit() if store_model_fit: self.model_fit = result X.loc[:, 'propensity score'] = result.predict(df[regression_confounders]) return X
def test_attributes(self): data = ds.df mask_drop = data['apply'] == "somewhat likely" data2 = data.loc[~mask_drop, :].copy() # we need to remove the category also from the Categorical Index data2['apply'] = data2['apply'].cat.remove_categories( "somewhat likely") # standard fit with pandas input modp = OrderedModel(data2['apply'], data2[['pared', 'public', 'gpa']], distr='logit') resp = modp.fit(method='bfgs', disp=False) exog = add_constant(data2[['pared', 'public', 'gpa']], prepend=False) mod_logit = Logit(data2['apply'].cat.codes, exog) res_logit = mod_logit.fit() attributes = "bse df_resid llf aic bic llnull".split() attributes += "llnull llr llr_pvalue prsquared".split() assert_allclose(resp.params[:3], res_logit.params[:3], rtol=1e-5) assert_allclose(resp.params[3], -res_logit.params[3], rtol=1e-5) for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4) resp = modp.fit(method='bfgs', disp=False, cov_type="hac", cov_kwds={"maxlags": 2}) res_logit = mod_logit.fit(method='bfgs', disp=False, cov_type="hac", cov_kwds={"maxlags": 2}) for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4) resp = modp.fit(method='bfgs', disp=False, cov_type="hc1") res_logit = mod_logit.fit(method='bfgs', disp=False, cov_type="hc1") for attr in attributes: assert_allclose(getattr(resp, attr), getattr(res_logit, attr), rtol=1e-4)
def Log_Calc(y, x): x1 = AC(x, False) lr = LR(y, x1) try: res_lr = lr.fit(disp=0) except: return -1 return math.exp(res_lr.params[0])
def log_reg(X_train, Y_train, X_val): from statsmodels.discrete.discrete_model import Logit from statsmodels.tools import add_constant X_train = add_constant(X_train) X_val = add_constant(X_val) logit = Logit(Y_train, X_train) fit = logit.fit(method = 'bfgs', maxiter = 10000) logitprobs = fit.predict(X_val) return logitprobs
def select_features(X, y): if len(list(set(list(y)))) == 2: model = Logit(y, X) else: model = OLS(y, X) res = model.fit(disp = False) features = ind = multitest.multipletests(res.pvalues, method='holm')[0] X = X[:, features] return X
def get_logit_coef(X, y, cols=None): if cols: X_fit = X[cols] else: X_fit = X X_fit = sm.add_constant(X_fit) logit = Logit(y, X_fit) fit = logit.fit() print fit.summary()
class LogisticRegression(object): def __init__(self): pass def fit(self, X, y, **kwargs): self.model = Logit(y, X) self.result = self.model.fit() def predict_proba(self, X): return self.result.predict(X)
def _initialize(cls): y, x = cls.y, cls.x modp = Logit(y, x) cls.res2 = modp.fit(disp=0) mod = LogitPenalized(y, x, penal=cls.penalty) mod.pen_weight = 0 cls.res1 = mod.fit(disp=0) cls.atol = 1e-4 # why not closer ?
class LogisticRegression(object): def __init__(self): pass def fit(self, X, y, **kwargs): from statsmodels.discrete.discrete_model import Logit self.model = Logit(y, X) self.result = self.model.fit() def predict_proba(self, X): return self.result.predict(X)
def logregress(xi, xj, *args, **kwargs): x = np.vstack((xi, xj))[::3] y = np.vstack((np.zeros((xi.shape[0], 1)), np.ones((xj.shape[0], 1))))[::3] scaler = MinMaxScaler([-1, 1]) scaler.fit(x) x = scaler.transform(x) #clf = LogisticRegression(random_state=0).fit(x, y[:, 0]) model = Logit(y, x) res = model.fit() #print(res.prsquared) return res.prsquared
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()) res = mod.fit(method="newton", tol=1e-10) from statsmodels.discrete.discrete_model import Logit mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(method="newton", tol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = res2.get_influence()
def create_Logit(X, y): ''' creates statsmodels logistic regression model with 'linked click ' as target variable INPUT: pandas dataframe OUTPUT: statsmodels Logistic Regression model ''' X = X.copy() X['constant'] = 1 X.pop('email_id') logit = Logit(y, X) model = logit.fit(maxiter=400) return model
def _initialize(cls): y, x = cls.y, cls.x modp = Logit(y, x[:, :cls.k_nonzero]) cls.res2 = modp.fit(disp=0) mod = LogitPenalized(y, x, penal=cls.penalty) mod.pen_weight *= .5 mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def score( self, X, confounder_types, assignment="assignment", store_model_fit=False, intercept=True, propensity_score_name="propensity score", ): """ Fit a propensity score model using the data in X and the confounders listed in confounder_types. This adds the propensity scores to the dataframe, and returns the new dataframe. :param X: The data set, with (at least) an assignment, set of confounders, and an outcome :param assignment: A categorical variable (currently, 0 or 1) indicating test or control group, resp. :param outcome: The outcome of interest. Should be real-valued or ordinal. :param confounder_types: A dictionary of variable_name: variable_type pairs of strings, where variable_type is in {'c', 'o', 'd'}, for 'continuous', 'ordinal', and 'discrete'. :param store_model_fit: boolean, Whether to store the model as an attribute of the class, as self.propensity_score_model :param intercept: Whether to include an intercept in the logistic regression model :return: A new dataframe with the propensity scores included """ df = X[[assignment]].copy() regression_confounders = [] for confounder, var_type in confounder_types.items(): if var_type == "o" or var_type == "u": c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) if len(c_dummies.columns) == 1: df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1) regression_confounders.extend(c_dummies.columns) else: df = pd.concat([df, c_dummies[c_dummies.columns[1:]]], axis=1) regression_confounders.extend(c_dummies.columns[1:]) else: regression_confounders.append(confounder) df.loc[:, confounder] = X[confounder].copy() df.loc[:, confounder] = X[confounder].copy() if intercept: df.loc[:, "intercept"] = 1.0 regression_confounders.append("intercept") logit = Logit(df[assignment], df[regression_confounders]) model = logit.fit() if store_model_fit: self.propensity_score_model = model X.loc[:, propensity_score_name] = model.predict( df[regression_confounders]) return X
def regressor(y, X, model_type=model_type): if model_type == "linear": regressor = sm.OLS(y, X).fit() elif model_type == "logistic": # df = pd.DataFrame({'x':[X], 'y':[y]}) # regressor = sm.logit('y~x', data=df) regressor = Logit(y, X) regressor = regressor.fit() else: print("\nWrong Model Type : " + model_type + "\nLinear model type is seleted.") model_type = "linear" regressor = sm.OLS(y, X).fit() return regressor
def test_attack_nocovar(n0, n1, err=.001): n = n0 + n1 x = [rand.randint(0, 2) for i in range(0, n)] y = [1 for i in range(0, n)] for i in range(0, n0): y[i] = 0 x = AC(x, False) lr = LR(y, x) res_lr = lr.fit(disp=0) OR = math.exp(float(res_lr.params[0])) ret = attack_no_covar(n0, n1, OR, err) print "The number of matches is: " + str(len(ret)) print OR for r in ret: print r
def _LRT(dependent_var_name: str, independent_var_names: List[str], study_df: pd.DataFrame) -> Tuple[dict, dict]: from statsmodels.discrete.discrete_model import Logit from scipy.linalg import LinAlgError from statsmodels.tools.sm_exceptions import PerfectSeparationError from tqdm import tqdm dic_pvalues = {} dic_errors = {} for independent_var_name in tqdm(independent_var_names, position=0, leave=True): print(independent_var_name) subset_df = study_df.loc[:, [dependent_var_name, independent_var_name]]\ .dropna(how="any")\ .copy() if subset_df.shape[0] == 0: dic_pvalues[independent_var_name] = np.NaN dic_errors[independent_var_name] = "All NaN" continue if subset_df[independent_var_name].dtype in ["object", "bool"]: subset_df = pd.get_dummies(subset_df, columns=[independent_var_name], drop_first=False)\ .iloc[:, 0:-1] y = subset_df[dependent_var_name].cat.codes X = subset_df.drop(dependent_var_name, axis=1)\ .assign(intercept = 1) model = Logit(y, X) try: results = model.fit(disp=0) params = results.params.drop("intercept", axis=0) conf = np.exp(results.conf_int().drop("intercept", axis=0)) conf['OR'] = np.exp(params) conf["pvalue"] = results.pvalues.drop("intercept", axis=0) conf = conf.rename({0: 'lb', 1: 'ub'}, axis=1) dic_or = conf.to_dict(orient="index") dic_pvalues[independent_var_name] = { "llr_pvalue": results.llr_pvalue, **dic_or } except (LinAlgError, PerfectSeparationError) as e: dic_pvalues[independent_var_name] = np.NaN dic_errors[independent_var_name] = str(e) return dic_pvalues, dic_errors
class LogReg: def __init__(self): self.coef_=None def fit(self,X,y): X=add_constant(X) self.lr=Logit(y,X) self.l_fitted=self.lr.fit() self.coef_=self.l_fitted.params[:-1] def predict(self,X): if self.coef_ is None: print('you must first fit the model') return X=add_constant(X) return(self.lr.predict(self.l_fitted.params,X))
def get_logit_results(feature_set, feature_names, n, pos_words, neg_words, diff_func=None): """ Fit logistic regression to predict pos_words from neg_words according to the mean difference between their feature values (up to training month n). Parameters: ----------- feature_set : [pandas.DataFrame] Rows = words, cols = dates. feature_names : [str] n : int pos_words : [str] neg_words : [str] diff_func : func(x,y : z) Compute difference z between vectors x and y. Returns: -------- logit_results : statsmodels.discrete.discrete_model.LogitResults """ M = len(feature_set) stats = pd.concat([s.ix[:, 0:n] for s in feature_set], axis=1) if (diff_func is None): diff_func = lambda x, y: x - y X, Y = get_differenced_data(pos_words, neg_words, stats, diff_func) # mean of differences X_mean = pd.np.hstack([ pd.np.mean(X.iloc[:, i * n:(i + 1) * n - 1], axis=1).values.reshape(-1, 1) for i in range(M) ]) X = pd.DataFrame(MinMaxScaler().fit_transform(X_mean), columns=feature_names) # remove stats with 0 variance X = X.ix[:, X.var() > 0.] X = add_constant(X) logit = Logit(Y, X) logit_results = logit.fit() return logit_results
def attack_no_covar(n0, n1, OR, err=.001): n = n0 + n1 y = [1 for i in range(0, n)] for i in range(0, n0): y[i] = 0 num_match = 0 ret = [] ##iterate through all possibilities and test for i0 in range(0, n0 + 1): ##number in controls with 2 minor alleles for i1 in range( 0, (n0 - i0 + 1)): ##number in controls with 1 minor alleles for j0 in range(0, n1 + 1): ##number in cases with 2 minor alleles for j1 in range( 0, (n1 - j0 + 1)): ##number in cases with 1 minor alleles i2 = n0 - i0 - i1 j2 = n1 - j0 - j1 x = [0 for i in range(0, n)] x[:i0] = [2 for i in range(0, i0)] cur = i0 x[cur:cur + i1] = [1 for i in range(0, i1)] cur = cur + i1 cur = cur + i2 x[cur:cur + j0] = [2 for i in range(0, j0)] cur = cur + j0 x[cur:cur + j1] = [1 for i in range(0, j1)] res_lr = "" x = AC(x, False) lr = LR(y, x) try: res_lr = lr.fit(disp=0) except: #print x; continue try: OR_cur = math.exp(float(res_lr.params[0])) except: continue if round_sig(OR_cur, err) == round_sig(OR, err): num_match = num_match + 1 ret.append([i0, i1, i2, j0, j1, j2]) #print "match!\n"; return ret
def simple_model(motif_results_A, non_results_A, motif_results_B, non_results_B): all_results = motif_results_A + non_results_A + motif_results_B + non_results_B is_diplo = np.array([check_is_diplo(result) for result in all_results], dtype="int") total_gpc = (len(motif_results_A) + len(non_results_A)) is_gpc = np.zeros(len(all_results), dtype="int") is_gpc[:total_gpc] = 1 motif = np.zeros(len(all_results), dtype="int") motif[:len(motif_results_A)] = 1 motif[total_gpc:total_gpc + len(motif_results_B)] = 1 X = np.hstack((np.ones_like(is_gpc)[:, None], is_gpc[:, None], is_diplo[:, None], (is_gpc * is_diplo)[:, None])) print(np.sum(X * motif[:, None], axis=0) / np.sum(X, axis=0)) print(np.sum((1 - X) * motif[:, None], axis=0) / np.sum((1 - X), axis=0)) y = motif model = Logit(y, X) result = model.fit() print(result.summary())
def main(): """ Method to test the implementation """ my_log = MyLogisticRegression() sk_log = LogisticRegression(C=1000) X = np.random.random((50, 4)) y = np.random.randint(2, size=50)[:, None] my_log.fit(X, y) sk_log.fit(X, y) exog = add_constant(X) lr = Logit(y, exog) lrf = lr.fit() # print(sk_log.coef_, my_log.W, lrf.summary()) assert np.allclose(sk_log.coef_, my_log.W.T, .1), 'incorrect coefs'
def attack(y, cov, OR, err=.001, bnd=.5, numStep=10): n = n0 + n1 num_match = 0 ret = [] COV = [y] COV.extend(cov) COV = np.asarray(COV).T iter = IterTable(COV) cur = 0 iter.next() [yCur, covCur] = iter.get() while not iter.isDone(): #print cur; cur = cur + 1 [yCur, covCur] = iter.get() lr = LR(yCur, AC(covCur, False)) try: res_lr = lr.fit(disp=0) except: iter.next() continue OR_cur = 1.0 try: OR_cur = math.exp(float(res_lr.params[0])) except: iter.next() continue if abs(OR_cur - OR) < err: print "match" num_match = num_match + 1 ret.append(iter.getTable()) #if abs(OR-OR_cur)/OR_cur>bnd: #for i in range(0,numStep): #iter.next(); iter.next() return ret
def logit(df, y_var, X_vars, add_intercept=True): """ This function replicates probit in STATA, for probit model. 至少有一个固定效应变量,至多只能有两个。 y变量应为0-1变量。 Inputs. --------- df:pd.DataFrame, the data for OLS. y_var:str, the column name of the dependent variable X_vars:list of str, the list of explanatory variable names Outputs. --------- res:obj """ new_df = df.copy() new_df = new_df.dropna() y = new_df[y_var] if add_intercept: new_df['intercept'] = 1.0 X = new_df[['intercept'] + X_vars] else: X = new_df[X_vars] logit_mod = Logit(endog=y, exog=X, check_rank=True, missing="drop") res = logit_mod.fit(start_params=None, method='newton', maxiter=35, full_output=1, disp=1, callback=None) return res