def VariableMiningMnlogit(df, y): """Searches variables using multinomial logistic regression to find ones that predict the target dependent variable 'y'. Args: df (DataFrame): DataFrame that holds all the variables. y (string): Column name of dependent variable y. Returns: variables (list): A list of tuples each containing r-squared value and variable name """ variables = [] for name in df.columns: try: if df[name].var() < 1e-7: continue formula = '{} ~ '.format(y) + name model = smf.mnlogit(formula, data=df) nobs = len(model.endog) if nobs < len(df) / 2: continue results = model.fit() except: continue variables.append((results.prsquared, name)) return variables
def fit(self, data: List[pd.DataFrame], fit_method: str = 'ncg', disp: int = 0) -> None: num_timesteps = len(data) self.models = list() for t in range(0, num_timesteps, 1): model = smf.mnlogit('A ~ ' + self.formula, data=data[t].astype({'A': int})) model = model.fit(method=fit_method, disp=disp) self.models.append(model)
def __init__(self, formula=None, data=None, weights=None, **kwargs): if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = mnlogit(formula, data, **kwargs) self._fit = self._model.fit(maxiter=10000) self._betas = self._fit.params else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None
def __init__(self, formula=None, data=None, weights=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = mnlogit(formula, data, **kwargs) self._fit = self._model.fit(maxiter=10000) self._betas = self._fit.params else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None
def Forward_Select(data, response, modeltype, metric): """Select Variables using forward selection before building a Linear model. Parameters: ----------- data : pandas DataFrame with all possible predictors and response response: string, name of response column in data model_type: It accepts both "Regression" and "Classification" type problems. metric: the criteria improving which the variable is Selected. The metric must be a known metric among all Statsmodels' model metrics. Returns: -------- model: an "optimal" fitted statsmodels linear model with an intercept selected by forward selection. evaluated by adjusted R-squared or AIC or BIC or whatever selected: variables that are selected by this algorithm. """ ############################################################################ ##### CAUTION CAUTION: IF you have Scipy 1.0 version you have to do this ##### This is a dumb workaround until Scipy 1.0 is patched - I should not have ### upgraded from scipy 0.19 to scipy 1.0 = full of bugs!![]. If you DONT #### have this statement then your glm.summary statement will give an ERROR stats.chisqprob = lambda chisq, data: stats.chi2.sf(chisq, data) #### For those who have Scipy 0.19 or older, you can comment out above line. ############################################################################ start_time = time.time() remaining = set(data.columns) remaining.remove(response) selected = [] maxiter = 1000 if metric == 'rsquared' or metric == 'rsquared_adj': current_score, best_new_score = 0.0, 0.0 else: current_score, best_new_score = np.inf, np.inf iterations = 1 if data[response].dtype == object: response_char = 'C(' + response + ')' data[response], factors = factorize_class(data[response]) else: response_char = response while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: #print('Variable considered: %s' %candidate) if data[candidate].dtype == object: ### If it is a categorical variable, encode it this way #### In smf formula string notation, you don't have to add 1. it adds it automatically. if selected == []: formula = "{} ~ {}".format(response_char, 'C('+candidate+')') else: formula = "{} ~ {} + {}".format(response_char, ' + '.join(selected), 'C('+candidate+')') else: formula = "{} ~ {}".format(response_char, ' + '.join(selected + [candidate])) if modeltype == 'Regression': model = smf.ols(formula, data).fit(max_iter=maxiter, disp=0) else: if len(data[response].value_counts()) > 2: try: model = smf.mnlogit(formula=formula, data=data).fit(max_iter=maxiter, disp=0) except: model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit( max_iter=maxiter, disp=0) else: try: model = smf.logit(formula=formula, data=data).fit(max_iter=maxiter, disp=0) except: model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit( max_iter=maxiter, disp=0) try: score = eval('model.'+metric) except: if modeltype == 'Regression': metric = 'aic' print('Metric not recognized. Choosing default = %s' %metric) else: metric = 'aic' print('Metric not recognized. Choosing default = %s' %metric) score = eval('model.'+metric) iterations += 1 scores_with_candidates.append((score, candidate)) if metric == 'rsquared' or metric == 'rsquared_adj': scores_with_candidates.sort(reverse=False) else: scores_with_candidates.sort(reverse=False) best_new_score, best_candidate = scores_with_candidates.pop() if metric == 'rsquared' or metric == 'rsquared_adj': if current_score < best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score else: if current_score > best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score tempform = [] print('Time taken for %d iterations (minutes): %0.2f' %(iterations, (time.time()-start_time)/60)) for eachcol in selected: if tempform == []: if data[eachcol].dtype == object: ### If it is a categorical variable, encode it this way tempform = 'C('+eachcol+')' else: tempform = eachcol else: if data[eachcol].dtype == object: ### If it is a categorical variable, encode it this way tempform = "{} + {}".format(tempform, 'C()'+eachcol+')') else: tempform = "{} + {}".format(tempform, eachcol) ### when all is done, put the formula together #### formula = "{} ~ {} ".format(response_char, tempform) if modeltype == 'Regression': model = smf.ols(formula, data).fit(max_iter=maxiter, disp=0) else: if len(data[response].value_counts()) > 2: try: model = smf.mnlogit(formula=formula, data=data).fit(max_iter=maxiter, disp=0) except: model = smf.glm(formula=formula, data=data, family=sm.families.Gamma()).fit( max_iter=maxiter, disp=0) else: try: model = smf.logit(formula, data).fit(max_iter=maxiter, disp=0) except: model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit( max_iter=maxiter, disp=0) print('Score = %0.2f, Number Selected = %d\nmodel formula: %s' %(score, len(selected),formula)) print('Time taken for Final Model (minutes): %0.2f' %((time.time()-start_time)/60)) print(model.summary()) return model, selected
## read the variables relevant = MiningReport(variables, n=60) ## make model with relevant predictive variables model = smf.poisson('numbabes ~ ager + educat + C(race) + totincr', data=join) results = model.fit() results.summary() ## predict numbabes columns = ['ager', 'race', 'educat', 'totincr'] new = pd.DataFrame([[35, 1, 16, 14]], columns=columns) predict_babes = results.predict(new) print('predict_babes:\n', predict_babes) ## predict married/divorced model = smf.mnlogit('rmarital ~ ager + C(race) + totincr + educat', data=join) results = model.fit() results.summary() ## test individual new = pd.DataFrame({ 'ager': [25], 'race': [2], 'totincr': [11], 'educat': [12] }) predict = results.predict(new) print('predict:\n', predict)
def _estimate_gstar_(self, pooled_data, data_to_predict, distribution): # Treatment of individual f = sm.families.family.Binomial() treat_i_model = smf.glm(self._gi_model, pooled_data, family=f).fit() pred = treat_i_model.predict(data_to_predict) pr_i = np.where(data_to_predict[self.exposure] == 1, pred, 1 - pred) if self._verbose_: print('==============================================================================') print('gstar-model: A') print(treat_i_model.summary()) # Treatment of direct contacts if distribution is None: f = sm.families.family.Binomial() cond_vars = patsy.dmatrix(self._gs_model, pooled_data, return_type='matrix') pred_vars = patsy.dmatrix(self._gs_model, data_to_predict, return_type='matrix') pr_s = np.array([1.] * data_to_predict.shape[0]) for c in self._nonparam_cols_: # Estimating probability treat_s_model = sm.GLM(pooled_data[c], cond_vars, family=f).fit() pred = treat_s_model.predict(pred_vars) pr_s *= np.where(data_to_predict[c] == 1, pred, 1 - pred) if self._verbose_: print('==============================================================================') print('gstar-model: ' + c) print(treat_s_model.summary()) # Stacking vector to the end of the array cond_vars = np.c_[cond_vars, np.array(pooled_data[c])] pred_vars = np.c_[pred_vars, np.array(data_to_predict[c])] elif distribution == 'normal': gs_model = self._gs_measure_ + ' ~ ' + self._gs_model treat_s_model = smf.ols(gs_model, pooled_data).fit() pred = treat_s_model.predict(data_to_predict) pr_s = norm.pdf(data_to_predict[self._gs_measure_], pred, np.sqrt(treat_s_model.mse_resid)) if self._verbose_: print('==============================================================================') print('gstar-model: '+self._gs_measure_) print(treat_s_model.summary()) elif distribution == 'poisson': f = sm.families.family.Poisson() gs_model = self._gs_measure_ + ' ~ ' + self._gs_model treat_s_model = smf.glm(gs_model, pooled_data, family=f).fit() pred = treat_s_model.predict(data_to_predict) pr_s = poisson.pmf(data_to_predict[self._gs_measure_], pred) if self._verbose_: print('==============================================================================') print('gstar-model: '+self._gs_measure_) print(treat_s_model.summary()) elif distribution == 'multinomial': gs_model = self._gs_measure_ + ' ~ ' + self._gs_model treat_s_model = smf.mnlogit(gs_model, pooled_data).fit(disp=False) pred = treat_s_model.predict(data_to_predict) values = pd.get_dummies(data_to_predict[self._gs_measure_]) pr_s = np.array([0] * data_to_predict.shape[0]) for i in data_to_predict[self._gs_measure_].unique(): pr_s += pred[i] * values[i] if self._verbose_: print('==============================================================================') print('gstar-model: '+self._gs_measure_) print(treat_s_model.summary()) elif distribution == 'threshold': f = sm.families.family.Binomial() gs_model = self._gs_measure_ + ' ~ ' + self._gs_model treat_s_model = smf.glm(gs_model, pooled_data, family=f).fit() pred = treat_s_model.predict(data_to_predict) pr_s = np.where(data_to_predict[self._gs_measure_] == 1, pred, 1 - pred) if self._verbose_: print('==============================================================================') print('gstar-model: '+self._gs_measure_) print(treat_s_model.summary()) else: raise ValueError("Invalid distribution choice") # Creating estimated probability return pr_i * pr_s
df.dyslexia = df['{{dyslexia}}'] # make dyslexia a categorical variable df.dyslexia = df.dyslexia.astype('category') # remove trials based on comprehension < 2/3 # --- (D1) # just remove trials df = df[df.correct_rate > 0.6] # --- (D2) # drop entire participants bad_uuid = set() for i, row in df.iterrows(): if row.correct_rate < 0.6: bad_uuid.add(str(row.uuid)) df = df[~df.uuid.isin(bad_uuid)] # --- (F1) # fit a linear mixed effects model fml = '{{formula}}' model = smf.mixedlm(fml, df, groups=df.uuid).fit() print(model.summary()) # --- (F2) # fit a multinomial logit model to accuracy df['acc'] = 3 - pd.Categorical(df.correct_rate).codes fml = 'acc ~ page_condition*dyslexia_bin' model = smf.mnlogit(fml, df).fit() print(model.summary())
import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf import scipy.stats as stats from statsmodels.miscmodels.ordinal_model import OrderedModel from pandas.api.types import CategoricalDtype data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/钝吻鳄例子.csv") #1.基线-类别logit data['食物选择'] = data['食物选择'].replace({'I': 3, 'F': 2, 'O': 1}) mdl = smf.mnlogit(formula='食物选择~长度', data=data) result = mdl.fit() #与书中不同的还是去第一个数值为基线类,由于因变量是类别 #与顺序无关所以这里取O=1,得到与书中一样的结果. #得到两个方程是无脊椎-其他 #鱼-其他 #若研究无脊椎与鱼的关系则使用(1)-(2)得到 result.summary() #2.有序响应变量的累积logit模型 data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/政治意识与党派.csv") # data['意识形态']=data['意识形态'].replace({'很自由':1,'有点自由':2,'中等':3,'有点保守':4,'很保守':5}) data['政治党派'] = data['政治党派'].replace({'民主党人': 1, '共和党人': 0}) tmp = pd.DataFrame() for i in range(0, 20): tmp = tmp.append([data.loc[i]] * data.iloc[i]['值']) tmp = tmp.reset_index() del tmp['值'] del tmp['index'] # tmp.to_csv(r'D:/书籍资料整理/属性数据分析/政治意识与党派_整理数据.csv')
# wrangle education level edu_order = [ 'pre-high school', 'high school', 'professional school', 'college', 'graduate school', 'PhD', 'postdoctoral' ] tp = pd.CategoricalDtype(categories=edu_order, ordered=True) df['edu_level'] = df.education.astype(tp).cat.codes # check correlation between IVs ivs = df[['img_width', 'num_words', 'page_condition', 'age']] print(ivs.corr(), '\n') print(pd.crosstab(df.english_native, df.dyslexia, normalize='columns'), '\n') print(pd.crosstab(df.device, df.dyslexia, normalize='columns'), '\n') # fit a multinomial logit model to accuracy df['acc'] = 3 - pd.Categorical(df.correct_rate).codes print(df.groupby('acc').size(), '\n') fml = 'acc ~ page_condition*dyslexia_bin' model = smf.mnlogit(fml, df, groups=df.uuid).fit() print(model.summary(), '\n') # remove trials based on comprehension < 2/3 df = df[df.correct_rate > 0.6] # fit a linear mixed effects model fml = 'log_speed ~ img_width + num_words + page_condition*dyslexia' \ '+ age + english_native' model = smf.mixedlm(fml, df, groups=df.uuid).fit() print(model.summary())
chat_up['Successx'] = chat_up['Success'].replace({ 'Get Phone Number': 1, 'Go Home with Person': 2, 'No response/Walk Off': 0 }) chat_up['Genderx'] = chat_up['Gender'].replace({'Male': 0, 'Female': 1}) chat_up['Gen_Funny'] = chat_up['Genderx'] * chat_up['Funny'] chat_up['Gen_Sex'] = chat_up['Genderx'] * chat_up['Sex'] # In[26]: import statsmodels.formula.api as smf ml01 = smf.mnlogit( 'Successx ~ Funny + Sex + Good_Mate +Genderx+Gen_Funny+ Gen_Sex', chat_up).fit() print(ml01.summary()) # In[27]: print(np.exp(ml01.params)) # ## Checking Assumptions # ### Assumptions of Multicollinearity # In[29]: from statsmodels.stats.outliers_influence import variance_inflation_factor