def get_trained_logit_model(): """ In 'data/traning_data/' specific ETFs were visually inspected and white noise (0) and not white noise (1) were assigned. This data is loaded here to train the logistic parameters, but you can use this functionality as a template to train your own :ARGS: :class:`NoneType` :RETURNS: a fitted :class:`statsmodels.Logit` Logistic regression that has been fit to the trained data """ f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx') data = reduce( lambda a, b: numpy.vstack([a, b]), map(lambda x: f.parse(x, index_col=0)[['ln_chg', 'Y']], f.sheet_names)) data = pandas.DataFrame(data, columns=['ln_chg', 'Y']) #add an intercept for the model (required by statsmodels.api.Logit data['intercept'] = 1.0 #fit the model logit_model = Logit(endog=data['Y'], exog=data[['intercept', 'ln_chg']]) return logit_model.fit()
def prs_betaci(q, prs, df): (q0,q1)=q we_print=(q0==2) q0=df[prs].quantile((100-q0)/100.0), # pandas has 99 as the highest; we have 1 as the highest q1=df[prs].quantile((100-q1)/100.0) q40=df[prs].quantile(0.4) q60=df[prs].quantile(0.6) iids=df.index[((q0 <= df[prs]) & (df[prs] <= q1)) | ((q40 <= df[prs]) & (df[prs] <= q60))] if is_bin: data=np.vstack((expit(models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates])), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T try: m=Logit(df.loc[iids,phe_code], data).fit(disp=0) except PerfectSeparationError: return None,(None,None),None b=np.exp(m.params[1]) ci=np.abs(np.exp(m.conf_int().iloc[1,:].values)-b) else: data=np.vstack((models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates]), (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T m=OLS(df.loc[iids,phe_code], data).fit(disp=0) b=m.params[1] ci=np.abs(m.conf_int().iloc[1,:].values-b) if we_print: print(b, [b-ci[0],b+ci[1]]) return b,ci,df.loc[(q0 <= df[prs]) & (df[prs] <= q1),phe_code].mean()
def get_trained_logit_model(): """ In 'data/traning_data/' specific ETFs were visually inspected and white noise (0) and not white noise (1) were assigned. This data is loaded here to train the logistic parameters, but you can use this functionality as a template to train your own :ARGS: :class:`NoneType` :RETURNS: a fitted :class:`statsmodels.Logit` Logistic regression that has been fit to the trained data """ f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx') data = reduce(lambda a, b: numpy.vstack([ a, b]), map( lambda x: f.parse(x, index_col = 0)[['ln_chg', 'Y']], f.sheet_names)) data = pandas.DataFrame(data, columns = ['ln_chg', 'Y']) #add an intercept for the model (required by statsmodels.api.Logit data['intercept'] = 1.0 #fit the model logit_model = Logit(endog = data['Y'], exog = data[['intercept', 'ln_chg']]) return logit_model.fit()
def best_in_group(self, newvars, basevars=None): ''' Get the best variable for score among a set of new variables ''' if not basevars and self.add_cons: basevars = ['_cons'] elif basevars and self.add_cons: basevars = basevars + ['_cons'] elif not basevars and not self.add_cons: raise ValueError( 'Must specify at least one covariate for baseline model') origmod = Logit(self.data[self.outcome], self.data[basevars], missing='drop').fit(disp=False) list_llf = [] for cc in newvars: try: newmod = Logit(self.data[self.outcome], self.data[basevars + [cc]], missing='drop').fit(disp=False) if origmod.nobs / origmod.nobs < .95: warnings.warn('Using {} causes more than 5% '\ 'of the sample to be dropped'.format(cc)) list_llf.append(newmod.llf) except: if cc not in self.dropped_vars: self.dropped_vars.append(cc) list_llf.append(origmod.llf) idx = list_llf.index(max(list_llf)) return newvars[idx], 2 * (list_llf[idx] - origmod.llf)
def model_fit(store_path, X_df_path, y_df_path, feature_key="Gender", X_cols=[], testing=False, include_prc=False): if testing: # If testing the just print X and y columns print store_path, X_df_path, y_df_path, feature_key, X_cols return feature_key, ({"llf": 0.1}, "TEMP SUMMARY") ## Not testing. Fit the models and return the measures print store_path, X_df_path, y_df_path, feature_key, X_cols X = pd.read_hdf(store_path, key=X_df_path, columns=X_cols) y = pd.read_hdf(store_path, key=y_df_path) print "Created dataframes, feature_key=%s" % feature_key print "X.shape = %s, y.shape = %s" % (X.shape, y.shape) model = Logit(y, X) res = model.fit() predict = res.predict() measures = get_all_eval_measures(predict, model.endog, include_prc=include_prc) measures["llf"] = res.llf measures["aic"] = res.aic measures["bic"] = res.bic measures["prsquared"] = res.prsquared measures["df_model"] = res.df_model return feature_key, (measures, res.summary2())
def run_LR(model_dir, trainSet, testSet, timestep): # get shape H, W, C = trainSet.shape[1], trainSet.shape[2], trainSet.shape[3] train_len, test_len = trainSet.shape[0], testSet.shape[0] # get XY features trainX, trainY = getXSYS(trainSet, timestep) testX, testY = getXSYS(testSet, timestep) print('Train set shape: X/Y', trainX.shape, trainY.shape) print('Test set shape: X/Y', testX.shape, testY.shape) # check data imbalance neg, pos = np.bincount(trainX.flatten()) weight_ratio = neg / pos print('Weight ratio:', round(weight_ratio, 5)) # logit logit_model = Logit(trainY, trainX) result = logit_model.fit() print(result.summary2()) # LR logreg = LogisticRegression( class_weight={1: weight_ratio}) # balance pos/neg in training set logreg.fit(trainX, trainY) predY = logreg.predict(testX) y_true = testY.reshape((-1, H, W, C)) y_pred = predY.reshape((-1, H, W, C)) print('#Positive predictions: ', y_pred[y_pred != 0].shape[0], '\n') return y_true, y_pred
def _fit_logit(X, y): metadata = {} lm = Logit(y, X) try: flm = lm.fit(method='bfgs') logging.info(flm.summary()) output = format_output(flm) metadata = { 'summary': str(flm.summary()), 'summary2': str(flm.summary2()) } except (np.linalg.linalg.LinAlgError, PerfectSeparationError, ValueError) as e: # Perfect separation or singular matrix - use NaN logging.warning(e) output = { col: { "coef": None, "std_err": None, "t_values": None, "p_values": None, } for col in X.columns } return output, metadata
def model_fit(y,X, X_cols, y_col, feature_key="Gender", testing=False, include_prc=False): if testing: # If testing the just print X and y columns print X_cols, y_col return feature_key, ({"llf": 0.1}, "TEMP SUMMARY") ## Not testing. Fit the models and return the measures print feature_key, X.shape, X_cols, y.shape, y_col X = pd.DataFrame(X,columns=X_cols) y = pd.Series(y,name=y_col) print "Created dataframes." model = Logit(y,X) res = model.fit() measures = get_all_eval_measures(res, model.endog, include_prc=include_prc) return feature_key, (measures, res.summary2())
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False): """Validates that for each predictor column, all values are within the range 0-1 Notes ----- If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range. If a predictor has probability `False`, converts all values in that column with logistic regression Parameters ---------- data : pd.DataFrame the data set outcome : str the column to use as 'outcome' predictors : list(str) the list of predictors for the analysis probabilities: list(bool) list marking whether a predictor is a probability survival_time : bool if the analysis is a survival time analysis """ for i in range(0, len(predictors)): if probabilities[i]: #validate that any predictors with probability TRUE are b/t 0 and 1 if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0): raise ValueError("{val} must be between 0 and 1".format( val=repr(predictors[i]))) else: if survival_time: from statsmodels.sandbox.cox import CoxPH #TODO else: from statsmodels.api import Logit #predictor is not a probability, convert with logistic regression model = Logit(data[outcome], data[predictors[i]]) data[predictors[i]] = model.fit().y_pred return data
def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return
def _model(x: np.array, y: np.array, model: str) -> OLS or Logit: """ :param x: n-D array :param y: 1-D array :param model: {'linear' or 'logistic'} :return: regression model """ if model == 'regression': model_ = OLS(y, x).fit() else: model_ = Logit(y, x).fit() return model_
def create_model_object(self): model_mat = copy.deepcopy(self.model_data) # convert booleans to floats explicitly for c in model_mat.columns: if model_mat[c].dtype == bool: model_mat[c] = model_mat[c].astype(float) # scale specified vars to N(0,1) for c in self.scale_vars_list: try: xbar = model_mat[c].mean() s = model_mat[c].std() model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s) del xbar, s except KeyError: print( 'Warning: specified variable to scale, %s, is not included in model covariates' % c) # drop rows with na model_mat.dropna(inplace=True) # add constant if needed if self.add_constant: model_mat = pd.concat([ pd.DataFrame(data=[1] * model_mat.shape[0], index=model_mat.index, columns=['const']), model_mat ], axis=1) self.endog_matrix = model_mat[self.endog_name] self.exog_matrix = model_mat[[ c for c in model_mat.columns if c != self.endog_name ]] self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix)
def logit_fit(x_data, y, name='train'): """拟合逻辑回归,并绘制 gini,ks 曲线 \n 参数: ---------- x_data: dataframe, 已清洗好的训练数据的特征变量,函数会自动补上常数项 \n y: series or 1darray, 目标变量 \n name: 训练模型的名字 \n 返回值: ---------- result: statsmodel.api.Logit.fit() 返回结果对象 \n model_eval: ModelEval, 模型评估对象""" model_data = add_constant(x_data) logit_reg = Logit(y, model_data) result = logit_reg.fit(disp=False) prob = result.predict(model_data) model_eval = ModelEval(-prob, y, name, plot=False) a = "************************************" print(a + " " + name + " " + a) print(result.summary2()) model_eval.giniks_plot() return result, model_eval
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False): """Validates that for each predictor column, all values are within the range 0-1 Notes ----- If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range. If a predictor has probability `False`, converts all values in that column with logistic regression Parameters ---------- data : pd.DataFrame the data set outcome : str the column to use as 'outcome' predictors : list(str) the list of predictors for the analysis probabilities: list(bool) list marking whether a predictor is a probability survival_time : bool if the analysis is a survival time analysis """ for i in range(0, len(predictors)): if probabilities[i]: #validate that any predictors with probability TRUE are b/t 0 and 1 if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0): raise ValueError("{val} must be between 0 and 1" .format(val=repr(predictors[i]))) else: if survival_time: from statsmodels.sandbox.cox import CoxPH #TODO else: from statsmodels.api import Logit #predictor is not a probability, convert with logistic regression model = Logit(data[outcome], data[predictors[i]]) data[predictors[i]] = model.fit().y_pred return data
def fit(self): """ Fit the model; save and report results. This currently uses the Statsmodels Logit class with default estimation settings. (It will shift to ChoiceModels once more infrastructure is in place.) The `fit()` method can be run as many times as desired. Results will not be saved with Orca or ModelManager until the `register()` method is run. Parameters ---------- None Returns ------- None """ # TO DO - verify that params are in place for estimation # Workaround for a temporary statsmodels bug: # https://github.com/statsmodels/statsmodels/issues/3931 from scipy import stats stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df) df = get_data(tables=self.tables, filters=self.filters, model_expression=self.model_expression) m = Logit.from_formula(data=df, formula=self.model_expression) results = m.fit() self.name = self._generate_name() self.summary_table = str(results.summary()) print(self.summary_table) # For now, we can just save the summary table and the fitted parameters. Later on # we will probably want programmatic access to more details about the fit (e.g. # for autospec), but we can add that when it's needed. self.fitted_parameters = results.params.tolist( ) # params is a pd.Series
FullRaw = Tip.drop(['sex', 'day', 'time'], axis=1) FullRaw = pd.concat([FullRaw, dummyDf], axis=1) FullRaw['smoker'] = np.where(FullRaw['smoker'] == 'No', 1, 0) from sklearn.model_selection import train_test_split Train, Test = train_test_split(FullRaw, test_size=0.3, random_state=123) Train_X = Train.drop(['smoker'], axis=1) Train_Y = Train['smoker'].copy() Test_X = Test.drop(['smoker'], axis=1) Test_Y = Test['smoker'].copy() from statsmodels.api import Logit M1_Model = Logit(Train_Y, Train_X).fit() M1_Model.summary() Test_pred = M1_Model.predict(Test_X) from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score Test['Test_prob'] = Test_pred Test['Test_Class'] = np.where(Test['Test_prob'] > 0.5, 1, 0) Con_Mat = confusion_matrix(Test['Test_Class'], Test_Y) sum(np.diag(Con_Mat)) / Test_Y.shape[0] * 100 from sklearn.metrics import roc_auc_score, roc_curve ROC = roc_auc_score(Test['Test_Class'], Test_Y)
def __init__(self, outcome, test_vars, df, init_vars=None, add_cons=True, disp=True, cutoff_ord1=1, cutoff_ord2=2.71, t_strata=1, n_min='auto'): # double checking some inputs if type(outcome) != str: raise ValueError( 'y must be a string variable name in the DataFrame.') if type(test_vars) != list: raise ValueError('X must be a list of covariates to test.') self.outcome = outcome self.test_vars = test_vars self.add_cons = add_cons self.init_vars = init_vars if init_vars and type(init_vars) == str: covs = [init_vars] + test_vars elif init_vars and type(init_vars) == list: covs = init_vars + test_vars else: covs = test_vars if n_min == 'auto': n_min_strata = len(covs) + 2 n_min_tc = 3 else: if type(n_min) != dict: raise ValueError('n_min must be "auto" or a dictionary') elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min): raise ValueError('Must specify both n_min_strata (ex. K+2) '\ 'and n_min_tc (ex. 3)') n_min_strata = n_min['n_min_strata'] n_min_tc = n_min['n_min_tc'] if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]: raise ValueError( 'You cannot have variables labeled "propscore" or "logodds"') data = df[[outcome] + covs].copy() ord2_vars = [] dropped_vars = [] # looping through covariates for idx, cc in enumerate(covs): # first a gut check to make sure all the variables aren't singular if len(data[cc].dropna().unique()) == 1: raise ValueError('{} only takes on one value'.format(cc)) # for all variables generate the interaction terms if idx < len(covs): for jj in covs[idx + 1:]: testvar = data[cc] * data[jj] if (not testvar.equals(data[cc]) and not testvar.equals(data[jj]) and len(testvar.dropna().unique()) > 1): data.loc[:, 'X'.join([cc, jj])] = testvar ord2_vars.append('X'.join([cc, jj])) else: dropped_vars.append('X'.join([cc, jj])) # for continuous variables, generate squared term if not data[cc].equals(data[cc]**2): data.loc[:, '{}_sq'.format(cc)] = data[cc]**2 ord2_vars.append('{}_sq'.format(cc)) else: dropped_vars.append('{}_sq'.format(cc)) if add_cons: data.loc[:, '_cons'] = 1 self.data = data self.dropped_vars = dropped_vars self.test_vars_ord2 = ord2_vars # ===================================================================== # Actually calculating propensity score # ===================================================================== linear = self.model_from_group(self.test_vars, cutoff=cutoff_ord1, init_vars=self.init_vars) squared = self.model_from_group(ord2_vars, cutoff=cutoff_ord2, init_vars=linear) if add_cons: self.model = Logit(self.data[self.outcome], self.data[squared + ['_cons']], missing='drop').fit(disp=False) else: self.model = Logit(self.data[self.outcome], self.data[squared], missing='drop').fit(disp=False) self.logodds = self.model.fittedvalues.rename('logodds') self.propscore = Series(self.model.predict(), index=self.logodds.index, name='propscore') self.trim_range = self.calc_trim(self.propscore) self.in_trim = ( self.propscore.ge(self.trim_range[0]) & self.propscore.le(self.trim_range[1])).rename('in_trim') self.strata = self.stratify(self.data[self.outcome], self.logodds, t_max=t_strata, n_min_strata=n_min_strata, n_min_tc=n_min_tc) if disp: print(self.model.summary()) print('The following vars were infeasible: {}'.format(', '.join( self.dropped_vars))) print('Stratification produced {} strata'.format( len(self.strata.dropna().unique())))
def fit(self, data, formula, categorical_variables=None, max_iterations=100, show_results=True, confidence_intervals=True, use_patsy_notation=False, n_decimals=3): """ Fit model to the given data using formula. Parameters ---------- data : pd.DataFrame Data to fit a model formula : str Formula of a model specification, e.g. 'y ~ x1 + x2'; should be passed either in Patsy (statsmodels) notation or using the following rules: '*' for interaction of the variables, ':' for interaction & main effects, i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation). If you use Patsy notation, please specify the parameter use_patsy_notation=True. categorical_variables : list List of names of the variables that should be considered categorical. These variables would be automatically converted into sets of dummy variables. If you want to use this option, please make sure that you don't have nested names of variables (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure. max_iterations : int Maximum iterations for convergence show_results : bool Whether to show results of analysis confidence_intervals : bool Whether to include coefficients' confidence intervals in the summary table use_patsy_notation : bool Turn this on if you use strictly Patsy's rules to define a formula. See more: https://patsy.readthedocs.io/en/latest/quickstart.html n_decimals : int Number of digits to round results when showing them Returns ------- self The current instance of the BinaryLogisticRegression class """ self._data = data.copy() self.categorical_variables = categorical_variables self._show_ci = confidence_intervals self.max_iterations = max_iterations if '=' in formula: formula = formula.replace('=', '~') if not use_patsy_notation: formula = formula.replace('*', '^').replace(':', '*').replace('^', ':') self.formula = formula self.dependent_variable = self.formula.split('~')[0].strip() dep_cats = get_categories(self._data[self.dependent_variable]) self._dep_cats = dep_cats if len(dep_cats) != 2: raise ValueError( f"""A dependent variable should have exactly 2 unique categories. The provided variable has {len(dep_cats)}.""") self._mapper = {dep_cats[0]: 0, dep_cats[1]: 1} self._inv_mapper = {0: dep_cats[0], 1: dep_cats[1]} if not is_numeric_dtype(self._data[self.dependent_variable]): self._data[self.dependent_variable] = self._data[ self.dependent_variable].map(self._mapper).astype(int) #won't work correctly if some variables have nested names (e.g. kinopoisk_rate and kinopoisk_rate_count) if categorical_variables is not None: if not isinstance(categorical_variables, list): raise ValueError( f"""Categorical variables should be passed as list. Type {type(categorical_variables)} was passed instead.""") else: for variable in categorical_variables: formula = formula.replace(variable, f'C({variable})') self._optimizer = 'newton' try: self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) except np.linalg.LinAlgError: self._optimizer = 'bfgs' self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) self._model_params = { 'maxiter': self.max_iterations, 'warn_convergence': False, 'disp': False, 'method': self._optimizer, 'full_output': True } self._observations_idx = list(self._model.fittedvalues.index) self.variables_excluded = self._identify_variables_without_variation() if len(self.variables_excluded) > 0: y = pd.Series(self._model.model.endog.copy(), index=self._observations_idx, name=self.dependent_variable) X = self._remove_variables_without_variation() self._model = Logit(y, X, missing='drop').fit(**self._model_params) self.variables_excluded = [ BinaryLogisticRegression._translate_from_patsy_notation(x) for x in self.variables_excluded ] if self.method == 'backward': self._fit_backward() self._get_statistics_from_model() self.predictions = self.predict() self.classification_table = self.get_classification_table() self.precision_and_recall = self.get_precision_and_recall() if show_results: self.show_results(n_decimals) if len(self.variables_excluded) > 0: print('------------------\n') print( f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}" ) return self
class BinaryLogisticRegression: """ Class for binary logistic regression models based on the excellent statsmodels package. Parameters ---------- method : 'enter' or 'backward' Method for predictors selection include_constant : bool (CURRENTLY UNAVAILIABLE) Whether to include constant in the model classification_cutoff : float Minimum probability to assign a prediction value 1 sig_level_entry : float (CURRENTLY UNAVAILIABLE) Max significance level to include predictor in the model sig_level_removal : float Min significance level to exclude predictor from the model Attributes ---------- predictions : pd.Series Predicted values classification_table : pd.DataFrame A classification table precision_and_recall : pd.DataFrame Table with precision, recall, and F1-score of the model variables_excluded : list Variables excluded because of zero variance variables_included : list Variables included in a model N : int Number of observations included in a model r2_pseudo_macfadden : float MacFadden's pseudo coefficient of determination r2_pseudo_cox_snell : float Cox&Snell's pseudo coefficient of determination r2_pseudo_nagelkerke : float Nagelkerke's pseudo coefficient of determination loglikelihood : float -2LL coefficients : pd.Series Regression coefficients coefficients_sterrors : pd.Series Standard errors of regression coefficients coefficients_wald_statistics : pd.Series Wald statistic of regression coefficients coefficients_zvalues : pd.Series z-statistic of regression coefficients coefficients_pvalues : pd.Series P-values of regression coefficients coefficients_exp : pd.Series e ** regression coefficients """ def __init__( self, method='enter', include_constant=True, classification_cutoff=0.5, sig_level_entry=0.05, sig_level_removal=0.05, ): self.method = method.lower().strip() self.include_constant = include_constant self.classification_cutoff = classification_cutoff self.sig_level_entry = sig_level_entry self.sig_level_removal = sig_level_removal def fit(self, data, formula, categorical_variables=None, max_iterations=100, show_results=True, confidence_intervals=True, use_patsy_notation=False, n_decimals=3): """ Fit model to the given data using formula. Parameters ---------- data : pd.DataFrame Data to fit a model formula : str Formula of a model specification, e.g. 'y ~ x1 + x2'; should be passed either in Patsy (statsmodels) notation or using the following rules: '*' for interaction of the variables, ':' for interaction & main effects, i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation). If you use Patsy notation, please specify the parameter use_patsy_notation=True. categorical_variables : list List of names of the variables that should be considered categorical. These variables would be automatically converted into sets of dummy variables. If you want to use this option, please make sure that you don't have nested names of variables (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure. max_iterations : int Maximum iterations for convergence show_results : bool Whether to show results of analysis confidence_intervals : bool Whether to include coefficients' confidence intervals in the summary table use_patsy_notation : bool Turn this on if you use strictly Patsy's rules to define a formula. See more: https://patsy.readthedocs.io/en/latest/quickstart.html n_decimals : int Number of digits to round results when showing them Returns ------- self The current instance of the BinaryLogisticRegression class """ self._data = data.copy() self.categorical_variables = categorical_variables self._show_ci = confidence_intervals self.max_iterations = max_iterations if '=' in formula: formula = formula.replace('=', '~') if not use_patsy_notation: formula = formula.replace('*', '^').replace(':', '*').replace('^', ':') self.formula = formula self.dependent_variable = self.formula.split('~')[0].strip() dep_cats = get_categories(self._data[self.dependent_variable]) self._dep_cats = dep_cats if len(dep_cats) != 2: raise ValueError( f"""A dependent variable should have exactly 2 unique categories. The provided variable has {len(dep_cats)}.""") self._mapper = {dep_cats[0]: 0, dep_cats[1]: 1} self._inv_mapper = {0: dep_cats[0], 1: dep_cats[1]} if not is_numeric_dtype(self._data[self.dependent_variable]): self._data[self.dependent_variable] = self._data[ self.dependent_variable].map(self._mapper).astype(int) #won't work correctly if some variables have nested names (e.g. kinopoisk_rate and kinopoisk_rate_count) if categorical_variables is not None: if not isinstance(categorical_variables, list): raise ValueError( f"""Categorical variables should be passed as list. Type {type(categorical_variables)} was passed instead.""") else: for variable in categorical_variables: formula = formula.replace(variable, f'C({variable})') self._optimizer = 'newton' try: self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) except np.linalg.LinAlgError: self._optimizer = 'bfgs' self._model = logit(formula=formula, data=self._data).fit( maxiter=self.max_iterations, warn_convergence=False, disp=False, method=self._optimizer, full_output=True) self._model_params = { 'maxiter': self.max_iterations, 'warn_convergence': False, 'disp': False, 'method': self._optimizer, 'full_output': True } self._observations_idx = list(self._model.fittedvalues.index) self.variables_excluded = self._identify_variables_without_variation() if len(self.variables_excluded) > 0: y = pd.Series(self._model.model.endog.copy(), index=self._observations_idx, name=self.dependent_variable) X = self._remove_variables_without_variation() self._model = Logit(y, X, missing='drop').fit(**self._model_params) self.variables_excluded = [ BinaryLogisticRegression._translate_from_patsy_notation(x) for x in self.variables_excluded ] if self.method == 'backward': self._fit_backward() self._get_statistics_from_model() self.predictions = self.predict() self.classification_table = self.get_classification_table() self.precision_and_recall = self.get_precision_and_recall() if show_results: self.show_results(n_decimals) if len(self.variables_excluded) > 0: print('------------------\n') print( f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}" ) return self def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = Logit(y_train, X_train, missing='drop') results = model.fit(**self._model_params) max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return def _identify_variables_without_variation(self): if self.include_constant: mask = self._model.model.exog.var(axis=0)[1:] == 0 else: mask = self._model.model.exog.var(axis=0) == 0 variables_included = [ x for x in list(self._model.params.index) if x != 'Intercept' ] return list(np.array(variables_included)[mask]) def _remove_variables_without_variation(self): X = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) X = X.drop(self.variables_excluded, axis=1) return X @staticmethod def _translate_from_patsy_notation(effect): effect = effect\ .replace(':', ' * ')\ .replace('C(', '')\ .replace('T.', '')\ .replace('[', ' = "')\ .replace(']', '"')\ .replace(')', '') return effect def _get_statistics_from_model(self): self.N = self._model.nobs self.r2_pseudo_macfadden = self._model.prsquared self.r2_pseudo_cox_snell = 1 - np.exp(-self._model.llr / self.N) self.r2_pseudo_nagelkerke = self.r2_pseudo_cox_snell / ( 1 - np.exp(-(-2 * self._model.llnull) / self.N)) self.loglikelihood = -2 * self._model.llf self.coefficients = self._model.params.copy() self.coefficients_sterrors = self._model.bse.copy() self.coefficients_wald_statistics = self._model.tvalues.copy()**2 self.coefficients_zvalues = self._model.tvalues.copy() self.coefficients_pvalues = self._model.pvalues.copy() self.coefficients_exp = self.coefficients.apply(np.exp) variables_included = [ x for x in list(self.coefficients.index) if x != 'Intercept' ] self._variables_included_patsy = variables_included.copy() variables_included = [ BinaryLogisticRegression._translate_from_patsy_notation(x) for x in variables_included ] self.variables_included = variables_included if self.include_constant: self._params_idx = ['Constant'] + variables_included else: self._params_idx = variables_included.copy() for stats in [ self.coefficients, self.coefficients_pvalues, self.coefficients_sterrors, self.coefficients_zvalues, self.coefficients_wald_statistics, self.coefficients_exp ]: stats.index = self._params_idx return def summary(self): """ Summary table with requested information related to regression coefficients. Returns ------- pd.DataFrame A summary table """ statistics = [ self.coefficients, self.coefficients_sterrors, self.coefficients_wald_statistics, self.coefficients_pvalues, self.coefficients_exp ] columns = ['B', 'Std. Error', 'Wald', 'p-value', 'Exp(B)'] if self._show_ci: statistics.append(self.coefficients_confidence_interval) columns.extend(list(self.coefficients_confidence_interval.columns)) statistics = pd.concat(statistics, axis=1) statistics.columns = columns statistics.index = self._params_idx return statistics @property def coefficients_confidence_interval(self): ci = self._model.conf_int() ci.index = self._params_idx ci.columns = [f'LB CI (95%)', f'UB CI (95%)'] return ci def show_results(self, n_decimals): """ Show results of the analysis in a readable form. Parameters ---------- n_decimals : int Number of digits to round results when showing them """ phrase = 'method {}' print('\nLOGISTIC REGRESSION SUMMARY\n') if self._model.mle_retvals['converged'] == True: print('Estimation was converged successfully.') else: print('Estimation was NOT converged successfully.') print('Please enlarge the number of iterations.') print('------------------\n') print('Dependent variable encoding') display(self.get_dependent_variable_codes().style\ .set_caption(phrase.format('.get_dependent_variable_codes()'))) print('------------------\n') print('Model summary') display(self.summary_r2().style\ .set_caption(phrase.format('.summary_r2()'))\ .set_precision(n_decimals)) print('------------------\n') print('Classification table') display(self.get_classification_table().style\ .set_caption(phrase.format('.get_classification_table()'))\ .set_precision(n_decimals)) print('------------------\n') print('Precision and recall') display(self.get_precision_and_recall().style\ .set_caption(phrase.format('.get_precision_and_recall()'))\ .set_precision(n_decimals)) print('------------------\n') print('Coefficients') display(self.summary().style\ .format(None, na_rep="")\ .set_caption(phrase.format('.summary()'))\ .set_precision(n_decimals)) def summary_r2(self): """ Summary table with information related to pseudo coefficients of determination. Returns ------- pd.DataFrame A summary table """ ll = self.loglikelihood mf = self.r2_pseudo_macfadden cs = self.r2_pseudo_cox_snell nk = self.r2_pseudo_nagelkerke statistics = [[ll, mf, cs, nk]] columns = [ '-2 Log likelihood', "MacFadden's Pseudo R2", "Cox&Snell's Pseudo R2", "Nagelkerke's Pseudo R2", ] statistics = pd.DataFrame(statistics, columns=columns, index=['']) return statistics def get_dependent_variable_codes(self): """ Get information on how categories of a dependent variable were encoded. Returns ------- pd.DataFrame A table explaining encodings """ mapper = self._mapper result = pd.DataFrame( [list(mapper.items())[0], list(mapper.items())[1]], columns=['Original value', 'Model value'], index=['', ' ']) return result def get_classification_table(self): """ Get a classification table. Returns ------- pd.DataFrame A classification table """ all_categories = self._dep_cats classification = pd.DataFrame(self._model.pred_table(), columns=self._dep_cats, index=self._dep_cats) classification.index.name = 'Observed' classification.columns.name = 'Predicted' classification['All'] = classification.sum(axis=1) classification.loc['All'] = classification.sum() n = classification.loc['All', 'All'] for category in all_categories: classification.loc[category, 'All'] = classification.loc[ category, category] / classification.loc[category, 'All'] * 100 classification.loc[ 'All', category] = classification.loc['All', category] / n * 100 classification.loc['All', 'All'] = np.diagonal( classification.loc[all_categories, all_categories]).sum() / n * 100 classification.index = all_categories + ['Percent predicted'] classification.index.name = 'Observed' classification.columns = all_categories + ['Percent correct'] classification.columns.name = 'Predicted' return classification def get_precision_and_recall(self): """ Estimate precision, recall, and F-score for all the categories. Returns ------- pd.DataFrame A table with estimated metrics """ preds = self.classification_table.iloc[:-1, :-1] results = [] categories = list(preds.index) for current_category in categories: idx = [cat for cat in categories if cat != current_category] tp = preds.loc[current_category, current_category] fp = preds.loc[idx, current_category].sum() fn = preds.loc[current_category, idx].sum() if fp == 0: precision = 0 else: precision = tp / (tp + fp) recall = tp / (tp + fn) if precision + recall != 0: f1 = 2 * (precision * recall) / (precision + recall) else: f1 = 0 results.append([precision, recall, f1]) results = pd.DataFrame(results, index=categories, columns=['Precision', 'Recall', 'F score']) results.loc['Mean'] = results.mean() return results def predict( self, data=None, group_membership=True, probability=False, logit=False, add_to_data=False, ): """ Predict values of a dependent variable using the fitted model. Parameters ---------- data : pd.DataFrame Data for prediction; may be not specified if you want to predict values for the same data that were used to fit a model group_membership : bool Whether to predict observation's membership to categories of a dependent variable probability : bool Whether to predict exact probability logit : bool Whether to predict a logit value add_to_data : bool Whether to merge predictions with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Predictions """ name_memb = f'{self.dependent_variable} (predicted)' name_prob = f'{self.dependent_variable} (predicted prob.)' name_logit = f'{self.dependent_variable} (predicted logit)' all_columns = [name_memb, name_prob, name_logit] columns_to_show = [] if group_membership: columns_to_show.append(name_memb) if probability: columns_to_show.append(name_prob) if logit: columns_to_show.append(name_logit) cutoff = self.classification_cutoff if data is None: data_init = self._data.copy() logit = self._model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=self._observations_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data_init, result], axis=1) else: return result else: aux_model = logit(self.formula, data).fit(**self._model_params) aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) aux_X = add_constant(aux_data[self.variables_included].copy()) aux_y = aux_model.model.endog.copy() aux_model = Logit(aux_y, aux_X, missing='drop').fit(**self._model_params) logit = aux_model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=aux_data_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data, result], axis=1) else: return result def save_independent_variables(self, data=None, add_to_data=False): """ Produce values of independent variable remained in a fitted model. This option is useful if you don't create dummy variables or interaction effects manually but want to use them in a further analysis. Only variables remained in a model are returned (those that are shown in a summary table). Parameters ---------- data : pd.DataFrame Data for which independent variables are requested; may be not specified if you want to save values for the same data that were used to fit a model add_to_data : bool Whether to merge new values with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Values of independent variables """ if data is None: data = self._data.copy() if self.include_constant: result = self._model.model.exog[:, 1:].copy() else: result = self._model.model.exog.copy() columns = [x for x in self.variables_included if x != 'Constant'] result = pd.DataFrame(result, columns=columns, index=self._observations_idx) else: aux_model = logit(self.formula, data).fit(**self._model_params) aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) result = aux_data[self.variables_included] if add_to_data: result = pd.concat([data, result], axis=1) return result def save_residuals(self, unstandardized=True, standardized=False, logit=False, deviance=False, add_to_data=False): """ Produce values of various residuals. Residuals are returned only for data used to fit a model. Parameters ---------- unstandardized : bool Whether to save unstandardized (raw) residuals standardized : bool Whether to save standardized (z-scores) residuals logit : bool Whether to save logit residuals deviance : bool Whether to save deviance residuals add_to_data : bool Whether to merge new values with data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Requested residuals """ columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \ for k, v in vars().items() if v==True and k!='add_to_data'] result = [] res_unstand = self._model.resid_response res_unstand.name = 'Unstandard. res.' res_stand = self._model.resid_pearson res_stand.name = 'Standard. res.' res_deviance = self._model.resid_dev res_deviance.name = 'Deviance res.' preds_prob = self.predict(group_membership=False, probability=True) res_logit = res_unstand / (preds_prob * (1 - preds_prob)).iloc[:, 0] res_logit.name = 'Logit res.' result.extend([res_unstand, res_stand, res_deviance, res_logit]) result = pd.concat(result, axis=1) result = result[columns_to_show].copy() if add_to_data: result = pd.concat([self._data, result], axis=1) return result
def predict( self, data=None, group_membership=True, probability=False, logit=False, add_to_data=False, ): """ Predict values of a dependent variable using the fitted model. Parameters ---------- data : pd.DataFrame Data for prediction; may be not specified if you want to predict values for the same data that were used to fit a model group_membership : bool Whether to predict observation's membership to categories of a dependent variable probability : bool Whether to predict exact probability logit : bool Whether to predict a logit value add_to_data : bool Whether to merge predictions with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Predictions """ name_memb = f'{self.dependent_variable} (predicted)' name_prob = f'{self.dependent_variable} (predicted prob.)' name_logit = f'{self.dependent_variable} (predicted logit)' all_columns = [name_memb, name_prob, name_logit] columns_to_show = [] if group_membership: columns_to_show.append(name_memb) if probability: columns_to_show.append(name_prob) if logit: columns_to_show.append(name_logit) cutoff = self.classification_cutoff if data is None: data_init = self._data.copy() logit = self._model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=self._observations_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data_init, result], axis=1) else: return result else: aux_model = logit(self.formula, data).fit(**self._model_params) aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) aux_X = add_constant(aux_data[self.variables_included].copy()) aux_y = aux_model.model.endog.copy() aux_model = Logit(aux_y, aux_X, missing='drop').fit(**self._model_params) logit = aux_model.fittedvalues prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x))) memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map( self._inv_mapper) result = pd.DataFrame(index=aux_data_idx, columns=all_columns) result[name_memb] = memb result[name_prob] = prob result[name_logit] = logit result = result[columns_to_show] if add_to_data: return pd.concat([data, result], axis=1) else: return result
def fit(self, X, y, print_detail=False): """Stepwise logistic regression. Use Score test for entry, Wald test for remove. 参数: ---------- X: array-like, n_sample * p_features. 特征变量数据集,程序会自动添加常数项 y: array-like, 目标变量 print_detail: bool, 是否打印出逐步回归选择变量的细节 返回值: ----------- result: 类型同 statsmodels.api.Logit 对象 fit 方法的返回值, 逐步回归选出的模型。""" def score_test(Xtest, y_true, y_predict): """对step forward进入的变量进行Score检验。函数假设新进入的变量放在最后. Xtest包括vars_old(似合模型并给出预测值y_predict的),和var_new(一个待检验的新变量)。 Score检验假设待检验变量的系数为0,所以Xtest虽然包括了它的数据,但拟合参数是按没有此变量计算出来的。""" u = np.dot(Xtest.T, y_true - y_predict) # 一阶导数 h = np.dot(Xtest.T * (y_predict * (1 - y_predict)).values.reshape(len(y_predict)), Xtest) # 二阶导数 score = np.dot(np.dot(u.T, np.linalg.inv(h)), u) # score 是 1*1 数组 p_value = chi2.sf(score, 1) # Score统计量服从自由度为1的卡方分布 return score, p_value def print_wrap(*obj): if print_detail: print(*obj) X = add_constant(X) xenter = ['const'] xwait = list(X.columns.drop('const')) logit_mod = Logit(y, X[xenter]) logit_res = logit_mod.fit(disp=0) y_predict = logit_res.predict(X[xenter]) step = 0 while xwait: # 停止条件1:所有变量都进入了模型 # entry test score = pd.Series(name='Score') pvalue = pd.Series(name='P>chi2') for xname in xwait: tmpX = X[xenter + [xname]] score[xname], pvalue[xname] = score_test(tmpX, y, y_predict) step += 1 print_wrap("step {}: Variables Entry test:\n".format(step), pd.concat([score, pvalue], axis=1)) # 打印运行信息 if pvalue.min() <= self.entry: # 最显著的变量选进来 xin = pvalue.argmin() xenter.append(xin) xwait.remove(xin) print_wrap("step {0}: {1} entered.\n".format(step, xin)) else: # 停止条件2:没有变量符合进入标准 print_wrap("Stopped 2: No vars can get entered any more.\n") break # remove test while True: # 程序运行到这里,说明新增了变量进来 logit_mod = Logit(y, X[xenter]) logit_res = logit_mod.fit(disp=0) y_predict = logit_res.predict(X[xenter]) test = logit_res.wald_test_terms().dframe # wald 检验 pvalue = test['P>chi2'].iloc[1:] # 常数项不参与检验 step += 1 print_wrap("step {}: Variables remove test:\n".format(step), test) if pvalue.max() < self.stay: xout = None print_wrap("step {}: No Variables removed:\n".format(step)) break # 所有变量都是显著的,不剔除变量 else: xout = pvalue.argmax() xenter.remove(xout) xwait.append(xout) print_wrap("step {0}: {1} removed.\n".format(step, xout)) # 停止条件3:如果刚进入的变量又剔除 if xin == xout: print_wrap("Stopped 3: last var entered also got removed.\n") break else: print_wrap("Stopped 1: all var available got entered.\n") return Logit(y, X[xenter]).fit(disp=0)
class PropensityScore: """ Parameters ---------- outcome : str This should be the name of the binary variable to predict. test_vars : list A list of the variables to test. df : DataFrame The pandas DataFrame that contains all of the data. init_vars : str or list, optional Variables to always have included in the propensity score. The default is None. add_cons : Boolean, optional Select this to add a constant to model. The default is True. disp : Boolean, optional Display the final model including dropped variables. The default is True. cutoff_ord1 : Numeric, optional The log gain cutoff for first order covariates. The default is 1. cutoff_ord2 : Numeric, optional The log gain cutoff for second order covariates. The default is 2.71. t_strata : Numeric, optional The cutoff for the t-statistic for the calculated strata. The default is 1. n_min : {'n_min_strata':int1,'n_min_tc':int2} or 'auto' The minimum number of units in each strata or treated/control individuals in strata. The default is 'auto' in which case the number per strata is the number of covariates tested in the propensity score (just linear ones) + 2 (or K+2) while the minimum number of treated and control individuals per strata is 3. If not auto, the input needs to be a dictionary that explicitly specifies: {'n_min_strata':int1,'n_min_tc':int2} Raises ------ ValueError If variables are improperly defined, this prints out warnings. Returns ------- self.data : DataFrame This includes a new frame of just the outcome and potential covariates. self.dropped_vars : list The variables that did not make the cut for singularity reasons. self.model : sm.Logit.fit() model This is the raw model on the final set of variables from Statsmodels self.propscore : Series This is the propensity score as calculated by self.model.fittedvalues. This may not match dimension of data due to dropped missing values, but index will align properly. self.strata : Series The calculated strata. Missing propensity scores and values outside of min of treated group or max of control group are coded as NaN. self.logodds : Series The linearized propensity score. Will be the same dimension as propscore. self.test_vars_ord2: list The full list of tested second order variables for reference. self.trim_range : tuple The result of calculating the optimal trim min and max propensity score values. self.in_trim : Series (True/False) An array where True means that the propensity score falls within the trim min/max range. """ def __init__(self, outcome, test_vars, df, init_vars=None, add_cons=True, disp=True, cutoff_ord1=1, cutoff_ord2=2.71, t_strata=1, n_min='auto'): # double checking some inputs if type(outcome) != str: raise ValueError( 'y must be a string variable name in the DataFrame.') if type(test_vars) != list: raise ValueError('X must be a list of covariates to test.') self.outcome = outcome self.test_vars = test_vars self.add_cons = add_cons self.init_vars = init_vars if init_vars and type(init_vars) == str: covs = [init_vars] + test_vars elif init_vars and type(init_vars) == list: covs = init_vars + test_vars else: covs = test_vars if n_min == 'auto': n_min_strata = len(covs) + 2 n_min_tc = 3 else: if type(n_min) != dict: raise ValueError('n_min must be "auto" or a dictionary') elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min): raise ValueError('Must specify both n_min_strata (ex. K+2) '\ 'and n_min_tc (ex. 3)') n_min_strata = n_min['n_min_strata'] n_min_tc = n_min['n_min_tc'] if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]: raise ValueError( 'You cannot have variables labeled "propscore" or "logodds"') data = df[[outcome] + covs].copy() ord2_vars = [] dropped_vars = [] # looping through covariates for idx, cc in enumerate(covs): # first a gut check to make sure all the variables aren't singular if len(data[cc].dropna().unique()) == 1: raise ValueError('{} only takes on one value'.format(cc)) # for all variables generate the interaction terms if idx < len(covs): for jj in covs[idx + 1:]: testvar = data[cc] * data[jj] if (not testvar.equals(data[cc]) and not testvar.equals(data[jj]) and len(testvar.dropna().unique()) > 1): data.loc[:, 'X'.join([cc, jj])] = testvar ord2_vars.append('X'.join([cc, jj])) else: dropped_vars.append('X'.join([cc, jj])) # for continuous variables, generate squared term if not data[cc].equals(data[cc]**2): data.loc[:, '{}_sq'.format(cc)] = data[cc]**2 ord2_vars.append('{}_sq'.format(cc)) else: dropped_vars.append('{}_sq'.format(cc)) if add_cons: data.loc[:, '_cons'] = 1 self.data = data self.dropped_vars = dropped_vars self.test_vars_ord2 = ord2_vars # ===================================================================== # Actually calculating propensity score # ===================================================================== linear = self.model_from_group(self.test_vars, cutoff=cutoff_ord1, init_vars=self.init_vars) squared = self.model_from_group(ord2_vars, cutoff=cutoff_ord2, init_vars=linear) if add_cons: self.model = Logit(self.data[self.outcome], self.data[squared + ['_cons']], missing='drop').fit(disp=False) else: self.model = Logit(self.data[self.outcome], self.data[squared], missing='drop').fit(disp=False) self.logodds = self.model.fittedvalues.rename('logodds') self.propscore = Series(self.model.predict(), index=self.logodds.index, name='propscore') self.trim_range = self.calc_trim(self.propscore) self.in_trim = ( self.propscore.ge(self.trim_range[0]) & self.propscore.le(self.trim_range[1])).rename('in_trim') self.strata = self.stratify(self.data[self.outcome], self.logodds, t_max=t_strata, n_min_strata=n_min_strata, n_min_tc=n_min_tc) if disp: print(self.model.summary()) print('The following vars were infeasible: {}'.format(', '.join( self.dropped_vars))) print('Stratification produced {} strata'.format( len(self.strata.dropna().unique()))) def best_in_group(self, newvars, basevars=None): ''' Get the best variable for score among a set of new variables ''' if not basevars and self.add_cons: basevars = ['_cons'] elif basevars and self.add_cons: basevars = basevars + ['_cons'] elif not basevars and not self.add_cons: raise ValueError( 'Must specify at least one covariate for baseline model') origmod = Logit(self.data[self.outcome], self.data[basevars], missing='drop').fit(disp=False) list_llf = [] for cc in newvars: try: newmod = Logit(self.data[self.outcome], self.data[basevars + [cc]], missing='drop').fit(disp=False) if origmod.nobs / origmod.nobs < .95: warnings.warn('Using {} causes more than 5% '\ 'of the sample to be dropped'.format(cc)) list_llf.append(newmod.llf) except: if cc not in self.dropped_vars: self.dropped_vars.append(cc) list_llf.append(origmod.llf) idx = list_llf.index(max(list_llf)) return newvars[idx], 2 * (list_llf[idx] - origmod.llf) def model_from_group(self, test_vars, cutoff, init_vars=None): ''' Iterate through a list over and over until no more contribution ''' remaining = test_vars.copy() if init_vars and type(init_vars) == str: final = [init_vars].copy() init_vars = [init_vars] elif init_vars and type(init_vars) == list: final = init_vars.copy() else: final = [] while len(remaining) > 0: temp, gain_add = self.best_in_group(remaining, basevars=final) if gain_add > cutoff: final.append(temp) remaining.remove(temp) else: break return final # we will define a static method so that we can call this on any generic series @staticmethod def stratify(outcome, logodds, n_min_strata, n_min_tc=3, t_max=1): """ Calculate strata from a given outcome variable and log-odds. Specify the cutoff for the t-statistic in t_max, or the minimum number of observations for each strata in n_min_strata and the number of treated or control observations per strata in n_min_tc. Parameters ---------- outcome : Series Binary variable denoting treatment outcome logodds : Series The calculated log-odds for that (transformation of propensity score). n_min_strata : Int The minimum number of observations per strata. n_min_tc : Int The minimum number of treated or control observations per strata. Default is 3. t_max : Float The maximum t-statistic value acceptable in a strata before splitting. Default is 1. Returns ------- strata : Series The calculated strata. Missing propensity scores and values outside of min of treated group or max of control group are coded as NaN. """ if type(outcome) != Series or type(logodds) != Series: raise ValueError('Expecting pandas series as inputs') # helper function to facilitate indexing def above_med(x): return (x >= x.median()).astype(int) outcome = outcome.rename('outcome').to_frame() df = outcome.join(logodds) minmax = df.groupby('outcome')['logodds'].agg(['max', 'min']) df = df.loc[df.logodds.ge(minmax.loc[1, 'min']) & df.logodds.le(minmax.loc[0, 'max']) & df.logodds.notnull()] # initialize the strata, potential blocks, and the change while loop df.loc[:, 'strata'] = 0 df.loc[:, 'block'] = 0 change = True while change == True: # get the medians of the strata df.loc[:, 'medgrp'] = df.groupby('strata')['logodds'].apply(above_med) for ii in df.strata.unique(): # simplify the notation sub = df.loc[df.strata.eq(ii), :].copy() # calculate t-stat and a grouper with number of groups t_test = ttest(sub.loc[sub.outcome.eq(1), 'logodds'], sub.loc[sub.outcome.eq(0), 'logodds'], nan_policy='omit').statistic n = sub.groupby(['medgrp', 'outcome'])['logodds'].count() # make new blocks if (t_test > t_max and min(n) >= n_min_tc and min(n.groupby('medgrp').sum()) >= n_min_strata): df.loc[df.strata.eq(ii), 'block'] = df.loc[df.strata.eq(ii), 'medgrp'] if df.block.sum() == 0: change = False else: # getting ready for next loop df.strata = df.groupby(['strata', 'block']).ngroup() df.block = 0 return outcome.join(df.strata).strata # we will define a static method so that we can call this on any generic series @staticmethod def calc_trim(propscore): y = 1 / (propscore * (1 - propscore)) if y.max() <= (2 / y.count()) * (y.sum()): return 0, 1 for gamma in linspace(y.max(), 0, 10000): lhs_estimand = (gamma / y.count()) * (y.le(gamma).sum()) rhs_estimand = (2 / y.count()) * ((y.le(gamma) * y).sum()) if lhs_estimand < rhs_estimand: break alpha = .5 - ((.25 - (1 / gamma))**.5) return alpha, 1 - alpha
Temp_Column_Name = VIF_Df.loc[VIF_Df['VIF'] == Max_VIF, 'Column_Name'] print(Temp_Column_Name, ": ", Max_VIF) if (Max_VIF >= 10): # This condition will ensure that ONLY columns having VIF lower than 10 are NOT dropped print(Temp_Column_Name, Max_VIF) Train_X_Copy = Train_X_Copy.drop(Temp_Column_Name, axis = 1) High_VIF_Column_Names.extend(Temp_Column_Name) Train_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True) Test_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True) Train_x.shape Test_x.shape from statsmodels.api import Logit Model1 = Logit(Train_y, Train_x).fit() Model1.summary() col_names = ['ApplicantIncome','Dependents'] Model2 = Logit(Train_y,Train_x.drop( col_names, axis= 1)).fit() Model2.summary() Test_x.drop(['ApplicantIncome','Dependents'], axis = 1, inplace=True) Test_x['Predit'] = Model2.predict(Test_x) Test_x.columns Test_x['Predit'][0:6] import numpy as np Test_x['Test_class']=np.where(Test_x['Predit']>=0.5, 1, 0)
class LogisticRegression: def __init__(self, endog_name_f=None, exog_name_f=None, data_f=None, add_constant_f=True, scale_vars_list_f=list(), interaction_name_f=list(), convert_bool_dict_f=dict(), convert_ord_list_f=list(), cat_col_omit_dict_f=dict(), hier_model_vars_dict_f=dict(), hier_exog_var_names_f=list(), classification_threshold_f=0.5, **kwds): self.endog_name = endog_name_f self.exog_name = exog_name_f self.data = data_f.reindex() self.add_constant = add_constant_f self.interaction_name = interaction_name_f self.convert_bool_dict = convert_bool_dict_f # convert_bool_dict_f self.convert_ord_list = convert_ord_list_f # convert_ord_list_f self.hier_model_vars_dict = hier_model_vars_dict_f self.hier_exog_var_names = hier_exog_var_names_f self.cat_col_names = list() self.cat_col_omit_dict = cat_col_omit_dict_f self.cat_col_drop_names = list() self.dummy_col_omit_list = list() self.scale_vars_list = scale_vars_list_f self.classification_threshold = classification_threshold_f self.exog_name_model = None self.model_data = None self.model = None self.model_result = None self.est_coef = dict() self.exog_matrix = None self.endog_matrix = None self.fitted_values = None self.refresh_model_data() def check_for_exog_conflict(self): t_bool_ord = set(self.convert_bool_dict.keys()).intersection( set(self.convert_ord_list)) t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_bool_dict.keys())) t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection( set(self.convert_ord_list)) t_hier_exog = set(self.exog_name).intersection( set(self.hier_exog_var_names)) if len(t_bool_ord) > 0: print( 'WARNING appearing in both boolean and ordinal variable lists: %s' % ', '.join(t_bool_ord)) if len(t_cat_ord) > 0: print( 'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical' % ', '.join(t_cat_ord)) if len(t_cat_bool) > 0: print( 'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical' % ', '.join(t_cat_bool)) if len(t_hier_exog) > 0: print( 'WARNING appearing in both exogenous and hierarchical exogenous variable lists: %s' ) def convert_cat_to_dummies(self): # get list of exogenous variables that are categorical and need to be converted self.cat_col_names = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in self.convert_ord_list) and (self.data[x].dtype == 'O') ) ] prefix_sep = '_' [ self.cat_col_omit_dict.update( {x: self.data[x].mode(dropna=True).values[0]}) for x in self.cat_col_names if x not in list(self.cat_col_omit_dict.keys()) ] self.cat_col_drop_names = [ k + prefix_sep + v for k, v in self.cat_col_omit_dict.items() ] if len(self.cat_col_names) > 0: return pd.get_dummies(self.data[self.cat_col_names], prefix_sep=prefix_sep, columns=self.cat_col_names, dtype=bool) else: return None def convert_to_bool(self): t_df = pd.DataFrame() t_col_names = list() for k, v in self.convert_bool_dict.items(): t_col_names.append(k + '_' + v + '_TF') t_df = pd.concat([t_df, self.data[k] == v], axis=1) t_df.columns = t_col_names return t_df def convert_to_ordinal(self): t_df = pd.DataFrame() t_col_names = list() for c in self.convert_ord_list: t_col_names.append(c + '_ORD') t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1) t_df.columns = t_col_names return t_df def create_hier_vars(self): t_df = pd.DataFrame() for c in self.hier_model_vars_dict.keys(): t_model = LogisticRegression( endog_name_f=self.hier_model_vars_dict[c] ['external_model'].endog_name, exog_name_f=self.hier_model_vars_dict[c] ['external_model'].exog_name, data_f=self.hier_model_vars_dict[c]['external_model'].data, add_constant_f=self.hier_model_vars_dict[c] ['external_model'].add_constant, scale_vars_list_f=self.hier_model_vars_dict[c] ['external_model'].scale_vars_list, convert_ord_list_f=self.hier_model_vars_dict[c] ['external_model'].convert_ord_list, convert_bool_dict_f=self.hier_model_vars_dict[c] ['external_model'].convert_bool_dict, cat_col_omit_dict_f=self.hier_model_vars_dict[c] ['external_model'].cat_col_omit_dict, interaction_name_f=self.hier_model_vars_dict[c] ['external_model'].interaction_name, classification_threshold_f=self.hier_model_vars_dict[c] ['classification_threshold']) ####### t_model.create_model_object() t_pred_prob, t_pred_class = self.hier_model_vars_dict[c][ 'external_model'].make_predictions( pred_data=t_model.exog_matrix, select_coef=self.hier_model_vars_dict[c]['select_coef']) t_col_names = list(t_df.columns) + [c, c + '_TF'] t_df = pd.concat([t_df, t_pred_prob, t_pred_class], axis=1) t_df.columns = t_col_names return t_df def create_interactions(self): def create_dummy_df(data_f, v1, v2, drop_list_f): prefix_sep = '_' if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool): # both bool - create interaction effect directly t_df = pd.DataFrame(data_f[v1] & data_f[v2], columns=[v1 + ' * ' + v2 + '_INT']) return t_df, ({v1: None}, {v2: None}) elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool): # both cat v1_dummies = pd.get_dummies(data_f[v1], prefix_sep=prefix_sep, dtype=bool) v1_omit = data_f[v1].mode( dropna=True).values[0] if v1 not in list( drop_list_f.keys()) else drop_list_f[v1] v2_dummies = pd.get_dummies(data_f[v2], prefix_sep=prefix_sep, dtype=bool) v2_omit = data_f[v2].mode( dropna=True).values[0] if v2 not in list( drop_list_f.keys()) else drop_list_f[v2] t_df = pd.DataFrame(index=data_f.index) for c1 in [x for x in v1_dummies.columns if x != v1_omit]: for c2 in [x for x in v2_dummies.columns if x != v2_omit]: t_df = pd.concat([ t_df, pd.DataFrame(v1_dummies[c1] & v2_dummies[c2], columns=[c1 + ' * ' + c2 + '_INT']) ], axis=1) return t_df, ({v1: v1_omit}, {v2: v2_omit}) else: # one bool if data_f[v1].dtype == bool: vb = v1 vd = v2 else: vb = v2 vd = v1 vd_dummies = pd.get_dummies(data_f[vd], prefix_sep=prefix_sep, dtype=bool) vd_omit = data_f[vd].mode( dropna=True).values[0] if vd not in list( drop_list_f.keys()) else drop_list_f[vd] t_df = pd.DataFrame(index=data_f.index) for c in [x for x in vd_dummies.columns if x != vd_omit]: t_df = pd.concat([ t_df, pd.DataFrame(data_f[vb] & vd_dummies[c], columns=[vb + ' * ' + c + '_INT']) ], axis=1) return t_df, ({vb: None}, {vd: None}) t_all_data = pd.concat([ self.data, self.model_data[np.setdiff1d(self.model_data.columns, self.data.columns)] ], axis=1) t_df = pd.DataFrame(index=self.data.index) t_dummy_col_omit_list = list() for int_act_col1, int_act_col2 in self.interaction_name: t_dummy, t_dummy_omit = create_dummy_df( data_f=t_all_data, v1=int_act_col1, v2=int_act_col2, drop_list_f=self.cat_col_omit_dict) ##### t_df = pd.concat([t_df, t_dummy], axis=1) t_dummy_col_omit_list.append(t_dummy_omit) del t_dummy, t_dummy_omit del int_act_col1, int_act_col2 self.dummy_col_omit_list = t_dummy_col_omit_list return t_df def code_variables(self): # get new variable matrices if len(self.convert_bool_dict) > 0: df_bool_f = self.convert_to_bool() else: df_bool_f = None if len(self.convert_ord_list) > 0: df_ord_f = self.convert_to_ordinal() else: df_ord_f = None df_cat_f = self.convert_cat_to_dummies() return df_bool_f, df_ord_f, df_cat_f def refresh_model_data(self): df_bool_f, df_ord_f, df_cat_f = self.code_variables() self.check_for_exog_conflict() t_remain_exog = [ x for x in self.exog_name if ((x not in list(self.convert_bool_dict.keys())) and ( x not in list(self.convert_ord_list)) and ( x not in self.cat_col_names)) ] if df_cat_f is not None: df_cat_f_dropped_omit = df_cat_f[[ c for c in df_cat_f.columns if c not in self.cat_col_drop_names ]] else: df_cat_f_dropped_omit = None self.model_data = pd.concat([ self.data[self.endog_name], self.data[t_remain_exog], df_bool_f, df_ord_f, df_cat_f_dropped_omit ], axis=1) # ------------- # add predictions for fold based on estimation of lower model if len(self.hier_model_vars_dict) > 0: df_hier_f = self.create_hier_vars() self.data[df_hier_f.columns] = df_hier_f self.model_data[self.hier_exog_var_names] = df_hier_f[ self.hier_exog_var_names] # add interaction variables if len(self.interaction_name) > 0: df_interaction_f = self.create_interactions() self.model_data[[x for x in df_interaction_f.columns ]] = df_interaction_f self.exog_name_model = [ x for x in self.model_data if x != self.endog_name ] def create_model_object(self): model_mat = copy.deepcopy(self.model_data) # convert booleans to floats explicitly for c in model_mat.columns: if model_mat[c].dtype == bool: model_mat[c] = model_mat[c].astype(float) # scale specified vars to N(0,1) for c in self.scale_vars_list: try: xbar = model_mat[c].mean() s = model_mat[c].std() model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s) del xbar, s except KeyError: print( 'Warning: specified variable to scale, %s, is not included in model covariates' % c) # drop rows with na model_mat.dropna(inplace=True) # add constant if needed if self.add_constant: model_mat = pd.concat([ pd.DataFrame(data=[1] * model_mat.shape[0], index=model_mat.index, columns=['const']), model_mat ], axis=1) self.endog_matrix = model_mat[self.endog_name] self.exog_matrix = model_mat[[ c for c in model_mat.columns if c != self.endog_name ]] self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix) def estimate_model(self): self.refresh_model_data() self.create_model_object() self.model_result = self.model.fit() self.est_coef.update( dict( zip(list(self.exog_matrix.columns), self.model_result._results.params))) self.make_predictions() # predict values of training data print(self.model_result.summary()) def make_predictions(self, pred_data=None, select_coef=None): def utility_calc(coef_fff, data_fff): return np.matmul(np.array(data_fff), np.array(coef_fff).reshape(len(coef_fff), 1)).flatten() def matrix_pred_calc(coef_ff, data_ff): return np.exp(utility_calc(coef_ff, data_ff)) / ( 1 + np.exp(utility_calc(coef_ff, data_ff))).flatten() def classify_pred(prob_ff, threshold_ff): return prob_ff > threshold_ff if pred_data is None: if select_coef is None: self.fitted_values = self.model_result.predict( self.exog_matrix) return self.fitted_values, classify_pred( self.fitted_values, self.classification_threshold) else: t_pred = pd.Series(matrix_pred_calc( coef_ff=[self.est_coef.get(key) for key in select_coef], data_ff=self.exog_matrix[select_coef]), index=self.exog_matrix.index) return t_pred, classify_pred(t_pred, self.classification_threshold) else: if select_coef is None: t_pred = self.model_result.predict(pred_data[:, [ x for x in pred_data.columns if x in list(self.est_coef.keys()) ]]) return t_pred, classify_pred(t_pred, self.classification_threshold) else: t_pred = pd.Series(matrix_pred_calc( coef_ff=[self.est_coef.get(key) for key in select_coef], data_ff=pred_data[select_coef]), index=pred_data.index) return t_pred, classify_pred(t_pred, self.classification_threshold)
def fit_model(X, y, co=0.1): sm = Logit((y.clip(0, 1) > co).astype(float), X.clip(0, 1), missing='drop') return sm.fit(disp=False)
def fit_model(X, y, co=0.1): sm = Logit((y.clip(0, 1)>co).astype(float), X.clip(0, 1), missing='drop') return sm.fit(disp=False)
# into an Nx2 array. hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend=True), family=sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM': logit_pars, 'Logit': logit_pars2}) # Draw a separating line in the [height, weight]-space. # The line will separate the space into predicted-male # and predicted-female regions. # Get the intercept and slope of the line based on the logit coefficients intercept = -logit_pars['const'] / logit_pars['x2'] slope = -logit_pars['x1'] / logit_pars['x2']
from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report print("Done ...") print("\n*** Recreate Train Data ***") dfX_train = dfTrain[allCols] y_train = dfTrain[clsVars].values print("Done ...") # model object print("\n*** Model ***") # add intercept manually dfX_train_const = add_constant(dfX_train) # build model and fit training data model = Logit(y_train, dfX_train_const).fit() # print the model summary print(model.summary()) print("Done ...") ################################ # Classification - Predict Train # evaluate : Accuracy & Confusion Metrics ############################### # Probability Distribution for train data prob_train = model.predict(dfX_train_const) # sort the prob dist for visualization sorted_train = sorted(prob_train.values) index_train = np.arange(len(sorted_train))
# ## Model # ### Regresi Logistik # In[ ]: titanic_ = add_constant(titanic) # In[ ]: model_ = Logit(titanic_['Survived'], titanic_.drop(['Survived'], axis=1)) result = model_.fit(); result.summary() # In[ ]: odd_ratio = np.exp(result.params); odd_ratio # ### Ekstraksi variabel target # Buat dataframe dengan X berupa masukan dan y berupa target (Survived) # In[ ]:
for poly in polys[target_gene]: in_central = poly.contains_points( atlas_coords.ix[:, ['X', 'Z'], time_point].T ) not_expr = atlas_expr.ix[:, target_gene, time_point] < co in_central |= not_expr print(sum(in_central)) #in_central = (x_coord < 45) #in_central = x_coord_scale < 0.6 #fitter = logistic.LogisticRegression(fit_intercept=False) #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co) sm_fitter = Logit( y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1)) sm_fit = sm_fitter.fit() Y_tmp = atlas_expr.ix[in_central, target_gene,time_point].copy() Y_tmp /= Y_tmp.max() Y_tmp = 1.0 * (Y_tmp > .5) all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0 all_regs = all_regs.index[all_regs] #if True: #if (poly == poly1) or (poly == poly2) or (poly == poly12): if target_gene == 'hb': #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const'] #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb', 'KrP', 'const']
def fit_model(df, formula, title="Full", fp=None, filename="Model", save=False): """ Function to fit model, collect stats and save predictions and model. df: dataframe formula: formula title: title of model (Default: "Full") fp: File pointer (Default: None) filename: Model and data file prefix ("Model") save: Weather to save predictions, model or both or none ["Both", "Data", "Model", False] (Default: False) """ if df.shape[0] < 10: print "Too less instances. Skipping. Make sure you have atleast 10 instances." return None, None print "Modelling Model[%s] with instances %s" % (title, df.shape[0]) print "Using formula:\n %s" % (formula) print "Generating patsy matrices" y, X = patsy.dmatrices(formula, df, return_type="dataframe") print "Initializing model" model = Logit(y, X) print "Fitting model" res = model.fit() print title, "\n", res.summary2() print "Confusion Matrix:", res.pred_table() precision = ems.precision(res.pred_table()) recall = ems.recall(res.pred_table()) accuracy = ems.accuracy(res.pred_table()) f_score = ems.fscore_measure(res.pred_table()) rmse = ems.rmse(res.predict(), model.endog) mae = ems.mae(res.predict(), model.endog) auc = ems.auc(res.predict(), model.endog) prc = ems.prc(res.predict(), model.endog) prc_filename = "%s.pdf" % filename plot_prc(prc, prc_filename) evaluation_metrics = "[Model Measures]: Confusion Matrix: %s\nRMSE: %s\tMAE: %s\tAUC: %s\nPrecision: %s\tRecall: %s\tAccuracy: %s\tF1-Score: %s\nPRC:\n%s" % ( res.pred_table(), rmse, mae, auc, precision, recall, accuracy, f_score, prc_filename) print evaluation_metrics print "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename if fp is not None: print >> fp, "Modelling Model[%s] with instances %s" % (title, df.shape[0]) print >> fp, "Using formula:\n %s" % (formula) print >> fp, title, "\n", res.summary2() print >> fp, evaluation_metrics print >> fp, "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename model_save, data_save = False, False if save == "Both": model_save, data_save = True, True if save == "Model" or model_save: model_file = "%s.pkl" % filename res.save(model_file, remove_data=True) # Save model if save == "Data" or data_save: data_file = "%s.data.txt" % filename # Include predictions print "df.index", df.index save_data(df[["from_id", "is_self_cite"]], res.predict(), filename=data_file) print "Done Saving" return model, res
}) for poly in polys[target_gene]: in_central = poly.contains_points(atlas_coords.ix[:, ['X', 'Z'], time_point].T) not_expr = atlas_expr.ix[:, target_gene, time_point] < co in_central |= not_expr print(sum(in_central)) #in_central = (x_coord < 45) #in_central = x_coord_scale < 0.6 #fitter = logistic.LogisticRegression(fit_intercept=False) #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co) sm_fitter = Logit(y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1)) sm_fit = sm_fitter.fit() Y_tmp = atlas_expr.ix[in_central, target_gene, time_point].copy() Y_tmp /= Y_tmp.max() Y_tmp = 1.0 * (Y_tmp > .5) all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0 all_regs = all_regs.index[all_regs] #if True: #if (poly == poly1) or (poly == poly2) or (poly == poly12): if target_gene == 'hb': #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const'] #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb', 'KrP', 'const'] #best_tfs = atlas_expr.major_axis
def logregress(df, X, y): dfX = _items(df, X) dfy = _value(df, y) model = Logit(dfy, dfX) result = model.fit_regularized() return result.summary()
def logregress_loose(X, y, *args, **kwargs): X = list(zip(*(_series(x) for x in X))) y = _series(y) model = Logit(y, X) result = model.fit(*args, **kwargs) return result.summary()
# Matrix of predictor variables: hieght and weight from data frame # into an Nx2 array. hw_exog = heights_weights[['Height', 'Weight']].values # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link # Note I have to add constants to the `exog` matrix. The prepend = True # argument prevents a warning about future change to the default argument. logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit)) logit_model.fit().summary() # Get the coefficient parameters. logit_pars = logit_model.fit().params # Logit model 2: Using the Logit function. logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True)) logit_model2.fit().summary() # Get the coefficient parameters logit_pars2 = logit_model2.fit().params # Compare the two methods again. They give the same parameters. DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2}) # Draw a separating line in the [height, weight]-space. # The line will separate the space into predicted-male # and predicted-female regions. # Get the intercept and slope of the line based on the logit coefficients intercept = -logit_pars['const'] / logit_pars['x2'] slope = -logit_pars['x1'] / logit_pars['x2']