def _compareFit(self, q): # QAR qar = self.fit(q) rhoName = qar.params.index[1] # alpha_tau alpha_tau = qar.params[0] alpha_tauLowerCI = qar.conf_int().loc['const'][0] alpha_tauUpperCI = qar.conf_int().loc['const'][1] # rho_tau rho_tau = qar.params[1] rho_tauLowerCI = qar.conf_int().loc[rhoName][0] rho_tauUpperCI = qar.conf_int().loc[rhoName][1] # OLS ols = OLS(self.y, self.X).fit() rhoName = ols.params.index[1] # alphaOLS alphaOLS = ols.params[0] alphaOLSLowerCI = ols.conf_int().loc['const'][0] alphaOLSUpperCI = ols.conf_int().loc['const'][1] # rhoOLS rhoOLS = ols.params[1] rhoOLSLowerCI = ols.conf_int().loc[rhoName][0] rhoOLSUpperCI = ols.conf_int().loc[rhoName][1] params = { 'quantile': q, 'α₀(τ)': alpha_tau, 'α₀(τ):LB': alpha_tauLowerCI, 'α₀(τ):UB': alpha_tauUpperCI, 'α₀(OLS)': alphaOLS, 'α₀(OLS):LB': alphaOLSLowerCI, 'α₀(OLS):UB': alphaOLSUpperCI, 'ρ₁(τ)': rho_tau, 'ρ₁(τ):LB': rho_tauLowerCI, 'ρ₁(τ):UB': rho_tauUpperCI, 'ρ₁(OLS)': rhoOLS, 'ρ₁(OLS):LB': rhoOLSLowerCI, 'ρ₁(OLS):UB': rhoOLSUpperCI, } return params
class LinearRegression: """ Class for OLS regression models based on the excellent statsmodels package. Parameters ---------- method : 'enter' or 'backward' Method for predictors selection include_constant : bool (CURRENTLY UNAVAILIABLE) Whether to include constant in the model sig_level_entry : float (CURRENTLY UNAVAILIABLE) Max significance level to include predictor in the model sig_level_removal : float Min significance level to exclude predictor from the model Attributes ---------- variables_excluded : list Variables excluded because of zero variance variables_included : list Variables included in a model predictions : pd.Series Predicted values N : int Number of observations included in a model r2 : float R-squared (coefficient of determination) r2_adjusted : float Adjusted r-squared F : float F-statistic F_pvalue : float P-value for F-statistic ess : float Explained sum of squares rss : float Residual sum of squares tss : float Total sum of squares coefficients : pd.Series Regression coefficients coefficients_sterrors : pd.Series Standard errors of regression coefficients coefficients_tvalues : pd.Series T-statistics of regression coefficients coefficients_pvalues : pd.Series P-values of regression coefficients """ def __init__(self, method='enter', include_constant=True, sig_level_entry=0.05, sig_level_removal=0.05): self.method = method.lower().strip() self.include_constant = include_constant self.sig_level_entry = sig_level_entry self.sig_level_removal = sig_level_removal def fit(self, data, formula, categorical_variables=None, show_results=True, confidence_intervals=True, collinearity_statistics=False, use_patsy_notation=False, n_decimals=3): """ Fit model to the given data using formula. Parameters ---------- data : pd.DataFrame Data to fit a model formula : str Formula of a model specification, e.g. 'y ~ x1 + x2'; should be passed either in Patsy (statsmodels) notation or using the following rules: '*' for interaction of the variables, ':' for interaction & main effects, i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation). If you use Patsy notation, please specify the parameter use_patsy_notation=True. categorical_variables : list List of names of the variables that should be considered categorical. These variables would be automatically converted into sets of dummy variables. If you want to use this option, please make sure that you don't have nested names of variables (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure. show_results : bool Whether to show results of analysis confidence_intervals : bool Whether to include coefficients' confidence intervals in the summary table collinearity_statistics : bool whether to include coefficients' tolerance and VIF in the summary table use_patsy_notation : bool turn this on if you use strictly Patsy's rules to define a formula. See more: https://patsy.readthedocs.io/en/latest/quickstart.html n_decimals : int Number of digits to round results when showing them Returns ------- self The current instance of the LinearRegression class """ self._data = data.copy() self.categorical_variables = categorical_variables self._show_ci = confidence_intervals self._show_col = collinearity_statistics if '=' in formula: formula = formula.replace('=', '~') if not use_patsy_notation: formula = formula.replace('*', '^').replace(':', '*').replace('^', ':') self.formula = formula #won't work correctly if some variables have similar names (e.g. kinopoisk_rate and kinopoisk_rate_count) if categorical_variables is not None: if not isinstance(categorical_variables, list): raise ValueError( f"""Categorical variables should be passed as list. Type {type(categorical_variables)} was passed instead.""") else: for variable in categorical_variables: formula = formula.replace(variable, f'C({variable})') self._model = ols(formula=formula, data=data).fit() self._observations_idx = list(self._model.fittedvalues.index) self.dependent_variable = self._model.model.endog_names self.variables_excluded = self._identify_variables_without_variation() if len(self.variables_excluded) > 0: y = pd.Series(self._model.model.endog.copy(), index=self._observations_idx, name=self.dependent_variable) X = self._remove_variables_without_variation() self._model = OLS(y, X, missing='drop').fit() self.variables_excluded = [ LinearRegression._translate_from_patsy_notation(x) for x in self.variables_excluded ] if self.method == 'backward': self._fit_backward() self._get_statistics_from_model() self.predictions = self.predict() if show_results: self.show_results(n_decimals) if len(self.variables_excluded) > 0: print('------------------\n') print( f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}" ) return self def predict( self, data=None, add_to_data=False, ): """ Predict values of a dependent variable for the given data using the fitted model. Parameters ---------- data : pd.DataFrame Data for predictions, may be not specified if you want to predict values for the same data that were used to fit a model add_to_data : bool Whether to merge predictions with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Predictions """ name = f'{self.dependent_variable} (predicted)' if data is None: data_init = self._data.copy() result = self._model.fittedvalues data_init[name] = result if add_to_data: return data_init else: return data_init[name].copy() else: aux_model = ols(self.formula, data).fit() aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [LinearRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) aux_X = add_constant(aux_data[self.variables_included].copy()) aux_y = aux_model.model.endog.copy() aux_model = OLS(aux_y, aux_X, missing='drop').fit() result = aux_model.fittedvalues result.name = name if add_to_data: result = pd.concat([data, result], axis=1, sort=False) return result def _get_statistics_from_model(self): self.N = self._model.nobs self.r2 = self._model.rsquared self.r2_adjusted = self._model.rsquared_adj self.F = self._model.fvalue self.F_pvalue = self._model.f_pvalue self.ess = self._model.ess self.rss = self._model.ssr if self.include_constant: self.tss = self._model.centered_tss else: self.tss = self._model.uncentered_tss self.ms_model = self._model.mse_model self.ms_resid = self._model.mse_resid self.ms_total = self._model.mse_total self.dof_model = self._model.df_model self.dof_resid = self._model.df_resid self.dof_total = self.dof_model + self.dof_resid self.coefficients = self._model.params.copy() self.coefficients_sterrors = self._model.bse.copy() self.coefficients_tvalues = self._model.tvalues.copy() self.coefficients_pvalues = self._model.pvalues.copy() variables_included = [ x for x in list(self.coefficients.index) if x != 'Intercept' ] self._variables_included_patsy = variables_included.copy() variables_included = [ LinearRegression._translate_from_patsy_notation(x) for x in variables_included ] self.variables_included = variables_included #self._independent_variables = if self.include_constant: self._params_idx = ['Constant'] + variables_included else: self._params_idx = variables_included.copy() for stats in [ self.coefficients, self.coefficients_pvalues, self.coefficients_sterrors, self.coefficients_tvalues ]: stats.index = self._params_idx return @property def coefficients_beta(self): b = np.array(self._model.params)[1:] std_y = self._model.model.endog.std(axis=0) std_x = self._model.model.exog.std(axis=0)[1:] beta = list(b * (std_x / std_y)) if self.include_constant: beta = [np.nan] + beta result = pd.Series(beta, index=self._params_idx) return result @property def coefficients_confidence_interval(self): ci = self._model.conf_int() ci.index = self._params_idx ci.columns = [f'LB CI (95%)', f'UB CI (95%)'] return ci @property def coefficients_VIF(self): #eps = 1e-20 x = self._model.model.exog[:, 1:].copy() inv_corr = np.linalg.inv(sp.corrcoef(x, rowvar=False)) diag = list(inv_corr.diagonal()) if self.include_constant: diag = [np.nan] + diag return pd.Series(diag, index=self._params_idx) @property def coefficients_tolerance(self): return 1 / self.coefficients_VIF @staticmethod def _translate_from_patsy_notation(effect): effect = effect\ .replace(':', ' * ')\ .replace('C(', '')\ .replace('T.', '')\ .replace('[', ' = "')\ .replace(']', '"')\ .replace(')', '') return effect def show_results(self, n_decimals): """ Show results of the analysis in a readable form. Parameters ---------- n_decimals : int Number of digits to round results when showing them """ phrase = 'method {}' print('\nLINEAR REGRESSION SUMMARY') print('------------------\n') print('Model summary') display(self.summary_r2().style\ .set_caption(phrase.format('.summary_r2()'))\ .set_precision(n_decimals)) print('------------------\n') print('ANOVA') display(self.summary_F().style\ .format(None, na_rep="")\ .set_caption(phrase.format('.summary_F()'))\ .set_precision(n_decimals)) print('------------------\n') print('Coefficients') display(self.summary().style\ .format(None, na_rep="")\ .set_caption(phrase.format('.summary()'))\ .set_precision(n_decimals)) def summary(self): """ Summary table with requested information related to regression coefficients. Returns ------- pd.DataFrame A summary table """ statistics = [ self.coefficients, self.coefficients_sterrors, self.coefficients_beta, self.coefficients_tvalues, self.coefficients_pvalues ] columns = ['B', 'Std. Error', 'Beta', 't', 'p-value'] if self._show_ci: statistics.append(self.coefficients_confidence_interval) columns.extend(list(self.coefficients_confidence_interval.columns)) if self._show_col: statistics.append(self.coefficients_tolerance) statistics.append(self.coefficients_VIF) columns.extend(['Tolerance', 'VIF']) statistics = pd.concat(statistics, axis=1) statistics.columns = columns statistics.index = self._params_idx return statistics def summary_r2(self): """ Summary table with information related to coefficient of determination. Returns ------- pd.DataFrame A summary table """ r = self.r2**0.5 r2 = self.r2 r2_adj = self.r2_adjusted statistics = [[r, r2, r2_adj]] columns = ['R', 'R Squared', 'Adj. R Squared'] statistics = pd.DataFrame(statistics, columns=columns, index=['']) return statistics def summary_F(self): """ Summary table with information related to F-statistic. Returns ------- pd.DataFrame A summary table """ results = [[ self.ess, self.dof_model, self.ms_model, self.F, self.F_pvalue ], [self.rss, self.dof_resid, self.ms_resid, np.nan, np.nan], [self.tss, self.dof_total, np.nan, np.nan, np.nan]] results = pd.DataFrame( results, columns=['Sum of Squares', 'df', 'Mean Square', 'F', 'p-value'], index=['Regression', 'Residual', 'Total']) return results def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return def _identify_variables_without_variation(self): if self.include_constant: mask = self._model.model.exog.var(axis=0)[1:] == 0 else: mask = self._model.model.exog.var(axis=0) == 0 variables_included = [ x for x in list(self._model.params.index) if x != 'Intercept' ] return list(np.array(variables_included)[mask]) def _remove_variables_without_variation(self): X = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) X = X.drop(self.variables_excluded, axis=1) return X def save_independent_variables(self, data=None, add_to_data=False): """ Produce values of independent variable remained in a fitted model. This option is useful if you don't create dummy variables or interaction effects manually but want to use them in a further analysis. Only variables remained in a model are returned (those that are shown in a summary table). Parameters ---------- data : pd.DataFrame Data for which independent variables are requested; may be not specified if you want to save values for the same data that were used to fit a model add_to_data : bool Whether to merge new values with the given data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Values of independent variables """ if data is None: data = self._data.copy() if self.include_constant: result = self._model.model.exog[:, 1:].copy() else: result = self._model.model.exog.copy() columns = [x for x in self.variables_included if x != 'Constant'] result = pd.DataFrame(result, columns=columns, index=self._observations_idx) else: aux_model = ols(self.formula, data).fit() aux_data_idx = aux_model.fittedvalues.index aux_data_cols = aux_model.model.exog_names aux_data_cols = [LinearRegression._translate_from_patsy_notation(x)\ for x in aux_data_cols] aux_data = pd.DataFrame(aux_model.model.exog, index=aux_data_idx, columns=aux_data_cols) result = aux_data[self.variables_included] if add_to_data: result = pd.concat([data, result], axis=1, sort=False) return result def save_residuals(self, unstandardized=True, standardized=False, studentized=False, deleted=False, studentized_deleted=False, add_to_data=False): """ Produce values of various residuals. Residuals are returned only for data used to fit a model. Parameters ---------- unstandardized : bool Whether to save unstandardized (raw) residuals standardized : bool Whether to save standardized (z-scores) residuals studentized : bool Whether to save studentized residuals deleted : bool Whether to save deleted residuals studentized_deleted : bool Whether to save studentized deleted residuals add_to_data : bool Whether to merge new values with data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Requested residuals """ columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \ for k, v in vars().items() if v==True and k!='add_to_data'] infl = OLSInfluence(self._model) result = [] res_unstand = infl.resid res_unstand.name = 'Unstandard. res.' res_stand = (res_unstand - res_unstand.mean()) / res_unstand.std() res_stand.name = 'Standard. res.' res_stud = infl.resid_studentized_internal res_stud.name = 'Student. res.' result.extend([res_unstand, res_stand, res_stud]) if deleted: res_del = infl.resid_press res_del.name = 'Del. res.' result.append(res_del) if studentized_deleted: res_stud_del = infl.resid_studentized_external res_stud_del.name = 'Student. del. res.' result.append(res_stud_del) result = pd.concat(result, axis=1) result = result[columns_to_show].copy() if add_to_data: result = pd.concat([self._data, result], axis=1) return result #following two methods are still in progress @staticmethod def _turn_all_rows_to_fancy(summary): return summary.apply( lambda x: LinearRegression._turn_one_row_to_fancy(x), axis=1) @staticmethod def _turn_one_row_to_fancy(row): coef = round(row['B'].item(), 3) sterr = round(row['Std. Error'].item(), 3) pval = row['p-value'].item() if pval <= 0.01: mark = '***' elif pval <= 0.05: mark = '**' elif pval <= 0.1: mark = '*' else: mark = '' result = f'{coef}{mark} \n ({sterr})' return result
def singlesplit(self, X, y, nleft, model_selector): ######## Split a sample into two subsamples ######## #print(self.gamma) n, p = X.shape nright = n - nleft pvals_v = np.ones(p) lci_v = np.array([-np.Inf] * p) uci_v = np.array([np.Inf] * p) coefs = np.zeros(p) ses_v = np.array([np.Inf] * p) df_res = 0 tryagain = True count = 0 while tryagain: ######## Randomly split the sample ####### split = np.random.choice(np.arange(n), nleft, replace=False) # without replacement xleft = X.copy()[split, :] yleft = y.copy()[split] xright = X.copy()[~split, :] yright = y.copy()[~split] ######## Model selection on Sample I ####### if self.manual_lam: # calculate regularization path eps = 0.001 K = 100 max_lambda = np.max(np.abs(np.sum(np.dot(xleft.T, yleft)))) / n lambda_path = np.round(np.exp( np.linspace(math.log10(max_lambda), math.log10(max_lambda * eps), K)), decimals=100) model_selector.set_params(alphas=lambda_path, normalize=True, tol=1e-3) #print(lambda_path) model_selector.fit(X=xleft, y=yleft) sel_nonzero = (model_selector.coef_ != 0 ) # location of selected variables #location = np.where(self.selector.coef_ != 0) #print(location) #print(self.selector.coef_) p_sel = sum(sel_nonzero) # size of selected variables ######## Check up the selected results, make sure applicable for OLS ######## if (p_sel + 1) >= nright: # rankX larger than number of row, cannot calculate p-values tryagain = True count = count + 1 print("Too large model selected in a sample-split") if p_sel == 0: print("Empty model selected, it is OK") tryagain = False if p_sel > 0 and (p_sel + 1) < nright: tryagain = False ######## Fitting Sample II with reduced features using OLS ######## lm = OLS(yright, xright[:, sel_nonzero]).fit(method="qr") df_res = lm.df_resid sel_pval = lm.pvalues coefs[sel_nonzero] = lm.params ses_v[sel_nonzero] = lm.bse # Sanity checks for p-values if len(sel_pval) != p_sel: sys.exit( "The statsmodels.OLS didn't return the correct number of p-values for the provided submodel." ) if not (np.all(sel_pval >= 0) and np.all(sel_pval <= 1)): sys.exit( "The statsmodels.OLS returned p-values below 0 or above 1." ) ######## Multiple testing adjustment on small sample: Bonferroni ######## pvals_v[sel_nonzero] = np.minimum(sel_pval * p_sel, 1) # renew p-values ######## Confidence intervals and other relative informations ######## if all(pow(10, -5) < abs(self.gamma * self.B % 1)): print( "Duality might be violated because of choice of gamma. Use steps of length 1 / B" ) sel_ci = lm.conf_int(alpha=self.ci_level) lci_v[sel_nonzero] = sel_ci[:, 0] uci_v[sel_nonzero] = sel_ci[:, 1] ######## End of C.I. ######## if count > self.repeat_max: print( "Exceed max repeat times,sample splits resulted in too large models." ) sys.exit() return pvals_v, p_sel, coefs, lci_v, uci_v, ses_v, df_res
def singlesplit(self,X,y): # single split n,p = X.shape self.nleft = np.floor(n * self.fraction) self.nright = n - self.nleft if not (self.nleft>=1 and self.nright>=1): print("Not enough data for splitting") sys.exit() pvals_v = np.ones(p) lci_v = np.array([-np.Inf]*p) uci_v = np.array([np.Inf]*p) coefs = np.zeros(p) tryagain = True count = 0 while tryagain: split = np.random.randint(low=1,high=n,size=self.nleft) xleft = X.copy()[split,:] yleft = y.copy()[split] xright = X.copy()[~split,:] yright = y.copy()[~split] # calculate lambdas sequence for lasso eps = 0.001 K = 100 max_lambda = np.max(np.abs(np.sum(np.dot(xleft.T,yleft)) ))/n lambda_path = np.round(np.exp(np.linspace(math.log10(max_lambda), math.log10(max_lambda * eps), K)),decimals=100) sel_model.set_params(alphas=lambda_path) sel_model.fit(X=xleft,y=yleft) sel_nonzero = np.where(sel_model.coef_!=0)[0] p_sel = len(sel_nonzero) ## Classical situation: ## A model with intercept is used, hence p.sel + 1 < nrow(x.right), ## otherwise, p-values can *not* be calculated if p_sel==0: print("Empty model selected") tryagain = False if p_sel>0 and p_sel< (self.nright-1): #Fitting Sample II with selected features using simple linear regression lm = OLS(yright,xright[:,sel_nonzero]).fit() sel_pval = lm.pvalues coefs[sel_nonzero] = lm.params if len(sel_pval)!=p_sel: print("The classical OLS didn't return the correct number of p-values for the provided submodel.") sys.exit() if not((sel_pval>=0).all() and (sel_pval<=1).all()): print("The classical OLS returned p-values below 0 or above 1.") sys.exit() # Multi-test adjustment with Bonferroni method: pvals_v[sel_nonzero] = np.minimum(sel_pval*p_sel,1) #renew p-values tryagain=False # Calculate confidence intervals if self.ci: if not (all(abs(self.gamma * self.B % 1) <= pow(10, -5))): print("Duality might be violated because of choice of gamma. Use steps of length 1 / B") sel_ci = np.array(lm.conf_int(alpha=self.ci_level)) lci_v[sel_nonzero] = sel_ci[:,0] uci_v[sel_nonzero] = sel_ci[:,1] pvals_adjusted = np.minimum(pvals_v * p_sel, 1) return pvals_adjusted,coefs, lci_v, uci_v if p_sel >= (self.nright-1):#rankX less than number of low tryagain=True count=count+1 print("Too large model selected in a sample-split") if count>self.repeat_max: print("Exceed max repeat times,sample splits resulted in too large models.") sys.exit() if count > 5: # Adaptive lasso init = RidgeCV(fit_intercept=False, cv=10).fit(xleft, yleft) w = abs(init.coef_) sel_model = LassoLars(fit_intercept=False, normalize=False, fit_path=False) sel_model.fit(xleft * w, y=yleft) sel_nonzero = np.where(sel_model.coef_ != 0)[0] p_sel = len(sel_nonzero) pvals_adjusted = np.minimum(pvals_v*p_sel,1) return pvals_adjusted,coefs