def weight_plot(model_results: RegressionResultsWrapper, score='t'): summary = model_results.summary2().tables[1] summary['abs.t'] = summary[score].abs() summary = summary.sort_values('abs.t', ascending=True) fig, ax = plt.subplots(figsize=(12, 0.5 * len(summary))) sns.despine(fig, left=True, bottom=True) for i, (coef, row) in enumerate(summary.iterrows()): # plot the points ax.plot(row[['[0.025', 'Coef.', '0.975]']], [i, i, i], 'ko-', ms=5., lw=2., markevery=[1]) # add the vertical markers ax.vlines(row['[0.025'], i - 0.15, i + 0.15) ax.vlines(row['0.975]'], i - 0.15, i + 0.15) ax.annotate("%.2f" % row['Coef.'], (row['Coef.'], i), xytext=(-6, 4), textcoords='offset points') # add the horizontal lines ax.hlines(list(range(len(summary))), [ax.get_xlim()[0]] * len(summary), summary['[0.025'], colors='lightgray', linestyle='--') ax.xaxis.set_visible(False) # add the y labels ax.set_yticks(list(range(len(summary)))) ax.set_yticklabels(summary.index) ax.vlines([0], -1, len(summary), colors='k', linestyle='--') ax.set_title("Weight plot", loc='left', size=18, pad=-20)
def _package_attrs(self, attrs): # Sometimes features are retrieved from wrapper (stargazer does this), # other times from the actual result (statsmodels' summary_col does # this), so we'll have both. rres = RRegressionResults() # Use patsy to extract the target variable: fobj = ModelDesc.from_formula(self.formula) rres.target = fobj.lhs_termlist[0].name() rres.model = self # We need to hijack this rather than subclassing because stargazer does # not use "isinstance()" but "type()": wrap = RegressionResultsWrapper(rres) # All items except "params" are @cache_readonly and need first to be # deleted, and then redefined: for attr in attrs: if attr not in ('params', ): if hasattr(rres, attr): delattr(rres, attr) setattr(rres, attr, attrs[attr]) setattr(wrap, attr, attrs[attr]) self._debug("Set {} to {}".format(attr, attrs[attr])) rres.__class__ = RegressionResults return wrap
def show_result(poly_data: pandas.DataFrame, regression_model: RegressionResultsWrapper, predict_model: pandas.DataFrame, y_param: str, degree: int): poly_features = [ i for i in poly_data.columns.values if i.startswith('power_') ] # #7 pred_model = regression_model.predict( sm.add_constant(predict_model[poly_features])) plot_polynomial(poly_data, pred_model, y_param) # #9 print('-----coefficient of degree {deg}------'.format(deg=degree)) print_coefficient(regression_model.params)
def plot_confidence_intervals(res: RegressionResultsWrapper) -> alt.Chart: """Returns a matplotlib axes containing a box and whisker Altair plot of regression coefficients' point estimates and confidence intervals. """ alt.themes.register("streamlit", streamlit_theme) # Enable custom theme alt.themes.enable("streamlit") conf_int = res.conf_int() # 95% C.I. # Stack lower and upper columns conf_int = conf_int.stack() conf_int.name = "estimate" conf_int = pd.DataFrame(conf_int) conf_int = (conf_int.reset_index().rename(columns={ 'level_0': 'regressor', 'level_1': 'interval' })) chart = alt.Chart(conf_int).mark_boxplot().encode( x='regressor:O', y='estimate:Q').properties(width=200, height=500) return chart
def expression_fields( xs: np.ndarray, ys: np.ndarray, results: regres, n_ticks: int = 400, ) -> Tuple[np.ndarray, np.ndarray, Tuple[int, int]]: mx = np.max((xs[:, 1])) mn = np.min(xs[:, 1]) xx = np.linspace(mn, mx, n_ticks) mx = np.max((xs[:, 2])) mn = np.min(xs[:, 2]) yy = np.linspace(mn, mx, n_ticks) X, Y = np.meshgrid(xx, yy) shape = X.shape Xf = X.flatten() Yf = Y.flatten() XY = np.hstack((np.ones( (Xf.shape[0], 1)), Xf[:, np.newaxis], Yf[:, np.newaxis])) Z = results.predict(XY) return (XY[:, 1::], Z, shape)
def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-6, **kwargs): """ Solve by Iterative Weighted Least Squares Parameters ---------- q : float Quantile must be between 0 and 1 vcov : str, method used to calculate the variance-covariance matrix of the parameters. Default is ``robust``: - robust : heteroskedasticity robust standard errors (as suggested in Greene 6th edition) - iid : iid errors (as in Stata 12) kernel : str, kernel to use in the kernel density estimation for the asymptotic covariance matrix: - epa: Epanechnikov - cos: Cosine - gau: Gaussian - par: Parzene bandwidth : str, Bandwidth selection method in kernel density estimation for asymptotic covariance estimate (full references in QuantReg docstring): - hsheather: Hall-Sheather (1988) - bofinger: Bofinger (1975) - chamberlain: Chamberlain (1994) """ if q < 0 or q > 1: raise Exception('p must be between 0 and 1') kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] if kernel not in kern_names: raise Exception("kernel must be one of " + ', '.join(kern_names)) else: kernel = kernels[kernel] if bandwidth == 'hsheather': bandwidth = hall_sheather elif bandwidth == 'bofinger': bandwidth = bofinger elif bandwidth == 'chamberlain': bandwidth = chamberlain else: raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") endog = self.endog exog = self.exog nobs = self.nobs exog_rank = np.linalg.matrix_rank(self.exog) self.rank = exog_rank self.df_model = float(self.rank - self.k_constant) self.df_resid = self.nobs - self.rank n_iter = 0 xstar = exog beta = np.ones(exog_rank) # TODO: better start, initial beta is used only for convergence check # Note the following does not work yet, # the iteration loop always starts with OLS as initial beta # if start_params is not None: # if len(start_params) != rank: # raise ValueError('start_params has wrong length') # beta = start_params # else: # # start with OLS # beta = np.dot(np.linalg.pinv(exog), endog) diff = 10 cycle = False history = dict(params = [], mse=[]) while n_iter < max_iter and diff > p_tol and not cycle: n_iter += 1 beta0 = beta xtx = np.dot(xstar.T, exog) xty = np.dot(xstar.T, endog) beta = np.dot(pinv(xtx), xty) resid = endog - np.dot(exog, beta) mask = np.abs(resid) < .000001 resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001 resid = np.where(resid < 0, q * resid, (1-q) * resid) resid = np.abs(resid) xstar = exog / resid[:, np.newaxis] diff = np.max(np.abs(beta - beta0)) history['params'].append(beta) history['mse'].append(np.mean(resid*resid)) if (n_iter >= 300) and (n_iter % 100 == 0): # check for convergence circle, should not happen for ii in range(2, 10): if np.all(beta == history['params'][-ii]): cycle = True warnings.warn("Convergence cycle detected", ConvergenceWarning) break if n_iter == max_iter: warnings.warn("Maximum number of iterations (" + str(max_iter) + ") reached.", IterationLimitWarning) e = endog - np.dot(exog, beta) # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: # h = 0.9 * np.std(e) / (nobs**0.2) # Instead, we calculate bandwidth as in Stata 12 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) h = bandwidth(nobs, q) h = min(np.std(endog), iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) if vcov == 'robust': d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2) xtxi = pinv(np.dot(exog.T, exog)) xtdx = np.dot(exog.T * d[np.newaxis, :], exog) vcov = chain_dot(xtxi, xtdx, xtxi) elif vcov == 'iid': vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) else: raise Exception("vcov must be 'robust' or 'iid'") lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) lfit.q = q lfit.iterations = n_iter lfit.sparsity = 1. / fhat0 lfit.bandwidth = h lfit.history = history return RegressionResultsWrapper(lfit)
def get_rss(model: RegressionResultsWrapper, data: list, input_model: list, param_name: str) -> float: prediction = model.predict(sm.add_constant(data[input_model])) residuals = data[param_name] - prediction rss = (residuals**2).sum() return rss