def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, **kwargs): from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = influence leverage = infl.hat_matrix_diag resid = zscore(infl.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, **kwargs): from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = influence leverage = infl.hat_matrix_diag resid = zscore(infl.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1. - alpha / 2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)] * int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None, **kwds): from statsmodels.graphics import utils fig, ax = utils.create_mpl_ax(ax) if title is None: title = "Index Plot" nobs = len(self.endog) index = np.arange(nobs) ax.scatter(index, y, **kwds) if threshold == 'all': large_points = np.ones(nobs, np.bool_) else: large_points = np.abs(y) > threshold psize = 3 * np.ones(nobs) # add point labels labels = self.results.model.data.row_labels if labels is None: labels = np.arange(nobs) ax = utils.annotate_axes( np.where(large_points)[0], labels, lzip(index, y), lzip(-psize, psize), "large", ax) font = {"fontsize": 16, "color": "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Observation", **font) ax.set_title(title, **font) return fig
def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,**kwds): from statsmodels.graphics import utils fig, ax = utils.create_mpl_ax(ax) if title is None: title = "Index Plot" nobs = len(self.endog) index = np.arange(nobs) ax.scatter(index, y, **kwds) if threshold == 'all': large_points = np.ones(nobs, np.bool_) else: large_points = np.abs(y) > threshold psize = 3 * np.ones(nobs) # add point labels labels = self.results.model.data.row_labels if labels is None: labels = np.arange(nobs) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(index, y), lzip(-psize, psize), "large", ax) font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Observation", **font) ax.set_title(title, **font) return fig
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range / old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized from scipy import stats cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes( np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize": 16, "color": "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. label_kwargs : dict The keywords to pass to annotate for the labels. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm, t fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1. - alpha / 2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = range(results.nobs) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, zip(resid**2, leverage), [(0, 5)] * int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. label_kwargs : dict The keywords to pass to annotate for the labels. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm, t fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = range(results.nobs) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, zip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. Examples -------- Load the Statewide Crime data set and plot partial regression of the rate of high school graduation (hs_grad) on the murder rate(murder). The effects of the percent of the population living in urban areas (urban), below the poverty line (poverty) , and in a single person household (single) are removed by OLS regression. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> crime_data = sm.datasets.statecrime.load_pandas() >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', ... exog_others=['urban', 'poverty', 'single'], ... data=crime_data.data, obs_labels=False) >>> plt.show() .. plot:: plots/graphics_regression_partregress.py More detailed examples can be found in the Regression Plots notebook on the examples page. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size == 0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance( endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def influence_plot(results, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): """ Plot of influence in regression. Plots studentized resids vs. leverage. Parameters ---------- results : #1lab_results instance A fitted model. external : bool Whether to use externally or internally studentized residuals. It is recommended to leave external as True. alpha : float The alpha value to identify large studentized residuals. Large means abs(resid_studentized) > t.ppf(1-alpha/2, dof=#1lab_results.df_resid) criterion : str {'DFFITS', 'Cooks'} Which criterion to base the size of the points on. Options are DFFITS or Cook's D. size : float The range of `criterion` is mapped to 10**2 - size**2 in points. plot_alpha : float The `alpha` of the plotted points. ax : matplotlib Axes instance An instance of a matplotlib Axes. Returns ------- fig : matplotlib figure The matplotlib figure that contains the Axes. Notes ----- Row labels for the observations in which the leverage, measured by the diagonal of the hat matrix, is high or the residuals are large, as the combination of large residuals and a high influence value indicates an influence point. The value of large residuals can be controlled using the `alpha` parameter. Large leverage points are identified as hat_i > 2 * (df_model + 1)/nobs. """ fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range / old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized_internal from scipy import stats cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes( np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize": 16, "color": "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
# #888](https://github.com/statsmodels/statsmodels/issues/808)) weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx.values] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1) resid = rob_crime_model.resid resid2 = resid**2 resid2 /= resid2.sum() nobs = int(idx.sum()) hm = hat_matrix_diag.mean() rm = resid2.mean() from statsmodels.graphics import utils fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(resid2[idx], hat_matrix_diag, 'o') ax = utils.annotate_axes(range(nobs), labels=rob_crime_model.model.data.row_labels[idx], points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5, 5)] * nobs, size="large", ax=ax) ax.set_xlabel("resid2") ax.set_ylabel("leverage") ylim = ax.get_ylim() ax.vlines(rm, *ylim) xlim = ax.get_xlim() ax.hlines(hm, *xlim) ax.margins(0, 0)
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size==0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def influence_plot(results, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): """ Plot of influence in regression. Plots studentized resids vs. leverage. Parameters ---------- results : results instance A fitted model. external : bool Whether to use externally or internally studentized residuals. It is recommended to leave external as True. alpha : float The alpha value to identify large studentized residuals. Large means abs(resid_studentized) > t.ppf(1-alpha/2, dof=results.df_resid) criterion : str {'DFFITS', 'Cooks'} Which criterion to base the size of the points on. Options are DFFITS or Cook's D. size : float The range of `criterion` is mapped to 10**2 - size**2 in points. plot_alpha : float The `alpha` of the plotted points. ax : matplotlib Axes instance An instance of a matplotlib Axes. Returns ------- fig : matplotlib figure The matplotlib figure that contains the Axes. Notes ----- Row labels for the observations in which the leverage, measured by the diagonal of the hat matrix, is high or the residuals are large, as the combination of large residuals and a high influence value indicates an influence point. The value of large residuals can be controlled using the `alpha` parameter. Large leverage points are identified as hat_i > 2 * (df_model + 1)/nobs. """ fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() if criterion.lower().startswith('dff'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('coo'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized_internal from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = range(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, zip(leverage, resids), zip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) if (isinstance(endog, basestring) or isinstance(exog_others, (basestring, list)) or isinstance(exog_i, basestring)): from patsy import dmatrix # strings, use patsy to transform to data if isinstance(endog, basestring): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, basestring): RHS = dmatrix(RHS, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others if isinstance(exog_i, basestring): varname = exog_i exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() ax.plot(res_xaxis.resid, res_yaxis.resid, 'o', **kwargs) fitted_line = OLS(res_yaxis.resid, res_xaxis.resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) x_axis_endog_name = res_xaxis.model.endog_names if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % res_yaxis.model.endog_names) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = range(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(range(len(obs_labels)), obs_labels, zip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_axis.resid, res_yaxis.resid) else: return fig
X = rob_crime_model.model.exog[idx] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1) resid = rob_crime_model.resid resid2 = resid ** 2 resid2 /= resid2.sum() nobs = int(idx.sum()) hm = hat_matrix_diag.mean() rm = resid2.mean() from statsmodels.graphics import utils fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(resid2[idx], hat_matrix_diag, "o") ax = utils.annotate_axes( range(nobs), labels=rob_crime_model.model.data.row_labels[idx], points=zip(resid2[idx], hat_matrix_diag), offset_points=[(-5, 5)] * nobs, size="large", ax=ax, ) ax.set_xlabel("resid2") ax.set_ylabel("leverage") ylim = ax.get_ylim() ax.vlines(rm, *ylim) xlim = ax.get_xlim() ax.hlines(hm, *xlim) ax.margins(0, 0)