def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*results._results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def plot_added_variable(results, focus_exog, resid_type=None, use_glm_weights=True, fit_kwargs=None, ax=None): # Docstring attached below model = results.model fig, ax = utils.create_mpl_ax(ax) endog_resid, focus_exog_resid =\ added_variable_resids(results, focus_exog, resid_type=resid_type, use_glm_weights=use_glm_weights, fit_kwargs=fit_kwargs) ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6) ax.set_title('Added variable plot', fontsize='large') if type(focus_exog) is str: xname = focus_exog else: xname = model.exog_names[focus_exog] ax.set_xlabel(xname, size=15) ax.set_ylabel(model.endog_names + " residuals", size=15) return fig
def plot_rsquare(self, ncomp=None, ax=None): """ Box plots of the individual series R-square against the number of PCs Parameters ---------- ncomp : int, optional Number of components ot include in the plot. If None, will plot the minimum of 10 or the number of computed components ax : Matplotlib axes instance, optional An axes on which to draw the graph. If omitted, new a figure is created Returns ------- fig : figure Handle to the figure """ import statsmodels.graphics.utils as gutils fig, ax = gutils.create_mpl_ax(ax) ncomp = 10 if ncomp is None else ncomp ncomp = min(ncomp, self._ncomp) # R2s in rows, series in columns r2s = 1.0 - self._ess_indiv / self._tss_indiv r2s = r2s[1:] r2s = r2s[:ncomp] ax.boxplot(r2s.T) ax.set_title('Individual Input $R^2$') ax.set_ylabel('$R^2$') ax.set_xlabel('Number of Included Principal Components') return fig
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, **kwargs): from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = influence leverage = infl.hat_matrix_diag resid = zscore(infl.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,**kwds): from statsmodels.graphics import utils fig, ax = utils.create_mpl_ax(ax) if title is None: title = "Index Plot" nobs = len(self.endog) index = np.arange(nobs) ax.scatter(index, y, **kwds) if threshold == 'all': large_points = np.ones(nobs, np.bool_) else: large_points = np.abs(y) > threshold psize = 3 * np.ones(nobs) # add point labels labels = self.results.model.data.row_labels if labels is None: labels = np.arange(nobs) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(index, y), lzip(-psize, psize), "large", ax) font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Observation", **font) ax.set_title(title, **font) return fig
def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None): """ Consider using one of month_plot or quarter_plot unless you need irregular plotting. Parameters ---------- grouped_x : iterable of DataFrames Should be a GroupBy object (or similar pair of group_names and groups as DataFrames) with a DatetimeIndex or PeriodIndex """ fig, ax = utils.create_mpl_ax(ax) start = 0 ticks = [] for season, df in grouped_x: df = df.copy() # or sort balks for series. may be better way sort_values(df, inplace=True) nobs = len(df) x_plot = np.arange(start, start + nobs) ticks.append(x_plot.mean()) ax.plot(x_plot, df.values, 'k') ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='k') start += nobs ax.set_xticks(ticks) ax.set_xticklabels(xticklabels) ax.set_ylabel(ylabel) ax.margins(.1, .05) return fig
def plot_scree(self, ncomp=None, log_scale=True, cumulative=False, ax=None): """ Plot of the ordered eigenvalues Parameters ---------- ncomp : int, optional Number of components ot include in the plot. If None, will included the same as the number of components computed log_scale : boot, optional Flag indicating whether ot use a log scale for the y-axis cumulative : bool, optional Flag indicating whether to plot the eigenvalues or cumulative eigenvalues ax : Matplotlib axes instance, optional An axes on which to draw the graph. If omitted, new a figure is created Returns ------- fig : figure Handle to the figure """ import statsmodels.graphics.utils as gutils fig, ax = gutils.create_mpl_ax(ax) ncomp = self._ncomp if ncomp is None else ncomp vals = np.asarray(self.eigenvals) vals = vals[:self._ncomp] if cumulative: vals = np.cumsum(vals) if log_scale: ax.set_yscale('log') ax.plot(np.arange(ncomp), vals[: ncomp], 'bo') ax.autoscale(tight=True) xlim = np.array(ax.get_xlim()) sp = xlim[1] - xlim[0] xlim += 0.02 * np.array([-sp, sp]) ax.set_xlim(xlim) ylim = np.array(ax.get_ylim()) scale = 0.02 if log_scale: sp = np.log(ylim[1] / ylim[0]) ylim = np.exp(np.array([np.log(ylim[0]) - scale * sp, np.log(ylim[1]) + scale * sp])) else: sp = ylim[1] - ylim[0] ylim += scale * np.array([-sp, sp]) ax.set_ylim(ylim) ax.set_title('Scree Plot') ax.set_ylabel('Eigenvalue') ax.set_xlabel('Component Number') fig.tight_layout() return fig
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best') return fig
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- res : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int index of regressor in exog matrix y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Notes ----- This is currently very simple, no options or varnames yet. """ fig, ax = utils.create_mpl_ax(ax) if exog_name == '': exog_name = 'variable %d' % exog_idx #maybe add option for wendog, wexog y = res.model.endog x1 = res.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label='observed') if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='true') title = 'fitted versus regressor %s' % exog_name else: title = 'fitted versus regressor %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(res) ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o') #ax.plot(x1, iv_u, 'r--') #ax.plot(x1, iv_l, 'r--') ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k') ax.set_title(title, fontsize=fontsize) return fig
def plot_partial(self, smooth_index, plot_se=True, cpr=False, include_constant=True, ax=None): """plot the contribution of a smooth term to the linear prediction Parameters ---------- smooth_index : int index of the smooth term within list of smooth terms plot_se : book If plot_se is true, then the confidence interval for the linear prediction will be added to the plot. cpr : bool If cpr (component plus residual) is true, the a scatter plot of the partial working residuals will be added to the plot. include_constant : bool If true, then the estimated intercept is added to the prediction and its standard errors. This avoids that the confidence interval has zero width at the imposed identification constraint, e.g. either at a reference point or at the mean. ax : None or matplotlib axis instance If ax is not None, then the plot will be added to it. Returns ------- fig : matplotlib Figure instance """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _import_mpl() variable = smooth_index y_est, se = self.partial_values(variable, include_constant=include_constant) smoother = self.model.smoother x = smoother.smoothers[variable].x sort_index = np.argsort(x) x = x[sort_index] y_est = y_est[sort_index] se = se[sort_index] fig, ax = create_mpl_ax(ax) ax.plot(x, y_est, c='blue', lw=2) if plot_se: ax.plot(x, y_est + 1.96 * se, '-', c='blue') ax.plot(x, y_est - 1.96 * se, '-', c='blue') if cpr: # TODO: resid_response doesn't make sense with nonlinear link # use resid_working ? cpr_ = y_est + self.resid_working ax.plot(x, cpr_, '.', lw=2) ax.set_xlabel(smoother.smoothers[variable].variable_name) return fig
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step, var_step, levs=[.2, .1, .05, .01, .001]): """ Returns a plot of the confidence region for a univariate mean and variance. Parameters ---------- mu_low : float Lowest value of the mean to plot mu_high : float Highest value of the mean to plot var_low : float Lowest value of the variance to plot var_high : float Highest value of the variance to plot mu_step : float Increments to evaluate the mean var_step : float Increments to evaluate the mean levs : list Which values of significance the contour lines will be drawn. Default is [.2, .1, .05, .01, .001] Returns ------- fig : matplotlib figure instance The contour plot """ fig, ax = utils.create_mpl_ax() ax.set_ylabel('Variance') ax.set_xlabel('Mean') mu_vect = list(np.arange(mu_low, mu_high, mu_step)) var_vect = list(np.arange(var_low, var_high, var_step)) z = [] for sig0 in var_vect: self.sig2_0 = sig0 for mu0 in mu_vect: z.append(self._opt_var(mu0, pval=True)) z = np.asarray(z).reshape(len(var_vect), len(mu_vect)) ax.contour(mu_vect, var_vect, z, levels=levs) return fig
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. label_kwargs : dict The keywords to pass to annotate for the labels. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm, t fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = range(results.nobs) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, zip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def plot_ccpr_ax(res, exog_idx=None, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- res : result instance uses exog and params of the result instance exog_idx : int (column) index of the exog used in the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr : Creates CCPR plot for multiple regressors in a plot grid. References ---------- See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) x1 = res.model.exog[:,exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*res.params[1] ax.plot(x1, x1beta + res.resid, 'o') ax.plot(x1, x1beta, '-') ax.set_title('X_%d beta_%d plus residuals versus exog (CCPR)' % \ (exog_idx, exog_idx)) return fig
def plot_partregress_ax(endog, exog_i, exog_others, varname='', title_fontsize=None, ax=None): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray endogenous or response variable exog_i : ndarray exogenous, explanatory variable exog_others : ndarray other exogenous, explanatory variables, the effect of these variables will be removed by OLS regression varname : str name of the variable used in the title ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_partregress : Plot partial regression for a set of regressors. """ fig, ax = utils.create_mpl_ax(ax) res1a = OLS(endog, exog_others).fit() res1b = OLS(exog_i, exog_others).fit() ax.plot(res1b.resid, res1a.resid, 'o') res1c = OLS(res1a.resid, res1b.resid).fit() ax.plot(res1b.resid, res1c.fittedvalues, '-', color='k') ax.set_title('Partial Regression plot %s' % varname, fontsize=title_fontsize)# + namestr) return fig
def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None, ax=None): # Docstring attached below model = results.model focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) presid = ceres_resids(results, focus_exog, frac=frac, cond_means=cond_means) focus_exog_vals = model.exog[:, focus_col] fig, ax = utils.create_mpl_ax(ax) ax.plot(focus_exog_vals, presid, 'o', alpha=0.6) ax.set_title('CERES residuals plot', fontsize='large') ax.set_xlabel(focus_exog, size=15) ax.set_ylabel("Component plus residual", size=15) return fig
def plot_partial_residuals(results, focus_exog, ax=None): # Docstring attached below model = results.model focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) pr = partial_resids(results, focus_exog) focus_exog_vals = results.model.exog[:, focus_col] fig, ax = utils.create_mpl_ax(ax) ax.plot(focus_exog_vals, pr, 'o', alpha=0.6) ax.set_title('Partial residuals plot', fontsize='large') if type(focus_exog) is str: xname = focus_exog else: xname = model.exog_names[focus_exog] ax.set_xlabel(xname, size=15) ax.set_ylabel("Component plus residual", size=15) return fig
def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None): """ Consider using one of month_plot or quarter_plot unless you need irregular plotting. Parameters ---------- grouped_x : iterable of DataFrames Should be a GroupBy object (or similar pair of group_names and groups as DataFrames) with a DatetimeIndex or PeriodIndex xticklabels : list of str List of season labels, one for each group. ylabel : str Lable for y axis ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. """ fig, ax = utils.create_mpl_ax(ax) start = 0 ticks = [] for season, df in grouped_x: df = df.copy() # or sort balks for series. may be better way df.sort_index() nobs = len(df) x_plot = np.arange(start, start + nobs) ticks.append(x_plot.mean()) ax.plot(x_plot, df.values, 'k') ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='r', linewidth=3) start += nobs ax.set_xticks(ticks) ax.set_xticklabels(xticklabels) ax.set_ylabel(ylabel) ax.margins(.1, .05) return fig
def plot_acf(x, ax=None, lags=None, *, alpha=.05, use_vlines=True, adjusted=False, fft=False, missing='none', title='Autocorrelation', zero=True, vlines_kwargs=None, **kwargs): """ Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. adjusted : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. title : str, optional Title to place on plot. Default is 'Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40) >>> plt.show() .. plot:: plots/graphics_tsa_plot_acf.py """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs confint = None # acf has different return type based on alpha acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft, adjusted=adjusted, missing=missing) if alpha is not None: acf_x, confint = acf_x _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, **kwargs) return fig
def interaction_plot(x, trace, response, func=np.mean, ax=None, plottype='b', xlabel=None, ylabel=None, colors=None, markers=None, linestyles=None, legendloc='best', legendtitle=None, **kwargs): """ Interaction plot for factor level statistics. Note. If categorial factors are supplied levels will be internally recoded to integers. This ensures matplotlib compatibility. Uses a DataFrame to calculate an `aggregate` statistic for each level of the factor or group given by `trace`. Parameters ---------- x : array_like The `x` factor levels constitute the x-axis. If a `pandas.Series` is given its name will be used in `xlabel` if `xlabel` is None. trace : array_like The `trace` factor levels will be drawn as lines in the plot. If `trace` is a `pandas.Series` its name will be used as the `legendtitle` if `legendtitle` is None. response : array_like The reponse or dependent variable. If a `pandas.Series` is given its name will be used in `ylabel` if `ylabel` is None. func : function Anything accepted by `pandas.DataFrame.aggregate`. This is applied to the response variable grouped by the trace levels. ax : axes, optional Matplotlib axes instance plottype : str {'line', 'scatter', 'both'}, optional The type of plot to return. Can be 'l', 's', or 'b' xlabel : str, optional Label to use for `x`. Default is 'X'. If `x` is a `pandas.Series` it will use the series names. ylabel : str, optional Label to use for `response`. Default is 'func of response'. If `response` is a `pandas.Series` it will use the series names. colors : list, optional If given, must have length == number of levels in trace. markers : list, optional If given, must have length == number of levels in trace linestyles : list, optional If given, must have length == number of levels in trace. legendloc : {None, str, int} Location passed to the legend command. legendtitle : {None, str} Title of the legend. **kwargs These will be passed to the plot command used either plot or scatter. If you want to control the overall plotting options, use kwargs. Returns ------- Figure The figure given by `ax.figure` or a new instance. Examples -------- >>> import numpy as np >>> np.random.seed(12345) >>> weight = np.random.randint(1,4,size=60) >>> duration = np.random.randint(1,3,size=60) >>> days = np.log(np.random.randint(1,30, size=60)) >>> fig = interaction_plot(weight, duration, days, ... colors=['red','blue'], markers=['D','^'], ms=10) >>> import matplotlib.pyplot as plt >>> plt.show() .. plot:: import numpy as np from statsmodels.graphics.factorplots import interaction_plot np.random.seed(12345) weight = np.random.randint(1,4,size=60) duration = np.random.randint(1,3,size=60) days = np.log(np.random.randint(1,30, size=60)) fig = interaction_plot(weight, duration, days, colors=['red','blue'], markers=['D','^'], ms=10) import matplotlib.pyplot as plt #plt.show() """ from pandas import DataFrame fig, ax = utils.create_mpl_ax(ax) response_name = ylabel or getattr(response, 'name', 'response') ylabel = '%s of %s' % (func.__name__, response_name) xlabel = xlabel or getattr(x, 'name', 'X') legendtitle = legendtitle or getattr(trace, 'name', 'Trace') ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) x_values = x_levels = None if isinstance(x[0], str): x_levels = [l for l in np.unique(x)] x_values = lrange(len(x_levels)) x = _recode(x, dict(zip(x_levels, x_values))) data = DataFrame(dict(x=x, trace=trace, response=response)) plot_data = data.groupby(['trace', 'x']).aggregate(func).reset_index() # return data # check plot args n_trace = len(plot_data['trace'].unique()) linestyles = ['-'] * n_trace if linestyles is None else linestyles markers = ['.'] * n_trace if markers is None else markers colors = rainbow(n_trace) if colors is None else colors if len(linestyles) != n_trace: raise ValueError("Must be a linestyle for each trace level") if len(markers) != n_trace: raise ValueError("Must be a marker for each trace level") if len(colors) != n_trace: raise ValueError("Must be a color for each trace level") if plottype == 'both' or plottype == 'b': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.plot(group['x'], group['response'], color=colors[i], marker=markers[i], label=label, linestyle=linestyles[i], **kwargs) elif plottype == 'line' or plottype == 'l': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.plot(group['x'], group['response'], color=colors[i], label=label, linestyle=linestyles[i], **kwargs) elif plottype == 'scatter' or plottype == 's': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.scatter(group['x'], group['response'], color=colors[i], label=label, marker=markers[i], **kwargs) else: raise ValueError("Plot type %s not understood" % plottype) ax.legend(loc=legendloc, title=legendtitle) ax.margins(.1) if all([x_levels, x_values]): ax.set_xticks(x_values) ax.set_xticklabels(x_levels) return fig
def plot_acf(x, ax=None, lags=None, alpha=.05, use_vlines=True, unbiased=False, fft=False, **kwargs): """Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : array_like, optional Array of lag values, used on horizontal axis. If not given, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) if lags is None: lags = np.arange(len(x)) nlags = len(lags) - 1 else: nlags = lags lags = np.arange(lags + 1) # +1 for zero lag confint = None # acf has different return type based on alpha if alpha is None: acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased) else: acf_x, confint = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased) if use_vlines: ax.vlines(lags, [0], acf_x, **kwargs) ax.axhline(**kwargs) kwargs.setdefault('marker', 'o') kwargs.setdefault('markersize', 5) kwargs.setdefault('linestyle', 'None') ax.margins(.05) ax.plot(lags, acf_x, **kwargs) ax.set_title("Autocorrelation") if confint is not None: # center the confidence interval TODO: do in acf? ax.fill_between(lags, confint[:, 0] - acf_x, confint[:, 1] - acf_x, alpha=.25) return fig
def plot_fit_obs(self, col_name, lowess_args=None, lowess_min_n=40, jitter=None, plot_points=True, ax=None): """ Plot fitted versus imputed or observed values as a scatterplot. Parameters ---------- col_name : string The variable to be plotted on the horizontal axis. lowess_args : dict-like Keyword arguments passed to lowess fit. A dictionary of dictionaries, keys are 'o' and 'i' denoting 'observed' and 'imputed', respectively. lowess_min_n : integer Minimum sample size to plot a lowess fit jitter : float or tuple Standard deviation for jittering points in the plot. Either a single scalar applied to both axes, or a tuple containing x-axis jitter and y-axis jitter, respectively. plot_points : bool If True, the data points are plotted. ax : matplotlib axes object Axes on which to plot, created if not provided. Returns ------- The matplotlib figure on which the plot is drawn. """ from statsmodels.graphics import utils as gutils from statsmodels.nonparametric.smoothers_lowess import lowess if lowess_args is None: lowess_args = {} if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() ax.set_position([0.1, 0.1, 0.7, 0.8]) ixi = self.ix_miss[col_name] ixo = self.ix_obs[col_name] vec1 = np.asarray(self.data[col_name]) # Fitted values formula = self.conditional_formula[col_name] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") results = self.results[col_name] vec2 = results.predict(exog=exog) vec2 = self._get_predicted(vec2) if jitter is not None: if np.isscalar(jitter): jitter = (jitter, jitter) vec1 += jitter[0] * np.random.normal(size=len(vec1)) vec2 += jitter[1] * np.random.normal(size=len(vec2)) # Plot the points keys = ['o', 'i'] ixs = {'o': ixo, 'i': ixi} lak = {'o': 'obs', 'i': 'imp'} color = {'o': 'orange', 'i': 'lime'} if plot_points: for ky in keys: ix = ixs[ky] ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky], label=lak[ky], alpha=0.6) # Plot the lowess fits for ky in keys: ix = ixs[ky] if len(ix) < lowess_min_n: continue if ky in lowess_args: la = lowess_args[ky] else: la = {} ix = ixs[ky] lfit = lowess(vec2[ix], vec1[ix], **la) ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky], alpha=0.6, lw=4, label=lak[ky]) ha, la = ax.get_legend_handles_labels() leg = fig.legend(ha, la, 'center right', numpoints=1) leg.draw_frame(False) ax.set_xlabel(col_name + " observed or imputed") ax.set_ylabel(col_name + " fitted") return fig
def plot_missing_pattern(self, ax=None, row_order="pattern", column_order="pattern", hide_complete_rows=False, hide_complete_columns=False, color_row_patterns=True): """ Generate an image showing the missing data pattern. Parameters ---------- ax : matplotlib axes Axes on which to draw the plot. row_order : string The method for ordering the rows. Must be one of 'pattern', 'proportion', or 'raw'. column_order : string The method for ordering the columns. Must be one of 'pattern', 'proportion', or 'raw'. hide_complete_rows : boolean If True, rows with no missing values are not drawn. hide_complete_columns : boolean If True, columns with no missing values are not drawn. color_row_patterns : boolean If True, color the unique row patterns, otherwise use grey and white as colors. Returns ------- A figure containing a plot of the missing data pattern. """ # Create an indicator matrix for missing values. miss = np.zeros(self.data.shape) cols = self.data.columns for j, col in enumerate(cols): ix = self.ix_miss[col] miss[ix, j] = 1 # Order the columns as requested if column_order == "proportion": ix = np.argsort(miss.mean(0)) elif column_order == "pattern": cv = np.cov(miss.T) u, s, vt = np.linalg.svd(cv, 0) ix = np.argsort(cv[:, 0]) elif column_order == "raw": ix = np.arange(len(cols)) else: raise ValueError( column_order + " is not an allowed value for `column_order`.") miss = miss[:, ix] cols = [cols[i] for i in ix] # Order the rows as requested if row_order == "proportion": ix = np.argsort(miss.mean(1)) elif row_order == "pattern": x = 2**np.arange(miss.shape[1]) rky = np.dot(miss, x) ix = np.argsort(rky) elif row_order == "raw": ix = np.arange(miss.shape[0]) else: raise ValueError( row_order + " is not an allowed value for `row_order`.") miss = miss[ix, :] if hide_complete_rows: ix = np.flatnonzero((miss == 1).any(1)) miss = miss[ix, :] if hide_complete_columns: ix = np.flatnonzero((miss == 1).any(0)) miss = miss[:, ix] cols = [cols[i] for i in ix] from statsmodels.graphics import utils as gutils from matplotlib.colors import LinearSegmentedColormap if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() if color_row_patterns: x = 2**np.arange(miss.shape[1]) rky = np.dot(miss, x) _, rcol = np.unique(rky, return_inverse=True) miss *= 1 + rcol[:, None] ax.imshow(miss, aspect="auto", interpolation="nearest", cmap='gist_ncar_r') else: cmap = LinearSegmentedColormap.from_list("_", ["white", "darkgrey"]) ax.imshow(miss, aspect="auto", interpolation="nearest", cmap=cmap) ax.set_ylabel("Cases") ax.set_xticks(range(len(cols))) ax.set_xticklabels(cols, rotation=90) return fig
def plot_predict( result, start=None, end=None, dynamic=False, alpha=0.05, ax=None, **predict_kwargs, ): """ Parameters ---------- result : Result Any model result supporting ``get_prediction``. start : int, str, or datetime, optional Zero-indexed observation number at which to start forecasting, i.e., the first forecast is start. Can also be a date string to parse or a datetime type. Default is the the zeroth observation. end : int, str, or datetime, optional Zero-indexed observation number at which to end forecasting, i.e., the last forecast is end. Can also be a date string to parse or a datetime type. However, if the dates index does not have a fixed frequency, end must be an integer index if you want out of sample prediction. Default is the last observation in the sample. dynamic : bool, int, str, or datetime, optional Integer offset relative to `start` at which to begin dynamic prediction. Can also be an absolute date string to parse or a datetime type (these are not interpreted as offsets). Prior to this observation, true endogenous values will be used for prediction; starting with this observation and continuing through the end of prediction, forecasted endogenous values will be used instead. alpha : {float, None} The tail probability not covered by the confidence interval. Must be in (0, 1). Confidence interval is constructed assuming normally distributed shocks. If None, figure will not show the confidence interval. ax : AxesSubplot matplotlib Axes instance to use **predict_kwargs Any additional keyword arguments to pass to ``result.get_prediction``. Returns ------- Figure matplotlib Figure containing the prediction plot """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _ = _import_mpl() fig, ax = create_mpl_ax(ax) from statsmodels.tsa.base.prediction import PredictionResults # use predict so you set dates pred: PredictionResults = result.get_prediction(start=start, end=end, dynamic=dynamic, **predict_kwargs) mean = pred.predicted_mean if isinstance(mean, (pd.Series, pd.DataFrame)): x = mean.index mean.plot(ax=ax, label="forecast") else: x = np.arange(mean.shape[0]) ax.plot(x, mean) if alpha is not None: label = f"{1-alpha:.0%} confidence interval" ci = pred.conf_int(alpha) conf_int = np.asarray(ci) ax.fill_between( x, conf_int[:, 0], conf_int[:, 1], color="gray", alpha=0.5, label=label, ) ax.legend(loc="best") return fig
def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs): """ Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : Results A result instance with resid, model.endog and model.exog as attributes. exog_idx : {int, str} Name or index of regressor in exog matrix. y_true : array_like. optional If this is not None, then the array is added to the plot. ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. vlines : bool, optional If this not True, then the uncertainty of the fit is not plotted. **kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Examples -------- Load the Statewide Crime data set and perform linear regression with `poverty` and `hs_grad` as variables and `murder` as the response >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> data = sm.datasets.statecrime.load_pandas().data >>> murder = data['murder'] >>> X = data[['poverty', 'hs_grad']] >>> X["constant"] = 1 >>> y = murder >>> model = sm.OLS(y, X) >>> results = model.fit() Create a plot just for the variable 'Poverty': >>> fig, ax = plt.subplots() >>> fig = sm.graphics.plot_fit(results, 0, ax=ax) >>> ax.set_ylabel("Murder Rate") >>> ax.set_xlabel("Poverty Level") >>> ax.set_title("Linear Regression") >>> plt.show() .. plot:: plots/graphics_plot_fit_ex.py """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if y_true is not None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) if vlines is True: _, iv_l, iv_u = wls_prediction_std(results) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best', numpoints=1) return fig
def plot_missing_pattern(self, ax=None, row_order="pattern", column_order="pattern", hide_complete_rows=False, hide_complete_columns=False, color_row_patterns=True): """ Generate an image showing the missing data pattern. Parameters ---------- ax : matplotlib axes Axes on which to draw the plot. row_order : string The method for ordering the rows. Must be one of 'pattern', 'proportion', or 'raw'. column_order : string The method for ordering the columns. Must be one of 'pattern', 'proportion', or 'raw'. hide_complete_rows : boolean If True, rows with no missing values are not drawn. hide_complete_columns : boolean If True, columns with no missing values are not drawn. color_row_patterns : boolean If True, color the unique row patterns, otherwise use grey and white as colors. Returns ------- A figure containing a plot of the missing data pattern. """ # Create an indicator matrix for missing values. miss = np.zeros(self.data.shape) cols = self.data.columns for j, col in enumerate(cols): ix = self.ix_miss[col] miss[ix, j] = 1 # Order the columns as requested if column_order == "proportion": ix = np.argsort(miss.mean(0)) elif column_order == "pattern": cv = np.cov(miss.T) u, s, vt = np.linalg.svd(cv, 0) ix = np.argsort(cv[:, 0]) elif column_order == "raw": ix = np.arange(len(cols)) else: raise ValueError(column_order + " is not an allowed value for `column_order`.") miss = miss[:, ix] cols = [cols[i] for i in ix] # Order the rows as requested if row_order == "proportion": ix = np.argsort(miss.mean(1)) elif row_order == "pattern": x = 2**np.arange(miss.shape[1]) rky = np.dot(miss, x) ix = np.argsort(rky) elif row_order == "raw": ix = np.arange(miss.shape[0]) else: raise ValueError(row_order + " is not an allowed value for `row_order`.") miss = miss[ix, :] if hide_complete_rows: ix = np.flatnonzero((miss == 1).any(1)) miss = miss[ix, :] if hide_complete_columns: ix = np.flatnonzero((miss == 1).any(0)) miss = miss[:, ix] cols = [cols[i] for i in ix] from statsmodels.graphics import utils as gutils from matplotlib.colors import LinearSegmentedColormap if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() if color_row_patterns: x = 2**np.arange(miss.shape[1]) rky = np.dot(miss, x) _, rcol = np.unique(rky, return_inverse=True) miss *= 1 + rcol[:, None] ax.imshow(miss, aspect="auto", interpolation="nearest", cmap='gist_ncar_r') else: cmap = LinearSegmentedColormap.from_list("_", ["white", "darkgrey"]) ax.imshow(miss, aspect="auto", interpolation="nearest", cmap=cmap) ax.set_ylabel("Cases") ax.set_xticks(range(len(cols))) ax.set_xticklabels(cols, rotation=90) return fig
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywunbiased', use_vlines=True, title='Partial Autocorrelation', zero=True, **kwargs): """ Plot the partial autocorrelation function Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int or array_like, optional int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : float, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : {'ywunbiased', 'ywmle', 'ols'} Specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf. Default. - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Plots lags on the horizontal and the correlations on vertical axis. Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs) return fig
def plot_acf(x, ax=None, lags=None, alpha=.05, use_vlines=True, unbiased=False, fft=False, title='Autocorrelation', zero=True, **kwargs): """Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int or array_like, optional int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. title : str, optional Title to place on plot. Default is 'Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None # acf has different return type based on alpha if alpha is None: acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased) else: acf_x, confint = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs) return fig
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Examples -------- Load the Statewide Crime data set and perform linear regression with `poverty` and `hs_grad` as variables and `murder` as the response >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> data = sm.datasets.statecrime.load_pandas().data >>> murder = data['murder'] >>> X = data[['poverty', 'hs_grad']] >>> X["constant"] = 1 >>> y = murder >>> model = sm.OLS(y, X) >>> results = model.fit() Create a plot just for the variable 'Poverty': >>> fig, ax = plt.subplots() >>> fig = sm.graphics.plot_fit(results, 0, ax=ax) >>> ax.set_ylabel("Murder Rate") >>> ax.set_xlabel("Poverty Level") >>> ax.set_title("Linear Regression") >>> plt.show() .. plot:: plots/graphics_plot_fit_ex.py """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best', numpoints=1) return fig
def plot_ccf(x, y, ax=None, lags=None, *, alpha=.05, use_vlines=True, unbiased=False, title='Cross-correlation', vlines_kwargs=None, **kwargs): """ Plot the cross-correlation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values y: array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n title : str, optional Title to place on plot. Default is 'Cross-correlation' vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Adapted from https://github.com/statsmodels/statsmodels/blob/master/statsmodels/graphics/tsaplots.py """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot_ccf(x, lags) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs confint = None ccf_data = ccf(y, x, lag_max=nlags) if alpha is not None: z = norm.ppf(1 - alpha / 2) sl = z / np.sqrt(len(x)) confint = np.empty((2 * nlags + 1, 2)) confint[:, 0] = ccf_data - sl confint[:, 1] = ccf_data + sl _plot_corr(ax, title, ccf_data, confint, lags, irregular, use_vlines, vlines_kwargs, **kwargs) return fig
def mv_mean_contour(self, mu1_low, mu1_upp, mu2_low, mu2_upp, step1, step2, levs=(.001, .01, .05, .1, .2), var1_name=None, var2_name=None, plot_dta=False): """ Creates a confidence region plot for the mean of bivariate data Parameters ---------- m1_low : float Minimum value of the mean for variable 1 m1_upp : float Maximum value of the mean for variable 1 mu2_low : float Minimum value of the mean for variable 2 mu2_upp : float Maximum value of the mean for variable 2 step1 : float Increment of evaluations for variable 1 step2 : float Increment of evaluations for variable 2 levs : list Levels to be drawn on the contour plot. Default = (.001, .01, .05, .1, .2) plot_dta : bool If True, makes a scatter plot of the data on top of the contour plot. Defaultis False. var1_name : str Name of variable 1 to be plotted on the x-axis var2_name : str Name of variable 2 to be plotted on the y-axis Notes ----- The smaller the step size, the more accurate the intervals will be If the function returns optimization failed, consider narrowing the boundaries of the plot Examples -------- >>> import statsmodels.api as sm >>> two_rvs = np.random.standard_normal((20,2)) >>> el_analysis = sm.emplike.DescStat(two_rvs) >>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1) >>> contourp.show() """ if self.endog.shape[1] != 2: raise ValueError('Data must contain exactly two variables') fig, ax = utils.create_mpl_ax() if var2_name is None: ax.set_ylabel('Variable 2') else: ax.set_ylabel(var2_name) if var1_name is None: ax.set_xlabel('Variable 1') else: ax.set_xlabel(var1_name) x = np.arange(mu1_low, mu1_upp, step1) y = np.arange(mu2_low, mu2_upp, step2) pairs = itertools.product(x, y) z = [] for i in pairs: z.append(self.mv_test_mean(np.asarray(i))[0]) X, Y = np.meshgrid(x, y) z = np.asarray(z) z = z.reshape(X.shape[1], Y.shape[0]) ax.contour(x, y, z.T, levels=levs) if plot_dta: ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo') return fig
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, leverage=None, resid=None, **kwargs): # leverage and resid kwds are used only internally for MLEInfluence infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range / old_range + 8**2 if leverage is None: leverage = infl.hat_matrix_diag if resid is None: ylabel = "Studentized Residuals" if external: resid = infl.resid_studentized_external else: resid = infl.resid_studentized else: resid = np.asarray(resid) ylabel = "Residuals" from scipy import stats cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid) large_resid = np.abs(resid) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resid, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resid)) ax = utils.annotate_axes( np.where(large_points)[0], labels, lzip(leverage, resid), lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax) # TODO: make configurable or let people do it ex-post? font = {"fontsize": 16, "color": "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def abline_plot(intercept=None, slope=None, horiz=None, vert=None, model_results=None, ax=None, **kwargs): """ Plot a line given an intercept and slope. Parameters ---------- intercept : float The intercept of the line. slope : float The slope of the line. horiz : float or array_like Data for horizontal lines on the y-axis. vert : array_like Data for verterical lines on the x-axis. model_results : statsmodels results instance Any object that has a two-value `params` attribute. Assumed that it is (intercept, slope). ax : axes, optional Matplotlib axes instance. **kwargs Options passed to matplotlib.pyplot.plt. Returns ------- Figure The figure given by `ax.figure` or a new instance. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm >>> np.random.seed(12345) >>> X = sm.add_constant(np.random.normal(0, 20, size=30)) >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30) >>> mod = sm.OLS(y,X).fit() >>> fig = sm.graphics.abline_plot(model_results=mod) >>> ax = fig.axes[0] >>> ax.scatter(X[:,1], y) >>> ax.margins(.1) >>> import matplotlib.pyplot as plt >>> plt.show() .. plot:: plots/graphics_regression_abline.py """ if ax is not None: # get axis limits first thing, do not change these x = ax.get_xlim() else: x = None fig, ax = utils.create_mpl_ax(ax) if model_results: intercept, slope = model_results.params if x is None: x = [model_results.model.exog[:, 1].min(), model_results.model.exog[:, 1].max()] else: if not (intercept is not None and slope is not None): raise ValueError("specify slope and intercepty or model_results") if x is None: x = ax.get_xlim() data_y = [x[0]*slope+intercept, x[1]*slope+intercept] ax.set_xlim(x) #ax.set_ylim(y) from matplotlib.lines import Line2D class ABLine2D(Line2D): def __init__(self, *args, **kwargs): super(ABLine2D, self).__init__(*args, **kwargs) self.id_xlim_callback = None self.id_ylim_callback = None def remove(self): ax = self.axes if self.id_xlim_callback: ax.callbacks.disconnect(self.id_xlim_callback) if self.id_ylim_callback: ax.callbacks.disconnect(self.id_ylim_callback) super(ABLine2D, self).remove() def update_datalim(self, ax): ax.set_autoscale_on(False) children = ax.get_children() ablines = [child for child in children if child is self] abline = ablines[0] x = ax.get_xlim() y = [x[0] * slope + intercept, x[1] * slope + intercept] abline.set_data(x, y) ax.figure.canvas.draw() # TODO: how to intercept something like a margins call and adjust? line = ABLine2D(x, data_y, **kwargs) ax.add_line(line) line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim) line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim) if horiz: ax.hline(horiz) if vert: ax.vline(vert) return fig
def plot_fit_obs(self, col_name, lowess_args=None, lowess_min_n=40, jitter=None, plot_points=True, ax=None): """ Plot fitted versus imputed or observed values as a scatterplot. Parameters ---------- col_name : string The variable to be plotted on the horizontal axis. lowess_args : dict-like Keyword arguments passed to lowess fit. A dictionary of dictionaries, keys are 'o' and 'i' denoting 'observed' and 'imputed', respectively. lowess_min_n : integer Minimum sample size to plot a lowess fit jitter : float or tuple Standard deviation for jittering points in the plot. Either a single scalar applied to both axes, or a tuple containing x-axis jitter and y-axis jitter, respectively. plot_points : bool If True, the data points are plotted. ax : matplotlib axes object Axes on which to plot, created if not provided. Returns ------- The matplotlib figure on which the plot is drawn. """ from statsmodels.graphics import utils as gutils from statsmodels.nonparametric.smoothers_lowess import lowess import pandas as pd if lowess_args is None: lowess_args = {} if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() ax.set_position([0.1, 0.1, 0.7, 0.8]) ixi = self.ix_miss[col_name] ixo = self.ix_obs[col_name] vec1 = np.asarray(self.data[col_name]) # Fitted values formula = self.conditional_formula[col_name] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") results = self.results[col_name] vec2 = results.predict(exog=exog) vec2 = self._get_predicted(vec2) if jitter is not None: if np.isscalar(jitter): jitter = (jitter, jitter) vec1 += jitter[0] * np.random.normal(size=len(vec1)) vec2 += jitter[1] * np.random.normal(size=len(vec2)) # Plot the points keys = ['o', 'i'] ixs = {'o': ixo, 'i': ixi} lak = {'o': 'obs', 'i': 'imp'} color = {'o': 'orange', 'i': 'lime'} if plot_points: for ky in keys: ix = ixs[ky] ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky], label=lak[ky], alpha=0.6) # Plot the lowess fits for ky in keys: ix = ixs[ky] if len(ix) < lowess_min_n: continue if ky in lowess_args: la = lowess_args[ky] else: la = {} ix = ixs[ky] lfit = lowess(vec2[ix], vec1[ix], **la) ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky], alpha=0.6, lw=4, label=lak[ky]) ha, la = ax.get_legend_handles_labels() leg = fig.legend(ha, la, 'center right', numpoints=1) leg.draw_frame(False) ax.set_xlabel(col_name + " observed or imputed") ax.set_ylabel(col_name + " fitted") return fig
def split_plot3(grouped_x, keyOrder, ydim, labdim, xticklabels=None, ylabel=None, ax=None, label=False, keyHighlight=None, ylim=None): fig, ax = utils.create_mpl_ax(ax) start = 0 ticks = [] tmpxlabels = [] for key in keyOrder: df = grouped_x.get_group(key) nobs = len(df) x_plot = arange(start, start + nobs) ticks.append(x_plot.mean()) #Third parameter is color; 'k' is black ax.plot(x_plot, df[ydim], 'g', linestyle='--') #named colors: http://matplotlib.org/examples/color/named_colors.html highlightColors = [ 'mistyrose', 'lightsalmon', 'salmon', 'tomato', 'orangered' ] baseColors = [ 'silver', 'lightblue', 'paleturquoise', 'lightcyan', 'lightgreen' ] if key == keyHighlight: colors = highlightColors else: colors = baseColors for i in range(0, nobs): if ylim is not None and (df[ydim].iloc[i] <= ylim[0] or df[ydim].iloc[i] >= ylim[1]): continue if label: if nobs <= len(colors): if keyHighlight == 'leader': if int(df[labdim].iloc[i]) == 1: color = highlightColors[i - nobs] else: color = baseColors[i - nobs] else: color = colors[i - nobs] else: color = 'pink' ax.text(x_plot[i], df[ydim].iloc[i], int(df[labdim].iloc[i]), bbox=dict(boxstyle='round,pad=0.3', color=color)) #elif df[ydim].iloc[i]>self.RC1SIZE: # ax.text(x_plot[i]-1, df[ydim].iloc[i], int(df[labdim].iloc[i]), # bbox=dict( boxstyle='round,pad=0.3',color='pink')) #facecolor='none',edgecolor='black', else: ax.plot(x_plot[i], df[labdim].iloc[i], 'or') #ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='k') start += nobs tmpxlabels.append(key) if xticklabels is None: xticklabels = tmpxlabels elif isinstance(xticklabels, dict): xticklabels = [ xticklabels[x] if x in xticklabels else x for x in tmpxlabels ] ax.set_xticks(ticks) ax.set_xticklabels(xticklabels) if ylabel is not None: ax.set_ylabel(ylabel) ax.margins(.1, .05) return fig
def plot_survfunc(survfuncs, ax=None): """ Plot one or more survivor functions. Parameters ---------- survfuncs : object or array-like A single SurvfuncRight object, or a list or SurvfuncRight objects that are plotted together. Returns ------- A figure instance on which the plot was drawn. Examples -------- Add a legend: >>> import statsmodels.api as sm >>> from statsmodels.duration.survfunc import plot_survfunc >>> data = sm.datasets.get_rdataset("flchain", "survival").data >>> df = data.loc[data.sex == "F", :] >>> sf0 = sm.SurvfuncRight(df["futime"], df["death"]) >>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"]) >>> fig = plot_survfunc([sf0, sf1]) >>> ax = fig.get_axes()[0] >>> ax.set_position([0.1, 0.1, 0.64, 0.8]) >>> ha, lb = ax.get_legend_handles_labels() >>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), 'center right') Change the line colors: >>> fig = plot_survfunc([sf0, sf1]) >>> ax = fig.get_axes()[0] >>> ax.set_position([0.1, 0.1, 0.64, 0.8]) >>> ha, lb = ax.get_legend_handles_labels() >>> ha[0].set_color('purple') >>> ha[1].set_color('orange') """ fig, ax = utils.create_mpl_ax(ax) # If we have only a single survival function to plot, put it into # a list. try: assert(type(survfuncs[0]) is SurvfuncRight) except: survfuncs = [survfuncs] for gx, sf in enumerate(survfuncs): # The estimated survival function does not include a point at # time 0, include it here for plotting. surv_times = np.concatenate(([0], sf.surv_times)) surv_prob = np.concatenate(([1], sf.surv_prob)) # If the final times are censoring times they are not included # in the survival function so we add them here mxt = max(sf.time) if mxt > surv_times[-1]: surv_times = np.concatenate((surv_times, [mxt])) surv_prob = np.concatenate((surv_prob, [surv_prob[-1]])) label = getattr(sf, "title", "Group %d" % (gx + 1)) li, = ax.step(surv_times, surv_prob, '-', label=label, lw=2, where='post') # Plot the censored points. ii = np.flatnonzero(np.logical_not(sf.status)) ti = sf.time[ii] jj = np.searchsorted(surv_times, ti) - 1 sp = surv_prob[jj] ax.plot(ti, sp, '+', ms=12, color=li.get_color(), label=label + " points") ax.set_ylim(0, 1.01) return fig
def plot_acf( x, ax=None, lags=None, *, alpha=0.05, use_vlines=True, adjusted=False, fft=False, missing="none", title="Autocorrelation", zero=True, auto_ylims=False, bartlett_confint=True, vlines_kwargs=None, **kwargs, ): """ Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. adjusted : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. title : str, optional Title to place on plot. Default is 'Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. auto_ylims : bool, optional If True, adjusts automatically the y-axis limits to ACF values. bartlett_confint : bool, default True Confidence intervals for ACF values are generally placed at 2 standard errors around r_k. The formula used for standard error depends upon the situation. If the autocorrelations are being used to test for randomness of residuals as part of the ARIMA routine, the standard errors are determined assuming the residuals are white noise. The approximate formula for any lag is that standard error of each r_k = 1/sqrt(N). See section 9.4 of [1] for more details on the 1/sqrt(N) result. For more elementary discussion, see section 5.3.2 in [2]. For the ACF of raw data, the standard error at a lag k is found as if the right model was an MA(k-1). This allows the possible interpretation that if all autocorrelations past a certain lag are within the limits, the model might be an MA of order defined by the last significant autocorrelation. In this case, a moving average model is assumed for the data and the standard errors for the confidence intervals should be generated using Bartlett's formula. For more details on Bartlett formula result, see section 7.2 in [1]. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. References ---------- [1] Brockwell and Davis, 1987. Time Series Theory and Methods [2] Brockwell and Davis, 2010. Introduction to Time Series and Forecasting, 2nd edition. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40) >>> plt.show() .. plot:: plots/graphics_tsa_plot_acf.py """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs confint = None # acf has different return type based on alpha acf_x = acf( x, nlags=nlags, alpha=alpha, fft=fft, bartlett_confint=bartlett_confint, adjusted=adjusted, missing=missing, ) if alpha is not None: acf_x, confint = acf_x[:2] _plot_corr( ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, auto_ylims=auto_ylims, **kwargs, ) return fig
def plot_acf(x, ax=None, lags=None, alpha=.05, use_vlines=True, unbiased=False, fft=False, **kwargs): """Plot the autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : array_like, optional Array of lag values, used on horizontal axis. If not given, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional If True, computes the ACF via FFT. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) if lags is None: lags = np.arange(len(x)) nlags = len(lags) - 1 else: nlags = lags lags = np.arange(lags + 1) # +1 for zero lag acf_x, confint = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased) if use_vlines: ax.vlines(lags, [0], acf_x, **kwargs) ax.axhline(**kwargs) # center the confidence interval TODO: do in acf? confint = confint - confint.mean(1)[:,None] kwargs.setdefault('marker', 'o') kwargs.setdefault('markersize', 5) kwargs.setdefault('linestyle', 'None') ax.margins(.05) ax.plot(lags, acf_x, **kwargs) ax.fill_between(lags, confint[:,0], confint[:,1], alpha=.25) ax.set_title("Autocorrelation") return fig
def plot_bivariate(self, col1_name, col2_name, lowess_args=None, lowess_min_n=40, jitter=None, plot_points=True, ax=None): """ Plot observed and imputed values for two variables. Displays a scatterplot of one variable against another. The points are colored according to whether the values are observed or imputed. Parameters ---------- col1_name : string The variable to be plotted on the horizontal axis. col2_name : string The variable to be plotted on the vertical axis. lowess_args : dictionary A dictionary of dictionaries, keys are 'ii', 'io', 'oi' and 'oo', where 'o' denotes 'observed' and 'i' denotes imputed. See Notes for details. lowess_min_n : integer Minimum sample size to plot a lowess fit jitter : float or tuple Standard deviation for jittering points in the plot. Either a single scalar applied to both axes, or a tuple containing x-axis jitter and y-axis jitter, respectively. plot_points : bool If True, the data points are plotted. ax : matplotlib axes object Axes on which to plot, created if not provided. Returns ------- The matplotlib figure on which the plot id drawn. """ from statsmodels.graphics import utils as gutils from statsmodels.nonparametric.smoothers_lowess import lowess if lowess_args is None: lowess_args = {} if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() ax.set_position([0.1, 0.1, 0.7, 0.8]) ix1i = self.ix_miss[col1_name] ix1o = self.ix_obs[col1_name] ix2i = self.ix_miss[col2_name] ix2o = self.ix_obs[col2_name] ix_ii = np.intersect1d(ix1i, ix2i) ix_io = np.intersect1d(ix1i, ix2o) ix_oi = np.intersect1d(ix1o, ix2i) ix_oo = np.intersect1d(ix1o, ix2o) vec1 = np.asarray(self.data[col1_name]) vec2 = np.asarray(self.data[col2_name]) if jitter is not None: if np.isscalar(jitter): jitter = (jitter, jitter) vec1 += jitter[0] * np.random.normal(size=len(vec1)) vec2 += jitter[1] * np.random.normal(size=len(vec2)) # Plot the points keys = ['oo', 'io', 'oi', 'ii'] lak = {'i': 'imp', 'o': 'obs'} ixs = {'ii': ix_ii, 'io': ix_io, 'oi': ix_oi, 'oo': ix_oo} color = {'oo': 'grey', 'ii': 'red', 'io': 'orange', 'oi': 'lime'} if plot_points: for ky in keys: ix = ixs[ky] lab = lak[ky[0]] + "/" + lak[ky[1]] ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky], label=lab, alpha=0.6) # Plot the lowess fits for ky in keys: ix = ixs[ky] if len(ix) < lowess_min_n: continue if ky in lowess_args: la = lowess_args[ky] else: la = {} ix = ixs[ky] lfit = lowess(vec2[ix], vec1[ix], **la) if plot_points: ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky], alpha=0.6, lw=4) else: lab = lak[ky[0]] + "/" + lak[ky[1]] ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky], alpha=0.6, lw=4, label=lab) ha, la = ax.get_legend_handles_labels() pad = 0.0001 if plot_points else 0.5 leg = fig.legend(ha, la, 'center right', numpoints=1, handletextpad=pad) leg.draw_frame(False) ax.set_xlabel(col1_name) ax.set_ylabel(col2_name) return fig
def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005, properties=lambda key: None, labelizer=None, title='', statistic=False, axes_label=True, label_rotation=0.0): """Create a mosaic plot from a contingency table. It allows to visualize multivariate categorical data in a rigorous and informative way. Parameters ---------- data : dict, pandas.Series, np.ndarray, pandas.DataFrame The contingency table that contains the data. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be representes; if that is not true, will automatically consider the missing values as 0. The order of the keys will be the same as the one of insertion. If a dict of a Series (or any other dict like object) is used, it will take the keys as labels. If a np.ndarray is provided, it will generate a simple numerical labels. index: list, optional Gives the preferred order for the category ordering. If not specified will default to the given order. It doesn't support named indexes for hierarchical Series. If a DataFrame is provided, it expects a list with the name of the columns. ax : matplotlib.Axes, optional The graph where display the mosaic. If not given, will create a new figure horizontal : bool, optional (default True) The starting direction of the split (by default along the horizontal axis) gap : float or array of floats The list of gaps to be applied on each subdivision. If the lenght of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps labelizer : function (key) -> string, optional A function that generate the text to display at the center of each tile base on the key of that tile properties : function (key) -> dict, optional A function that for each tile in the mosaic take the key of the tile and returns the dictionary of properties of the generated Rectangle, like color, hatch or similar. A default properties set will be provided fot the keys whose color has not been defined, and will use color variation to help visually separates the various categories. It should return None to indicate that it should use the default property for the tile. A dictionary of the properties for each key can be passed, and it will be internally converted to the correct function statistic: bool, optional (default False) if true will use a crude statistical model to give colors to the plot. If the tile has a containt that is more than 2 standard deviation from the expected value under independence hipotesys, it will go from green to red (for positive deviations, blue otherwise) and will acquire an hatching when crosses the 3 sigma. title: string, optional The title of the axis axes_label: boolean, optional Show the name of each value of each category on the axis (default) or hide them. label_rotation: float or list of float the rotation of the axis label (if present). If a list is given each axis can have a different rotation Returns ---------- fig : matplotlib.Figure The generate figure rects : dict A dictionary that has the same keys of the original dataset, that holds a reference to the coordinates of the tile and the Rectangle that represent it See Also ---------- A Brief History of the Mosaic Display Michael Friendly, York University, Psychology Department Journal of Computational and Graphical Statistics, 2001 Mosaic Displays for Loglinear Models. Michael Friendly, York University, Psychology Department Proceedings of the Statistical Graphics Section, 1992, 61-68. Mosaic displays for multi-way contingecy tables. Michael Friendly, York University, Psychology Department Journal of the american statistical association March 1994, Vol. 89, No. 425, Theory and Methods Examples ---------- The most simple use case is to take a dictionary and plot the result >>> data = {'a': 10, 'b': 15, 'c': 16} >>> mosaic(data, title='basic dictionary') >>> pylab.show() A more useful example is given by a dictionary with multiple indices. In this case we use a wider gap to a better visual separation of the resulting plot >>> data = {('a', 'b'): 1, ('a', 'c'): 2, ('d', 'b'): 3, ('d', 'c'): 4} >>> mosaic(data, gap=0.05, title='complete dictionary') >>> pylab.show() The same data can be given as a simple or hierarchical indexed Series >>> rand = np.random.random >>> from itertools import product >>> >>> tuples = list(product(['bar', 'baz', 'foo', 'qux'], ['one', 'two'])) >>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) >>> data = pd.Series(rand(8), index=index) >>> mosaic(data, title='hierarchical index series') >>> pylab.show() The third accepted data structureis the np array, for which a very simple index will be created. >>> rand = np.random.random >>> data = 1+rand((2,2)) >>> mosaic(data, title='random non-labeled array') >>> pylab.show() If you need to modify the labeling and the coloring you can give a function tocreate the labels and one with the graphical properties starting from the key tuple >>> data = {'a': 10, 'b': 15, 'c': 16} >>> props = lambda key: {'color': 'r' if 'a' in key else 'gray'} >>> labelizer = lambda k: {('a',): 'first', ('b',): 'second', ('c',): 'third'}[k] >>> mosaic(data, title='colored dictionary', properties=props, labelizer=labelizer) >>> pylab.show() Using a DataFrame as source, specifying the name of the columns of interest >>> gender = ['male', 'male', 'male', 'female', 'female', 'female'] >>> pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat'] >>> data = pandas.DataFrame({'gender': gender, 'pet': pet}) >>> mosaic(data, ['pet', 'gender']) >>> pylab.show() """ from pylab import Rectangle fig, ax = utils.create_mpl_ax(ax) # normalize the data to a dict with tuple of strings as keys data = _normalize_data(data, index) # split the graph into different areas rects = _hierarchical_split(data, horizontal=horizontal, gap=gap) # if there is no specified way to create the labels # create a default one if labelizer is None: labelizer = lambda k: "\n".join(k) if statistic: default_props = _statistical_coloring(data) else: default_props = _create_default_properties(data) if isinstance(properties, dict): color_dict = properties properties = lambda key: color_dict.get(key, None) for k, v in list(rects.items()): # create each rectangle and put a label on it x, y, w, h = v conf = properties(k) props = conf if conf else default_props[k] text = labelizer(k) Rect = Rectangle((x, y), w, h, label=text, **props) ax.add_patch(Rect) ax.text(x + w / 2, y + h / 2, text, ha='center', va='center', size='smaller') #creating the labels on the axis #o clearing it if axes_label: if np.iterable(label_rotation): rotation = label_rotation else: rotation = [label_rotation] * 4 labels = _create_labels(rects, horizontal, ax, rotation) else: ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ax.set_title(title) return fig, rects
def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, obs_hist_args=None, all_hist_args=None): """ Display imputed values for one variable as a histogram. Parameters ---------- col_name : string The name of the variable to be plotted. ax : matplotlib axes An axes on which to draw the histograms. If not provided, one is created. imp_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for imputed values. obs_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for observed values. all_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for all values. Returns ------- The matplotlib figure on which the histograms were drawn """ from statsmodels.graphics import utils as gutils if imp_hist_args is None: imp_hist_args = {} if obs_hist_args is None: obs_hist_args = {} if all_hist_args is None: all_hist_args = {} if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() ax.set_position([0.1, 0.1, 0.7, 0.8]) ixm = self.ix_miss[col_name] ixo = self.ix_obs[col_name] imp = self.data[col_name].iloc[ixm] obs = self.data[col_name].iloc[ixo] for di in imp_hist_args, obs_hist_args, all_hist_args: if 'histtype' not in di: di['histtype'] = 'step' ha, la = [], [] if len(imp) > 0: h = ax.hist(np.asarray(imp), **imp_hist_args) ha.append(h[-1][0]) la.append("Imp") h1 = ax.hist(np.asarray(obs), **obs_hist_args) h2 = ax.hist(np.asarray(self.data[col_name]), **all_hist_args) ha.extend([h1[-1][0], h2[-1][0]]) la.extend(["Obs", "All"]) leg = fig.legend(ha, la, 'center right', numpoints=1) leg.draw_frame(False) ax.set_xlabel(col_name) ax.set_ylabel("Frequency") return fig
def abline_plot(intercept=None, slope=None, horiz=None, vert=None, model_results=None, ax=None, **kwargs): """ Plots a line given an intercept and slope. intercept : float The intercept of the line slope : float The slope of the line horiz : float or array-like Data for horizontal lines on the y-axis vert : array-like Data for verterical lines on the x-axis model_results : statsmodels results instance Any object that has a two-value `params` attribute. Assumed that it is (intercept, slope) ax : axes, optional Matplotlib axes instance kwargs Options passed to matplotlib.pyplot.plt Returns ------- fig : Figure The figure given by `ax.figure` or a new instance. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm >>> np.random.seed(12345) >>> X = sm.add_constant(np.random.normal(0, 20, size=30), prepend=True) >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30) >>> mod = sm.OLS(y,X).fit() >>> fig = abline_plot(model_results=mod) >>> ax = fig.axes >>> ax.scatter(X[:,1], y) >>> ax.margins(.1) >>> import matplotlib.pyplot as plt >>> plt.show() """ if ax is not None: # get axis limits first thing, don't change these x = ax.get_xlim() y = ax.get_ylim() else: x = None fig,ax = utils.create_mpl_ax(ax) if model_results: intercept, slope = model_results.params if x is None: x = [model_results.model.exog[:,1].min(), model_results.model.exog[:,1].max()] else: if not (intercept is not None and slope is not None): raise ValueError("specify slope and intercepty or model_results") if x is None: x = ax.get_xlim() data_y = [x[0]*slope+intercept, x[1]*slope+intercept] ax.set_xlim(x) #ax.set_ylim(y) from matplotlib.lines import Line2D class ABLine2D(Line2D): def update_datalim(self, ax): ax.set_autoscale_on(False) children = ax.get_children() abline = [children[i] for i in range(len(children)) if isinstance(children[i], ABLine2D)][0] x = ax.get_xlim() y = [x[0]*slope+intercept, x[1]*slope+intercept] abline.set_data(x,y) ax.figure.canvas.draw() line = ABLine2D(x, data_y, **kwargs) ax.add_line(line) ax.callbacks.connect('xlim_changed', line.update_datalim) ax.callbacks.connect('ylim_changed', line.update_datalim) if horiz: ax.hline(horiz) if vert: ax.vline(vert) return fig
def plot_power(self, dep_var='nobs', nobs=None, effect_size=None, alpha=0.05, ax=None, title=None, plt_kwds=None, **kwds): '''plot power with number of observations or effect size on x-axis Parameters ---------- dep_var : string in ['nobs', 'effect_size', 'alpha'] This specifies which variable is used for the horizontal axis. If dep_var='nobs' (default), then one curve is created for each value of ``effect_size``. If dep_var='effect_size' or alpha, then one curve is created for each value of ``nobs``. nobs : scalar or array_like specifies the values of the number of observations in the plot effect_size : scalar or array_like specifies the values of the effect_size in the plot alpha : float or array_like The significance level (type I error) used in the power calculation. Can only be more than a scalar, if ``dep_var='alpha'`` ax : None or axis instance If ax is None, than a matplotlib figure is created. If ax is a matplotlib axis instance, then it is reused, and the plot elements are created with it. title : string title for the axis. Use an empty string, ``''``, to avoid a title. plt_kwds : None or dict not used yet kwds : optional keywords for power function These remaining keyword arguments are used as arguments to the power function. Many power function support ``alternative`` as a keyword argument, two-sample test support ``ratio``. Returns ------- fig : matplotlib figure instance Notes ----- This works only for classes where the ``power`` method has ``effect_size``, ``nobs`` and ``alpha`` as the first three arguments. If the second argument is ``nobs1``, then the number of observations in the plot are those for the first sample. TODO: fix this for FTestPower and GofChisquarePower TODO: maybe add line variable, if we want more than nobs and effectsize ''' #if pwr_kwds is None: # pwr_kwds = {} from statsmodels.graphics import utils from statsmodels.graphics.plottools import rainbow fig, ax = utils.create_mpl_ax(ax) import matplotlib.pyplot as plt colormap = plt.cm.Dark2 #pylint: disable-msg=E1101 plt_alpha = 1 #0.75 lw = 2 if dep_var == 'nobs': colors = rainbow(len(effect_size)) colors = [colormap(i) for i in np.linspace(0, 0.9, len(effect_size))] for ii, es in enumerate(effect_size): power = self.power(es, nobs, alpha, **kwds) ax.plot(nobs, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='es=%4.2F' % es) xlabel = 'Number of Observations' elif dep_var in ['effect size', 'effect_size', 'es']: colors = rainbow(len(nobs)) colors = [colormap(i) for i in np.linspace(0, 0.9, len(nobs))] for ii, n in enumerate(nobs): power = self.power(effect_size, n, alpha, **kwds) ax.plot(effect_size, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='N=%4.2F' % n) xlabel = 'Effect Size' elif dep_var in ['alpha']: # experimental nobs as defining separate lines colors = rainbow(len(nobs)) for ii, n in enumerate(nobs): power = self.power(effect_size, n, alpha, **kwds) ax.plot(alpha, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='N=%4.2F' % n) xlabel = 'alpha' else: raise ValueError('depvar not implemented') if title is None: title = 'Power of Test' ax.set_xlabel(xlabel) ax.set_title(title) ax.legend(loc='lower right') return fig
yhat=phi1*np.hstack((0,y[0:(n-1)])) sigma=np.var(y,ddof=1) nu=np.hstack((sigma,sigma*(1-phi1**(2))[1:n])) tau=nu/sigma sigmahat=np.mean((y-yhat)**2/tau) nuhat=sigmahat*(1-phi1**(2)) nuhat2=np.sqrt(nuhat) ciar=(1-phi1**(2)) nuhat3=np.sqrt(sigma*ciar) ####Fitting Regular Models arma_mod10 = sm.tsa.ARMA(y, (1,0)).fit(trend='nc') #arma_mod10.summary() syar=arma_mod10.sigma2/(1-arma_mod10.params[0]**2) car=(1-arma_mod10.params[0]**2) sear=np.sqrt(sigma*car) pdf = matplotlib.backends.backend_pdf.PdfPages("outputAR.pdf") ax=None fig, ax = utils.create_mpl_ax(ax) ax.vlines(sT[1:100], ymin=np.zeros(100), ymax=nuhat3[1:100]) ax.scatter(sT[1:100],nuhat3[1:100],c='green') ax.axhline(np.sqrt(sigma),color='red') ax.axhline(sear,color='blue') ax.axhline(np.mean(nuhat3[1:100]),color='black') ax.set_ylim([0, 1]) pdf.savefig(1) pdf.close()
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywadjusted', use_vlines=True, title='Partial Autocorrelation', zero=True, vlines_kwargs=None, **kwargs): """ Plot the partial autocorrelation function Parameters ---------- x : array_like Array of time-series values ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. lags : {int, array_like}, optional An int or array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : float, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : {'ywunbiased', 'ywmle', 'ols'} Specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf. Default. - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr Notes ----- Plots lags on the horizontal and the correlations on vertical axis. Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40) >>> plt.show() .. plot:: plots/graphics_tsa_plot_pacf.py """ fig, ax = utils.create_mpl_ax(ax) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, **kwargs) return fig
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, eval_env=1, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : {ndarray, str} The endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : {ndarray, str} The exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : {ndarray, list[str]} Any other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : {DataFrame, dict} Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : {bool, array_like} Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the observation numbers. label_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : AxesSubplot, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. eval_env : int Patsy eval environment if user functions and formulas are used in defining endog or exog. **kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. Examples -------- Load the Statewide Crime data set and plot partial regression of the rate of high school graduation (hs_grad) on the murder rate(murder). The effects of the percent of the population living in urban areas (urban), below the poverty line (poverty) , and in a single person household (single) are removed by OLS regression. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> crime_data = sm.datasets.statecrime.load_pandas() >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', ... exog_others=['urban', 'poverty', 'single'], ... data=crime_data.data, obs_labels=False) >>> plt.show() .. plot:: plots/graphics_regression_partregress.py More detailed examples can be found in the Regression Plots notebook on the examples page. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) print("eval_env:", eval_env) # strings, use patsy to transform to data if isinstance(endog, str): endog = dmatrix(endog + "-1", data, eval_env=eval_env) if isinstance(exog_others, str): RHS = dmatrix(exog_others, data, eval_env=eval_env) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data, eval_env=eval_env) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size==0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, str): exog_i = dmatrix(exog_i + "-1", data, eval_env=eval_env) # all arrays or pandas-like if RHS_isemtpy: endog = np.asarray(endog) exog_i = np.asarray(exog_i) ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) # NOTE: if we want to get super fancy, we could annotate if a point is # clicked using this widget # http://stackoverflow.com/questions/4652439/ # is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ # 4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array_like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def plot_power(self, dep_var='nobs', nobs=None, effect_size=None, alpha=0.05, ax=None, title=None, plt_kwds=None, **kwds): '''plot power with number of observations or effect size on x-axis Parameters ---------- dep_var : string in ['nobs', 'effect_size', 'alpha'] This specifies which variable is used for the horizontal axis. If dep_var='nobs' (default), then one curve is created for each value of ``effect_size``. If dep_var='effect_size' or alpha, then one curve is created for each value of ``nobs``. nobs : scalar or array_like specifies the values of the number of observations in the plot effect_size : scalar or array_like specifies the values of the effect_size in the plot alpha : float or array_like The significance level (type I error) used in the power calculation. Can only be more than a scalar, if ``dep_var='alpha'`` ax : None or axis instance If ax is None, than a matplotlib figure is created. If ax is a matplotlib axis instance, then it is reused, and the plot elements are created with it. title : string title for the axis. Use an empty string, ``''``, to avoid a title. plt_kwds : None or dict not used yet kwds : optional keywords for power function These remaining keyword arguments are used as arguments to the power function. Many power function support ``alternative`` as a keyword argument, two-sample test support ``ratio``. Returns ------- fig : matplotlib figure instance Notes ----- This works only for classes where the ``power`` method has ``effect_size``, ``nobs`` and ``alpha`` as the first three arguments. If the second argument is ``nobs1``, then the number of observations in the plot are those for the first sample. TODO: fix this for FTestPower and GofChisquarePower TODO: maybe add line variable, if we want more than nobs and effectsize ''' #if pwr_kwds is None: # pwr_kwds = {} from statsmodels.graphics import utils from statsmodels.graphics.plottools import rainbow fig, ax = utils.create_mpl_ax(ax) import matplotlib.pyplot as plt colormap = plt.cm.Dark2 #pylint: disable-msg=E1101 plt_alpha = 1 #0.75 lw = 2 if dep_var == 'nobs': colors = rainbow(len(effect_size)) colors = [ colormap(i) for i in np.linspace(0, 0.9, len(effect_size)) ] for ii, es in enumerate(effect_size): power = self.power(es, nobs, alpha, **kwds) ax.plot(nobs, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='es=%4.2F' % es) xlabel = 'Number of Observations' elif dep_var in ['effect size', 'effect_size', 'es']: colors = rainbow(len(nobs)) colors = [colormap(i) for i in np.linspace(0, 0.9, len(nobs))] for ii, n in enumerate(nobs): power = self.power(effect_size, n, alpha, **kwds) ax.plot(effect_size, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='N=%4.2F' % n) xlabel = 'Effect Size' elif dep_var in ['alpha']: # experimental nobs as defining separate lines colors = rainbow(len(nobs)) for ii, n in enumerate(nobs): power = self.power(effect_size, n, alpha, **kwds) ax.plot(alpha, power, lw=lw, alpha=plt_alpha, color=colors[ii], label='N=%4.2F' % n) xlabel = 'alpha' else: raise ValueError('depvar not implemented') if title is None: title = 'Power of Test' ax.set_xlabel(xlabel) ax.set_title(title) ax.legend(loc='lower right') return fig
def plot_ccpr(results, exog_idx, ax=None): """ Plot CCPR against one regressor. Generates a component and component-plus-residual (CCPR) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : {int, str} Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : AxesSubplot, optional If given, it is used to plot in instead of a new figure being created. Returns ------- Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm Examples -------- Using the state crime dataset plot the effect of the rate of single households ('single') on the murder rate while accounting for high school graduation rate ('hs_grad'), percentage of people in an urban area, and rate of poverty ('poverty'). >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plot >>> import statsmodels.formula.api as smf >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> sm.graphics.plot_ccpr(results, 'single') >>> plt.show() .. plot:: plots/graphics_regression_ccpr.py """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() ax.plot(res_xaxis.resid, res_yaxis.resid, 'o', **kwargs) fitted_line = OLS(res_yaxis.resid, res_xaxis.resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) x_axis_endog_name = res_xaxis.model.endog_names if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % res_yaxis.model.endog_names) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None, obs_hist_args=None, all_hist_args=None): """ Display imputed values for one variable as a histogram. Parameters ---------- col_name : string The name of the variable to be plotted. ax : matplotlib axes An axes on which to draw the histograms. If not provided, one is created. imp_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for imputed values. obs_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for observed values. all_hist_args : dict Keyword arguments to be passed to pyplot.hist when creating the histogram for all values. Returns ------- The matplotlib figure on which the histograms were drawn """ from statsmodels.graphics import utils as gutils from matplotlib.colors import LinearSegmentedColormap if imp_hist_args is None: imp_hist_args = {} if obs_hist_args is None: obs_hist_args = {} if all_hist_args is None: all_hist_args = {} if ax is None: fig, ax = gutils.create_mpl_ax(ax) else: fig = ax.get_figure() ax.set_position([0.1, 0.1, 0.7, 0.8]) ixm = self.ix_miss[col_name] ixo = self.ix_obs[col_name] imp = self.data[col_name].iloc[ixm] obs = self.data[col_name].iloc[ixo] for di in imp_hist_args, obs_hist_args, all_hist_args: if 'histtype' not in di: di['histtype'] = 'step' ha, la = [], [] if len(imp) > 0: h = ax.hist(np.asarray(imp), **imp_hist_args) ha.append(h[-1][0]) la.append("Imp") h1 = ax.hist(np.asarray(obs), **obs_hist_args) h2 = ax.hist(np.asarray(self.data[col_name]), **all_hist_args) ha.extend([h1[-1][0], h2[-1][0]]) la.extend(["Obs", "All"]) leg = fig.legend(ha, la, 'center right', numpoints=1) leg.draw_frame(False) ax.set_xlabel(col_name) ax.set_ylabel("Frequency") return fig
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True, **kwargs): """Plot the partial autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : array_like, optional Array of lag values, used on horizontal axis. If not given, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) if lags is None: lags = np.arange(len(x)) nlags = len(lags) - 1 else: nlags = lags lags = np.arange(lags + 1) # +1 for zero lag confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) if use_vlines: ax.vlines(lags, [0], acf_x, **kwargs) ax.axhline(**kwargs) # center the confidence interval TODO: do in acf? kwargs.setdefault('marker', 'o') kwargs.setdefault('markersize', 5) kwargs.setdefault('linestyle', 'None') ax.margins(.05) ax.plot(lags, acf_x, **kwargs) ax.set_title("Partial Autocorrelation") if confint is not None: # center the confidence interval TODO: do in acf? ax.fill_between(lags, confint[:, 0] - acf_x, confint[:, 1] - acf_x, alpha=.25) return fig
def influence_plot(results, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): """ Plot of influence in regression. Plots studentized resids vs. leverage. Parameters ---------- results : results instance A fitted model. external : bool Whether to use externally or internally studentized residuals. It is recommended to leave external as True. alpha : float The alpha value to identify large studentized residuals. Large means abs(resid_studentized) > t.ppf(1-alpha/2, dof=results.df_resid) criterion : str {'DFFITS', 'Cooks'} Which criterion to base the size of the points on. Options are DFFITS or Cook's D. size : float The range of `criterion` is mapped to 10**2 - size**2 in points. plot_alpha : float The `alpha` of the plotted points. ax : matplotlib Axes instance An instance of a matplotlib Axes. Returns ------- fig : matplotlib figure The matplotlib figure that contains the Axes. Notes ----- Row labels for the observations in which the leverage, measured by the diagonal of the hat matrix, is high or the residuals are large, as the combination of large residuals and a high influence value indicates an influence point. The value of large residuals can be controlled using the `alpha` parameter. Large leverage points are identified as hat_i > 2 * (df_model + 1)/nobs. """ fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() if criterion.lower().startswith('dff'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('coo'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized_internal from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True, **kwargs): """Plot the partial autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : array_like, optional Array of lag values, used on horizontal axis. If not given, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) if lags is None: lags = np.arange(len(x)) nlags = len(lags) - 1 else: nlags = lags lags = np.arange(lags + 1) # +1 for zero lag acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) if use_vlines: ax.vlines(lags, [0], acf_x, **kwargs) ax.axhline(**kwargs) # center the confidence interval TODO: do in acf? confint = confint - confint.mean(1)[:,None] kwargs.setdefault('marker', 'o') kwargs.setdefault('markersize', 5) kwargs.setdefault('linestyle', 'None') ax.margins(.05) ax.plot(lags, acf_x, **kwargs) ax.fill_between(lags, confint[:,0], confint[:,1], alpha=.25) ax.set_title("Partial Autocorrelation") return fig
def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005, properties=lambda key: None, labelizer=None, title='', statistic=False, axes_label=True, label_rotation=0.0): """Create a mosaic plot from a contingency table. It allows to visualize multivariate categorical data in a rigorous and informative way. Parameters ---------- data : {dict, Series, ndarray, DataFrame} The contingency table that contains the data. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be represents; if that is not true, will automatically consider the missing values as 0. The order of the keys will be the same as the one of insertion. If a dict of a Series (or any other dict like object) is used, it will take the keys as labels. If a np.ndarray is provided, it will generate a simple numerical labels. index : list, optional Gives the preferred order for the category ordering. If not specified will default to the given order. It does not support named indexes for hierarchical Series. If a DataFrame is provided, it expects a list with the name of the columns. ax : Axes, optional The graph where display the mosaic. If not given, will create a new figure horizontal : bool, optional The starting direction of the split (by default along the horizontal axis) gap : {float, sequence[float]} The list of gaps to be applied on each subdivision. If the length of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps properties : dict[str, callable], optional A function that for each tile in the mosaic take the key of the tile and returns the dictionary of properties of the generated Rectangle, like color, hatch or similar. A default properties set will be provided fot the keys whose color has not been defined, and will use color variation to help visually separates the various categories. It should return None to indicate that it should use the default property for the tile. A dictionary of the properties for each key can be passed, and it will be internally converted to the correct function labelizer : dict[str, callable], optional A function that generate the text to display at the center of each tile base on the key of that tile title : str, optional The title of the axis statistic : bool, optional If true will use a crude statistical model to give colors to the plot. If the tile has a constraint that is more than 2 standard deviation from the expected value under independence hypothesis, it will go from green to red (for positive deviations, blue otherwise) and will acquire an hatching when crosses the 3 sigma. axes_label : bool, optional Show the name of each value of each category on the axis (default) or hide them. label_rotation : {float, list[float]} The rotation of the axis label (if present). If a list is given each axis can have a different rotation Returns --------- fig : Figure The figure containing the plot. rects : dict A dictionary that has the same keys of the original dataset, that holds a reference to the coordinates of the tile and the Rectangle that represent it. References ---------- A Brief History of the Mosaic Display Michael Friendly, York University, Psychology Department Journal of Computational and Graphical Statistics, 2001 Mosaic Displays for Loglinear Models. Michael Friendly, York University, Psychology Department Proceedings of the Statistical Graphics Section, 1992, 61-68. Mosaic displays for multi-way contingency tables. Michael Friendly, York University, Psychology Department Journal of the american statistical association March 1994, Vol. 89, No. 425, Theory and Methods Examples ---------- >>> import numpy as np >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> from statsmodels.graphics.mosaicplot import mosaic The most simple use case is to take a dictionary and plot the result >>> data = {'a': 10, 'b': 15, 'c': 16} >>> mosaic(data, title='basic dictionary') >>> plt.show() A more useful example is given by a dictionary with multiple indices. In this case we use a wider gap to a better visual separation of the resulting plot >>> data = {('a', 'b'): 1, ('a', 'c'): 2, ('d', 'b'): 3, ('d', 'c'): 4} >>> mosaic(data, gap=0.05, title='complete dictionary') >>> plt.show() The same data can be given as a simple or hierarchical indexed Series >>> rand = np.random.random >>> from itertools import product >>> tuples = list(product(['bar', 'baz', 'foo', 'qux'], ['one', 'two'])) >>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) >>> data = pd.Series(rand(8), index=index) >>> mosaic(data, title='hierarchical index series') >>> plt.show() The third accepted data structure is the np array, for which a very simple index will be created. >>> rand = np.random.random >>> data = 1+rand((2,2)) >>> mosaic(data, title='random non-labeled array') >>> plt.show() If you need to modify the labeling and the coloring you can give a function tocreate the labels and one with the graphical properties starting from the key tuple >>> data = {'a': 10, 'b': 15, 'c': 16} >>> props = lambda key: {'color': 'r' if 'a' in key else 'gray'} >>> labelizer = lambda k: {('a',): 'first', ('b',): 'second', ... ('c',): 'third'}[k] >>> mosaic(data, title='colored dictionary', properties=props, ... labelizer=labelizer) >>> plt.show() Using a DataFrame as source, specifying the name of the columns of interest >>> gender = ['male', 'male', 'male', 'female', 'female', 'female'] >>> pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat'] >>> data = pd.DataFrame({'gender': gender, 'pet': pet}) >>> mosaic(data, ['pet', 'gender'], title='DataFrame as Source') >>> plt.show() .. plot :: plots/graphics_mosaicplot_mosaic.py """ if isinstance(data, DataFrame) and index is None: raise ValueError("You must pass an index if data is a DataFrame." " See examples.") from matplotlib.patches import Rectangle #from pylab import Rectangle fig, ax = utils.create_mpl_ax(ax) # normalize the data to a dict with tuple of strings as keys data = _normalize_data(data, index) # split the graph into different areas rects = _hierarchical_split(data, horizontal=horizontal, gap=gap) # if there is no specified way to create the labels # create a default one if labelizer is None: labelizer = lambda k: "\n".join(k) if statistic: default_props = _statistical_coloring(data) else: default_props = _create_default_properties(data) if isinstance(properties, dict): color_dict = properties properties = lambda key: color_dict.get(key, None) for k, v in rects.items(): # create each rectangle and put a label on it x, y, w, h = v conf = properties(k) props = conf if conf else default_props[k] text = labelizer(k) Rect = Rectangle((x, y), w, h, label=text, **props) ax.add_patch(Rect) ax.text(x + w / 2, y + h / 2, text, ha='center', va='center', size='smaller') #creating the labels on the axis #o clearing it if axes_label: if np.iterable(label_rotation): rotation = label_rotation else: rotation = [label_rotation] * 4 labels = _create_labels(rects, horizontal, ax, rotation) else: ax.set_xticks([]) ax.set_xticklabels([]) ax.set_yticks([]) ax.set_yticklabels([]) ax.set_title(title) return fig, rects
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True, title='Partial Autocorrelation', zero=True, **kwargs): """Plot the partial autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int or array_like, optional int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs) return fig
def plot_scree(self, ncomp=None, log_scale=True, cumulative=False, ax=None): """ Plot of the ordered eigenvalues Parameters ---------- ncomp : int, optional Number of components ot include in the plot. If None, will included the same as the number of components computed log_scale : boot, optional Flag indicating whether ot use a log scale for the y-axis cumulative : bool, optional Flag indicating whether to plot the eigenvalues or cumulative eigenvalues ax : Matplotlib axes instance, optional An axes on which to draw the graph. If omitted, new a figure is created Returns ------- fig : figure Handle to the figure """ import statsmodels.graphics.utils as gutils fig, ax = gutils.create_mpl_ax(ax) ncomp = self._ncomp if ncomp is None else ncomp vals = np.asarray(self.eigenvals) vals = vals[:self._ncomp] if cumulative: vals = np.cumsum(vals) if log_scale: ax.set_yscale('log') ax.plot(np.arange(ncomp), vals[:ncomp], 'bo') ax.autoscale(tight=True) xlim = np.array(ax.get_xlim()) sp = xlim[1] - xlim[0] xlim += 0.02 * np.array([-sp, sp]) ax.set_xlim(xlim) ylim = np.array(ax.get_ylim()) scale = 0.02 if log_scale: sp = np.log(ylim[1] / ylim[0]) ylim = np.exp( np.array([ np.log(ylim[0]) - scale * sp, np.log(ylim[1]) + scale * sp ])) else: sp = ylim[1] - ylim[0] ylim += scale * np.array([-sp, sp]) ax.set_ylim(ylim) ax.set_title('Scree Plot') ax.set_ylabel('Eigenvalue') ax.set_xlabel('Component Number') fig.tight_layout() return fig
def plot_survfunc(survfuncs, ax=None): """ Plot one or more survivor functions. Parameters ---------- survfuncs : object or array-like A single SurvfuncRight object, or a list or SurvfuncRight objects that are plotted together. Returns ------- A figure instance on which the plot was drawn. Examples -------- Add a legend: >>> import statsmodels.api as sm >>> from statsmodels.duration.survfunc import plot_survfunc >>> data = sm.datasets.get_rdataset("flchain", "survival").data >>> df = data.loc[data.sex == "F", :] >>> sf0 = sm.SurvfuncRight(df["futime"], df["death"]) >>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"]) >>> fig = plot_survfunc([sf0, sf1]) >>> ax = fig.get_axes()[0] >>> ax.set_position([0.1, 0.1, 0.64, 0.8]) >>> ha, lb = ax.get_legend_handles_labels() >>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), 'center right') Change the line colors: >>> fig = plot_survfunc([sf0, sf1]) >>> ax = fig.get_axes()[0] >>> ax.set_position([0.1, 0.1, 0.64, 0.8]) >>> ha, lb = ax.get_legend_handles_labels() >>> ha[0].set_color('purple') >>> ha[1].set_color('orange') """ fig, ax = utils.create_mpl_ax(ax) # If we have only a single survival function to plot, put it into # a list. try: assert (type(survfuncs[0]) is SurvfuncRight) except: survfuncs = [survfuncs] for gx, sf in enumerate(survfuncs): # The estimated survival function does not include a point at # time 0, include it here for plotting. surv_times = np.concatenate(([0], sf.surv_times)) surv_prob = np.concatenate(([1], sf.surv_prob)) # If the final times are censoring times they are not included # in the survival function so we add them here mxt = max(sf.time) if mxt > surv_times[-1]: surv_times = np.concatenate((surv_times, [mxt])) surv_prob = np.concatenate((surv_prob, [surv_prob[-1]])) label = getattr(sf, "title", "Group %d" % (gx + 1)) li, = ax.step(surv_times, surv_prob, '-', label=label, lw=2, where='post') # Plot the censored points. ii = np.flatnonzero(np.logical_not(sf.status)) ti = sf.time[ii] jj = np.searchsorted(surv_times, ti) - 1 sp = surv_prob[jj] ax.plot(ti, sp, '+', ms=12, color=li.get_color(), label=label + " points") ax.set_ylim(0, 1.01) return fig