def _setup_subplots( subplots, nseries, sharex=False, sharey=False, figsize=None, ax=None, layout=None, layout_type="vertical", ): """prepares the subplots""" from pandas.plotting._tools import _subplots, _flatten if subplots: fig, axes = _subplots( naxes=nseries, sharex=sharex, sharey=sharey, figsize=figsize, ax=ax, layout=layout, layout_type=layout_type, ) else: if ax is None: fig = plt.figure(figsize=figsize) axes = fig.add_subplot(111) else: fig = ax.get_figure() if figsize is not None: fig.set_size_inches(figsize) axes = ax axes = _flatten(axes) return fig, axes
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): """ Draw a matrix of scatter plots. Parameters ---------- frame : DataFrame alpha : float, optional amount of transparency applied figsize : (float,float), optional a tuple (width, height) in inches ax : Matplotlib axis object, optional grid : bool, optional setting this to True will show the grid diagonal : {'hist', 'kde'} pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plot in the diagonal marker : str, optional Matplotlib marker type, default '.' hist_kwds : other plotting keyword arguments To be passed to hist function density_kwds : other plotting keyword arguments To be passed to kernel density estimate plot range_padding : float, optional relative extension of axis range in x and y with respect to (x_max - x_min) or (y_max - y_min), default 0.05 kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault('edgecolors', 'none') boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes
def _joyplot(data, grid=False, labels=None, sublabels=None, xlabels=True, label_strings = [], xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, figsize=None, hist=False, bins=10, fade=False, xlim=None, ylim='max', fill=True, linecolor=None, overlap=1, background=None, range_style='all', x_range=None, tails=0.2, title=None, x_spacing=None, legend=False, loc="upper right", colormap=None, color=None, x_title=None, **kwargs): """ Internal method. Draw a joyplot from an appropriately nested collection of lists using matplotlib and pandas. Parameters ---------- data : DataFrame, Series or nested collection grid : boolean, default True Whether to show axis grid lines labels : boolean or list, default True. If list, must be the same size of the de xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None figsize : tuple The size of the figure to create in inches by default hist : boolean, default False bins : integer, default 10 Number of histogram bins to be used kwarg : other plotting keyword arguments To be passed to hist/kde plot function """ if fill is True and linecolor is None: linecolor = "k" if sublabels is None: legend = False def _get_color(i, num_axes, j, num_subgroups): if isinstance(color, list): return color[i] elif color is not None: return color elif isinstance(colormap, list): return colormap[j](i/num_axes) elif color is None and colormap is None: return plt.rcParams['axes.prop_cycle'].by_key()['color'][j] else: return colormap(i/num_axes) ygrid = (grid is True or grid == 'y' or grid == 'both') xgrid = (grid is True or grid == 'x' or grid == 'both') num_axes = len(data) if x_range is None: global_x_range = _x_range([v for g in data for sg in g for v in sg]) else: global_x_range = _x_range(x_range, 0.0) global_x_min, global_x_max = min(global_x_range), max(global_x_range) # Each plot will have its own axis fig, axes = _subplots(naxes=num_axes, ax=ax, squeeze=False, sharex=True, sharey=False, figsize=figsize, layout_type='vertical') _axes = _flatten(axes) # The legend must be drawn in the last axis if we want it at the bottom. if loc in (3, 4, 8) or 'lower' in str(loc): legend_axis = num_axis - 1 else: legend_axis = 0 # A couple of simple checks. if labels is not None: assert len(labels) == num_axes if sublabels is not None: assert all(len(g) == len(sublabels) for g in data) # if isinstance(color, list): # assert all(len(g) == len(color) for g in data) if isinstance(colormap, list): assert all(len(g) == len(colormap) for g in data) for i, group in enumerate(data): a = _axes[i] group_zorder = i if fade: kwargs['alpha'] = _get_alpha(i, num_axes) num_subgroups = len(group) if hist: # matplotlib hist() already handles multiple subgroups in a histogram a.hist(group, label=sublabels, bins=bins, range=[min(global_x_range), max(global_x_range)], edgecolor=linecolor, zorder=group_zorder, **kwargs) else: for j, subgroup in enumerate(group): # Compute the x_range of the current plot if range_style == 'all': # All plots have the same range x_range = global_x_range elif range_style == 'own': # Each plot has its own range x_range = _x_range(subgroup, tails) elif range_style == 'group': # Each plot has a range that covers the whole group x_range = _x_range(group, tails) elif isinstance(range_style, (list, np.ndarray)): # All plots have exactly the range passed as argument x_range = _x_range(range_style, 0.0) else: raise NotImplementedError("Unrecognized range style.") if sublabels is None: sublabel = None else: sublabel = sublabels[j] element_zorder = group_zorder + j/(num_subgroups+1) element_color = _get_color(i, num_axes, j, num_subgroups) if not fill and linecolor is None: linecolor = element_color print ("LABEL STRINGS ARE") print (label_strings) if len(label_strings) == 0: plot_density(a, x_range, subgroup, fill=fill, linecolor=linecolor, label=sublabel, zorder=element_zorder, color=element_color, bins=bins, **kwargs) else: print ('string is: ' + label_strings[i]) plot_density(a, x_range, subgroup, fill=fill, linecolor=linecolor, label=label_strings[i], zorder=element_zorder, color=element_color, bins=bins, **kwargs) # Setup the current axis: transparency, labels, spines. if labels is None: _setup_axis(a, global_x_range, col_name=None, grid=ygrid, x_spacing=x_spacing) else: if len(label_strings) == 0: _setup_axis(a, global_x_range, col_name=labels[i], grid=ygrid, x_spacing=x_spacing) else: _setup_axis(a, global_x_range, col_name=label_strings[i], grid=ygrid, x_spacing=x_spacing) # When needed, draw the legend if legend and i == legend_axis: a.legend(loc=loc) # Bypass alpha values, in case for p in a.get_legend().get_patches(): p.set_alpha(1.0) for l in a.get_legend().get_lines(): l.set_alpha(1.0) # Final adjustments # Set the y limit for the density plots. # Since the y range in the subplots can vary significantly, # different options are available. if ylim == 'max': # Set all yaxis limit to the same value (max range among all) max_ylim = max(a.get_ylim()[1] for a in _axes) min_ylim = min(a.get_ylim()[0] for a in _axes) for a in _axes: a.set_ylim([min_ylim - 0.1*(max_ylim-min_ylim), max_ylim]) elif ylim == 'own': # Do nothing, each axis keeps its own ylim pass else: # Set all yaxis max lim to the argument value ylim try: for a in _axes: a.set_ylim(ylim) except: print("Warning: the value of ylim must be either 'max', 'own', or a tuple of length 2. The value you provided has no effect.") # Compute a final axis, used to apply global settings last_axis = fig.add_subplot(1, 1, 1) # Background color if background is not None: last_axis.patch.set_facecolor(background) for side in ['top', 'bottom', 'left', 'right']: last_axis.spines[side].set_visible(_DEBUG) # This looks hacky, but all the axes share the x-axis, # so they have the same lims and ticks last_axis.set_xlim(_axes[0].get_xlim()) if xlabels is True: last_axis.set_xticks(_axes[0].get_xticks()[1:-1]) last_axis.set_xticklabels(_axes[0].get_xticks()[1:-1]) for t in last_axis.get_xticklabels(): t.set_visible(True) # If grid is enabled, do not allow xticks (they are ugly) if xgrid: last_axis.tick_params(axis='both', which='both',length=0) else: last_axis.xaxis.set_visible(False) last_axis.yaxis.set_visible(False) last_axis.grid(xgrid) # set the x axis title if you want it if x_title is not None: last_axis.set_xlabel(x_title) # Last axis on the back last_axis.zorder = min(a.zorder for a in _axes) - 1 _axes = list(_axes) + [last_axis] if title is not None: plt.title(title) # The magic overlap happens here. h_pad = 5 + (- 5*(1 + overlap)) plt.tight_layout(h_pad=h_pad) return fig, _axes
def _joyplot(data, grid=False, labels=None, sublabels=None, xlabels=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, figsize=None, hist=False, bins=10, fade=False, xlim=None, ylim='max', fill=True, linecolor=None, overlap=1, background=None, range_style='all', x_range=None, tails=0.2, title=None, legend=False, loc="upper right", colormap=None, color=None, **kwargs): """ Internal method. Draw a joyplot from an appropriately nested collection of lists using matplotlib and pandas. Parameters ---------- data : DataFrame, Series or nested collection grid : boolean, default True Whether to show axis grid lines labels : boolean or list, default True. If list, must be the same size of the de xlabelsize : int, default None If specified changes the x-axis label size xrot : float, default None rotation of x axis labels ylabelsize : int, default None If specified changes the y-axis label size yrot : float, default None rotation of y axis labels ax : matplotlib axes object, default None figsize : tuple The size of the figure to create in inches by default hist : boolean, default False bins : integer, default 10 Number of histogram bins to be used kwarg : other plotting keyword arguments To be passed to hist/kde plot function """ if fill is True and linecolor is None: linecolor = "k" if sublabels is None: legend = False def _get_color(i, num_axes, j, num_subgroups): if isinstance(color, list): return color[j] if num_subgroups > 1 else color[i] elif color is not None: return color elif isinstance(colormap, list): return colormap[j](i / num_axes) elif color is None and colormap is None: num_cycle_colors = len( plt.rcParams['axes.prop_cycle'].by_key()['color']) return plt.rcParams['axes.prop_cycle'].by_key()['color'][ j % num_cycle_colors] else: return colormap(i / num_axes) ygrid = (grid is True or grid == 'y' or grid == 'both') xgrid = (grid is True or grid == 'x' or grid == 'both') num_axes = len(data) if x_range is None: global_x_range = _x_range([v for g in data for sg in g for v in sg]) else: global_x_range = _x_range(x_range, 0.0) global_x_min, global_x_max = min(global_x_range), max(global_x_range) # Each plot will have its own axis fig, axes = _subplots(naxes=num_axes, ax=ax, squeeze=False, sharex=True, sharey=False, figsize=figsize, layout_type='vertical') _axes = _flatten(axes) # The legend must be drawn in the last axis if we want it at the bottom. if loc in (3, 4, 8) or 'lower' in str(loc): legend_axis = num_axis - 1 else: legend_axis = 0 # A couple of simple checks. if labels is not None: assert len(labels) == num_axes if sublabels is not None: assert all(len(g) == len(sublabels) for g in data) if isinstance(color, list): assert all(len(g) <= len(color) for g in data) if isinstance(colormap, list): assert all(len(g) == len(colormap) for g in data) for i, group in enumerate(data): a = _axes[i] group_zorder = i if fade: kwargs['alpha'] = _get_alpha(i, num_axes) num_subgroups = len(group) if hist: # matplotlib hist() already handles multiple subgroups in a histogram a.hist(group, label=sublabels, bins=bins, color=color, range=[min(global_x_range), max(global_x_range)], edgecolor=linecolor, zorder=group_zorder, **kwargs) else: for j, subgroup in enumerate(group): # Compute the x_range of the current plot if range_style == 'all': # All plots have the same range x_range = global_x_range elif range_style == 'own': # Each plot has its own range x_range = _x_range(subgroup, tails) elif range_style == 'group': # Each plot has a range that covers the whole group x_range = _x_range(group, tails) elif isinstance(range_style, (list, np.ndarray)): # All plots have exactly the range passed as argument x_range = _x_range(range_style, 0.0) else: raise NotImplementedError("Unrecognized range style.") if sublabels is None: sublabel = None else: sublabel = sublabels[j] element_zorder = group_zorder + j / (num_subgroups + 1) element_color = _get_color(i, num_axes, j, num_subgroups) plot_density(a, x_range, subgroup, fill=fill, linecolor=linecolor, label=sublabel, zorder=element_zorder, color=element_color, bins=bins, **kwargs) # Setup the current axis: transparency, labels, spines. col_name = None if labels is None else labels[i] _setup_axis(a, global_x_range, col_name=col_name, grid=ygrid, ylabelsize=ylabelsize, yrot=yrot) # When needed, draw the legend if legend and i == legend_axis: a.legend(loc=loc) # Bypass alpha values, in case for p in a.get_legend().get_patches(): p.set_facecolor(p.get_facecolor()) p.set_alpha(1.0) for l in a.get_legend().get_lines(): l.set_alpha(1.0) # Final adjustments # Set the y limit for the density plots. # Since the y range in the subplots can vary significantly, # different options are available. if ylim == 'max': # Set all yaxis limit to the same value (max range among all) max_ylim = max(a.get_ylim()[1] for a in _axes) min_ylim = min(a.get_ylim()[0] for a in _axes) for a in _axes: a.set_ylim([min_ylim - 0.1 * (max_ylim - min_ylim), max_ylim]) elif ylim == 'own': # Do nothing, each axis keeps its own ylim pass else: # Set all yaxis lim to the argument value ylim try: for a in _axes: a.set_ylim(ylim) except: print( "Warning: the value of ylim must be either 'max', 'own', or a tuple of length 2. The value you provided has no effect." ) # Compute a final axis, used to apply global settings last_axis = fig.add_subplot(1, 1, 1) # Background color if background is not None: last_axis.patch.set_facecolor(background) for side in ['top', 'bottom', 'left', 'right']: last_axis.spines[side].set_visible(_DEBUG) # This looks hacky, but all the axes share the x-axis, # so they have the same lims and ticks last_axis.set_xlim(_axes[0].get_xlim()) if xlabels is True: last_axis.set_xticks(np.array(_axes[0].get_xticks()[1:-1])) for t in last_axis.get_xticklabels(): t.set_visible(True) t.set_fontsize(xlabelsize) t.set_rotation(xrot) # If grid is enabled, do not allow xticks (they are ugly) if xgrid: last_axis.tick_params(axis='both', which='both', length=0) else: last_axis.xaxis.set_visible(False) last_axis.yaxis.set_visible(False) last_axis.grid(xgrid) # Last axis on the back last_axis.zorder = min(a.zorder for a in _axes) - 1 _axes = list(_axes) + [last_axis] if title is not None: plt.title(title) # The magic overlap happens here. h_pad = 5 + (-5 * (1 + overlap)) fig.tight_layout(h_pad=h_pad) return fig, _axes
def plot_scatter_matrix( data, cols, alpha=0.8, figsize=None, ax=None, grid=False, diagonal="hist", marker=".", density_kwds=None, hist_kwds={'bins': 20}, range_padding=0.05, plot_axes="lower", # "all", "lower", "upper" **kwds): features = data[cols] # plt.figure(figsize=(15,9)) def _get_marker_compat(marker): if marker not in mlines.lineMarkers: return "o" return marker df = features._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=(15, 9), ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} kwds.setdefault("edgecolors", "none") boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in enumerate(df.columns): for j, b in enumerate(df.columns): ax = axes[i, j] ax.set_visible(False) if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == "hist": ax.hist(values, **hist_kwds) elif diagonal in ("kde", "density"): y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) ax.set_visible(True) elif plot_axes == "all" or (i > j and plot_axes == "lower") or ( i < j and plot_axes == "upper"): common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_visible(True) ax.set_xlabel(b, rotation=40) ax.set_ylabel(a, rotation=40) # plt.xticks(rotation=90) if plot_axes in ("all", "lower"): if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) elif plot_axes == "upper": if i != j: ax.yaxis.set_visible(False) if i == 0: ax.xaxis.tick_top() ax.xaxis.set_label_position('top') else: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=6, xrot=0, ylabelsize=6, yrot=0) axes[0][0].yaxis.set_visible(False) corrs = df.corr().values for i, j in zip(*plt.np.tril_indices_from(axes, k=1)): axes[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=12) plt.show() return axes