def singlecdf(ax, data, weights, lower, upper, bins, color, label=None, **kwargs): """ local function to plot a single cdf """ cdf_x, cdfvals = gs.cdf(data, weights=weights, lower=lower, upper=upper, bins=bins) # Matplotlib is a memory hog if to many points are used. Limit the number of points the CDF # is build with to 1000. The tails are given extra attention to make sure they are defined # nicely. if len(cdf_x) > 1000: cdfinterp = scipy.interpolate.interp1d(x=cdfvals, y=cdf_x) cdfvals = np.concatenate([ np.arange(cdfvals.min(), 0.1, 0.001), np.arange(0.1, 0.9, 0.01), np.arange(0.9, cdfvals.max(), 0.001) ]) cdf_x = [] for val in cdfvals: cdf_x.append(cdfinterp(val)) cdf_x = np.array(cdf_x) fig = ax.plot(cdf_x, cdfvals, color=color, label=label, **kwargs) return fig
def scatter_plot(x, y, wt=None, nmax=None, s=None, c=None, alpha=None, cmap=None, clim=None, cbar=False, cbar_label=None, stat_blk=None, stat_xy=None, stat_ha=None, stat_fontsize=None, roundstats=None, sigfigs=None, xlim=None, ylim=None, xlabel=None, ylabel=None, output_file=None, out_kws = None, title=None, grid=None, axis_xy=None, label='_nolegend_', ax=None, figsize=None, return_plot=False, logx=None, logy=None, **kwargs): ''' Scatter plot that mimics the GSLIB scatter_plot program, providing summary statistics, kernel density estimate coloring, etc. NaN values are treated as null and removed from the plot and statistics. Parameters: x(np.ndarray or pd.Series): 1-D array with the variable to plot on the x-axis. y(np.ndarray or pd.Series): 1-D array with the variable to plot on the y-axis. Keyword arguments: wt(np.ndarray or pd.DataFrame): 1-D array with weights that are used in the calculation of displayed statistics. s(float or np.ndarray or pd.Series): size of each scatter point. Based on Parameters['plotting.scatter_plot.s'] if None. c(color or np.ndarray or pd.Series): color of each scatter point, as an array or valid Matplotlib color. Alternatively, 'KDE' may be specified to color each point according to its associated kernel density estimate. Based on Parameters['plotting.scatter_plot.c'] if None. nmax (int): specify the maximum number of scatter points that should be displayed, which may be necessary due to the time-requirements of plotting many data. If specified, a nmax-length random sub-sample of the data is plotted. Note that this does not impact summary statistics. alpha(float): opacity of the scatter. Based on Parameters['plotting.scatter_plot.alpha'] if None. cmap (str): A matplotlib colormap object or a registered matplotlib clim (float tuple): Data minimum and maximum values cbar (bool): Indicate if a colorbar should be plotted or not cbar_label (str): Colorbar title stat_blk(str or list): statistics to place in the plot, which should be 'all' or a list that may contain ['count', 'pearson', 'spearman', 'noweightflag']. Based on Parameters['plotting.scatter_plot.stat_blk'] if None. Set to False to disable. stat_xy (float tuple): X, Y coordinates of the annotated statistics in figure space. Based on Parameters['plotting.scatter_plot.stat_xy'] if None. stat_ha (str): Horizontal alignment parameter for the annotated statistics. Can be ``'right'``, ``'left'``, or ``'center'``. If None, based on Parameters['plotting.stat_ha'] stat_fontsize (float): the fontsize for the statistics block. If None, based on Parameters['plotting.stat_fontsize']. If less than 1, it is the fraction of the matplotlib.rcParams['font.size']. If greater than 1, it the absolute font size. roundstats (bool): Indicate if the statistics should be rounded to the number of digits or to a number of significant figures (e.g., 0.000 vs. 1.14e-5). The number of digits or figures used is set by the parameter ``sigfigs``. sigfigs (int): Number of significant figures or number of digits (depending on ``roundstats``) to display for the float statistics. Based on Parameters['plotting.roundstats'] and Parameters['plotting.roundstats'] and Parameters['plotting.sigfigs'] if None. xlim(tuple): x-axis limits - xlim[0] to xlim[1]. Based on the data if None ylim(tuple): y-axis limits - ylim[0] to ylim[1]. Based on the data if None. xlabel(str): label of the x-axis, extracted from x if None ylabel(str): label of the y-axis, extracted from y if None output_file (str): Output figure file name and location out_kws (dict): Optional dictionary of permissible keyword arguments to pass to :func:`gs.export_image() <pygeostat.plotting.export_image.export_image>` title(str): plot title grid(bool): plot grid lines in each panel? Based on Parameters['plotting.grid'] if None. axis_xy(bool): if True, mimic a GSLIB-style scatter_plot, where only the bottom and left axes lines are displayed. Based on Parameters['plotting.axis_xy'] if None. label(str): label of scatter for legend ax(Matplotlib axis handle): if None, create a new figure and axis handles figsize(tuple): size of the figure, if creating a new one when ax = None logx, logy (str): permissible mpl axis scale, like `log` **kwargs: Optional permissible keyword arguments to pass to either: (1) matplotlib's scatter function Return: ax(Matplotlib axis handle) **Examples:** Basic scatter example: .. plot:: import pygeostat as gs # Load the data data_file = gs.ExampleData('point3d_ind_mv') # Select a couple of variables x, y = data_file[data_file.variables[0]], data_file[data_file.variables[1]] # Scatter plot with default parameters gs.scatter_plot(x, y, figsize=(5, 5), cmap='hot') # Scatter plot without correlation and with a color bar: gs.scatter_plot(x, y, nmax=2000, stat_blk=False, cbar=True, figsize=(5, 5)) # Scatter plot with the a constant color, transparency and all statistics # Also locate the statistics where they are better seen gs.scatter_plot(x, y, c='k', alpha=0.2, nmax=2000, stat_blk='all', stat_xy=(.95, .95), figsize=(5, 5)) ''' # Import packages from scipy.stats import gaussian_kde from copy import deepcopy import pygeostat as gs from . utils import _set_stat_fontsize # Figure out the plotting axes if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) # Labels if present if xlabel is None: xlabel = gs.get_label(x) if ylabel is None: ylabel = gs.get_label(y) # Check the input data if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series): x = x.values if x.ndim > 1: raise ValueError('x should be one-dimension!') if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values if y.shape != x.shape: raise ValueError('x and y should be the same shape!') # Check the weights if isinstance(wt, pd.DataFrame) or isinstance(wt, pd.Series): wt = wt.values elif wt is None: wt = np.ones(x.shape) if wt.shape != x.shape: raise ValueError('x, y and wt should be the same shape!') # Remove nans if present idx = np.logical_and(np.isfinite(x), np.isfinite(y), np.isfinite(wt)) x, y, wt = x[idx], y[idx], wt[idx] # Draw a random sub-sample if present xplot, yplot = deepcopy(x), deepcopy(y) if isinstance(nmax, int): if len(xplot) > nmax: idx1 = np.random.randint(0, len(xplot), nmax) xplot = xplot[idx1] yplot = yplot[idx1] else: idx1 = np.arange(0, len(xplot)) # There's probably a lot of edge cases to this testing that are not yet # handled if isinstance(c, pd.DataFrame) or isinstance(c, pd.Series): if cbar_label is None: cbar_label = gs.get_label(c) c = c.values if isinstance(c, np.ndarray): c = c[idx] c = c[idx1] # Calculate kernel density estimate at data locations if necessary if c is None: c = Parameters['plotting.scatter_plot.c'] kde = False if isinstance(c, str): if c.lower()[:3] == 'kde': pval = c.lower()[3:] # Points are colored based on KDE if logy: ykde = yplot.copy() ykde[ykde <= 0] = Parameters['plotting.log_lowerval'] ykde = np.log(ykde) else: ykde = yplot if logx: xkde = xplot.copy() xkde[xkde <= 0] = Parameters['plotting.log_lowerval'] xkde = np.log(xkde) else: xkde = xplot xy = np.stack((xkde, ykde), axis=1) kde = gaussian_kde(xy.T) c = kde.evaluate(xy.T) c = (c - min(c)) / (max(c) - min(c)) if len(pval) > 0: try: if pval.startswith('p'): ipval = int(pval.lower()[1:]) else: ipval = int(pval.lower()) assert (ipval <= 100) and (ipval >= 1) ipval -= 1 except ValueError: raise ValueError('Could not interpret {} as a kde percentile!'. format(pval.lower())) except AssertionError: raise ValueError('kde percentiles must be 1 <= p <= 100 ') cdfx, cdfy = gs.cdf(c, bins=101) clipval = np.interp(ipval / 100, cdfy, cdfx) c[c > clipval] = clipval kde = True else: cbar = False # Draw parameters from Parameters if necessary if s is None: s = Parameters['plotting.scatter_plot.s'] if alpha is None: alpha = Parameters['plotting.scatter_plot.alpha'] if stat_blk is None: stat_blk = Parameters['plotting.scatter_plot.stat_blk'] if roundstats is None: roundstats = Parameters['plotting.roundstats'] if sigfigs is None: sigfigs = Parameters['plotting.sigfigs'] # Set-up some parameters if len(c) != xplot.shape[0]: cmap = False else: if cmap is None: cmap = Parameters['plotting.scatter_plot.cmap'] if cmap is not False: clim, ticklocs, ticklabels = gs.get_contcbarargs(c, sigfigs, clim) if clim is None: clim = (None, None) # Set-up plot if no axis is supplied using the ImageGrid method if required or the regular way cax = None fig, ax, cax = gs.setup_plot(ax, cax=cax, cbar=cbar, figsize=figsize) # Scatter - let Matplotlib use the default size/color if None if s is None: if c is None: plot = ax.scatter(xplot, yplot, alpha=alpha, label=label, cmap=cmap, vmin=clim[0], vmax=clim[1], **kwargs) else: plot = ax.scatter(xplot, yplot, c=c, alpha=alpha, label=label, cmap=cmap, vmin=clim[0], vmax=clim[1], **kwargs) else: if c is None: plot = ax.scatter(xplot, yplot, s=s, alpha=alpha, label=label, cmap=cmap, vmin=clim[0], vmax=clim[1], **kwargs) else: plot = ax.scatter(xplot, yplot, s=s, c=c, alpha=alpha, label=label, cmap=cmap, vmin=clim[0], vmax=clim[1], **kwargs) # Setup the colorbar if required if cbar: if kde: if clim[0] is not None and clim[1] is not None: ticklocs = np.linspace(clim[0], clim[1], 3) else: ticklocs = [0, 0.5, 1] ticklabels = ['Low', 'Med.', 'High'] cbar_label = 'Kernel Density Estimate' cbar = fig.colorbar(plot, cax=cax, ticks=ticklocs) # Configure the color bar cbar.ax.set_yticklabels(ticklabels, ha='left') cbar.ax.tick_params(axis='y', pad=2) if cbar_label is not None: cbar.set_label(cbar_label, ha='center', va='top', labelpad=2) # Set the axis extents if xlim is None: xlim = (np.min(x), np.max(x)) if ylim is None: ylim = (np.min(y), np.max(y)) if logx and xlim[0] <= 0: if xlim[0] == 0: xlim = [Parameters['plotting.log_lowerval'], ylim[1]] else: raise ValueError('ERROR: invalid clim for a log x-axis!') if logy and ylim[0] <= 0: if ylim[0] == 0: ylim = [Parameters['plotting.log_lowerval'], ylim[1]] else: raise ValueError('ERROR: invalid clim for a log y-axis!') # Set the formatting attributes gs.format_plot(ax, xlabel, ylabel, title, grid, axis_xy, xlim, ylim, logx, logy) # Setup the correlation if stat_blk: stats = ['pearson', 'spearmanr', 'count', 'noweightflag'] # Error checking and conversion to a list of stats if isinstance(stat_blk, str): if stat_blk == 'all': stat_blk = stats[:-1] else: stat_blk = [stat_blk] elif isinstance(stat_blk, tuple): stat_blk = list(stat_blk) if isinstance(stat_blk, list): for stat in stat_blk: if stat not in stats: raise ValueError('invalid stat_blk') else: raise ValueError('invalid stat_blk') # Build the txtstats txtstats = '' if 'count' in stat_blk: txtstats += r'$n = $'+str(x.shape[0]) if 'pearson' in stat_blk: corr = gs.weighted_correlation(x, y, wt) if roundstats: corr = round(corr, sigfigs) else: corr = gs.round_sigfig(corr, sigfigs) txtstats += '\n'+r'$\rho = $'+str(corr) if 'spearmanr' in stat_blk: corr = gs.weighted_correlation_rank(x, y, wt) if roundstats: corr = round(corr, sigfigs) else: corr = gs.round_sigfig(corr, sigfigs) txtstats += '\n'+r'$\rho_s = $'+str(corr) # Note if weights were used if len(np.unique(wt)) > 1 and 'noweightflag' not in stat_blk: txtstats = txtstats + '\n\nweights used' # Sort the location and font size if stat_xy is None: stat_xy = Parameters['plotting.scatter_plot.stat_xy'] if stat_ha is None: stat_ha = Parameters['plotting.stat_ha'] if stat_xy[1] > 0.5: va = 'top' else: va = 'bottom' stat_fontsize = _set_stat_fontsize(stat_fontsize) # Draw to plot ax.text(stat_xy[0], stat_xy[1], txtstats, va=va, ha=stat_ha, transform=ax.transAxes, fontsize=stat_fontsize, linespacing=0.8) # Handle dictionary defaults if out_kws is None: out_kws = dict() if output_file or ('pdfpages' in out_kws): gs.export_image(output_file, **out_kws) if return_plot: return ax, plot else: return ax
def plot(self, c='k', cmap=None, catdata=None, ax=None, cax=None, figsize=None, s=15, lw=0.1, cbar=None, grid=True, legend_loc='lower right', title=None, vlim=None, legstr='Cluster', xlabel=None, ylabel=None): """ Parameters ---------- c : str, ndarray a single color or a ndata-long array of colors cmap : str, dict Either a mpl compatible cmap string, or if `catdata` a dictionary of {k: color} mapping each category to a specific color catdata : bool, dict If a dictionary is passed, the mapping {k: name} is expected """ colors = c try: import pygeostat as gs except: raise ImportError("ERROR: this function requires pygeostat!") coords = self.coords # setup the figure fig, ax, cax = gs.setup_plot(ax, cbar=cbar, cax=cax, figsize=figsize) if vlim is None: if colors is not None and not isinstance(colors, str): # if vlim is `None` get the 95 percentile as the max vlim = (colors.min(), gs.cdf(colors, bins=100)[0][95]) else: vlim = (None, None) # deal with non-array input if hasattr(colors, 'values'): colors = colors.values if catdata is None and not isinstance( colors, str) and len(np.unique(colors)) <= 12: catdata = True # plot categories if catdata: if isinstance(catdata, dict): catdict = catdata else: catdict = None ucolors = np.unique(colors) ncat = len(ucolors) cmap = cmap_handling(cmap, ncat, catdict) for i in range(ncat): thiscolor = cmap[i] if catdict is None: label = '{} {}'.format(legstr, ucolors[i]) else: label = catdict[ucolors[i]] idx = colors == ucolors[i] ax.scatter(coords[idx, 0], coords[idx, 1], c=thiscolor, s=s, lw=lw, label=label, zorder=10) if isinstance(legend_loc, str): ax.legend(loc=legend_loc, scatterpoints=1, handletextpad=0.05) elif isinstance(legend_loc, tuple): ax.legend(loc='upper left', bbox_to_anchor=legend_loc, scatterpoints=1, handletextpad=0.05) # plot continous data with a colorbar else: plot = ax.scatter(coords[:, 0], coords[:, 1], c=colors, s=s, lw=lw, cmap=cmap, vmin=vlim[0], vmax=vlim[1], zorder=10) if cbar: vlim, ticklocs, ticklabels = gs.get_contcbarargs(colors, 2, vlim, nticks=8) cbar = fig.colorbar(plot, cax=cax, ticks=ticklocs) cbar.ax.set_yticklabels(ticklabels, ha='left') ax.grid(grid) if ylabel is None: ax.set_ylabel('$MDS_2$') else: ax.set_ylabel(ylabel) if xlabel is None: ax.set_xlabel('$MDS_1$') else: ax.set_xlabel(xlabel) if title: ax.set_title(title) return ax
def histogram_plot(data, var=None, weights=None, cat=None, catdict=None, bins=None, icdf=False, lower=None, upper=None, ax=None, figsize=None, xlim=None, ylim=None, title=None, xlabel=None, stat_blk=None, stat_xy=None, stat_ha=None, roundstats=None, sigfigs=None, color=None, edgecolor=None, edgeweights=None, grid=None, axis_xy=None, label_count=False, rotateticks=None, plot_style=None, custom_style=None, output_file=None, out_kws=None, stat_fontsize=None, stat_linespacing=None, logx=False, **kwargs): """ Generates a matplotlib style histogram with summary statistics. Trimming is now only applied to NaN values (Pygeostat null standard). The only required required parameter is ``data``. If ``xlabel`` is left to its default value of ``None`` and the input data is contained in a pandas dataframe or series, the column information will be used to label the x-axis. Two statistics block sets are available: ``'all'`` and the default ``'minimal'``. The statistics block can be customized to a user defined list and order. Available statistics are as follows: >>> ['count', 'mean', 'stdev', 'cvar', 'max', 'upquart', 'median', 'lowquart', 'min', ... 'p10', 'p90'] The way in which the values within the statistics block are rounded and displayed can be controlled using the parameters ``roundstats`` and ``sigfigs``. Please review the documentation of the :func:`gs.set_style() <pygeostat.plotting.set_style.set_style>` and :func:`gs.export_image() <pygeostat.plotting.export_image.export_image>` functions for details on their parameters so that their use in this function can be understood. Parameters: data (np.ndarray, pd.DataFrame/Series, or gs.DataFile): data array, which must be 1D unless var is provided. The exception being a DataFile, if data.variables is a single name. var (str): name of the variable in data, which is required if data is not 1D. weights (np.ndarray, pd.DataFrame/Series, or gs.DataFile or str): 1D array of declustering weights for the data. Alternatively the declustering weights name in var. If data is a DataFile, it may be string in data.columns, or True to use data.weights (if data.weights is not None). cat (bool or str): either a cat column in data.data, or if True uses data.cat if data.cat is not None catdict (dict or bool): overrides bins. If a categorical variable is being plotted, provide a dictionary where keys are numeric (categorical codes) and values are their associated labels (categorical names). The bins will be set so that the left edge (and associated label) of each bar is inclusive to each category. May also be set to True, if data is a DataFile and data.catdict is initialized. bins (int or list): Number of bins to use, or a list of bins icdf (bool): Indicator to plot a CDF or not lower (float): Lower limit for histogram upper (float): Upper limit for histogram ax (mpl.axis): Matplotlib axis to plot the figure figsize (tuple): Figure size (width, height) xlim (float tuple): Minimum and maximum limits of data along the x axis ylim (float tuple): Minimum and maximum limits of data along the y axis title (str): Title for the plot xlabel (str): X-axis label stat_blk (bool): Indicate if statistics are plotted or not stat_xy (float tuple): X, Y coordinates of the annotated statistics in figure space. Based on Parameters['plotting.histogram_plot.stat_xy'] if a histogram and Parameters['plotting.histogram_plot.stat_xy'] if a CDF, which defaults to the top right when a PDF is plotted and the bottom right if a CDF is plotted. stat_ha (str): Horizontal alignment parameter for the annotated statistics. Can be ``'right'``, ``'left'``, or ``'center'``. If None, based on Parameters['plotting.stat_ha'] stat_fontsize (float): the fontsize for the statistics block. If None, based on Parameters['plotting.stat_fontsize']. If less than 1, it is the fraction of the matplotlib.rcParams['font.size']. If greater than 1, it the absolute font size. roundstats (bool): Indicate if the statistics should be rounded to the number of digits or to a number of significant figures (e.g., 0.000 vs. 1.14e-5). The number of digits or figures used is set by the parameter ``sigfigs``. sigfigs (int): Number of significant figures or number of digits (depending on ``roundstats``) to display for the float statistics color (str or int or dict): Any permissible matplotlib color or a integer which is used to draw a color from the pygeostat color pallet ``pallet_pastel``> May also be a dictionary of colors, which are used for each bar (useful for categories). colors.keys() must align with bins[:-1] if a dictionary is passed. Drawn from Parameters['plotting.cmap_cat'] if catdict is used and their keys align. edgecolor (str): Any permissible matplotlib color for the edge of a histogram bar grid(bool): plots the major grid lines if True. Based on Parameters['plotting.grid'] if None. axis_xy (bool): converts the axis to GSLIB-style axis visibility (only left and bottom visible) if axis_xy is True. Based on Parameters['plotting.axis_xy'] if None. label_count (bool): label the number of samples found for each category in catdict. Does nothing if no catdict is found rotateticks (bool tuple): Indicate if the axis tick labels should be rotated (x, y) plot_style (str): Use a predefined set of matplotlib plotting parameters as specified by :class:`gs.GridDef <pygeostat.data.grid_definition.GridDef>`. Use ``False`` or ``None`` to turn it off custom_style (dict): Alter some of the predefined parameters in the ``plot_style`` selected. output_file (str): Output figure file name and location out_kws (dict): Optional dictionary of permissible keyword arguments to pass to :func:`gs.export_image() <pygeostat.plotting.export_image.export_image>` **kwargs: Optional permissible keyword arguments to pass to either: (1) matplotlib's hist function if a PDF is plotted or (2) matplotlib's plot function if a CDF is plotted. Returns: ax (ax): matplotlib Axes object with the histogram **Examples:** A simple call: .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") # plot the histogram_plot gs.histogram_plot(dfl, var="Phi", bins=30) | Change the colour, number of significant figures displayed in the statistics, and pass some keyword arguments to matplotlibs hist function: .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") # plot the histogram_plot gs.histogram_plot(dfl, var="Phi", color='#c2e1e5', sigfigs=5, log=True, density=True) | Plot a CDF while also displaying all available statistics, which have been shifted up: .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") # plot the histogram_plot gs.histogram_plot(dfl, var="Phi", icdf=True, stat_blk='all', stat_xy=(1, 0.75)) # Change the CDF line colour by grabbing the 3rd colour from the color pallet # ``cat_vibrant`` and increase its width by passing a keyword argument to matplotlib's # plot function. Also define a custom statistics block: gs.histogram_plot(dfl, var="Phi", icdf=True, color=3, lw=3.5, stat_blk=['count','upquart']) | Generate histograms of Phi considering the categories: .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") cats = [1, 2, 3, 4, 5] colors = gs.catcmapfromcontinuous("Spectral", 5).colors # build the required cat dictionaries dfl.catdict = {c: "RT {:02d}".format(c) for c in cats} colordict = {c: colors[i] for i, c in enumerate(cats)} # plot the histogram_plot f, axs = plt.subplots(2, 1, figsize=(8, 6)) for var, ax in zip(["Phi", "Sw"], axs): gs.histogram_plot(dfl, var=var, cat=True, color=colordict, bins=40, figsize=(8, 4), ax=ax, xlabel=False, title=var) | Generate cdf subplots considering the categories: .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") cats = [1, 2, 3, 4, 5] colors = gs.catcmapfromcontinuous("Spectral", 5).colors # build the required cat dictionaries dfl.catdict = {c: "RT {:02d}".format(c) for c in cats} colordict = {c: colors[i] for i, c in enumerate(cats)} # plot the histogram_plot f, axs = plt.subplots(2, 2, figsize=(12, 9)) axs=axs.flatten() for var, ax in zip(dfl.variables, axs): gs.histogram_plot(dfl, var=var, icdf=True, cat=True, color=colordict, ax=ax) Recreate the `Proportion` class plot .. plot:: import pygeostat as gs # load some data dfl = gs.ExampleData("point3d_ind_mv") cats = [1, 2, 3, 4, 5] colors = gs.catcmapfromcontinuous("Spectral", 5).colors # build the required cat dictionaries dfl.catdict = {c: "RT {:02d}".format(c) for c in cats} colordict = {c: colors[i] for i, c in enumerate(cats)} # plot the histogram_plot ax = gs.histogram_plot(dfl, cat=True, color=colordict, figsize=(7, 4), rotateticks=(45, 0), label_count=True) """ import pygeostat as gs from .utils import format_plot, _set_stat_fontsize, _format_grid, _format_tick_labels, setup_plot, catcmapfromcontinuous from .cmaps import _cat_pastel_data, _cat_vibrant_data import copy # Now converting to a numpy array, as encountering some odd pandas performance, and there's # no major disadvantagve to application of a numpy in this context to my knowledge - RMB # If a list is passed convert it to a series so that trimming can take place # weights if isinstance(weights, str): if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile): weights = data[weights] elif isinstance(weights, bool): if weights: if isinstance(data, gs.DataFile): if data.weights is None: raise ValueError('weights=True but data.weights is None!') elif isinstance(data.weights, list): raise ValueError( 'weights=True but data.weights is a list!') weights = data[data.weights].values else: raise ValueError( 'weights=True is only valid if data is a DataFile!') else: weights = None if isinstance(weights, pd.Series) or isinstance(weights, pd.DataFrame): weights = weights.values # cats for continuous histogram_plots if isinstance(cat, str): if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile): cat = data[cat] elif isinstance(cat, bool): if cat: if isinstance(data, gs.DataFile): if data.cat is None: raise ValueError('cat=True but data.cat is None!') cat = data[data.cat].values if catdict is None and data.catdict is None: raise ValueError("pass a `catdict` when setting `cat`") else: catdict = data.catdict else: raise ValueError( 'cat=True is only valid if data is a DataFile!') else: cat = None if isinstance(cat, pd.Series) or isinstance(cat, pd.DataFrame): cat = cat.values # Handle categorical dictionary if isinstance(catdict, bool): if catdict: if not isinstance(data, gs.DataFile): raise ValueError( 'catdict as a bool is only valid if data is a DataFile!') if data.catdict is None: raise ValueError( 'catdict as a bool is only valid if data is not None!') catdict = data.catdict # Variable # Handle data that is 2-D and/or a DataFile if isinstance(var, str): if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile): if isinstance(cat, str): cat = data[cat] data = data[var] else: raise ValueError( 'var as a string is only valid if data is a DataFile or DataFrame!' ) elif isinstance(data, gs.DataFile): if isinstance(data.variables, str): data = data[data.variables] elif cat is not None: if isinstance(cat, str): data = data[cat] elif var is None and isinstance(cat, (np.ndarray, list)): data = cat elif len(data.columns) == 1: data = data.data else: raise ValueError( 'Could not coerce data (DataFile) into a 1D dataset!') # Get the xlabel if possible before converting to a numpy array if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame): if xlabel is None: xlabel = gs.get_label(data) data = data.values elif isinstance(data, list): data = np.array(data) if isinstance(cat, (pd.Series, pd.DataFrame)): cat = cat.values # Should be numpy by now... if data.ndim > 1: if data.shape[1] > 1: raise ValueError('Could not coerce data into a 1D dataset!') else: data = data.flatten() # Handle Null values if needed idx = np.isnan(data) nullcnt = np.sum(idx) if nullcnt > 0: data = data[~idx] if weights is not None: weights = weights[~idx] if cat is not None: cat = cat[~idx] # Handle dictionary defaults if out_kws is None: out_kws = dict() # Set-up plot if no axis is supplied _, ax, _ = setup_plot(ax, figsize=figsize, aspect=False) # Infer some default parameters if weights is None: weights = np.ones(len(data)) / len(data) else: weights = weights / np.sum(weights) # Some quick error checks assert (np.all(weights) >= 0.0), 'weights less than 0 not valid' # Categories if isinstance(catdict, dict) and var is None: if not all([isinstance(float(i), float) for i in catdict.keys()]): raise ValueError( 'if catdict is dict., all keys should be an int/float!') # The bins are set to begin at the start of each category # bins go from 0.5 to (icat + 1) + 0.5 # label is centered at (icat + 1) bins = np.arange(len(catdict) + 1) + 0.5 if color is None and isinstance(catdict, dict): # Color each bin by the category color? if isinstance(Parameters['plotting.cmap_cat'], dict): temp = Parameters['plotting.cmap_cat'] if list(sorted(temp.keys())) == list(sorted(catdict.keys())): color = temp else: color = catcmapfromcontinuous(Parameters["plotting.cmap"], len(catdict)).colors if isinstance(color, dict): if list(sorted(color.keys())) != list(sorted(catdict.keys())): raise ValueError(('if color is a dictionary, keys must align with ' 'bins[:-1]! Consider using a single color.')) temp = color color = [] for _, v in sorted(temp.items()): color.append(v) # Color setup if isinstance(color, int): # Grab a color from ``cat_vibrant`` if an integer is passed color = _cat_pastel_data[color % len(_cat_vibrant_data)] if not icdf: if color is None: color = Parameters['plotting.histogram_plot.facecolor'] if edgecolor is None: edgecolor = Parameters['plotting.histogram_plot.edgecolor'] if edgeweights is None: if "lw" in kwargs: edgeweights = kwargs.pop("lw") else: edgeweights = Parameters["plotting.histogram_plot.edgeweight"] else: if color is None and icdf: color = Parameters['plotting.histogram_plot.cdfcolor'] plotdata = copy.deepcopy(data) plotweights = copy.deepcopy(weights) if xlim is not None: plotdata[data < xlim[0]] = xlim[0] plotdata[data > xlim[1]] = xlim[1] # Main plotting if icdf: def singlecdf(ax, data, weights, lower, upper, bins, color, label=None, **kwargs): """ local function to plot a single cdf """ cdf_x, cdfvals = gs.cdf(data, weights=weights, lower=lower, upper=upper, bins=bins) # Matplotlib is a memory hog if to many points are used. Limit the number of points the CDF # is build with to 1000. The tails are given extra attention to make sure they are defined # nicely. if len(cdf_x) > 1000: cdfinterp = scipy.interpolate.interp1d(x=cdfvals, y=cdf_x) cdfvals = np.concatenate([ np.arange(cdfvals.min(), 0.1, 0.001), np.arange(0.1, 0.9, 0.01), np.arange(0.9, cdfvals.max(), 0.001) ]) cdf_x = [] for val in cdfvals: cdf_x.append(cdfinterp(val)) cdf_x = np.array(cdf_x) fig = ax.plot(cdf_x, cdfvals, color=color, label=label, **kwargs) return fig if catdict is not None: if var is not None: stat_blk = False for icat, c in enumerate(catdict): clr = color[icat] catidx = cat == c fig = singlecdf(ax, plotdata[catidx], plotweights[catidx], lower, upper, bins, clr, label=catdict[c], **kwargs) else: raise ValueError( "`icdf=True` and `catdict` only makes sense with a `var` defined" ) else: fig = singlecdf(ax, plotdata, plotweights, lower, upper, bins, color, **kwargs) if ylim is None: ylim = (0, 1.0) else: if bins is None: bins = Parameters['plotting.histogram_plot.histbins'] label = kwargs.pop("label", None) if bins is None: if len(plotdata) < 200: bins = 20 elif len(plotdata) < 500: bins = 25 else: bins = 30 if logx: if catdict is not None: raise ValueError('Cannot have logx with catdict!') if xlim is None: minv = np.log10(max(plotdata.min(), 1e-10)) maxv = np.log10(plotdata.max()) else: minv = np.log10(max(xlim[0], 1e-10)) maxv = np.log10(xlim[1]) if np.isnan([minv, maxv]).any(): raise ValueError( 'ERROR converting your data to log base! are there negatives?' ) bins = np.logspace(minv, maxv, bins) if catdict is not None: if var is None: for icat, cat in enumerate(catdict): plotdata[data == cat] = icat + 1 histclr = None else: # generate lists of data per cat plotdata = [plotdata[cat == c] for c in catdict] plotweights = [weights[cat == c] for c in catdict] label = list(catdict.values()) histtype = kwargs.pop("histtype", "stepfilled") stat_blk = False if "stacked" not in kwargs: kwargs["stacked"] = True histclr = color histtype = kwargs.pop("histtype", "bar") if not isinstance(color, list): ax.hist(plotdata, bins, weights=plotweights, color=color, edgecolor=edgecolor, histtype=histtype, label=label, lw=edgeweights, **kwargs) else: _, _, patches = ax.hist(plotdata, bins, weights=plotweights, histtype=histtype, color=histclr, edgecolor=edgecolor, label=label, lw=edgeweights, **kwargs) try: for patch, clr in zip(patches, color): patch.set_facecolor(clr) except (AttributeError, ValueError): pass if catdict is not None and label_count: nd = len(data) for icat, cat in enumerate(catdict): count = np.count_nonzero(data == cat) pcat = (weights * (data == cat).astype(float)).sum() ax.text(icat + 1, pcat, count, ha="center", va="bottom") # Summary stats if stat_blk is None: stat_blk = Parameters['plotting.histogram_plot.stat_blk'] if stat_xy is None: if icdf: stat_xy = Parameters['plotting.histogram_plot.stat_xy_cdf'] else: stat_xy = Parameters['plotting.histogram_plot.stat_xy'] if stat_blk: if sigfigs is None: sigfigs = Parameters['plotting.sigfigs'] if roundstats is None: roundstats = Parameters['plotting.roundstats'] if stat_ha is None: stat_ha = Parameters['plotting.stat_ha'] if stat_linespacing is None: stat_linespacing = Parameters['plotting.stat_linespacing'] if stat_linespacing is None: stat_linespacing = 1.0 # Force no bins and upper/lower for median cdf_x, cdfvals = gs.cdf(data, weights=weights) # Currently defined statistics, possible to add more quite simply if np.mean(data) == 0: cdata = float("nan") elif roundstats: cdata = round((np.std(data) / np.mean(data)), sigfigs) else: cdata = gs.round_sigfig((np.std(data) / np.mean(data)), sigfigs) if roundstats: mean = round(gs.weighted_mean(data, weights), sigfigs) median = round(gs.percentile_from_cdf(cdf_x, cdfvals, 50.0), sigfigs) stdev = round(np.sqrt(gs.weighted_variance(data, weights)), sigfigs) minval = round(np.min(data), sigfigs) maxval = round(np.max(data), sigfigs) upquart = round(np.percentile(data, 75), sigfigs) lowquart = round(np.percentile(data, 25), sigfigs) p10 = round(np.percentile(data, 10), sigfigs) p90 = round(np.percentile(data, 90), sigfigs) else: mean = gs.round_sigfig(gs.weighted_mean(data, weights), sigfigs) median = gs.round_sigfig( gs.percentile_from_cdf(cdf_x, cdfvals, 50.0), sigfigs) stdev = gs.round_sigfig( np.sqrt(gs.weighted_variance(data, weights)), sigfigs) minval = gs.round_sigfig(np.min(data), sigfigs) maxval = gs.round_sigfig(np.max(data), sigfigs) upquart = gs.round_sigfig(np.percentile(data, 75), sigfigs) lowquart = gs.round_sigfig(np.percentile(data, 25), sigfigs) p10 = gs.round_sigfig(np.percentile(data, 10), sigfigs) p90 = gs.round_sigfig(np.percentile(data, 90), sigfigs) statistics = { 'mean': (r'$m = %g$' % mean), 'median': (r'$x_{{50}} = %g$' % median), 'count': ('$n = %i$' % len(data)), 'count_trimmed': ('$n_{trim} = %i$' % nullcnt), 'stdev': (r'$\sigma = %g$' % stdev), 'cvar': ('$CV = %g$' % cdata), 'min': ('$x_{{min}} = %g$' % minval), 'max': ('$x_{{max}} = %g$' % maxval), 'upquart': ('$x_{{75}} = %g$' % upquart), 'lowquart': ('$x_{{25}} = %g$' % lowquart), 'p10': ('$x_{{10}} = %g$' % p10), 'p90': ('$x_{{90}} = %g$' % p90) } # Default statistic sets if stat_blk == 'varlabel' and 'label' in kwargs: statistics['varlabel'] = kwargs['label'] statsets = { 'minimal': ['count', 'mean', 'median', 'stdev'], 'all': [ 'count', 'mean', 'stdev', 'cvar', 'max', 'upquart', 'median', 'lowquart', 'min' ], 'varlabel': [ 'varlabel', 'count', 'mean', 'stdev', 'cvar', 'max', 'upquart', 'median', 'lowquart', 'min' ], 'none': None } # Use a default statistic set if isinstance(stat_blk, bool) and stat_blk: stat_blk = 'all' if isinstance(stat_blk, str): if stat_blk in statsets: stat_blk = statsets[stat_blk] else: print('WARNING: stats value of: "' + stat_blk + '" does not exist - ' 'default to no stats') stat_blk = None # Use a supplied statistic set, but check for bad ones else: badstats = [s for s in stat_blk if s not in statistics] stat_blk = [s for s in stat_blk if s in statistics] for badstat in badstats: print('WARNING: stats value of: "' + badstat + '" does not exist - ' 'It was removed from summary statistics list') # Form the stats string if stat_blk: if nullcnt != 0: stat_blk.insert(stat_blk.index('count') + 1, 'count_trimmed') stat_blk = [statistics[s] for s in stat_blk] txtstats = '\n'.join(stat_blk) if len(np.unique(weights)) > 1: txtstats = txtstats + '\n\nweights used' if stat_xy[1] > 0.5: va = 'top' else: va = 'bottom' # Set the stat_fontsize stat_fontsize = _set_stat_fontsize(stat_fontsize) ax.text(stat_xy[0], stat_xy[1], txtstats, va=va, ha=stat_ha, transform=ax.transAxes, fontsize=stat_fontsize, linespacing=stat_linespacing) # Label as required if icdf: ylabel = 'Cumulative Distribution Function' elif 'density' in kwargs: ylabel = 'Probability Density Function (PDF)' else: ylabel = 'Frequency' ax = format_plot(ax, xlabel, ylabel, title, axis_xy=axis_xy, xlim=xlim, ylim=ylim, logx=logx) if catdict is not None and var is None: ticlocs = [i + 1 for i in range(len(catdict.keys()))] ax.set_xticks(ticlocs) ax.set_xticklabels(catdict.values()) ax.set_xlim(0.25, len(catdict) + 0.75) elif catdict is not None and var is not None: ax.legend() _format_tick_labels(ax, rotateticks) # format_plot doesn't handle some specialized axis_xy and grid requirements # for histogram_plot... if icdf: # Ensure that we have top spline, in case it was removed above ax.spines['top'].set_visible(True) _format_grid(ax, grid, below=False) else: # The grid should be below for a histogram _format_grid(ax, grid, below=True) # Export figure if output_file or ('pdfpages' in out_kws): gs.export_image(output_file, **out_kws) return ax
def accsim(truth, reals, pinc=0.05): """ Calculates the proportion of locations where the true value falls within symmetric p-PI intervals when completing a jackknife study. A portion of the data is excluded from the conditioning dataset and the excluded sample locations simulated values are then checked. .. seealso:: Pyrcz, M. J., & Deutsch, C. V. (2014). Geostatistical Reservoir Modeling (2nd ed.). New York, NY: Oxford University Press, p. 350-351. Arguments: truth: Tidy (long-form) 1D data where a single column containing the true values. A pandas dataframe/series or numpy array can be passed reals: Tidy (long-form) 2D data where a single column contains values from a single realizations and each row contains the simulated values from a single truth location. A pandas dataframe or numpy matrix can be passed Keyword Arguments: pinc (float): Increments between the probability intervals to calculate within (0, 1) Returns: propavg (pd.DataFrame): Dataframe with the calculated probability intervals and the fraction within the interval Returns: sumstats (dict): Dictionary containing the average variance (U), mean squared error (MSE), accuracy measure (acc), precision measure (pre), and a goodness measure (goo) """ import pandas as pd import pygeostat as gs # Handle input if isinstance(truth, pd.Series): truth = truth.values elif isinstance(truth, pd.DataFrame): truth = truth.values elif not isinstance(truth, np.ndarray): raise ValueError( "The argument `truth` must be a pd.DataFrame, pd.Series, or np.matrix" ) if isinstance(truth, np.ndarray) and len(truth.shape) == 1: truth = np.reshape(truth, (truth.shape[0], 1)) if isinstance(reals, pd.DataFrame): reals = reals.values elif not isinstance(reals, np.ndarray): raise ValueError( "The argument `reals` must be a pd.DataFrame or np.matrix") try: data = np.concatenate((truth, reals), axis=1) data = pd.DataFrame(data=data) except: raise ValueError( "The `truth` and `reals` data could not be coerced into a pd.DataFrame" ) # Initialize some variables pints = np.arange(pinc, 1, pinc) propindic = dict([pint, []] for pint in pints) variances = [] acc = dict([pint, 0] for pint in pints) pre = dict([pint, 0] for pint in pints) goo = dict([pint, 0] for pint in pints) # Calculate the indicator responses and local variances for i, values in data.iterrows(): cdf = gs.cdf(values[1:].values) variances.append(np.var(values[1:].values)) for pint in pints: if cdf[0][0] <= values[0] <= cdf[0][-1]: p = gs.z_percentile(values[0], cdf[0], cdf[1]) plower = 0.5 - (pint / 2) pupper = 0.5 + (pint / 2) if plower <= p <= pupper: indic = 1 else: indic = 0 else: indic = 0 propindic[pint].append(indic) # Calculate the average proportions and average variance propavg = [] for pint in pints: avg = np.average(propindic[pint]) propavg.append([pint, avg]) propavg = pd.DataFrame(propavg, columns=['ProbInt', 'FracIn']) # Calculate the summary statistics avgvar = np.average(variances) mse = ((propavg['ProbInt'].values - propavg['FracIn'].values)**2).mean() acc = 0 pre = 0 goo = 0 for i, values in propavg.iterrows(): if values[1] >= values[0]: acc = acc + 1 pre = pre + (values[1] - values[0]) goo = goo + (values[1] - values[0]) else: goo = goo + (2 * (values[0] - values[1])) acc = acc / len(propavg) pre = 1 - ((2 * pre) / len(propavg)) goo = 1 - (goo / len(propavg)) sumstats = { 'avgvar': avgvar, 'mse': mse, 'acc': acc, 'pre': pre, 'goo': goo } return propavg, sumstats