Exemplo n.º 1
0
 def singlecdf(ax,
               data,
               weights,
               lower,
               upper,
               bins,
               color,
               label=None,
               **kwargs):
     """ local function to plot a single cdf """
     cdf_x, cdfvals = gs.cdf(data,
                             weights=weights,
                             lower=lower,
                             upper=upper,
                             bins=bins)
     # Matplotlib is a memory hog if to many points are used. Limit the number of points the CDF
     # is build with to 1000. The tails are given extra attention to make sure they are defined
     # nicely.
     if len(cdf_x) > 1000:
         cdfinterp = scipy.interpolate.interp1d(x=cdfvals, y=cdf_x)
         cdfvals = np.concatenate([
             np.arange(cdfvals.min(), 0.1, 0.001),
             np.arange(0.1, 0.9, 0.01),
             np.arange(0.9, cdfvals.max(), 0.001)
         ])
         cdf_x = []
         for val in cdfvals:
             cdf_x.append(cdfinterp(val))
         cdf_x = np.array(cdf_x)
     fig = ax.plot(cdf_x, cdfvals, color=color, label=label, **kwargs)
     return fig
Exemplo n.º 2
0
def scatter_plot(x, y, wt=None, nmax=None, s=None, c=None, alpha=None, cmap=None, clim=None, cbar=False,
                cbar_label=None, stat_blk=None, stat_xy=None, stat_ha=None, stat_fontsize=None,
                roundstats=None, sigfigs=None, xlim=None, ylim=None, xlabel=None, ylabel=None, output_file=None, out_kws = None,
                title=None, grid=None, axis_xy=None, label='_nolegend_', ax=None, figsize=None,
                return_plot=False, logx=None, logy=None, **kwargs):
    '''
    Scatter plot that mimics the GSLIB scatter_plot program, providing summary statistics, kernel
    density estimate coloring, etc. NaN values are treated as null and removed from the plot and
    statistics.

    Parameters:
        x(np.ndarray or pd.Series): 1-D array with the variable to plot on the x-axis.
        y(np.ndarray or pd.Series): 1-D array with the variable to plot on the y-axis.

    Keyword arguments:
        wt(np.ndarray or pd.DataFrame): 1-D array with weights that are used in the calculation of
            displayed statistics.
        s(float or np.ndarray or pd.Series): size of each scatter point. Based on
            Parameters['plotting.scatter_plot.s'] if None.
        c(color or np.ndarray or pd.Series): color of each scatter point, as an array or valid
            Matplotlib color. Alternatively, 'KDE' may be specified to color each point according
            to its associated kernel density estimate. Based on Parameters['plotting.scatter_plot.c']
            if None.
        nmax (int): specify the maximum number of scatter points that should be displayed, which
            may be necessary due to the time-requirements of plotting many data. If specified,
            a nmax-length random sub-sample of the data is plotted. Note that this does not impact
            summary statistics.
        alpha(float): opacity of the scatter. Based on Parameters['plotting.scatter_plot.alpha'] if None.
        cmap (str): A matplotlib colormap object or a registered matplotlib
        clim (float tuple): Data minimum and maximum values
        cbar (bool): Indicate if a colorbar should be plotted or not
        cbar_label (str): Colorbar title
        stat_blk(str or list): statistics to place in the plot, which should be 'all' or
            a list that may contain ['count', 'pearson', 'spearman', 'noweightflag']. Based on
            Parameters['plotting.scatter_plot.stat_blk'] if None. Set to False to disable.
        stat_xy (float tuple): X, Y coordinates of the annotated statistics in figure
            space. Based on Parameters['plotting.scatter_plot.stat_xy'] if None.
        stat_ha (str): Horizontal alignment parameter for the annotated statistics. Can be
            ``'right'``, ``'left'``, or ``'center'``. If None, based on
            Parameters['plotting.stat_ha']
        stat_fontsize (float): the fontsize for the statistics block. If None, based on
            Parameters['plotting.stat_fontsize']. If less than 1, it is the fraction of the
            matplotlib.rcParams['font.size']. If greater than 1, it the absolute font size.
        roundstats (bool): Indicate if the statistics should be rounded to the number of digits or
            to a number of significant figures (e.g., 0.000 vs. 1.14e-5). The number of digits or
            figures used is set by the parameter ``sigfigs``. sigfigs (int): Number of significant
            figures or number of digits (depending on ``roundstats``) to display for the float
            statistics. Based on Parameters['plotting.roundstats'] and Parameters['plotting.roundstats']
            and Parameters['plotting.sigfigs'] if None.
        xlim(tuple): x-axis limits - xlim[0] to xlim[1]. Based on the data if None
        ylim(tuple): y-axis limits - ylim[0] to ylim[1]. Based on the data if None.
        xlabel(str): label of the x-axis, extracted from x if None
        ylabel(str): label of the y-axis, extracted from y if None
        output_file (str): Output figure file name and location
        out_kws (dict): Optional dictionary of permissible keyword arguments to pass to
            :func:`gs.export_image() <pygeostat.plotting.export_image.export_image>`
        title(str): plot title
        grid(bool): plot grid lines in each panel? Based on Parameters['plotting.grid'] if None.
        axis_xy(bool): if True, mimic a GSLIB-style scatter_plot, where only the bottom and left axes
            lines are displayed. Based on Parameters['plotting.axis_xy'] if None.
        label(str): label of scatter for legend
        ax(Matplotlib axis handle): if None, create a new figure and axis handles
        figsize(tuple): size of the figure, if creating a new one when ax = None
        logx, logy (str): permissible mpl axis scale, like `log`
        **kwargs: Optional permissible keyword arguments to pass to either: (1) matplotlib's
            scatter function

    Return:
        ax(Matplotlib axis handle)

    **Examples:**

    Basic scatter example:

    .. plot::

        import pygeostat as gs

        # Load the data
        data_file = gs.ExampleData('point3d_ind_mv')

        # Select a couple of variables
        x, y = data_file[data_file.variables[0]], data_file[data_file.variables[1]]

        # Scatter plot with default parameters
        gs.scatter_plot(x, y, figsize=(5, 5), cmap='hot')

        # Scatter plot without correlation and with a color bar:
        gs.scatter_plot(x, y, nmax=2000, stat_blk=False, cbar=True, figsize=(5, 5))

        # Scatter plot with the a constant color, transparency and all statistics
        # Also locate the statistics where they are better seen
        gs.scatter_plot(x, y, c='k', alpha=0.2, nmax=2000, stat_blk='all', stat_xy=(.95, .95),
                   figsize=(5, 5))
    '''
    # Import packages
    from scipy.stats import gaussian_kde
    from copy import deepcopy
    import pygeostat as gs
    from . utils import _set_stat_fontsize
    # Figure out the plotting axes
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)
    # Labels if present
    if xlabel is None:
        xlabel = gs.get_label(x)
    if ylabel is None:
        ylabel = gs.get_label(y)
    # Check the input data
    if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series):
        x = x.values
    if x.ndim > 1:
        raise ValueError('x should be one-dimension!')
    if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
        y = y.values
    if y.shape != x.shape:
        raise ValueError('x and y should be the same shape!')
    # Check the weights
    if isinstance(wt, pd.DataFrame) or isinstance(wt, pd.Series):
        wt = wt.values
    elif wt is None:
        wt = np.ones(x.shape)
    if wt.shape != x.shape:
        raise ValueError('x, y and wt should be the same shape!')
    # Remove nans if present
    idx = np.logical_and(np.isfinite(x), np.isfinite(y), np.isfinite(wt))
    x, y, wt = x[idx], y[idx], wt[idx]
    # Draw a random sub-sample if present
    xplot, yplot = deepcopy(x), deepcopy(y)
    if isinstance(nmax, int):
        if len(xplot) > nmax:
            idx1 = np.random.randint(0, len(xplot), nmax)
            xplot = xplot[idx1]
            yplot = yplot[idx1]
    else:
        idx1 = np.arange(0, len(xplot))
    # There's probably a lot of edge cases to this testing that are not yet
    # handled
    if isinstance(c, pd.DataFrame) or isinstance(c, pd.Series):
        if cbar_label is None:
            cbar_label = gs.get_label(c)
        c = c.values
    if isinstance(c, np.ndarray):
        c = c[idx]
        c = c[idx1]
    # Calculate kernel density estimate at data locations if necessary
    if c is None:
        c = Parameters['plotting.scatter_plot.c']
    kde = False
    if isinstance(c, str):
        if c.lower()[:3] == 'kde':
            pval = c.lower()[3:]
            # Points are colored based on KDE
            if logy:
                ykde = yplot.copy()
                ykde[ykde <= 0] = Parameters['plotting.log_lowerval']
                ykde = np.log(ykde)
            else:
                ykde = yplot
            if logx:
                xkde = xplot.copy()
                xkde[xkde <= 0] = Parameters['plotting.log_lowerval']
                xkde = np.log(xkde)
            else:
                xkde = xplot
            xy = np.stack((xkde, ykde), axis=1)
            kde = gaussian_kde(xy.T)
            c = kde.evaluate(xy.T)
            c = (c - min(c)) / (max(c) - min(c))
            if len(pval) > 0:
                try:
                    if pval.startswith('p'):
                        ipval = int(pval.lower()[1:])
                    else:
                        ipval = int(pval.lower())
                    assert (ipval <= 100) and (ipval >= 1)
                    ipval -= 1
                except ValueError:
                    raise ValueError('Could not interpret {} as a kde percentile!'.
                                     format(pval.lower()))
                except AssertionError:
                    raise ValueError('kde percentiles must be 1 <= p <= 100 ')
                cdfx, cdfy = gs.cdf(c, bins=101)
                clipval = np.interp(ipval / 100, cdfy, cdfx)
                c[c > clipval] = clipval
            kde = True
        else:
            cbar = False
    # Draw parameters from Parameters if necessary
    if s is None:
        s = Parameters['plotting.scatter_plot.s']
    if alpha is None:
        alpha = Parameters['plotting.scatter_plot.alpha']
    if stat_blk is None:
        stat_blk = Parameters['plotting.scatter_plot.stat_blk']
    if roundstats is None:
        roundstats = Parameters['plotting.roundstats']
    if sigfigs is None:
        sigfigs = Parameters['plotting.sigfigs']
    # Set-up some parameters
    if len(c) != xplot.shape[0]:
        cmap = False
    else:
        if cmap is None:
            cmap = Parameters['plotting.scatter_plot.cmap']
    if cmap is not False:
        clim, ticklocs, ticklabels = gs.get_contcbarargs(c, sigfigs, clim)
    if clim is None:
        clim = (None, None)
    # Set-up plot if no axis is supplied using the ImageGrid method if required or the regular way
    cax = None
    fig, ax, cax = gs.setup_plot(ax, cax=cax, cbar=cbar, figsize=figsize)
    # Scatter - let Matplotlib use the default size/color if None
    if s is None:
        if c is None:
            plot = ax.scatter(xplot, yplot, alpha=alpha, label=label, cmap=cmap,
                              vmin=clim[0], vmax=clim[1], **kwargs)
        else:
            plot = ax.scatter(xplot, yplot, c=c, alpha=alpha, label=label, cmap=cmap,
                              vmin=clim[0], vmax=clim[1], **kwargs)
    else:
        if c is None:
            plot = ax.scatter(xplot, yplot, s=s, alpha=alpha, label=label, cmap=cmap,
                              vmin=clim[0], vmax=clim[1], **kwargs)
        else:
            plot = ax.scatter(xplot, yplot, s=s, c=c, alpha=alpha, label=label, cmap=cmap,
                              vmin=clim[0], vmax=clim[1], **kwargs)
    # Setup the colorbar if required
    if cbar:
        if kde:
            if clim[0] is not None and clim[1] is not None:
                ticklocs = np.linspace(clim[0], clim[1], 3)
            else:
                ticklocs = [0, 0.5, 1]
            ticklabels = ['Low', 'Med.', 'High']
            cbar_label = 'Kernel Density Estimate'
        cbar = fig.colorbar(plot, cax=cax, ticks=ticklocs)
        # Configure the color bar
        cbar.ax.set_yticklabels(ticklabels, ha='left')
        cbar.ax.tick_params(axis='y', pad=2)
        if cbar_label is not None:
            cbar.set_label(cbar_label, ha='center', va='top', labelpad=2)
    # Set the axis extents
    if xlim is None:
        xlim = (np.min(x), np.max(x))
    if ylim is None:
        ylim = (np.min(y), np.max(y))
    if logx and xlim[0] <= 0:
        if xlim[0] == 0:
            xlim = [Parameters['plotting.log_lowerval'], ylim[1]]
        else:
            raise ValueError('ERROR: invalid clim for a log x-axis!')
    if logy and ylim[0] <= 0:
        if ylim[0] == 0:
            ylim = [Parameters['plotting.log_lowerval'], ylim[1]]
        else:
            raise ValueError('ERROR: invalid clim for a log y-axis!')
    # Set the formatting attributes
    gs.format_plot(ax, xlabel, ylabel, title, grid, axis_xy, xlim, ylim, logx, logy)
    # Setup the correlation
    if stat_blk:
        stats = ['pearson', 'spearmanr', 'count', 'noweightflag']
        # Error checking and conversion to a list of stats
        if isinstance(stat_blk, str):
            if stat_blk == 'all':
                stat_blk = stats[:-1]
            else:
                stat_blk = [stat_blk]
        elif isinstance(stat_blk, tuple):
            stat_blk = list(stat_blk)
        if isinstance(stat_blk, list):
            for stat in stat_blk:
                if stat not in stats:
                    raise ValueError('invalid stat_blk')
        else:
            raise ValueError('invalid stat_blk')
        # Build the txtstats
        txtstats = ''
        if 'count' in stat_blk:
            txtstats += r'$n = $'+str(x.shape[0])
        if 'pearson' in stat_blk:
            corr = gs.weighted_correlation(x, y, wt)
            if roundstats:
                corr = round(corr, sigfigs)
            else:
                corr = gs.round_sigfig(corr, sigfigs)
            txtstats += '\n'+r'$\rho = $'+str(corr)
        if 'spearmanr' in stat_blk:
            corr = gs.weighted_correlation_rank(x, y, wt)
            if roundstats:
                corr = round(corr, sigfigs)
            else:
                corr = gs.round_sigfig(corr, sigfigs)
            txtstats += '\n'+r'$\rho_s = $'+str(corr)
        # Note if weights were used
        if len(np.unique(wt)) > 1 and 'noweightflag' not in stat_blk:
            txtstats = txtstats + '\n\nweights used'
        # Sort the location and font size
        if stat_xy is None:
            stat_xy = Parameters['plotting.scatter_plot.stat_xy']
        if stat_ha is None:
            stat_ha = Parameters['plotting.stat_ha']
        if stat_xy[1] > 0.5:
            va = 'top'
        else:
            va = 'bottom'
        stat_fontsize = _set_stat_fontsize(stat_fontsize)
        # Draw to plot
        ax.text(stat_xy[0], stat_xy[1], txtstats, va=va, ha=stat_ha, transform=ax.transAxes,
                fontsize=stat_fontsize, linespacing=0.8)

    # Handle dictionary defaults
    if out_kws is None:
        out_kws = dict()

    if output_file or ('pdfpages' in out_kws):
        gs.export_image(output_file, **out_kws)
        
    if return_plot:
        return ax, plot
    else:
        return ax
Exemplo n.º 3
0
 def plot(self,
          c='k',
          cmap=None,
          catdata=None,
          ax=None,
          cax=None,
          figsize=None,
          s=15,
          lw=0.1,
          cbar=None,
          grid=True,
          legend_loc='lower right',
          title=None,
          vlim=None,
          legstr='Cluster',
          xlabel=None,
          ylabel=None):
     """
     Parameters
     ----------
     c : str, ndarray
         a single color or a ndata-long array of colors
     cmap : str, dict
         Either a mpl compatible cmap string, or if `catdata` a dictionary of {k: color} mapping
         each category to a specific color
     catdata : bool, dict
         If a dictionary is passed, the mapping {k: name} is expected
     """
     colors = c
     try:
         import pygeostat as gs
     except:
         raise ImportError("ERROR: this function requires pygeostat!")
     coords = self.coords
     # setup the figure
     fig, ax, cax = gs.setup_plot(ax, cbar=cbar, cax=cax, figsize=figsize)
     if vlim is None:
         if colors is not None and not isinstance(colors, str):
             # if vlim is `None` get the 95 percentile as the max
             vlim = (colors.min(), gs.cdf(colors, bins=100)[0][95])
         else:
             vlim = (None, None)
     # deal with non-array input
     if hasattr(colors, 'values'):
         colors = colors.values
     if catdata is None and not isinstance(
             colors, str) and len(np.unique(colors)) <= 12:
         catdata = True
     # plot categories
     if catdata:
         if isinstance(catdata, dict):
             catdict = catdata
         else:
             catdict = None
         ucolors = np.unique(colors)
         ncat = len(ucolors)
         cmap = cmap_handling(cmap, ncat, catdict)
         for i in range(ncat):
             thiscolor = cmap[i]
             if catdict is None:
                 label = '{} {}'.format(legstr, ucolors[i])
             else:
                 label = catdict[ucolors[i]]
             idx = colors == ucolors[i]
             ax.scatter(coords[idx, 0],
                        coords[idx, 1],
                        c=thiscolor,
                        s=s,
                        lw=lw,
                        label=label,
                        zorder=10)
         if isinstance(legend_loc, str):
             ax.legend(loc=legend_loc, scatterpoints=1, handletextpad=0.05)
         elif isinstance(legend_loc, tuple):
             ax.legend(loc='upper left',
                       bbox_to_anchor=legend_loc,
                       scatterpoints=1,
                       handletextpad=0.05)
     # plot continous data with a colorbar
     else:
         plot = ax.scatter(coords[:, 0],
                           coords[:, 1],
                           c=colors,
                           s=s,
                           lw=lw,
                           cmap=cmap,
                           vmin=vlim[0],
                           vmax=vlim[1],
                           zorder=10)
         if cbar:
             vlim, ticklocs, ticklabels = gs.get_contcbarargs(colors,
                                                              2,
                                                              vlim,
                                                              nticks=8)
             cbar = fig.colorbar(plot, cax=cax, ticks=ticklocs)
             cbar.ax.set_yticklabels(ticklabels, ha='left')
     ax.grid(grid)
     if ylabel is None:
         ax.set_ylabel('$MDS_2$')
     else:
         ax.set_ylabel(ylabel)
     if xlabel is None:
         ax.set_xlabel('$MDS_1$')
     else:
         ax.set_xlabel(xlabel)
     if title:
         ax.set_title(title)
     return ax
Exemplo n.º 4
0
def histogram_plot(data,
                   var=None,
                   weights=None,
                   cat=None,
                   catdict=None,
                   bins=None,
                   icdf=False,
                   lower=None,
                   upper=None,
                   ax=None,
                   figsize=None,
                   xlim=None,
                   ylim=None,
                   title=None,
                   xlabel=None,
                   stat_blk=None,
                   stat_xy=None,
                   stat_ha=None,
                   roundstats=None,
                   sigfigs=None,
                   color=None,
                   edgecolor=None,
                   edgeweights=None,
                   grid=None,
                   axis_xy=None,
                   label_count=False,
                   rotateticks=None,
                   plot_style=None,
                   custom_style=None,
                   output_file=None,
                   out_kws=None,
                   stat_fontsize=None,
                   stat_linespacing=None,
                   logx=False,
                   **kwargs):
    """
    Generates a matplotlib style histogram with summary statistics. Trimming is now only applied
    to NaN values (Pygeostat null standard).

    The only required required parameter is ``data``. If ``xlabel`` is left to its default value of
    ``None`` and the input data is contained in a pandas dataframe or series, the column
    information will be used to label the x-axis.

    Two statistics block sets are available: ``'all'`` and the default ``'minimal'``. The
    statistics block can be customized to a user defined list and order. Available statistics are
    as follows:

    >>> ['count', 'mean', 'stdev', 'cvar', 'max', 'upquart', 'median', 'lowquart', 'min',
    ...  'p10', 'p90']

    The way in which the values within the statistics block are rounded and displayed can be
    controlled using the parameters ``roundstats`` and ``sigfigs``.

    Please review the documentation of the :func:`gs.set_style()
    <pygeostat.plotting.set_style.set_style>` and :func:`gs.export_image()
    <pygeostat.plotting.export_image.export_image>` functions for details on their parameters so that
    their use in this function can be understood.

    Parameters:
        data (np.ndarray, pd.DataFrame/Series, or gs.DataFile): data array, which must be 1D
            unless var is provided. The exception being a DataFile, if data.variables
            is a single name.
        var (str): name of the variable in data, which is required if data is not 1D.
        weights (np.ndarray, pd.DataFrame/Series, or gs.DataFile or str): 1D array of declustering
             weights for the data. Alternatively the declustering weights name in var. If data
             is a DataFile, it may be string in data.columns, or True to use data.weights
             (if data.weights is not None).
        cat (bool or str): either a cat column in data.data, or if True uses data.cat if data.cat
            is not None
        catdict (dict or bool): overrides bins. If a categorical variable is being plotted, provide
            a dictionary where keys are numeric (categorical codes) and values are their associated
            labels (categorical names). The bins will be set so that the left edge (and associated
            label) of each bar is inclusive to each category. May also be set to True, if data is
            a DataFile and data.catdict is initialized.
        bins (int or list): Number of bins to use, or a list of bins
        icdf (bool): Indicator to plot a CDF or not
        lower (float): Lower limit for histogram
        upper (float): Upper limit for histogram
        ax (mpl.axis): Matplotlib axis to plot the figure
        figsize (tuple): Figure size (width, height)
        xlim (float tuple): Minimum and maximum limits of data along the x axis
        ylim (float tuple): Minimum and maximum limits of data along the y axis
        title (str): Title for the plot
        xlabel (str): X-axis label
        stat_blk (bool): Indicate if statistics are plotted or not
        stat_xy (float tuple): X, Y coordinates of the annotated statistics in figure
            space. Based on Parameters['plotting.histogram_plot.stat_xy'] if a histogram and
            Parameters['plotting.histogram_plot.stat_xy'] if a CDF, which defaults to the top right when
            a PDF is plotted and the bottom right if a CDF is plotted.
        stat_ha (str): Horizontal alignment parameter for the annotated statistics. Can be
            ``'right'``, ``'left'``, or ``'center'``. If None, based on
            Parameters['plotting.stat_ha']
        stat_fontsize (float): the fontsize for the statistics block. If None, based on
            Parameters['plotting.stat_fontsize']. If less than 1, it is the fraction of the
            matplotlib.rcParams['font.size']. If greater than 1, it the absolute font size.
        roundstats (bool): Indicate if the statistics should be rounded to the number of digits or
            to a number of significant figures (e.g., 0.000 vs. 1.14e-5). The number of digits or
            figures used is set by the parameter ``sigfigs``. sigfigs (int): Number of significant
            figures or number of digits (depending on ``roundstats``) to display for the float
            statistics
        color (str or int or dict): Any permissible matplotlib color or a integer which is used to draw
            a color from the pygeostat color pallet ``pallet_pastel``> May also be a dictionary of colors,
            which are used for each bar (useful for categories). colors.keys() must align with bins[:-1]
            if a dictionary is passed. Drawn from Parameters['plotting.cmap_cat'] if catdict is used
            and their keys align.
        edgecolor (str): Any permissible matplotlib color for the edge of a histogram bar
        grid(bool): plots the major grid lines if True. Based on Parameters['plotting.grid']
            if None.
        axis_xy (bool): converts the axis to GSLIB-style axis visibility (only left and bottom
            visible) if axis_xy is True. Based on Parameters['plotting.axis_xy'] if None.
        label_count (bool): label the number of samples found for each category in catdict. Does
            nothing if no catdict is found
        rotateticks (bool tuple): Indicate if the axis tick labels should be rotated (x, y)
        plot_style (str): Use a predefined set of matplotlib plotting parameters as specified by
            :class:`gs.GridDef <pygeostat.data.grid_definition.GridDef>`. Use ``False`` or ``None``
            to turn it off
        custom_style (dict): Alter some of the predefined parameters in the ``plot_style`` selected.
        output_file (str): Output figure file name and location
        out_kws (dict): Optional dictionary of permissible keyword arguments to pass to
            :func:`gs.export_image() <pygeostat.plotting.export_image.export_image>`
        **kwargs: Optional permissible keyword arguments to pass to either: (1) matplotlib's hist
            function if a PDF is plotted or (2) matplotlib's plot function if a CDF is plotted.

    Returns:
        ax (ax): matplotlib Axes object with the histogram

    **Examples:**

    A simple call:

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        # plot the histogram_plot
        gs.histogram_plot(dfl, var="Phi", bins=30)

    |

    Change the colour, number of significant figures displayed in the statistics, and pass some
    keyword arguments to matplotlibs hist function:

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        # plot the histogram_plot
        gs.histogram_plot(dfl, var="Phi", color='#c2e1e5', sigfigs=5, log=True, density=True)

    |

    Plot a CDF while also displaying all available statistics, which have been shifted up:

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        # plot the histogram_plot
        gs.histogram_plot(dfl, var="Phi", icdf=True, stat_blk='all', stat_xy=(1, 0.75))
        # Change the CDF line colour by grabbing the 3rd colour from the color pallet
        # ``cat_vibrant`` and increase its width by passing a keyword argument to matplotlib's
        # plot function. Also define a custom statistics block:
        gs.histogram_plot(dfl, var="Phi", icdf=True, color=3, lw=3.5, stat_blk=['count','upquart'])

    |

    Generate histograms of Phi considering the categories:

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        cats = [1, 2, 3, 4, 5]
        colors = gs.catcmapfromcontinuous("Spectral", 5).colors
        # build the required cat dictionaries
        dfl.catdict = {c: "RT {:02d}".format(c) for c in cats}
        colordict =  {c: colors[i] for i, c in enumerate(cats)}
        # plot the histogram_plot
        f, axs = plt.subplots(2, 1, figsize=(8, 6))
        for var, ax in zip(["Phi", "Sw"], axs):
            gs.histogram_plot(dfl, var=var, cat=True, color=colordict, bins=40, figsize=(8, 4), ax=ax,
                       xlabel=False, title=var)

    |

    Generate cdf subplots considering the categories:

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        cats = [1, 2, 3, 4, 5]
        colors = gs.catcmapfromcontinuous("Spectral", 5).colors
        # build the required cat dictionaries
        dfl.catdict = {c: "RT {:02d}".format(c) for c in cats}
        colordict =  {c: colors[i] for i, c in enumerate(cats)}
        # plot the histogram_plot
        f, axs = plt.subplots(2, 2, figsize=(12, 9))
        axs=axs.flatten()
        for var, ax in zip(dfl.variables, axs):
            gs.histogram_plot(dfl, var=var, icdf=True, cat=True, color=colordict, ax=ax)

    Recreate the `Proportion` class plot

    .. plot::

        import pygeostat as gs
        # load some data
        dfl = gs.ExampleData("point3d_ind_mv")
        cats = [1, 2, 3, 4, 5]
        colors = gs.catcmapfromcontinuous("Spectral", 5).colors
        # build the required cat dictionaries
        dfl.catdict = {c: "RT {:02d}".format(c) for c in cats}
        colordict =  {c: colors[i] for i, c in enumerate(cats)}
        # plot the histogram_plot
        ax = gs.histogram_plot(dfl, cat=True, color=colordict, figsize=(7, 4), rotateticks=(45, 0),
                        label_count=True)

    """
    import pygeostat as gs
    from .utils import format_plot, _set_stat_fontsize, _format_grid, _format_tick_labels, setup_plot, catcmapfromcontinuous
    from .cmaps import _cat_pastel_data, _cat_vibrant_data
    import copy
    # Now converting to a numpy array, as encountering some odd pandas performance, and there's
    # no major disadvantagve to application of a numpy in this context to my knowledge - RMB
    # If a list is passed convert it to a series so that trimming can take place
    # weights
    if isinstance(weights, str):
        if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile):
            weights = data[weights]
    elif isinstance(weights, bool):
        if weights:
            if isinstance(data, gs.DataFile):
                if data.weights is None:
                    raise ValueError('weights=True but data.weights is None!')
                elif isinstance(data.weights, list):
                    raise ValueError(
                        'weights=True but data.weights is a list!')
                weights = data[data.weights].values
            else:
                raise ValueError(
                    'weights=True is only valid if data is a DataFile!')
        else:
            weights = None
    if isinstance(weights, pd.Series) or isinstance(weights, pd.DataFrame):
        weights = weights.values
    # cats for continuous histogram_plots
    if isinstance(cat, str):
        if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile):
            cat = data[cat]
    elif isinstance(cat, bool):
        if cat:
            if isinstance(data, gs.DataFile):
                if data.cat is None:
                    raise ValueError('cat=True but data.cat is None!')
                cat = data[data.cat].values
                if catdict is None and data.catdict is None:
                    raise ValueError("pass a `catdict` when setting `cat`")
                else:
                    catdict = data.catdict
            else:
                raise ValueError(
                    'cat=True is only valid if data is a DataFile!')
        else:
            cat = None
    if isinstance(cat, pd.Series) or isinstance(cat, pd.DataFrame):
        cat = cat.values
    # Handle categorical dictionary
    if isinstance(catdict, bool):
        if catdict:
            if not isinstance(data, gs.DataFile):
                raise ValueError(
                    'catdict as a bool is only valid if data is a DataFile!')
            if data.catdict is None:
                raise ValueError(
                    'catdict as a bool is only valid if data is not None!')
            catdict = data.catdict
    # Variable
    # Handle data that is 2-D and/or a DataFile
    if isinstance(var, str):
        if isinstance(data, pd.DataFrame) or isinstance(data, gs.DataFile):
            if isinstance(cat, str):
                cat = data[cat]
            data = data[var]
        else:
            raise ValueError(
                'var as a string is only valid if data is a DataFile or DataFrame!'
            )
    elif isinstance(data, gs.DataFile):
        if isinstance(data.variables, str):
            data = data[data.variables]
        elif cat is not None:
            if isinstance(cat, str):
                data = data[cat]
            elif var is None and isinstance(cat, (np.ndarray, list)):
                data = cat
        elif len(data.columns) == 1:
            data = data.data
        else:
            raise ValueError(
                'Could not coerce data (DataFile) into a 1D dataset!')
    # Get the xlabel if possible before converting to a numpy array
    if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame):
        if xlabel is None:
            xlabel = gs.get_label(data)
        data = data.values
    elif isinstance(data, list):
        data = np.array(data)
    if isinstance(cat, (pd.Series, pd.DataFrame)):
        cat = cat.values
    # Should be numpy by now...
    if data.ndim > 1:
        if data.shape[1] > 1:
            raise ValueError('Could not coerce data into a 1D dataset!')
        else:
            data = data.flatten()
    # Handle Null values if needed
    idx = np.isnan(data)
    nullcnt = np.sum(idx)
    if nullcnt > 0:
        data = data[~idx]
        if weights is not None:
            weights = weights[~idx]
        if cat is not None:
            cat = cat[~idx]
    # Handle dictionary defaults
    if out_kws is None:
        out_kws = dict()
    # Set-up plot if no axis is supplied
    _, ax, _ = setup_plot(ax, figsize=figsize, aspect=False)
    # Infer some default parameters
    if weights is None:
        weights = np.ones(len(data)) / len(data)
    else:
        weights = weights / np.sum(weights)
    # Some quick error checks
    assert (np.all(weights) >= 0.0), 'weights less than 0 not valid'
    # Categories
    if isinstance(catdict, dict) and var is None:
        if not all([isinstance(float(i), float) for i in catdict.keys()]):
            raise ValueError(
                'if catdict is dict., all keys should be an int/float!')
        # The bins are set to begin at the start of each category
        # bins go from 0.5 to (icat + 1) + 0.5
        # label is centered at (icat + 1)
        bins = np.arange(len(catdict) + 1) + 0.5
    if color is None and isinstance(catdict, dict):
        # Color each bin by the category color?
        if isinstance(Parameters['plotting.cmap_cat'], dict):
            temp = Parameters['plotting.cmap_cat']
            if list(sorted(temp.keys())) == list(sorted(catdict.keys())):
                color = temp
        else:
            color = catcmapfromcontinuous(Parameters["plotting.cmap"],
                                          len(catdict)).colors
    if isinstance(color, dict):
        if list(sorted(color.keys())) != list(sorted(catdict.keys())):
            raise ValueError(('if color is a dictionary, keys must align with '
                              'bins[:-1]! Consider using a single color.'))
        temp = color
        color = []
        for _, v in sorted(temp.items()):
            color.append(v)
    # Color setup
    if isinstance(color, int):
        # Grab a color from ``cat_vibrant`` if an integer is passed
        color = _cat_pastel_data[color % len(_cat_vibrant_data)]
    if not icdf:
        if color is None:
            color = Parameters['plotting.histogram_plot.facecolor']
        if edgecolor is None:
            edgecolor = Parameters['plotting.histogram_plot.edgecolor']
        if edgeweights is None:
            if "lw" in kwargs:
                edgeweights = kwargs.pop("lw")
            else:
                edgeweights = Parameters["plotting.histogram_plot.edgeweight"]
    else:
        if color is None and icdf:
            color = Parameters['plotting.histogram_plot.cdfcolor']
    plotdata = copy.deepcopy(data)
    plotweights = copy.deepcopy(weights)
    if xlim is not None:
        plotdata[data < xlim[0]] = xlim[0]
        plotdata[data > xlim[1]] = xlim[1]
    # Main plotting
    if icdf:

        def singlecdf(ax,
                      data,
                      weights,
                      lower,
                      upper,
                      bins,
                      color,
                      label=None,
                      **kwargs):
            """ local function to plot a single cdf """
            cdf_x, cdfvals = gs.cdf(data,
                                    weights=weights,
                                    lower=lower,
                                    upper=upper,
                                    bins=bins)
            # Matplotlib is a memory hog if to many points are used. Limit the number of points the CDF
            # is build with to 1000. The tails are given extra attention to make sure they are defined
            # nicely.
            if len(cdf_x) > 1000:
                cdfinterp = scipy.interpolate.interp1d(x=cdfvals, y=cdf_x)
                cdfvals = np.concatenate([
                    np.arange(cdfvals.min(), 0.1, 0.001),
                    np.arange(0.1, 0.9, 0.01),
                    np.arange(0.9, cdfvals.max(), 0.001)
                ])
                cdf_x = []
                for val in cdfvals:
                    cdf_x.append(cdfinterp(val))
                cdf_x = np.array(cdf_x)
            fig = ax.plot(cdf_x, cdfvals, color=color, label=label, **kwargs)
            return fig

        if catdict is not None:
            if var is not None:
                stat_blk = False
                for icat, c in enumerate(catdict):
                    clr = color[icat]
                    catidx = cat == c
                    fig = singlecdf(ax,
                                    plotdata[catidx],
                                    plotweights[catidx],
                                    lower,
                                    upper,
                                    bins,
                                    clr,
                                    label=catdict[c],
                                    **kwargs)
            else:
                raise ValueError(
                    "`icdf=True` and `catdict` only makes sense with a `var` defined"
                )
        else:
            fig = singlecdf(ax, plotdata, plotweights, lower, upper, bins,
                            color, **kwargs)
        if ylim is None:
            ylim = (0, 1.0)
    else:
        if bins is None:
            bins = Parameters['plotting.histogram_plot.histbins']
        label = kwargs.pop("label", None)
        if bins is None:
            if len(plotdata) < 200:
                bins = 20
            elif len(plotdata) < 500:
                bins = 25
            else:
                bins = 30
        if logx:
            if catdict is not None:
                raise ValueError('Cannot have logx with catdict!')
            if xlim is None:
                minv = np.log10(max(plotdata.min(), 1e-10))
                maxv = np.log10(plotdata.max())
            else:
                minv = np.log10(max(xlim[0], 1e-10))
                maxv = np.log10(xlim[1])
            if np.isnan([minv, maxv]).any():
                raise ValueError(
                    'ERROR converting your data to log base! are there negatives?'
                )
            bins = np.logspace(minv, maxv, bins)
        if catdict is not None:
            if var is None:
                for icat, cat in enumerate(catdict):
                    plotdata[data == cat] = icat + 1
                histclr = None
            else:
                # generate lists of data per cat
                plotdata = [plotdata[cat == c] for c in catdict]
                plotweights = [weights[cat == c] for c in catdict]
                label = list(catdict.values())
                histtype = kwargs.pop("histtype", "stepfilled")
                stat_blk = False
                if "stacked" not in kwargs:
                    kwargs["stacked"] = True
                histclr = color
        histtype = kwargs.pop("histtype", "bar")
        if not isinstance(color, list):
            ax.hist(plotdata,
                    bins,
                    weights=plotweights,
                    color=color,
                    edgecolor=edgecolor,
                    histtype=histtype,
                    label=label,
                    lw=edgeweights,
                    **kwargs)
        else:
            _, _, patches = ax.hist(plotdata,
                                    bins,
                                    weights=plotweights,
                                    histtype=histtype,
                                    color=histclr,
                                    edgecolor=edgecolor,
                                    label=label,
                                    lw=edgeweights,
                                    **kwargs)
            try:
                for patch, clr in zip(patches, color):
                    patch.set_facecolor(clr)
            except (AttributeError, ValueError):
                pass
        if catdict is not None and label_count:
            nd = len(data)
            for icat, cat in enumerate(catdict):
                count = np.count_nonzero(data == cat)
                pcat = (weights * (data == cat).astype(float)).sum()
                ax.text(icat + 1, pcat, count, ha="center", va="bottom")
    # Summary stats
    if stat_blk is None:
        stat_blk = Parameters['plotting.histogram_plot.stat_blk']
    if stat_xy is None:
        if icdf:
            stat_xy = Parameters['plotting.histogram_plot.stat_xy_cdf']
        else:
            stat_xy = Parameters['plotting.histogram_plot.stat_xy']
    if stat_blk:
        if sigfigs is None:
            sigfigs = Parameters['plotting.sigfigs']
        if roundstats is None:
            roundstats = Parameters['plotting.roundstats']
        if stat_ha is None:
            stat_ha = Parameters['plotting.stat_ha']
        if stat_linespacing is None:
            stat_linespacing = Parameters['plotting.stat_linespacing']
        if stat_linespacing is None:
            stat_linespacing = 1.0
        # Force no bins and upper/lower for median
        cdf_x, cdfvals = gs.cdf(data, weights=weights)
        # Currently defined statistics, possible to add more quite simply
        if np.mean(data) == 0:
            cdata = float("nan")
        elif roundstats:
            cdata = round((np.std(data) / np.mean(data)), sigfigs)
        else:
            cdata = gs.round_sigfig((np.std(data) / np.mean(data)), sigfigs)
        if roundstats:
            mean = round(gs.weighted_mean(data, weights), sigfigs)
            median = round(gs.percentile_from_cdf(cdf_x, cdfvals, 50.0),
                           sigfigs)
            stdev = round(np.sqrt(gs.weighted_variance(data, weights)),
                          sigfigs)
            minval = round(np.min(data), sigfigs)
            maxval = round(np.max(data), sigfigs)
            upquart = round(np.percentile(data, 75), sigfigs)
            lowquart = round(np.percentile(data, 25), sigfigs)
            p10 = round(np.percentile(data, 10), sigfigs)
            p90 = round(np.percentile(data, 90), sigfigs)
        else:
            mean = gs.round_sigfig(gs.weighted_mean(data, weights), sigfigs)
            median = gs.round_sigfig(
                gs.percentile_from_cdf(cdf_x, cdfvals, 50.0), sigfigs)
            stdev = gs.round_sigfig(
                np.sqrt(gs.weighted_variance(data, weights)), sigfigs)
            minval = gs.round_sigfig(np.min(data), sigfigs)
            maxval = gs.round_sigfig(np.max(data), sigfigs)
            upquart = gs.round_sigfig(np.percentile(data, 75), sigfigs)
            lowquart = gs.round_sigfig(np.percentile(data, 25), sigfigs)
            p10 = gs.round_sigfig(np.percentile(data, 10), sigfigs)
            p90 = gs.round_sigfig(np.percentile(data, 90), sigfigs)
        statistics = {
            'mean': (r'$m = %g$' % mean),
            'median': (r'$x_{{50}} = %g$' % median),
            'count': ('$n = %i$' % len(data)),
            'count_trimmed': ('$n_{trim} = %i$' % nullcnt),
            'stdev': (r'$\sigma = %g$' % stdev),
            'cvar': ('$CV = %g$' % cdata),
            'min': ('$x_{{min}} = %g$' % minval),
            'max': ('$x_{{max}} = %g$' % maxval),
            'upquart': ('$x_{{75}} = %g$' % upquart),
            'lowquart': ('$x_{{25}} = %g$' % lowquart),
            'p10': ('$x_{{10}} = %g$' % p10),
            'p90': ('$x_{{90}} = %g$' % p90)
        }
        # Default statistic sets
        if stat_blk == 'varlabel' and 'label' in kwargs:
            statistics['varlabel'] = kwargs['label']
        statsets = {
            'minimal': ['count', 'mean', 'median', 'stdev'],
            'all': [
                'count', 'mean', 'stdev', 'cvar', 'max', 'upquart', 'median',
                'lowquart', 'min'
            ],
            'varlabel': [
                'varlabel', 'count', 'mean', 'stdev', 'cvar', 'max', 'upquart',
                'median', 'lowquart', 'min'
            ],
            'none':
            None
        }
        # Use a default statistic set
        if isinstance(stat_blk, bool) and stat_blk:
            stat_blk = 'all'
        if isinstance(stat_blk, str):
            if stat_blk in statsets:
                stat_blk = statsets[stat_blk]
            else:
                print('WARNING: stats value of: "' + stat_blk +
                      '" does not exist - '
                      'default to no stats')
                stat_blk = None
        # Use a supplied statistic set, but check for bad ones
        else:
            badstats = [s for s in stat_blk if s not in statistics]
            stat_blk = [s for s in stat_blk if s in statistics]
            for badstat in badstats:
                print('WARNING: stats value of: "' + badstat +
                      '" does not exist - '
                      'It was removed from summary statistics list')
        # Form the stats string
        if stat_blk:
            if nullcnt != 0:
                stat_blk.insert(stat_blk.index('count') + 1, 'count_trimmed')
            stat_blk = [statistics[s] for s in stat_blk]
            txtstats = '\n'.join(stat_blk)
            if len(np.unique(weights)) > 1:
                txtstats = txtstats + '\n\nweights used'
            if stat_xy[1] > 0.5:
                va = 'top'
            else:
                va = 'bottom'
            # Set the stat_fontsize
            stat_fontsize = _set_stat_fontsize(stat_fontsize)
            ax.text(stat_xy[0],
                    stat_xy[1],
                    txtstats,
                    va=va,
                    ha=stat_ha,
                    transform=ax.transAxes,
                    fontsize=stat_fontsize,
                    linespacing=stat_linespacing)
    # Label as required
    if icdf:
        ylabel = 'Cumulative Distribution Function'
    elif 'density' in kwargs:
        ylabel = 'Probability Density Function (PDF)'
    else:
        ylabel = 'Frequency'
    ax = format_plot(ax,
                     xlabel,
                     ylabel,
                     title,
                     axis_xy=axis_xy,
                     xlim=xlim,
                     ylim=ylim,
                     logx=logx)
    if catdict is not None and var is None:
        ticlocs = [i + 1 for i in range(len(catdict.keys()))]
        ax.set_xticks(ticlocs)
        ax.set_xticklabels(catdict.values())
        ax.set_xlim(0.25, len(catdict) + 0.75)
    elif catdict is not None and var is not None:
        ax.legend()
    _format_tick_labels(ax, rotateticks)
    # format_plot doesn't handle some specialized axis_xy and grid requirements
    # for histogram_plot...
    if icdf:
        # Ensure that we have top spline, in case it was removed above
        ax.spines['top'].set_visible(True)
        _format_grid(ax, grid, below=False)
    else:
        # The grid should be below for a histogram
        _format_grid(ax, grid, below=True)
    # Export figure
    if output_file or ('pdfpages' in out_kws):
        gs.export_image(output_file, **out_kws)
    return ax
Exemplo n.º 5
0
def accsim(truth, reals, pinc=0.05):
    """
    Calculates the proportion of locations where the true value falls within symmetric p-PI
    intervals when completing a jackknife study. A portion of the data is excluded from the
    conditioning dataset and the excluded sample locations simulated values are then checked.

    .. seealso::

        Pyrcz, M. J., & Deutsch, C. V. (2014). Geostatistical Reservoir Modeling (2nd ed.). New
        York, NY: Oxford University Press, p. 350-351.

    Arguments:
        truth: Tidy (long-form) 1D data where a single column containing the true values.
            A pandas dataframe/series or numpy array can be passed
        reals: Tidy (long-form) 2D data where a single column contains values from a single
            realizations and each row contains the simulated values from a single truth location.
            A pandas dataframe or numpy matrix can be passed

    Keyword Arguments:
        pinc (float): Increments between the probability intervals to calculate within (0, 1)

    Returns:
        propavg (pd.DataFrame): Dataframe with the calculated probability intervals and the
        fraction within the interval

    Returns:
        sumstats (dict): Dictionary containing the average variance (U), mean squared error (MSE),
        accuracy measure (acc), precision measure (pre), and a goodness measure (goo)

    """
    import pandas as pd
    import pygeostat as gs
    # Handle input
    if isinstance(truth, pd.Series):
        truth = truth.values
    elif isinstance(truth, pd.DataFrame):
        truth = truth.values
    elif not isinstance(truth, np.ndarray):
        raise ValueError(
            "The argument `truth` must be a pd.DataFrame, pd.Series, or np.matrix"
        )
    if isinstance(truth, np.ndarray) and len(truth.shape) == 1:
        truth = np.reshape(truth, (truth.shape[0], 1))
    if isinstance(reals, pd.DataFrame):
        reals = reals.values
    elif not isinstance(reals, np.ndarray):
        raise ValueError(
            "The argument `reals` must be a pd.DataFrame or np.matrix")
    try:
        data = np.concatenate((truth, reals), axis=1)
        data = pd.DataFrame(data=data)
    except:
        raise ValueError(
            "The `truth` and `reals` data could not be coerced into a pd.DataFrame"
        )
    # Initialize some variables
    pints = np.arange(pinc, 1, pinc)
    propindic = dict([pint, []] for pint in pints)
    variances = []
    acc = dict([pint, 0] for pint in pints)
    pre = dict([pint, 0] for pint in pints)
    goo = dict([pint, 0] for pint in pints)
    # Calculate the indicator responses and local variances
    for i, values in data.iterrows():
        cdf = gs.cdf(values[1:].values)
        variances.append(np.var(values[1:].values))
        for pint in pints:
            if cdf[0][0] <= values[0] <= cdf[0][-1]:
                p = gs.z_percentile(values[0], cdf[0], cdf[1])
                plower = 0.5 - (pint / 2)
                pupper = 0.5 + (pint / 2)
                if plower <= p <= pupper:
                    indic = 1
                else:
                    indic = 0
            else:
                indic = 0
            propindic[pint].append(indic)
    # Calculate the average proportions and average variance
    propavg = []
    for pint in pints:
        avg = np.average(propindic[pint])
        propavg.append([pint, avg])
    propavg = pd.DataFrame(propavg, columns=['ProbInt', 'FracIn'])
    # Calculate the summary statistics
    avgvar = np.average(variances)
    mse = ((propavg['ProbInt'].values - propavg['FracIn'].values)**2).mean()
    acc = 0
    pre = 0
    goo = 0
    for i, values in propavg.iterrows():
        if values[1] >= values[0]:
            acc = acc + 1
            pre = pre + (values[1] - values[0])
            goo = goo + (values[1] - values[0])
        else:
            goo = goo + (2 * (values[0] - values[1]))
    acc = acc / len(propavg)
    pre = 1 - ((2 * pre) / len(propavg))
    goo = 1 - (goo / len(propavg))
    sumstats = {
        'avgvar': avgvar,
        'mse': mse,
        'acc': acc,
        'pre': pre,
        'goo': goo
    }

    return propavg, sumstats