Пример #1
0
def confidence_interval(data, confidence=0.99):
    """Given data, calculate the percent confidence intervals for it.

    Returns mean, mean - ci, mean + ci
    """
    _a = as_flattened_numpy(data)
    return stats.t.interval(confidence,
                            _a.shape[0] - 1,
                            loc=np.mean(_a),
                            scale=stats.sem(_a))
Пример #2
0
def auto_fit(data, option="slim"):
    """Attempts to fit the best distribution to some given data.

    Currently only compatible with continuous distributions with a *fit* method.

    Parameters
    ----------
    data : np.ndarray (n,)
        If multi-dimensional, flattens it to 1D.
    option : str/tuple of str, default="slim"
        Choose between "slim" and "full", slim only compares the most popular distributions
        with fewer parameters.

    Returns
    -------
    df : DataFrame
        Results dataframe of each fitted model.
    """
    _data = as_flattened_numpy(data)

    dists = _get_distribution_set(option)
    # make as dataframe
    return _get_qqplot_score_correlate(_data, dists)
Пример #3
0
def univariate_kde(
    X: np.ndarray,
    bins: Optional[np.ndarray] = None,
    kde_name: str = "norm",
    kde_range: float = 1e-3,
    smoothen_kde: bool = True,
    verbose: int = 0,
    return_dist: bool = False,
):
    """Determines a univariate KDE approximation on a 1D sample, either continuous or discrete.

    .. note:: If x is multi-dimensional, the array is flattened.

    Parameters
    ----------
    X : np.ndarray
        The 1D set to calculate. If more than 1-dim it is flattened.
    bins : np.ndarray, optional
        A range relating to the desired number of bins
    kde_name : str, optional, default='norm'
        The name relating to a scipy.stats.<kde_name> distribution
        If 'freeform': fits the best KDE to the data points.
    kde_range : float, default=1e-3
        A range to use for continuous distributions
    smoothen_kde : bool, default=True
        whether to smooth the distribution appearance for discrete applications
    verbose : int, optional, default=0
        If > 0, prints out useful messages
    return_dist : bool, default=False
        If True, returns the frozen scipy.stats.<kde_name> model with fitted parameters

    Returns
    -------
    x_kde : np.ndarray
        The x-coordinates of the KDE
    y_kde : np.ndarray
        The kernel density approximation as a density score
    model : scipy.stats.rv_distribution (optional)
        The scipy model that has the distribution on it.
    """

    supported_disc_dists = list(_get_discrete_single()) + list(
        _get_discrete_multiple())
    # convert to numpy
    _X = as_flattened_numpy(X)
    if bins is None:
        bins = get_bins(_X)

    if kde_name == "freeform":
        _model = stats.gaussian_kde(_X)
        x_kde = np.linspace(_X.min(), _X.max(), 200)
        y_kde = _model.pdf(x_kde)
    else:
        _model, _params = fit_model(_X,
                                    kde_name,
                                    verbose=verbose,
                                    return_params=True)

        if kde_name in supported_disc_dists:
            if smoothen_kde:
                x_kde, y_kde = _smooth_kde(bins, _model)
            else:
                x_kde, y_kde = bins, _model.pmf(bins)
        else:
            # generate x_kde
            x_kde = np.linspace(_model.ppf(kde_range),
                                _model.ppf(1 - kde_range), 200)
            y_kde = _model.pdf(x_kde)

    if return_dist:
        return x_kde, y_kde, _model
    else:
        return x_kde, y_kde
Пример #4
0
def bibox1d(X: _ArrayLike,
            Y: _ArrayLike,
            colors: Optional[_ListLike] = None,
            labels: Optional[_ListLike] = None,
            measured: Optional[str] = None,
            ax: Optional[mpl.axes.Axes] = None,
            mannwhitney: bool = True,
            with_strip: bool = False,
            vertical: bool = True,
            notch: bool = False,
            capsize: float = 1.0,
            outliers: bool = True,
            grid: bool = True,
            width: Union[float, List[float]] = 0.7,
            label_rotation: float = 0.0,
            label_max_length: int = 25,
            spines: Optional[_ListLike] = None,
            strip_jitter: float = 0.15,
            theme: str = "white_circle",
            **plot_kwargs):
    """Plots two 1-dimensional boxplots using vectors `X`, `Y`.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The first data column to draw. Must be numeric.
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The second data column to draw. Must be numeric.
    colors : str/list of str, optional
        If None, uses a default color
    labels : str/list of str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X/Y is of type pandas.Series, uses this label instead.
    measured : str, optional
        A label to define what the measurement is
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    mannwhitney : bool, default=True
        If True, performs a Mann-Whitney U test between the values
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays fliers as outliers
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    strip_jitter : float, default=0.15
        With stripplot, defines the amount of jitter in the variables
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    ----------------
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    See Also
    --------
    matplotlib.pyplot.boxplot

    References
    ----------
    Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes
    """
    instance_check((X, Y), (list, tuple, np.ndarray, pd.Series))
    instance_check((colors, labels, spines), (type(None), list, pd.Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check((mannwhitney, vertical, notch, outliers, grid, with_strip),
                   bool)
    instance_check((capsize, width, strip_jitter, label_rotation),
                   (float, int))
    instance_check(theme, str)
    instance_check(label_max_length, int)
    bounds_check(strip_jitter, 0.0, 1.0)

    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    _style = _get_flier_style(theme)

    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(3.5, 7))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(7, 3.5))

    if with_strip:
        outliers = False

    if spines is None:
        if vertical and mannwhitney:
            spines = ("bottom", "left", "right")
        elif not vertical and mannwhitney:
            spines = ("bottom", "left", "top")
        else:
            spines = ("bottom", "left", "top", "right")
    # sort out labels
    if labels is None:
        labels = [
            X.name if isinstance(X, pd.Series) else "",
            Y.name if isinstance(Y, pd.Series) else "",
        ]
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot([_X, _Y],
                           vert=vertical,
                           patch_artist=True,
                           showfliers=outliers,
                           notch=notch,
                           widths=width,
                           flierprops=_style,
                           boxprops=dict(alpha=box_alpha),
                           **plot_kwargs)
    # define boxplot extras
    _define_boxplot_arguments(ax, patch_obj, vertical, measured, grid, spines,
                              capsize, None)
    # define basic colours - overrides if needs be
    colors = _kcolor_arrangement(patch_obj, colors)
    # label axes
    _label_axes(ax, labels, vertical, label_rotation, label_max_length)
    # if we have stripplot, draw this
    if with_strip:
        # plot x strips
        _overlay_stripplot(_X, ax, 1, width, colors[0], vertical, outliers,
                           strip_jitter)
        _overlay_stripplot(_Y, ax, 2, width, colors[1], vertical, outliers,
                           strip_jitter)

    # if we have mann-whitney append this info
    if mannwhitney:
        # determine mann-whitney U test
        z, p = mannwhitneyu(_X, _Y)
        # p-value * 2
        p *= 2
        star = _get_stars(p)
        # get dimensions to annotate
        joined = np.concatenate((_X, _Y))
        _max, _min = np.max(joined), np.min(joined)
        # annotate on mann-whitney test
        if vertical:
            ax.annotate(
                "",
                xy=(1, _max),
                xycoords="data",
                xytext=(2, _max),
                textcoords="data",
                arrowprops=dict(arrowstyle="-",
                                ec="#666666",
                                connectionstyle="bar,fraction=0.2"),
            )
            # add mw text
            ax.text(
                1.5,
                _max + np.abs(_max - _min) * 0.1,
                star,
                horizontalalignment="center",
                verticalalignment="center",
            )
        else:
            ax.annotate(
                "",
                xy=(_max, 2),
                xycoords="data",
                xytext=(_max, 1),
                textcoords="data",
                arrowprops=dict(arrowstyle="-",
                                ec="#666666",
                                connectionstyle="bar,fraction=0.2"),
            )
            # add mw text
            ax.text(
                _max + np.abs(_max - _min) * 0.1,
                1.5,
                star,
                horizontalalignment="center",
                verticalalignment="center",
            )

    return ax
Пример #5
0
def box1d(X: _ArrayLike,
          color: Optional[str] = None,
          label: Optional[str] = None,
          ax: Optional[mpl.axes.Axes] = None,
          with_strip: bool = False,
          vertical: bool = True,
          notch: bool = False,
          capsize: float = 1.0,
          outliers: bool = True,
          axis_scale: Optional[Union[str, Callable]] = None,
          grid: bool = True,
          width: float = 0.7,
          label_rotation: float = 0.0,
          label_max_length: int = 25,
          spines: Optional[_ListLike] = None,
          theme: str = "white_circle",
          **plot_kwargs):
    """Plots a 1-dimensional boxplot using a vector.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw. Must be numeric.
    color : str, optional
        If None, uses a default color
    label : str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X is of type pandas.Series, uses this label instead.
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays outfliers as outliers
    axis_scale: str/callable, optional
        Scales the data along the axis.
        If str, use {'log', 'sqrt', 'log2'}
        If callable, must reference a `np.*` function which takes array X and returns X'
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    ----------------
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot
    """

    instance_check(X, (np.ndarray, pd.Series, list, tuple))
    instance_check((vertical, notch, outliers, grid, with_strip), bool)
    instance_check(spines, (type(None), list))
    instance_check(theme, str)
    instance_check((label, color), (type(None), str))
    instance_check((capsize, width), float)
    instance_check(label_rotation, (int, float))
    instance_check(label_max_length, int)
    bounds_check(width, 0.0, 1.0)

    # convert option to numpy
    _X = as_flattened_numpy(X)
    _style = _get_flier_style(theme)
    # convert X data if we have axis_scale
    if axis_scale:
        _X = _convert_x_scale(_X, axis_scale)

    if with_strip:
        outliers = False
    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(2.5, 5))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(5, 2.5))
    if spines is None:
        spines = ("left", "top", "right", "bottom")
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot(_X,
                           vert=vertical,
                           patch_artist=True,
                           showfliers=outliers,
                           notch=notch,
                           widths=width,
                           boxprops=dict(alpha=box_alpha),
                           flierprops=_style,
                           **plot_kwargs)
    # define basic arguments
    _define_boxplot_arguments(ax, patch_obj, vertical, None, grid, spines,
                              capsize, axis_scale)
    # define colour features
    color = _color_arrangement(ax, patch_obj, color)
    # label the appropriate axes
    _label_axes(
        ax,
        X.name if isinstance(X, pd.Series) else label,
        vertical,
        label_rotation,
        label_max_length,
    )
    # plot the strips
    if with_strip:
        _overlay_stripplot(_X,
                           ax,
                           1,
                           width,
                           color,
                           vertical,
                           outliers,
                           strip_jitter=0.15)
    return ax
Пример #6
0
def histogram(X: _ArrayLike,
              kde: str = "freeform",
              bins: Optional[Union[int, _ListLike]] = None,
              density: bool = True,
              stat: bool = False,
              ax: Optional[mpl.axes.Axes] = None,
              x_label: str = "",
              title: str = "",
              kde_range: float = 1e-3,
              smoothen_kde: bool = True,
              verbose: int = 0,
              *hist_args,
              **hist_kwargs) -> mpl.axes.Axes:
    """Draws pretty histograms using `X`.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw. Must be numeric.
    kde : str/tuple of str, optional, default="freeform"
        If None, does not draw a KDE plot
        If 'freeform': fits the best KDE to the points
        If 'auto': attempts to fit the best `continuous` distribution
        If list/tuple: uses 'auto' to fit the best distribution out of options
        else, choose from available distributions in `scipy.stats`
    bins : int, optional
        If None, uses optimal algorithm to find best bin count
    density : bool, default=True
        If True, uses density approximation
    stat : bool, default=False
        If True, sets statistical variables in legend
    ax : matplotlib.ax object, optional, default=None
        If None, creates one.
    x_label : str, optional, default=None
        If None, uses `x-axis`.
    title : str, optional, default=""
        If None, uses `Default Title`
    kde_range : float, default=1e-3
        Defines the precision on the KDE range if plotted between (1e-3, 1-1e-3)
        Must be > 0.
    smoothen_kde : bool, default=True
        If discrete-distribution, applies smoothing function to KDE if True
    verbose : int, default=0
        If > 0, prints out useful messages

    Other Parameters
    ----------------
    args ; list
        Arguments to pass to `ax.hist`
    kwargs : dict
        Keyword arguments to pass to `ax.hist`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-histogram
    """
    instance_check(X, (np.ndarray, pd.Series, list, tuple))
    instance_check((density, stat, smoothen_kde), bool)
    instance_check((title, x_label), str)
    instance_check(kde, (str, type(None), list, tuple))
    instance_check(kde_range, float)
    bounds_check(verbose, 0, 4)

    # convert to numpy.
    _X = as_flattened_numpy(X)
    # make bins if set to None
    if bins is None:
        # if X is float, use freedman_diaconis_bins determinant, else simply np.arange for integer input.
        bins = get_bins(_X)
    if kde:
        density = True
    # plot histogram
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    if stat:
        stat_label = "mean: {:0.2f}, sd: {:0.3f},\n skew: {:0.3f} kurt: {:0.3f}".format(
            np.nanmean(_X), np.nanstd(_X), stats.skew(_X), stats.kurtosis(_X))
        # plot the histogram
        _plot_hist(_X,
                   ax,
                   bins=bins,
                   density=density,
                   rwidth=0.9,
                   label=stat_label,
                   *hist_args,
                   **hist_kwargs)
        ax.legend(loc="best")
    else:
        # plot the histogram
        _plot_hist(_X,
                   ax,
                   bins=bins,
                   density=density,
                   rwidth=0.9,
                   *hist_args,
                   **hist_kwargs)

    ax.set_title(title)

    if density:
        ax.set_ylabel("Density")
    else:
        ax.set_ylabel("Counts")

    if kde is not None:
        if kde == "auto" or isinstance(kde, (list, tuple)):
            # uses slim parameters by default
            auto_fitted = auto_fit(_X, kde)
            best_model_ = auto_fitted.loc[auto_fitted["r"].idxmax()]
            # set kde to the name given
            x_kde, y_kde, model = univariate_kde(
                _X,
                bins,
                best_model_.name,
                kde_range=1e-3,
                smoothen_kde=smoothen_kde,
                verbose=verbose,
                return_dist=True,
            )
        elif (kde == "freeform") or hasattr(stats, kde):
            # fetches the kde if possible
            auto_fitted = None
            x_kde, y_kde, model = univariate_kde(
                _X,
                bins,
                kde,
                kde_range=1e-3,
                smoothen_kde=smoothen_kde,
                verbose=verbose,
                return_dist=True,
            )
        else:
            raise ValueError(
                "kde value '{}' not found in scipy.stats".format(kde))

        # plot
        ax.plot(x_kde, y_kde, "-", color="r")
    else:
        auto_fitted = None
        model = None

    if x_label == "":
        x_label = _assign_x_label(
            title,
            X.name if isinstance(X, pd.Series) else "",
            kde is not None,
            auto_fitted,
            model if not kde == "freeform" else None,
        )

    ax.set_xlabel(x_label)

    return ax
Пример #7
0
def scatter_slim(X: _ArrayLike,
                 Y: _ArrayLike,
                 bins: Optional[int] = None,
                 threshold: Union[int, float] = 50,
                 **turbo_kws):
    """
    Generates a slim-down scatterplot.

    This is useful where there are thousands of points overlapping, and for visualization and storage size,
    you only plot so many points within a given bin area.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the x-axis. Flattens if np.ndarray
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the y-axis. Flattens if np.ndarray
    bins : int, optional
        Specifies the bins to split X,Y domain, if optional this is optimized for
    threshold : int or float
        Specifies the threshold above which nsamples are dropped in each bin.
        If float, specifies the proportion of points [0..1] to keep in each bin.
    turbo_kws : dict
        Keyword arguments to pass to `turb.plot.scatter`. All other arguments go to `ax.scatter`.

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-scatter
    """

    # defines some turbo keywords, everything else is scatter_kws
    turbo_keys = {
        'c', 's', 'marker', 'dense', 'fit_line', 'ax', 'alpha', 'cmap',
        'legend', 'colorbar', 'with_jitter', 'x_label', 'y_label', 'x_scale',
        'y_scale', 'legend_outside', 'title', 'with_grid', 'fit_line_degree'
    }

    our_keys = set(turbo_kws.keys())
    # intersection between the two.
    used_keys = turbo_keys & our_keys
    t_kws = {x: turbo_kws[x] for x in used_keys}
    mpl_kws = {x: turbo_kws[x] for x in our_keys - used_keys}

    # get subset where missing values from either are dropped
    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    # paired values
    _X, _Y = remove_na(_X, _Y, paired=True)

    # get the bins
    if bins is None:
        # we just use x here.
        bins_x = freedman_diaconis_bins(_X)
        bins_y = freedman_diaconis_bins(_Y)
        # take the average, integer divison
        bins = (bins_x + bins_y) // 2
    else:
        # ensure its non-negative
        nonnegative(bins, int)

    # compute the binned density
    s, xs, ys = np.histogram2d(_X, _Y, bins=bins)
    xs_lw = xs[:-1]
    xs_up = xs[1:]
    ys_lw = ys[:-1]
    ys_up = ys[1:]

    indices = []
    # loop through all the bins and compute a valid sample subset
    for i in range(bins):
        for j in range(bins):
            x_b = np.logical_and(_X >= xs_lw[i], _X < xs_up[i])
            y_b = np.logical_and(_Y >= ys_lw[j], _Y < ys_up[j])
            # indices
            i_b = np.argwhere(np.logical_and(x_b, y_b)).flatten()
            i_bn = i_b.shape[0]
            # if this is empty, do nothing else, select subset and return
            if i_bn > 0:
                samp_size = i_bn
                if type(threshold) == int:
                    samp_size = min(i_bn, threshold)
                elif type(threshold) == float:
                    samp_size = min(i_bn, int(i_bn * threshold))
                # sample
                samp = np.random.choice(i_b, samp_size, replace=False)
                indices.append(samp)

    ni = np.hstack(indices)
    # x and y is now selected using ni
    return scatter(_X[ni], _Y[ni], **t_kws, **mpl_kws)
Пример #8
0
def scatter(X: _ArrayLike,
            Y: _ArrayLike,
            c: Union[str, _ArrayLike] = "k",
            marker: Union[str, _ArrayLike] = "o",
            s: Optional[Union[_Numeric, _ArrayLike]] = None,
            dense: bool = False,
            fit_line: bool = False,
            ax: Optional[mpl.axes.Axes] = None,
            alpha: Optional[float] = None,
            cmap: str = "viridis",
            legend: bool = True,
            colorbar: bool = True,
            with_jitter: bool = False,
            x_label: Optional[str] = None,
            y_label: Optional[str] = None,
            x_scale: str = "linear",
            y_scale: str = "linear",
            legend_outside: bool = False,
            title: str = "",
            with_grid: bool = False,
            fit_line_degree: int = 1,
            **scatter_kws):
    """Generates a scatterplot, with some useful features added to it.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the x-axis. Flattens if np.ndarray
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the y-axis. Flattens if np.ndarray
    c : str/list/tuple/np.ndarray/pd.Series (1d), default='blue'
        The colour of the points.
        If array, colors must be a categorical/valid float type, uses cmap
    marker : str/list/tuple/np.ndarray/pd.Series (1d), default='o'
        The marker style of the points.
        If type=list/array, array must be a categorical/str-like type to map to matplotlib markers
        If dense=True, treats each marker as a circle, ignores this input
    s : int/float/list/tuple/np.ndarray/pd.Series (1d), optional
        Size of each point.
        If dense=True, this value is set automatically.
        If type=list/array, array must be array of floats
    dense : bool
        If True, draws the uniform densities instead of the actual points
    fit_line : bool
        If True, draws a line of best fit on the data
    ax : matplotlib.ax.Axes, optional, default=None
        If None, creates one.
    alpha : float, optional
        Sets the alpha for colour. If dense is True, this value is set automatically
    cmap : str, default="viridis"
        The default colormap for continuous-valued c.
    legend : bool, default=True
        Draws a legend if the 'c' variable is discrete
    colorbar : bool, default=True
        Draws a colorbar if the 'c' variable is continuous
    with_jitter : bool, default=False
        If True, and dense=True, adds some jitter to the uniform points
    x_label : str, default="x-axis"
        If X is not a pandas.Series, this is used
    y_label : str, default="y-axis"
        If Y is not a pandas.Series, this is used
    x_scale : str, default="linear"
        Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_xscale`
    y_scale : str, default="linear"
        Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_yscale`
    legend_outside : bool, default=False
        If True, plots the legend outside the plot at (1, 1)
    title : str, default=""
        Optional title at the top of the axes
    with_grid : bool, default=False
        If True, draws a grid
    fit_line_degree : int, default=1
        If fit_line=True, Determines the degree to which a line is fitted to the data,
         allows polynomials

    Other Parameters
    ----------------
    scatter_kws : dict
        Keyword arguments to pass to `ax.scatter`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-scatter
    """

    instance_check((X, Y), (list, tuple, np.ndarray, Series))
    instance_check((c, marker), (str, list, tuple, np.ndarray, Series, Index))
    instance_check(
        s, (type(None), int, float, list, tuple, np.ndarray, Series, Index))
    instance_check(alpha, (type(None), float))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check(
        (dense, with_jitter, fit_line, with_grid, legend, legend_outside),
        bool)
    instance_check((x_label, y_label, title, x_scale, y_scale),
                   (type(None), str))
    instance_check(fit_line_degree, int)

    arrays_equal_size(X, Y)
    if isinstance(marker, str):
        belongs(marker, _marker_set())

    # get subset where missing values from either are dropped
    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    # remove values not found in both.
    _X, _Y = remove_na(_X, _Y, paired=True)

    # warn the user if n is large to maybe consider dense option?
    if _X.shape[0] > 15000 and not dense:
        warn(
            "Data input n={} is large, consider setting dense=True or using function `scatter_slim`."
            .format(X.shape[0]),
            UserWarning,
        )

    # reconfigure colors if qualitative
    if isinstance(s, (list, tuple)) and not dense:
        s = as_flattened_numpy(s)
        arrays_equal_size(X, Y, s)
    if isinstance(marker, (list, tuple)) and not dense:
        marker = np.asarray(marker)
        arrays_equal_size(X, Y, marker)

    if not isinstance(c, str):
        # do some prep work on the color variable.
        palette, _cmode = cat_array_to_color(c, cmap=cmap)
        # perform size check
        arrays_equal_size(X, Y, palette)
    else:
        palette = c
        _cmode = "static"

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    if dense:
        # alpha, s are set in this case
        alpha = 0.8
        marker = "o"
        # perform density plotting
        bins_x = min(freedman_diaconis_bins(_X), 50)
        bins_y = min(freedman_diaconis_bins(_Y), 50)
        # estimate counts using histogram2d
        s, xs, ys = np.histogram2d(_X, _Y, bins=(bins_x, bins_y))
        # create a mesh
        xp, yp = np.meshgrid(xs[:-1], ys[:-1])
        if with_jitter:
            xp += np.random.rand(*xp.shape) / (_X.max() - _X.min())
            yp += np.random.rand(*yp.shape) / (_Y.max() - _Y.min())
    else:
        if alpha is None:
            alpha = _select_best_alpha(_X.shape[0])
        if s is None:
            s = _select_best_size(_X.shape[0])
        xp = _X
        yp = _Y

    # draw
    _ = _draw_scatter(xp,
                      yp,
                      palette,
                      s,
                      marker,
                      alpha,
                      ax,
                      cmap=cmap,
                      **scatter_kws)

    # optionally fit a line of best fit
    if fit_line:
        _draw_line_best_fit(_X, _Y, palette, ax, fit_line_degree)

    if with_grid:
        ax.grid()

    # associate legend if colour map is used
    if _cmode == "discrete" and legend:
        map_legend(c, palette, marker, ax, legend_outside)
    elif _cmode == "continuous" and colorbar:
        # add colorbar
        _make_colorbar(c, ax, cmap)

    # apply x-label, y-label, title
    if isinstance(x_label, str):
        ax.set_xlabel(x_label)
    elif isinstance(X, Series):
        ax.set_xlabel(X.name)

    if isinstance(y_label, str):
        ax.set_ylabel(y_label)
    elif isinstance(Y, Series):
        ax.set_ylabel(Y.name)

    ax.set_xscale(x_scale)
    ax.set_yscale(y_scale)
    ax.set_title(title)

    return ax
Пример #9
0
def bar1d(
    X: _ArrayLike,
    Y: Optional[_ListLike] = None,
    c: Optional[Union[_ArrayLike, str]] = "k",
    vert: bool = True,
    sort: bool = True,
    ax: Optional[mpl.axes.Axes] = None,
    scale: str = "linear",
    annotate: bool = False,
    legend: bool = False,
    width: float = 0.8,
    label_rotation: float = 0.0,
    value_label: Optional[str] = None,
    sort_by: str = "values",
    cmap: str = "Blues",
    linesAt: Optional[Union[_Numeric, _ListLike]] = None,
):
    """Plots a 1 dimensional barplot.

    Parameters
    ----------
    X : list, tuple, np.ndarray, pd.Series
        Categorical/string/time labels for the data.
        If pandas.Series, Index must be categorical, Values must be numeric.
    Y : list, tuple, np.ndarray, optional
        If None, X must be a pd.Series. Must be numeric dtype.
    c : str/list/tuple/np.ndarray/pd.Series (1d), optional
        Defines the colour of each bar.
        If str, colours all of the bars with the same
        If array, must be a categorical type.
        If None, uses an automatic qualitative palette
    vert : bool, default=True
        Determines whether the plot is vertical or horizontal
    sort : bool, default=True
        Sorts the data or labels
    ax : matplotlib.ax.Axes, optional, default=None
        If None, creates one.
    scale : str, default="linear"
        Determines how to scale the numeric axis.
    annotate : bool, default=False
        Determines whether values should be annotated
    legend : bool, default=False
        Choose whether to display a legend
    width : float, default=0.8
        The width of each bar in the barplot
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    value_label : str, optional
        Defines a name for the numerical axis
    sort_by : str, default="values"
        Defines how to sort the data if sort=True.
        Choose from {'values', 'labels'}
    cmap : str, default="Blues"
        Defines a colormap if color values are specified
    linesAt : int, float, list, tuple, optional
        If set, defines one or more vertical lines to add to the barplot

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot
    """
    # define plot if not set

    belongs(sort_by, ("values", "labels"))

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    # X is either numerical (in which case there is no Y, or categorical labels)
    if Y is None:
        # in this case, X must contain all the data
        if isinstance(X, pd.Series):
            _labels = as_flattened_numpy(X.index)
            _values = as_flattened_numpy(X.values)
            _ticks = np.arange(X.shape[0])
            value_label = X.name
        else:
            _labels = _ticks = np.arange(len(X))
            _values = as_flattened_numpy(X)
    else:
        # X is labels, Y are numeric values (assume!)
        _labels = as_flattened_numpy(X)
        _values = as_flattened_numpy(Y)
        _ticks = np.arange(_labels.shape[0])

    # sort out colour
    pal = _determine_color_palette(c, _ticks.shape[0], cmap)

    # perform sorting here
    if sort:
        if sort_by == "values":
            _order = np.argsort(_values)
        elif sort_by == "labels":
            _order = np.argsort(_labels)
        else:
            raise ValueError(
                "sort_by '{}': must be ['values', 'labels']".format(sort_by)
            )
        # apply sort
        if not isinstance(c, (type(None), str)):
            _labels, _values, pal = _apply_data_sort(_order, _labels, _values, pal)
        else:
            _labels, _values = _apply_data_sort(_order, _labels, _values)

    # plot the bar
    _plot_bar_orient(
        ax,
        _ticks,
        _labels,
        _values,
        c=pal,
        w=width,
        vert=vert,
        lrot=label_rotation,
        annotate=annotate,
        lines=linesAt,
        vlabel=value_label,
    )
    # orient scale
    if vert:
        ax.set_yscale(scale)
    else:
        ax.set_xscale(scale)

    # map a legend to it
    if legend and not isinstance(c, str):
        map_legend(c, pal, "o", ax, False)

    return ax
Пример #10
0
def annotate(X: Union[np.ndarray, Series, List, Tuple],
             Y: Union[np.ndarray, Series, List, Tuple],
             T: Union[np.ndarray, Series, List, Tuple],
             subset: Optional[Union[np.ndarray, Series, List, Tuple]] = None,
             ax: mpl.axes.Axes = None,
             word_shorten: Optional[int] = None,
             **annotate_kws):
    """Annotates a matplotlib plot with text.

    Offsets are pre-determined according to the scale of the plot.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The x-positions of the text.
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The y-positions of the text.
    T : list/tuple/np.ndarray/pd.Series (1d)
        The text array.
    subset : list/tuple/np.ndarray/pd.Series (1d)
        An array of indices to select a subset from.
    ax : matplotlib.ax.Axes, optional, default=None
        If None, creates one.
    word_shorten : int, optional
        If not None, shortens annotated strings to be more concise and displayable

    Other Parameters
    ----------------
    annotate_kws : dict
        Other keywords to pass to `ax.annotate`

    Returns
    -------
    ax : matplotlib.ax.Axes
        The same matplotlib plot, or the one generated
    """
    instance_check((X, Y), (list, tuple, np.ndarray, Series))
    instance_check(T, (list, tuple, np.ndarray, Series, Index))
    instance_check(subset,
                   (type(None), list, tuple, np.ndarray, Series, Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    arrays_equal_size(X, Y, T)
    # convert to numpy.
    _X = as_flattened_numpy(X).copy()
    _Y = as_flattened_numpy(Y).copy()
    _T = as_flattened_numpy(T)

    if word_shorten:
        _T = shorten(_T, newl=word_shorten)

    if _X.dtype.kind == "f":
        _X += (_X.max() - _X.min()) / 30.0
        _Y += -((_Y.max() - _Y.min()) / 30.0)

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))
    if subset is None:
        for x, y, t in it.zip_longest(_X, _Y, _T):
            ax.annotate(t, xy=(x, y), **annotate_kws)
    else:
        for i in subset:
            ax.annotate(_T[i], xy=(_X[i], _Y[i]), **annotate_kws)

    return ax