예제 #1
0
def shorten(s, newl: int = 15, strategy: str = "middle"):
    """Shortens a string or array of strings to length `newl`.

    Parameters
    ----------
    s : str / list of str / np.ndarray / pd.Series / pd.Index
        The string or list of strings to shorten
    newl : int, default=15
        The number of characters to preserve (5 on each side + spaces)
    strategy : str, default="middle"
        Choose from {'middle', 'end'}, determines where to put dots...

    Returns
    -------
    ns : str / list of str
        A shortened string or array of strings
    """
    instance_check(s, (str, list, tuple, np.ndarray, Series, Index))
    instance_check(newl, int)
    belongs(strategy, ("middle", "end"))

    if isinstance(s, str):
        return _shorten_string(s, newl, strategy)
    else:
        # create a partial passing in keyword arguments to every call.
        _shorten_part = partial(_shorten_string,
                                approp_len=newl,
                                strategy=strategy)
        # map through the strings and shorten them.
        return list(map(_shorten_part, s))
예제 #2
0
def _generate_diag_like_grid(n, direction, ax_size):
    """ Direction is in [row, column]"""
    belongs(direction, ["row", "column"])

    f1, f2 = nearest_factors(n, shape="diag")
    axf1, axf2 = ax_size
    fmax, fmin = max(f1, f2), min(f1, f2)
    # get longest one
    tup, nc, nr = (
        ((axf1 * fmin, axf2 * fmax), fmin, fmax)
        if direction == "row"
        else ((axf1 * fmax, axf2 * fmin), fmax, fmin)
    )
    fig, axes = plt.subplots(ncols=nc, nrows=nr, figsize=tup)
    axes = _clean_axes_objects(n, axes)
    return fig, axes
예제 #3
0
def bleep(_func=None, *, note="C") -> Callable:
    """Provides automatic sound release when a function has completed.

    .. note:: this requires the `simpleaudio` package to run.

    Note chord progression is played at the *end* of the function, and not the start.

    Parameters
    ----------
    note : str
        Must be {'A':'G'}

    Examples
    --------
    >>> from turbopanda.dev import bleep
    >>> @bleep
    >>> def f(x):
    ...     # compute some long function here
    ...     pass
    """
    belongs(note, list(_get_notes_all()))

    # define decorator
    def _decorator_wrap(func):
        @wraps(func)
        def _bleep_function(*args, **kwargs):
            # enter try-catch and if success, positive noise, or failure, negative noise.
            try:
                result = func(*args, **kwargs)
                # make positive noise
                _play_arpeggio(note.upper(), key="major")
                # return
                return result
            except Exception as e:
                # make negative noise
                _play_arpeggio(note.upper(), key="minor")
                print(e.args)

        return _bleep_function

    if _func is None:
        return _decorator_wrap
    else:
        return _decorator_wrap(_func)
예제 #4
0
def rename_axis(self,
                ops: Tuple[str, str],
                selector: Optional[SelectorType] = None,
                axis: int = 1):
    """Perform a chain of .str.replace operations on one of the axes.

    .. note:: strings that are unchanged remain the same (are not NA'd).

    Parameters
    -------
    ops : list of tuple (2,)
        Where the first value of each tuple is the string to find, with its replacement
        At this stage we only accept *direct* replacements. No regex.
        Operations are performed 'in order'
    selector : None, str, or tuple args, optional
        Contains either types, meta column names, column names or regex-compliant strings
        If None, all column names are subject to potential renaming
    axis : int, optional
        Choose from {1, 0} 1 = columns, 0 = index.

    Returns
    -------
    self
    """
    # check ops is right format
    is_twotuple(ops)
    belongs(axis, [0, 1])

    curr_cols = sel_cols = inspect(self.df_,
                                   self.meta_,
                                   self.selectors_,
                                   selector,
                                   axis=axis,
                                   mode="view")
    # performs the replacement operation inplace
    curr_cols = string_replace(curr_cols, ops)
    # rename using mapping
    _rename_axis(self.df_, self.meta_, sel_cols, curr_cols, axis=axis)
    return self
예제 #5
0
def save(
    fig_obj: plt.Figure,
    plot_type: str,
    name: str = "example1",
    save_types: Tuple[str, ...] = ("png", "pdf"),
    fp: str = "./",
    dpi: int = 360,
    savemode: str = "first",
) -> bool:
    """Saves a matplotlib figure in many formats.

    Given a matplotlib.Figure object, save appropriate numbers of Figures to the respective
    folders.

    Parameters
    ----------
    fig_obj : plt.Figure
        The figure object to save.
    plot_type : str
        Choose from:
            {"scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver",
            "box", "line", "venn", "multi", "pie"}
    name : str, optional
        The name of the file, this may be added to based on the other parameters
    save_types : tuple of str, optional
        Choose any from {"png", "pdf", "svg", "eps", "ps"}
    fp : str, optional
        The file path to the root directory of saving images
    dpi : int, optional
        The resolution in dots per inch; set to high if you want a good image
    savemode : str, optional
        Choose from {'first', 'update'}
            if first, only saves if file isn't present
            if update, overrides saved figure if present

    Warnings
    --------
    UserWarning
        If figure file itself already exists

    Raises
    ------
    IOError
        If the filepath does not exist
    TypeError
        If the arguments do not match their declared type
    ValueError
        If `plot_type`, `savemode` does not belong to an acceptable argument

    Returns
    -------
    success : bool
        Whether it was successful or not
    """
    accepted_types = (
        "scatter",
        "kde",
        "heatmap",
        "cluster",
        "bar",
        "hist",
        "kde",
        "quiver",
        "box",
        "line",
        "venn",
        "multi",
        "pie",
    )
    file_types_supported = ("png", "pdf", "svg", "eps", "ps")
    accepted_savemodes = ("first", "update")

    instance_check(fig_obj, (plt.Figure, mpl.figure.Figure))
    instance_check(name, str)
    instance_check(fp, str)
    belongs(plot_type, accepted_types)
    belongs(savemode, accepted_savemodes)

    for st in save_types:
        if st not in file_types_supported:
            TypeError("save_type: [%s] not supported" % st)

    # correct to ensure filepath has / at end
    if not fp.endswith("/"):
        fp += "/"

    # check whether the filepath exists
    if os.path.exists(fp):
        for t in save_types:
            # if the directory does not exist, create it!
            if not os.path.isdir(fp + "_" + t):
                os.mkdir(fp + "_" + t)
            # check if the figures themselves already exist.
            filename = "{}_{}/{}_{}.{}".format(fp, t, plot_type, name, t)
            if os.path.isfile(filename):
                warnings.warn(
                    "Figure: '{}' already exists: Using savemode: {}".format(
                        filename, savemode),
                    UserWarning,
                )
                if savemode == "update":
                    fig_obj.savefig(filename,
                                    format=t,
                                    bbox_inches="tight",
                                    dpi=dpi)
            else:
                # make the file
                fig_obj.savefig(filename,
                                format=t,
                                bbox_inches="tight",
                                dpi=dpi)
    else:
        raise IOError("filepath: [%s] does not exist." % fp)
    return True
예제 #6
0
def partial_bicorr(data: pd.DataFrame,
                   x: str,
                   y: str,
                   covar: Union[str, List[str], Tuple[str, ...], pd.Index],
                   method: str = "spearman",
                   tail: str = "two-sided",
                   output: str = 'score') -> Union[float, dict]:
    """Partial and semi-partial correlation.

    Adapted from the `pingouin` library, made by Raphael Vallat.

    .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py

    Parameters
    ----------
    data : pd.DataFrame
        The full dataset including covariates.
    x, y : str, list of str
        x and y. Must be names of columns in ``data``.
    covar : list of str
        Covariate(s). Column names of the covariates.
            covar must be made of continuous columns.
            If x, y are not continuous, will perform logistic regression
            to generate residuals.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::
        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'biserial' : Biserial correlation (continuous and boolean data)
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)
    tail : string
        Specify whether to return the 'one-sided' or 'two-sided' p-value.
    output : str, default='score'
        Determines whether to display the full output or
            just the correlation (r) score
            options are {'score', 'full'}.

    Returns
    -------
    stats : float/dict
        Test summary ::
        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'method' : pearson/spearman/biserial... etc
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).
    Notes
    -----
    From [4]_:
    “With *partial correlation*, we find the correlation between :math:`x`
    and :math:`y` holding :math:`C` constant for both :math:`x` and
    :math:`y`. Sometimes, however, we want to hold :math:`C` constant for
    just :math:`x` or just :math:`y`. In that case, we compute a
    *semi-partial correlation*. A partial correlation is computed between
    two residuals. A semi-partial correlation is computed between one
    residual and another raw (or unresidualized) variable.”
    Note that if you are not interested in calculating the statistics and
    p-values but only the partial correlation matrix, a (faster)
    alternative is to use the :py:func:`pingouin.pcorr` method (see example 4).
    Rows with missing values are automatically removed from data. Results have
    been tested against the `ppcor` R package.
    References
    ----------
    .. [2] https://en.wikipedia.org/wiki/Partial_correlation
    .. [3] https://cran.r-project.org/web/packages/ppcor/index.html
    .. [4] https://gist.github.com/fabianp/9396204419c7b638d38f
    .. [5] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html
    """
    # perform all checks in the public method..
    instance_check(data, pd.DataFrame)
    instance_check((x, y), str)
    instance_check(covar, (str, list, tuple, pd.Index))
    belongs(tail, ("one-sided", "two-sided"))
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ('score', 'full'))
    # perform a check to make sure every column in `covar`
    # is continuous.
    if not is_dataframe_float(data[covar]):
        raise TypeError("`covar` variables in `partial_bicorr` "
                        "all must be of type `float`/continuous.")

    return _partial_bicorr_inner(data,
                                 x,
                                 y,
                                 covar,
                                 tail=tail,
                                 method=method,
                                 output=output)
예제 #7
0
def bicorr(x: pd.Series,
           y: pd.Series,
           method: str = "spearman",
           tail: str = "two-sided",
           output: str = "score") -> Union[float, dict]:
    """(Robust) correlation between two variables.

    Adapted from the `pingouin` library, made by Raphael Vallat.

    .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py

    Parameters
    ----------
    x, y : pd.Series
        First and second set of observations. x and y must be independent.
    method : str
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::
        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'biserial' : Biserial correlation (continuous and boolean data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)
    tail : str
        Specify whether to return 'one-sided' or 'two-sided' p-value.
    output : str, default='score'
        Determines whether to display the full output or
            just the correlation (r) score
            options are {'score', 'full'}.

    Returns
    -------
    stats : float/dict
        Test summary ::
        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'method' : pearson/spearman/biserial... etc
        'p-val' : one or two tailed p-value
        'power' : achieved power of the test (= 1 - type II error).
    Notes
    -----
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Correlations of -1 or +1 imply
    an exact linear relationship.
    The Spearman correlation is a nonparametric measure of the monotonicity of
    the relationship between two datasets. Unlike the Pearson correlation,
    the Spearman correlation does not assume that both datasets are normally
    distributed. Correlations of -1 or +1 imply an exact monotonic
    relationship.
    Kendall’s tau is a measure of the correspondence between two rankings.
    Values close to 1 indicate strong agreement, values close to -1 indicate
    strong disagreement.
    The percentage bend correlation [1]_ is a robust method that
    protects against univariate outliers.
    The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust
    methods that returns the Spearman's rho after bivariate outliers removal.
    Note that the skipped correlation requires that the scikit-learn
    package is installed (for computing the minimum covariance determinant).
    Please note that rows with NaN are automatically removed.
    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395
    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200
    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119
    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606
    """
    # perform all checks in the public method.. rather than repeating them internally.
    # check type
    instance_check((x, y), pd.Series)
    belongs(tail, ("one-sided", "two-sided"))
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ('score', 'full'))
    # Check size
    if x.shape[0] != y.shape[0]:
        raise ValueError("x and y must have the same length.")

    if output == "score":
        return _bicorr_inner_score(x, y, method)
    else:
        return _bicorr_inner_full(x, y, method, tail=tail)
예제 #8
0
def hist_grid(mdf: Union[DataFrame, "MetaPanda"],
              subset: SelectorType,
              arrange: str = "square",
              plot_size: int = 3,
              shared_dist: str = "auto",
              savepath: Optional[Union[str, bool]] = None,
              **hist_kws):
    """
    Plots a grid of histograms comparing the distributions in a MetaPanda
    selector.

    Parameters
    --------
    mdf : turb.MetaPanda
        The dataset
    subset : str or list/tuple of str
        Contains either types, meta column names, column names or regex-compliant strings
    arrange : str
        Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row
        prioritises plots row-like, and column-wise for column.
    plot_size : int, default=3
        The size of each axes
    shared_dist : str/tuple of str/dict, default="auto"
        Determines what KDE to fit to the data, set to None if you don't want
        If tuple/list: attempts using these specified distributions
        If dict: maps column name (k) to distribution choice (v)
    savepath : None, bool, str
        saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved.

    Other Parameters
    ----------------
    hist_kws : dict
        Keywords to pass to `turb.plot.histogram`

    Returns
    -------
    None
    """
    # checks
    instance_check(shared_dist, (type(None), str, list, tuple, dict))
    instance_check(savepath, (type(None), str, bool))
    nonnegative(plot_size, int)
    belongs(arrange, ["square", "row", "column"])
    # make a metapanda if we have a dataframe.
    _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf

    # get selector
    selection = _mdf.view(subset)
    # assuming we've selected something...
    if selection.size > 0:
        fig, axes = gridplot(len(selection), arrange, ax_size=plot_size)

        if not isinstance(shared_dist, dict):
            for i, x in enumerate(selection):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=shared_dist,
                              **hist_kws)
            fig.tight_layout()
        else:
            for i, (x, d) in enumerate(shared_dist.items()):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=d,
                              **hist_kws)
            # iterate over any 'remaining' columns in selection and handle appropriately
            remaining = difference(selection, tuple(shared_dist.keys()))
            if remaining.shape[0] > 0:
                for i, x in enumerate(remaining):
                    _ = histogram(_mdf[x].dropna(),
                                  ax=axes[i + len(shared_dist)],
                                  title=x,
                                  kde="auto",
                                  **hist_kws)
            fig.tight_layout()

        if isinstance(savepath, bool):
            save(fig, "hist", _mdf.name_)
        elif isinstance(savepath, str):
            save(fig, "hist", _mdf.name_, fp=savepath)
예제 #9
0
def best_model(cv_results,
               y_var: str = "test",
               minimize: bool = True,
               score: str = "RMSE",
               **box_kws):
    """Determines the best model (min or max) and plots the boxplot of all resulting best models.

    Parameters
    ----------
    cv_results : MetaPanda
        The results from a call to `fit_grid`.
    y_var : str
        Choose from {'test', 'train'}
        If 'test': draws the test score
        If 'train': draws the training score
    minimize : bool
        If True, selects best smallest score, else select best largest score
    score : str
        The name of the scoring function
    box_kws : dict, optional
        Keyword arguments to pass to `plt.boxplot`.

    Returns
    -------
    fig : matplotlib.figure
        The figure object
    """
    instance_check(minimize, bool)
    instance_check(score, str)
    belongs(y_var, ("train", "test"))

    sely = pattern("mean_%s_score" % y_var, cv_results.columns, False)
    # create figures
    fig = plt.figure(figsize=(8, 5))
    ax = fig.add_subplot(111)
    # create a copy
    res = cv_results.df_ if not isinstance(cv_results,
                                           pd.DataFrame) else cv_results
    # transform.
    if res[sely].squeeze().mean() < 0.0:
        res = res.pipe(absolute, "(?:split[0-9]+|mean)_(?:train|test)_score")
    # for each 'model', arrange data into boxplot
    if minimize:
        indices = res.groupby("model")[sely].idxmin()
    else:
        indices = res.groupby("model")[sely].idxmax()
    # arrange data
    result_p = res.df_.loc[indices, res.view("split[0-9]+_%s_score" % y_var)]
    # reorder based on the best score
    re_order = result_p.median(axis=1).sort_values()
    result_p = result_p.reindex(re_order.index)
    # get best score name
    indices = switcheroo(indices).reindex(re_order.index)
    # plot
    bp = ax.boxplot(result_p, patch_artist=True, showfliers=False, **box_kws)
    # fetch package names and map them to colors - returned as pd.Series
    packages = find_model_family(indices.values)
    # map colors to each of the packages.
    mapping = dictzip(set_like(packages),
                      color_qualitative(len(set_like(packages))))
    mapped_cols = packages.map(mapping)
    # iterate over boxes and colour
    for box, col in zip(bp["boxes"], mapped_cols):
        box.set(facecolor=col, linewidth=1.2)
    plt.setp(bp["medians"], linewidth=1.5)
    # additional box requirements
    ax.set_xlabel("Model")
    ax.set_ylabel("%s %s" % (y_var, score))
    ax.set_xticklabels(indices.values)
    ax.tick_params("x", rotation=45)
    ax.grid()
    for tick in ax.get_xmajorticklabels():
        tick.set_horizontalalignment("right")
    # generate legend
    ax.legend(legend_line(mapping),
              list(mapping.keys()),
              bbox_to_anchor=(1.03, 1.03))
    plt.show()

    return fig
예제 #10
0
def cachedec(
    _func=None,
    *,
    filename: str = "example1.pkl",
    compress: int = 0,
    return_as: str = "MetaPanda"
) -> Callable:
    """Provides automatic decorator caching for objects.

    Especially compatible with `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: this is a decorator function, not to be called directly. All parameters
    must be passed as keyword arguments.

    Parameters
    ----------
    _func
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
         Accepts {'json', 'csv', 'pkl'} extensions only.
    compress : int [0-9] or 2-tuple, optional
        Optional compression level for the data. 0 or False is no compression.
        Higher value means more compression, but also slower read and
        write times. Using a value of 3 is often a good compromise.
        See the notes for more details.
        If compress is True, the compression level used is 3.
        If compress is a 2-tuple, the first element must correspond to a string
        between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma'
        'xz'), the second element must be an integer from 0 to 9, corresponding
        to the compression level.
    return_as : str, default="MetaPanda"
        Accepts {'pandas', 'MetaPanda'}
        Only applies if filename is "csv" or "json". Attempts to cast the return object
        as something palatable to the user.

    Warnings
    --------
    ImportWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv', 'pkl'}

    Returns
    -------
    mp : turb.MetaPanda / object
        The MetaPanda object if {'csv' or 'json'}, otherwise uses
        serialized pickling which can return an arbritrary object.

    Examples
    --------
    For example, we call as a decorator to our custom function:
    >>> from turbopanda import cachedec
    >>> @cachedec('meta_file.json')
    >>> def f(x):
    ...     return turb.MetaPanda(x)
    These also work with numpy arrays or python objects by using `joblib`:
    >>> @cachedec("meta.pkl")
    >>> def g(x):
    ...     return [1, 2, [3, 4], {"hi":"moto"}]
    """
    # check it is string
    instance_check(filename, str)
    file_ext = filename.rsplit(".")[-1]
    # check that file ends with json or csv
    belongs(file_ext, ("json", "csv", "pkl"))

    # define decorator
    def _decorator_cache(func):
        """Basic decorator."""

        @functools.wraps(func)
        def _wrapper_cache(*args, **kwargs):
            # if we find the file
            if os.path.isfile(filename):
                # if its .csv or .json, use `read`
                if file_ext in ("json", "csv"):
                    # read it in
                    mdf = read(filename)
                    _set_index_def(mdf.df_)
                    if return_as == "MetaPanda":
                        return mdf
                    else:
                        return mdf.df_
                else:
                    if is_joblib_installed(raise_error=True):
                        import joblib

                        mdf = joblib.load(filename)
                        return mdf
            else:
                # returns MetaPanda or pandas.DataFrame
                mpf = func(*args, **kwargs)
                if isinstance(mpf, MetaPanda):
                    # save file
                    mpf.write(filename)
                    if return_as == "MetaPanda":
                        return mpf
                    else:
                        return mpf.df_
                elif isinstance(mpf, DataFrame):
                    # save - bumping index into the file.
                    mpf.reset_index().to_csv(filename, index=None)
                    if return_as == "MetaPanda":
                        return MetaPanda(mpf)
                    else:
                        return mpf
                else:
                    if is_joblib_installed(raise_error=True):
                        import joblib
                        # attempt to use joblib to dump
                        joblib.dump(mpf, filename, compress=compress)
                        return mpf

        return _wrapper_cache

    if _func is None:
        return _decorator_cache
    else:
        return _decorator_cache(_func)
예제 #11
0
def scatter(X: _ArrayLike,
            Y: _ArrayLike,
            c: Union[str, _ArrayLike] = "k",
            marker: Union[str, _ArrayLike] = "o",
            s: Optional[Union[_Numeric, _ArrayLike]] = None,
            dense: bool = False,
            fit_line: bool = False,
            ax: Optional[mpl.axes.Axes] = None,
            alpha: Optional[float] = None,
            cmap: str = "viridis",
            legend: bool = True,
            colorbar: bool = True,
            with_jitter: bool = False,
            x_label: Optional[str] = None,
            y_label: Optional[str] = None,
            x_scale: str = "linear",
            y_scale: str = "linear",
            legend_outside: bool = False,
            title: str = "",
            with_grid: bool = False,
            fit_line_degree: int = 1,
            **scatter_kws):
    """Generates a scatterplot, with some useful features added to it.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the x-axis. Flattens if np.ndarray
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the y-axis. Flattens if np.ndarray
    c : str/list/tuple/np.ndarray/pd.Series (1d), default='blue'
        The colour of the points.
        If array, colors must be a categorical/valid float type, uses cmap
    marker : str/list/tuple/np.ndarray/pd.Series (1d), default='o'
        The marker style of the points.
        If type=list/array, array must be a categorical/str-like type to map to matplotlib markers
        If dense=True, treats each marker as a circle, ignores this input
    s : int/float/list/tuple/np.ndarray/pd.Series (1d), optional
        Size of each point.
        If dense=True, this value is set automatically.
        If type=list/array, array must be array of floats
    dense : bool
        If True, draws the uniform densities instead of the actual points
    fit_line : bool
        If True, draws a line of best fit on the data
    ax : matplotlib.ax.Axes, optional, default=None
        If None, creates one.
    alpha : float, optional
        Sets the alpha for colour. If dense is True, this value is set automatically
    cmap : str, default="viridis"
        The default colormap for continuous-valued c.
    legend : bool, default=True
        Draws a legend if the 'c' variable is discrete
    colorbar : bool, default=True
        Draws a colorbar if the 'c' variable is continuous
    with_jitter : bool, default=False
        If True, and dense=True, adds some jitter to the uniform points
    x_label : str, default="x-axis"
        If X is not a pandas.Series, this is used
    y_label : str, default="y-axis"
        If Y is not a pandas.Series, this is used
    x_scale : str, default="linear"
        Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_xscale`
    y_scale : str, default="linear"
        Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_yscale`
    legend_outside : bool, default=False
        If True, plots the legend outside the plot at (1, 1)
    title : str, default=""
        Optional title at the top of the axes
    with_grid : bool, default=False
        If True, draws a grid
    fit_line_degree : int, default=1
        If fit_line=True, Determines the degree to which a line is fitted to the data,
         allows polynomials

    Other Parameters
    ----------------
    scatter_kws : dict
        Keyword arguments to pass to `ax.scatter`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-scatter
    """

    instance_check((X, Y), (list, tuple, np.ndarray, Series))
    instance_check((c, marker), (str, list, tuple, np.ndarray, Series, Index))
    instance_check(
        s, (type(None), int, float, list, tuple, np.ndarray, Series, Index))
    instance_check(alpha, (type(None), float))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check(
        (dense, with_jitter, fit_line, with_grid, legend, legend_outside),
        bool)
    instance_check((x_label, y_label, title, x_scale, y_scale),
                   (type(None), str))
    instance_check(fit_line_degree, int)

    arrays_equal_size(X, Y)
    if isinstance(marker, str):
        belongs(marker, _marker_set())

    # get subset where missing values from either are dropped
    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    # remove values not found in both.
    _X, _Y = remove_na(_X, _Y, paired=True)

    # warn the user if n is large to maybe consider dense option?
    if _X.shape[0] > 15000 and not dense:
        warn(
            "Data input n={} is large, consider setting dense=True or using function `scatter_slim`."
            .format(X.shape[0]),
            UserWarning,
        )

    # reconfigure colors if qualitative
    if isinstance(s, (list, tuple)) and not dense:
        s = as_flattened_numpy(s)
        arrays_equal_size(X, Y, s)
    if isinstance(marker, (list, tuple)) and not dense:
        marker = np.asarray(marker)
        arrays_equal_size(X, Y, marker)

    if not isinstance(c, str):
        # do some prep work on the color variable.
        palette, _cmode = cat_array_to_color(c, cmap=cmap)
        # perform size check
        arrays_equal_size(X, Y, palette)
    else:
        palette = c
        _cmode = "static"

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    if dense:
        # alpha, s are set in this case
        alpha = 0.8
        marker = "o"
        # perform density plotting
        bins_x = min(freedman_diaconis_bins(_X), 50)
        bins_y = min(freedman_diaconis_bins(_Y), 50)
        # estimate counts using histogram2d
        s, xs, ys = np.histogram2d(_X, _Y, bins=(bins_x, bins_y))
        # create a mesh
        xp, yp = np.meshgrid(xs[:-1], ys[:-1])
        if with_jitter:
            xp += np.random.rand(*xp.shape) / (_X.max() - _X.min())
            yp += np.random.rand(*yp.shape) / (_Y.max() - _Y.min())
    else:
        if alpha is None:
            alpha = _select_best_alpha(_X.shape[0])
        if s is None:
            s = _select_best_size(_X.shape[0])
        xp = _X
        yp = _Y

    # draw
    _ = _draw_scatter(xp,
                      yp,
                      palette,
                      s,
                      marker,
                      alpha,
                      ax,
                      cmap=cmap,
                      **scatter_kws)

    # optionally fit a line of best fit
    if fit_line:
        _draw_line_best_fit(_X, _Y, palette, ax, fit_line_degree)

    if with_grid:
        ax.grid()

    # associate legend if colour map is used
    if _cmode == "discrete" and legend:
        map_legend(c, palette, marker, ax, legend_outside)
    elif _cmode == "continuous" and colorbar:
        # add colorbar
        _make_colorbar(c, ax, cmap)

    # apply x-label, y-label, title
    if isinstance(x_label, str):
        ax.set_xlabel(x_label)
    elif isinstance(X, Series):
        ax.set_xlabel(X.name)

    if isinstance(y_label, str):
        ax.set_ylabel(y_label)
    elif isinstance(Y, Series):
        ax.set_ylabel(Y.name)

    ax.set_xscale(x_scale)
    ax.set_yscale(y_scale)
    ax.set_title(title)

    return ax
예제 #12
0
def bar1d(
    X: _ArrayLike,
    Y: Optional[_ListLike] = None,
    c: Optional[Union[_ArrayLike, str]] = "k",
    vert: bool = True,
    sort: bool = True,
    ax: Optional[mpl.axes.Axes] = None,
    scale: str = "linear",
    annotate: bool = False,
    legend: bool = False,
    width: float = 0.8,
    label_rotation: float = 0.0,
    value_label: Optional[str] = None,
    sort_by: str = "values",
    cmap: str = "Blues",
    linesAt: Optional[Union[_Numeric, _ListLike]] = None,
):
    """Plots a 1 dimensional barplot.

    Parameters
    ----------
    X : list, tuple, np.ndarray, pd.Series
        Categorical/string/time labels for the data.
        If pandas.Series, Index must be categorical, Values must be numeric.
    Y : list, tuple, np.ndarray, optional
        If None, X must be a pd.Series. Must be numeric dtype.
    c : str/list/tuple/np.ndarray/pd.Series (1d), optional
        Defines the colour of each bar.
        If str, colours all of the bars with the same
        If array, must be a categorical type.
        If None, uses an automatic qualitative palette
    vert : bool, default=True
        Determines whether the plot is vertical or horizontal
    sort : bool, default=True
        Sorts the data or labels
    ax : matplotlib.ax.Axes, optional, default=None
        If None, creates one.
    scale : str, default="linear"
        Determines how to scale the numeric axis.
    annotate : bool, default=False
        Determines whether values should be annotated
    legend : bool, default=False
        Choose whether to display a legend
    width : float, default=0.8
        The width of each bar in the barplot
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    value_label : str, optional
        Defines a name for the numerical axis
    sort_by : str, default="values"
        Defines how to sort the data if sort=True.
        Choose from {'values', 'labels'}
    cmap : str, default="Blues"
        Defines a colormap if color values are specified
    linesAt : int, float, list, tuple, optional
        If set, defines one or more vertical lines to add to the barplot

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot
    """
    # define plot if not set

    belongs(sort_by, ("values", "labels"))

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    # X is either numerical (in which case there is no Y, or categorical labels)
    if Y is None:
        # in this case, X must contain all the data
        if isinstance(X, pd.Series):
            _labels = as_flattened_numpy(X.index)
            _values = as_flattened_numpy(X.values)
            _ticks = np.arange(X.shape[0])
            value_label = X.name
        else:
            _labels = _ticks = np.arange(len(X))
            _values = as_flattened_numpy(X)
    else:
        # X is labels, Y are numeric values (assume!)
        _labels = as_flattened_numpy(X)
        _values = as_flattened_numpy(Y)
        _ticks = np.arange(_labels.shape[0])

    # sort out colour
    pal = _determine_color_palette(c, _ticks.shape[0], cmap)

    # perform sorting here
    if sort:
        if sort_by == "values":
            _order = np.argsort(_values)
        elif sort_by == "labels":
            _order = np.argsort(_labels)
        else:
            raise ValueError(
                "sort_by '{}': must be ['values', 'labels']".format(sort_by)
            )
        # apply sort
        if not isinstance(c, (type(None), str)):
            _labels, _values, pal = _apply_data_sort(_order, _labels, _values, pal)
        else:
            _labels, _values = _apply_data_sort(_order, _labels, _values)

    # plot the bar
    _plot_bar_orient(
        ax,
        _ticks,
        _labels,
        _values,
        c=pal,
        w=width,
        vert=vert,
        lrot=label_rotation,
        annotate=annotate,
        lines=linesAt,
        vlabel=value_label,
    )
    # orient scale
    if vert:
        ax.set_yscale(scale)
    else:
        ax.set_xscale(scale)

    # map a legend to it
    if legend and not isinstance(c, str):
        map_legend(c, pal, "o", ax, False)

    return ax
예제 #13
0
 def test_belongs2(self):
     l1 = ['apples', 'oranges', 'pears']
     assert utils.belongs("apples", l1)
예제 #14
0
 def test_belongs1(self, s):
     l1 = ['apples', 'oranges', 'pears']
     with pytest.raises(ValueError):
         assert utils.belongs(s, l1)
예제 #15
0
def gridplot(
    n_plots: int,
    arrange: str = "square",
    ax_size: Union[int, Tuple[int, int]] = 2,
    annotate_labels: bool = False,
    annotate_offset: float = 0.01,
    **annotate_args
):
    """Determines the most optimal shape for a set of plots.

    Parameters
    ----------
    n_plots : int
        The total number of plots.
    arrange : str, default="square"
        Choose from {'square', 'row' 'column'}. Indicates preference for direction of plots.
    ax_size : int, default=2
        The square size of each plot.
    annotate_labels : bool, default=False
        If True, adds A, B,.. K label to top-left corner of each axes.
    annotate_offset : float, default=0.01
        Determines the amount of offset for each label

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure
    axes : list of matplotlib.ax.Axes
        A list of axes to use.
    """
    instance_check(annotate_labels, bool)
    nonnegative((n_plots,), int)
    belongs(arrange, ["square", "row", "column"])

    annot_props = {
        "weight": "bold",
        "horizontalalignment": "left",
        "verticalalignment": "center",
    }
    # update with args
    annot_props.update(annotate_args)
    if isinstance(ax_size, int):
        fs = np.array([ax_size, ax_size])
    else:
        fs = np.array(ax_size)

    if n_plots == 1:
        fig, ax = plt.subplots(figsize=fs)  #
        # wrap ax as a list to iterate over.
        if annotate_labels:
            fig.text(0.01, 0.98, "A", **annot_props)
        return fig, [ax]
    else:
        fig, ax = (
            _generate_square_like_grid(n_plots, ax_size=fs)
            if arrange == "square"
            else _generate_diag_like_grid(n_plots, arrange, ax_size=fs)
        )
        # add annotation labels, hmmm
        if annotate_labels:
            # we use tight layout to make sure text isnt overlapping
            fig.tight_layout()
            for a, n in zip(ax, string.ascii_uppercase):
                pos_ = a.get_position().bounds
                # add label
                fig.text(
                    pos_[0] - annotate_offset,
                    pos_[1] + pos_[3] + annotate_offset,
                    n,
                    **annot_props
                )
        return fig, ax
예제 #16
0
def cached_chunk(
    func: Callable,
    param_name: str,
    param_values: Union[List, Tuple],
    parallel: bool = True,
    filename: str = "example1.json",
    verbose: int = 0,
    *args,
    **kwargs
) -> "MetaPanda":
    """Provides chunked automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object.

    Parameters
    --------
    func : function
        A custom function returning the pd.DataFrame/MetaPanda
    param_name : str
        The keyword name of the parameter in question to iterate over
    param_values : list/tuple of something
        The values associated with the parameter to iterate over
    parallel : bool, default=True
        Determines whether to use `joblib` to compute independent chunks in parallel or not
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
        Accepts {'json', 'csv'} formats.
    verbose : int, optional
        If > 0, prints out useful information
    *args : list, optional
        Arguments to pass to function(...)
    **kwargs : dict, optional
        Keyword arguments to pass to function(...)

    Warnings
    --------
    FutureWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv'}

    Returns
    -------
    mp : MetaPanda
        The MetaPanda object

    See Also
    --------
    cached : Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.
    """
    # check it is string
    instance_check(filename, str)
    instance_check(param_name, str)
    instance_check(param_values, (list, tuple, dict))

    if not callable(func):
        raise ValueError("function is not callable")
    # check that file ends with json or csv
    belongs(filename.rsplit(".", 1)[-1], ("json", "csv"))

    # if the final file exists, perform as normal.
    if os.path.isfile(filename):
        if verbose > 0:
            print("reading in cached file: {}".format(filename))
        # read it in
        mdf = read(filename)
        _set_index_def(mdf.df_)
        return mdf
    else:
        # create a bunch of chunks by repeatedly calling cache.
        if parallel:
            _mdf_chunks = joblib.Parallel(joblib.cpu_count())(
                joblib.delayed(cached)(
                    func,
                    insert_suffix(filename, "_chunk%d" % i),
                    verbose=verbose,
                    *args,
                    **dictcopy(kwargs, {param_name: chunk})
                ).df_
                for i, chunk in enumerate(param_values)
            )
        else:
            _mdf_chunks = [
                cached(
                    func,
                    insert_suffix(filename, "_chunk%d" % i),
                    verbose=verbose,
                    *args,
                    **dictcopy(kwargs, {param_name: chunk})
                ).df_
                for i, chunk in enumerate(param_values)
            ]
        # join together the chunks
        mpf = _stack_rows(_mdf_chunks)
        # save file - return type must be a MetaPanda or error occurs!
        mpf.write(filename)
        # now delete the 'chunked' files.
        for i in range(len(param_values)):
            os.remove(insert_suffix(filename, "_chunk%d" % i))

        return mpf
예제 #17
0
def scatter_grid(
    mdf: Union[DataFrame, "MetaPanda"],
    x: SelectorType,
    y: SelectorType,
    arrange: str = "square",
    plot_size: int = 3,
    best_fit: bool = True,
    best_fit_deg: int = 1,
    savepath: Optional[Union[bool, str]] = None,
):
    """
    Plots a grid of scatter plots comparing each column for MetaPanda
    in selector to y target value.

    Parameters
    --------
    mdf : turb.MetaPanda
        The dataset
    x : str or list/tuple of str
            Contains either types, meta column names, column names or regex-compliant strings
    y : str or list/tuple of str
            Contains either types, meta column names, column names or regex-compliant strings
    arrange : str
        Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row
        prioritises plots row-like, and column-wise for column.
    plot_size : int
        The size of each axes
    best_fit : bool
        If True, draws a line of best fit
    best_fit_deg : int, default=1
        The degree of the line of best fit, can draw polynomial
    savepath : None, bool, str
        saves the figure to file. If bool, uses the name in mdf, else uses given string.

    Returns
    -------
    None
    """
    from turbopanda.corr import bicorr

    # checks
    instance_check((plot_size, best_fit_deg), int)
    instance_check(savepath, (type(None), str, bool))
    instance_check(best_fit, bool)
    nonnegative((
        best_fit_deg,
        plot_size,
    ))
    belongs(arrange, ["square", "row", "column"])

    # make a metapanda if we have a dataframe.
    _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf

    # get selector
    x_sel = _mdf.view(x)
    y_sel = _mdf.view(y)
    # create a product between x and y and plot
    prod = list(it.product(x_sel, y_sel))

    if len(prod) > 0:
        fig, axes = gridplot(len(prod), arrange, ax_size=plot_size)
        for i, (_x, _y) in enumerate(prod):
            # pair x, y
            __x, __y = remove_na(_mdf[_x].values, _mdf[_y].values, paired=True)
            axes[i].scatter(__x.flatten(), __y, alpha=0.5)
            # line of best fit
            if best_fit:
                xn = np.linspace(__x.min(), __x.max(), 100)
                z = np.polyfit(__x.flatten(), __y, deg=best_fit_deg)
                axes[i].plot(xn, np.polyval(z, xn), "k--")

            # spearman correlation
            pair_corr = bicorr(_mdf[_x], _mdf[_y]).loc["spearman", "r"]
            axes[i].set_title("r={:0.3f}".format(pair_corr))
            axes[i].set_xlabel(_x)
            axes[i].set_ylabel(_y)

        fig.tight_layout()

        if isinstance(savepath, bool):
            save(fig, "scatter", _mdf.name_)
        elif isinstance(savepath, str):
            save(fig, "scatter", _mdf.name_, fp=savepath)
예제 #18
0
def cached(
    func: Callable, filename: str = "example1.json", verbose: int = 0, *args, **kwargs
) -> "MetaPanda":
    """Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: this is a direct-call cache function. Not cached.

    For example, we call `cached` as a wrapper to our custom function:
    >>> import turbopanda as turb
    >>> def f(x):
    ...     return turb.MetaPanda(x)
    >>> data = cached(f, 'meta_file.json')

    .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object.

    Parameters
    --------
    func : function
        A custom function returning the pd.DataFrame/MetaPanda
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
        Accepts {'json', 'csv'} formats.
    verbose : int, optional
        If > 0, prints out useful information
    *args : list, optional
        Arguments to pass to function(...)
    **kwargs : dict, optional
        Keyword arguments to pass to function(...)

    Warnings
    --------
    FutureWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv'}

    Returns
    -------
    mp : MetaPanda
        The MetaPanda object

    See Also
    --------
    cache : Provides automatic {.json, .csv} decorator caching for `turb.MetaPanda` or `pd.DataFrame`.
    """
    # check it is string
    instance_check(filename, str)
    instance_check(verbose, int)
    instance_check(func, "__call__")

    # check that file ends with json or csv
    belongs(filename.rsplit(".", 1)[-1], ("json", "csv"))

    if os.path.isfile(filename):
        if verbose > 0:
            print("reading in cached file: {}".format(filename))
        # read it in
        mdf = read(filename)
        _set_index_def(mdf.df_)
        return mdf
    else:
        if verbose > 0:
            print("running function '{}' for cache".format(func.__name__))
        # returns MetaPanda or pandas.DataFrame
        mpf = func(*args, **kwargs)
        if isinstance(mpf, MetaPanda):
            # save file
            mpf.write(filename)
            return mpf
        elif isinstance(mpf, DataFrame):
            # save - bumping index into the file.
            mpf.reset_index().to_csv(filename, index=None)
            return MetaPanda(mpf)
        else:
            if verbose > 0:
                print(
                    "returned object from cache not of type [DataFrame, MetaPanda], not cached"
                )
            return mpf
예제 #19
0
def correlate(
        data: Union[pd.DataFrame, MetaPanda],
        x: Optional[SelectorType] = None,
        y: Optional[SelectorType] = None,
        covar: Optional[SelectorType] = None,
        cartesian_covar: bool = False,
        output: str = "full",
        method: str = "spearman",
        verbose: int = 0,
) -> pd.DataFrame:
    """Correlates X and Y together to generate a list of correlations.

    If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame

    Parameters
    ---------
    data : pd.DataFrame / MetaPanda
        The full dataset.
    x : (str, list, tuple, pd.Index), optional
        Subset of input(s) for column names.
            if None, uses the full dataset. Y must be None in this case also.
    y : (str, list, tuple, pd.Index), optional
        Subset of output(s) for column names.
            if None, uses the full dataset (from optional `x` subset)
    covar : (str, list, tuple, pd.Index), optional
        set of covariate(s). Covariates are needed to compute partial correlations.
            If None, uses standard correlation.
    cartesian_covar : bool, default=False
        If True, and if covar is not None, separates every
            element in covar to individually control for
        using the cartesian product
    output : str, default="full"
        Choose from {'full', 'score'}. Score just returns `r` number.
    method : str, default="spearman"
        Method to correlate with. Choose from:
            'pearson' : Pearson product-moment correlation
            'spearman' : Spearman rank-order correlation
            'kendall' : Kendall’s tau (ordinal data)
            'biserial' : Biserial correlation (continuous and boolean data only)
            'percbend' : percentage bend correlation (robust)
            'shepherd' : Shepherd's pi correlation (robust Spearman)
            'skipped' : skipped correlation (robust Spearman, requires sklearn)
    verbose : int, default=0
        If > 0, prints out useful debugging messages

    Returns
    -------
    R : pd.DataFrame
        correlation rows (based on pingouin structure)

    Examples
    --------
    >>> import turbopanda as turb
    >>> data = turb.read('example.json')
    >>> R = turb.correlate(data) # uses full dataset
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000
    >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset
                 X         M         Y
    X     1.000000  0.392251  0.059771
    M     0.392251  1.000000  0.545618
    Y     0.059771  0.545618  1.000000

    # correlates X columns against Ybin
    >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin')
                    X         M         Y
    Ybin     1.000000  0.392251  0.059771

    # correlates X against Ybin controlling for
    >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y
                     X
    Ybin     -0.149210

    # using a different technique
    >>>  R = turb.correlate(data, method="shepherd")
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000
    """

    # data cannot be NONE
    instance_check(data, (pd.DataFrame, MetaPanda))
    instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index))
    instance_check(cartesian_covar, bool)
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ("full","score"))
    bounds_check(verbose, 0, 4)

    # downcast to dataframe option
    df = data.df_ if not isinstance(data, pd.DataFrame) else data
    # downcast if list/tuple/pd.index is of length 1
    x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x
    y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y

    # convert using `view` if we have string instances.
    if isinstance(x, str):
        x = pattern(x, df.columns)
    if isinstance(y, str):
        y = pattern(y, df.columns)
    if isinstance(covar, str):
        covar = pattern(covar, df.columns)

    # perform a check to make sure every column in `covar` is continuous.
    if covar is not None:
        if not is_dataframe_float(data[covar]):
            raise TypeError(
                "`covar` variables in `correlate` all must be of type `float`/continuous."
            )

    # execute various use cases based on the presense of x, y, and covar, respectively.
    if x is None and y is None:
        # here just perform matrix-based correlation
        comb = it.combinations_with_replacement(df.columns, 2)
        niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and y is None:
        # use a subset of x, in union with covar
        comb = it.combinations_with_replacement(x, 2)
        niter = (len(x)**2) // 2 + (len(x) // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str):
        # list of x, y str -> matrix-vector cartesian product
        comb = it.product(x, [y])
        niter = len(x)
    elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str):
        # list of y, x str -> matrix-vector cartesian product
        comb = it.product(y, [x])
        niter = len(y)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(
            y, (list, tuple, pd.Index)
    ):
        # list of x, y -> cartesian product of x: y terms
        comb = it.product(x, y)
        niter = len(x) * len(y)
    else:
        raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar))
    # return the combination of these effects.
    return _corr_combination(
        df, comb, niter, covar, cartesian_covar, method, output, verbose
    )