def all_low_cardinality_to_categorical(df: pd.DataFrame,
                                       threshold: float = 0.5) -> pd.DataFrame:
    """Casts all low cardinality columns to type 'category' """
    bounds_check(threshold, 0.0, 1.0)

    df_to_use = df.copy()
    transform_fn = lambda x: x.astype("category")
    n_entre = df_to_use.shape[0]
    # check to see that the condition actually has object types to convert.
    if df.select_dtypes(include=["object"]).shape[1] == 0:
        return df
        # objects = df_to_use.select_dtypes(include=["object"]).nunique()
        condition = lambda x: (x.select_dtypes(include=["object"]).nunique()[
            lambda y: y.div(n_entre).lt(threshold)]).index

        return _multi_assign(df_to_use, transform_fn, condition)
def learning(
    df: "MetaPanda",
    y: str,
    x: Optional[SelectorType] = None,
    train_n: Optional[np.ndarray] = None,
    permute_n: int = 0,
    cv: Tuple[int, int] = (5, 15),
    model: str = "LinearRegression",
    cache: Optional[str] = None,
    plot: bool = False,
    verbose: int = 0,
    """Fits a basic model to generate cross-validated training/test scores for different training set sizes.

    A cross-validation generator splits the whole dataset `k` times in training and test data. Subsets of the training set with
    varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed.
    Afterwards, the scores will be averaged over all `k` runs for each training subset size.

    df : MetaPanda (n_samples, n_features)
        The main dataset.
    y : str
        A selected y column.
    x : list/tuple of str/selector, optional
        A list of selected column names for x or MetaPanda `selector`.
    train_n : int/array-like, with shape (n_ticks,) dtype float or int, optional
        Relative or absolute numbers of training examples that will be used to generate
        learning curve related data.
        If None: uses `linspace(.1, .9, 8)`
        If int: uses `linspace(.1, .9, n)`
    permute_n : int (default 0)
        The number of times to permute y, if > 0, then does full permutation analysis (making 4th plot)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    model : str/estimator sklearn model that implements `fit` and `predict` methods
        The name of a scikit-learn model, or the model object itself.
    cache : str, optional
        TODO: Not Implemented yet.
        If not None, stores the resulting model parts in JSON and reloads if present.
    plot : bool, optional
        If True, produces `.plot.learning_curve` inplace.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Other Parameters
    model_kws : dict, optional
        Keywords to pass to the sklearn model which are not parameterized.

    results : MetaPanda (n_ticks, 8)
        The results matrix of mean and std scores
    permute_ : np.ndarray (permute_n,), optional
        The permutation scores associated with the permutation analysis

    Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted.

    By default, `fit_learning` uses the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    `permute_n` is set to 0 by default, if you want a permutation histogram, this value must be > 0.

    See Also
    fit_basic : Performs a rudimentary fit model with no parameter searching.
    fit_grid : Performs exhaustive grid search analysis on the models selected.

    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    # perform checks
    instance_check(df, pd.DataFrame, MetaPanda)
    instance_check(y, str)
    instance_check(train_n, (type(None), int, list, tuple, np.ndarray))
    instance_check(permute_n, int)
    instance_check(cv, (int, tuple))
    # instance_check(cache, (type(None), str))
    instance_check(plot, bool)
    bounds_check(verbose, 0, 4)

    # set dataset if a pandas object
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    # retrieve x columns if none
    _xcols = select_xcols(_df, x, y)
    k, repeats = cv if isinstance(cv, tuple) else cv, 1
    lm, pkg_name = find_sklearn_model(model, "regression")
    # assign keywords to lm

    if train_n is None:
        train_n = np.linspace(0.1, 0.9, 8)
    elif isinstance(train_n, int):
        train_n = np.linspace(0.1, 0.9, train_n)
    # ml ready
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    rep = RepeatedKFold(n_splits=k, n_repeats=repeats)
    vars_ = learning_curve(
    # permutation analysis if permute_n > 0
    if permute_n > 0:
        perm_score_, perm_scorez_, pval = permutation_test_score(

    # outputs
    output_labels_ = ["train_score", "test_score", "fit_time", "score_time"]
    # format as df
    results = pd.DataFrame(
        # stack them together
                np.stack([np.mean(vars_[i], axis=1) for i in range(1, 5)], axis=1),
                np.stack([np.std(vars_[i], axis=1) for i in range(1, 5)], axis=1),
                map(lambda s: "mean_" + s, output_labels_),
                map(lambda s: "std_" + s, output_labels_),
    # add N column
    results["N"] = vars_[0]
    R = MetaPanda(results)
    if plot and permute_n > 0:
        lcurve(R, perm_scorez_)
    elif plot:
    # return as MetaPanda
    if permute_n > 0:
        return R, perm_score_, perm_scorez_, pval
        return R
def grid(df: Union[pd.DataFrame, "MetaPanda"],
         y: str,
         x: Optional[SelectorType] = None,
         models=("Ridge", "Lasso"),
         cv: Union[int, Tuple[int, int]] = 5,
         cache: Optional[str] = None,
         plot: bool = False,
         chunks: bool = False,
         verbose: int = 0,
         **grid_kws) -> "MetaPanda":
    """Performs exhaustive grid search analysis on the models selected.

    This function aims to encapsulate much of the functionality associated around `GridSearchCV` class
    within scikit-learn. With in-built caching options, flexible selection of inputs and outputs with the
    MetaPanda class.

    df : pd.DataFrame/MetaPanda
        The main dataset.
    y : str
        A selected y column.
    x : list/tuple of str, optional
        A list of selected column names for x or MetaPanda `selector`.
    models : list/dict, default=["Ridge", "Lasso"]
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, default=5
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Other Parameters
    grid_kws : dict, optional
        Additional keywords to assign to GridSearchCV.

        If one of the parameters has wrong input type

    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    Notes ----- From version 0.2.3 the `chunks` argument allows for fitting by parts. This means that breaks
    throughout a large pipeline will result only in losses up to the previous chunk. Chunk files are saved as
    '%filename_chunk%i.csv' so beware of clashes. Make sure to set `chunks=True` and `cache=str` where the `models`
    parameter is time-expensive.

    By default, `grid` tunes using the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    See Also
    basic : Performs a rudimentary fit model with no parameter searching.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    # checks
    instance_check(df, (pd.DataFrame, MetaPanda))
    instance_check(x, (type(None), str, list, tuple, pd.Index))
    instance_check(y, str)
    instance_check(cv, (int, tuple))
    instance_check(cache, (type(None), str))
    instance_check((plot, chunks), bool)
    bounds_check(verbose, 0, 4)

    if is_sklearn_model(models):
        models = [models]
        if isinstance(models, tuple):
            models = list(models)
        instance_check(models, (list, dict))

    # set dataset if a pandas object
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    # retrieve x columns if none
    # set up cv, repeats
    k, repeats = cv if isinstance(cv, tuple) else cv, 1

    # do caching
    def _perform_fit(_df: MetaPanda, _x, _y, _k: int, _repeats: int, _models):
        rep = RepeatedKFold(n_splits=_k, n_repeats=_repeats)
        # the header is 'model_est'
        header = "model"
        # any basic regression model
        pipe = Pipeline([(header, LinearRegression())])
        # get paramgrid - the magic happens here!
        pgrid = make_parameter_grid(_models, header=header)
        # join default grid parameters to given grid_kws
        def_grid_params = {
            "scoring": "neg_root_mean_squared_error",
            "n_jobs": -2,
            "verbose": verbose,
            "return_train_score": True,
        # create gridsearch
        gs = GridSearchCV(pipe, param_grid=pgrid, cv=rep, **def_grid_params)
        # make ml ready
        __xnp, __y = preprocess_continuous_X_y(_df, _x, _y)
        # fit the grid - expensive.
        gs.fit(__xnp, __y)
        # generate result
        _result = pd.DataFrame(gs.cv_results_)
        # associate model column to respective results
        _result["model"] = _result["param_model"].apply(
            lambda f: str(f).split("(")[0])
        # set as MetaPanda
        _met_result = MetaPanda(_result)
        # cast down parameter columns to appropriate type
        _met_result.transform(pd.to_numeric, object, errors="ignore")
        return _met_result

    if cache is not None:
        if chunks:
            # if dictionary, we need to split this into 1-sized list/dict blocks.
            values = dictchunk(models, 1) if isinstance(models,
                                                        dict) else models
            _cv_results = cached_chunk(
            _cv_results = cache_f(
        _cv_results = _perform_fit(_df=_df,

    if plot:

    return _cv_results
def optimize(df: "MetaPanda",
             x: SelectorType,
             y: str,
             cv: int = 5,
             verbose: int = 0):
    """Performs optimization grid analysis on the models selected.

    This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model.

    .. note:: optimization only works on *continuous* parameters with each model.

    TODO: complete `.ml.fit.optimize` function

    df : MetaPanda
        The main dataset.
    x : list/tuple of str
        A list of selected column names for x or MetaPanda `selector`.
    y : str
        A selected y column.
    models : tuple/dict
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    By default, `optimize` tunes using the root mean squared error (RMSE).
       There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
        will arrive in a later version.

    See Also
    grid : Performs exhaustive grid search analysis on the models selected.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    # checks
    instance_check(df, MetaPanda)
    instance_check(x, (str, list, tuple, pd.Index))
    instance_check(y, str)
    nonnegative((cv, verbose), int)
    instance_check(models, (tuple, list, dict))
    bounds_check(verbose, 0, 4)

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)
    _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y)

    # define the parameter sets
    param_sets = make_optimize_grid(models)

    for m, params in zip(models, param_sets):
        model = find_sklearn_model(m)[0]
        inits, bounds = optimize_grid_for_model(params)
        # minimize for every i element
        mins = [
                args=(_xnp, _y, model, params, cv),
            ) for i in inits

def widebox(data: Union[List, np.ndarray, pd.DataFrame],
            colors: Optional[_ListLike] = None,
            measured: Optional[str] = None,
            ax: Optional[mpl.axes.Axes] = None,
            vert: bool = True,
            sort: bool = True,
            outliers: bool = True,
            notch: bool = False,
            with_strip: bool = False,
            capsize: float = 1.0,
            width: float = 0.7,
            grid: bool = True,
            title: Optional[str] = None,
            label_rotation: float = 0.0,
            label_max_length: int = 25,
            spines: Optional[_ListLike] = None,
            strip_jitter: float = 0.15,
    """Plots a 2D boxplot with data oriented in wide-form.

    data : list, np.ndarray or pd.DataFrame (2d)
        The raw data to plot as a box.
        If data is of type pd.DataFrame: columns represent X-axis
    colors : list, tuple, optional
        Represents colors for each x-variable
    measured : str, optional
        A name for the measured variable
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    vert : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    sort : bool, default=True
        Determines whether to sort the data by numerical value
    outliers : bool, default=True
        If True, displays fliers as outliers
    notch : bool, default=False
        Determines whether to draw a notched plot
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    capsize : float, default=1.0
        Defines the length of the caps
    width : float, default=0.7
        Determines the width/height of the box
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    title : str, optional
        Sets the title of the axes if a string is passed
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    strip_jitter : float, default=0.15
        With stripplot, defines the amount of jitter in the variables
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    See Also

    Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes
    instance_check(data, (list, np.ndarray, pd.DataFrame))
    instance_check((colors, spines), (type(None), list, pd.Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check((vert, sort, notch, outliers, grid, with_strip), bool)
    instance_check((capsize, width, strip_jitter, label_rotation),
                   (float, int))
    instance_check(theme, str)
    instance_check(label_max_length, int)
    bounds_check(width, 0.0, 1.0)
    bounds_check(strip_jitter, 0.0, 1.0)
    bounds_check(label_rotation, 0.0, 360.0)

    if isinstance(data, pd.DataFrame):
        # select float, int subset
        ss = data.select_dtypes(include=[float, int])
        _data = np.asarray(ss)
        _labels = ss.columns
    elif isinstance(data, (list, np.ndarray)):
        _data = np.asarray(data)
        _labels = None
        raise TypeError("data matrix is not of type np.ndarray")

    _style = _get_flier_style(theme)

    # negative-exponential increase in figure size with more features
    def _figure_spacing(x):
        return np.exp(-0.35 * x) * x

    if with_strip:
        outliers = False
    if ax is None and vert:
        fig, ax = plt.subplots(figsize=(2.5 + _figure_spacing(_data.shape[1]),
    elif ax is None and not vert:
        fig, ax = plt.subplots(figsize=(7,
                                        2.5 + _figure_spacing(_data.shape[1])))
    if spines is None:
        spines = ("left", "top", "right", "bottom")

    # sort the data by the mean if selected
    if sort:
        _order = np.argsort(np.mean(_data, axis=0))
        _data = _data[:, _order]
        _labels = _labels[_order]

    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot(_data,

    # define boxplot extras
    _define_boxplot_arguments(ax, patch_obj, vert, measured, grid, spines,
                              capsize, None)
    # define basic colours - overrides if needs be
    colors = _kcolor_arrangement(patch_obj, colors, k=_data.shape[1])
    # label axes
    _label_axes(ax, _labels, vert, label_rotation, label_max_length)
    if title is not None:
    # perform stripplots
    if with_strip:
        for n in range(_data.shape[1]):
            # plot x strips
            _overlay_stripplot(_data[:, n], ax, n + 1, width, colors[n], vert,
                               outliers, strip_jitter)

    return ax
def bibox1d(X: _ArrayLike,
            Y: _ArrayLike,
            colors: Optional[_ListLike] = None,
            labels: Optional[_ListLike] = None,
            measured: Optional[str] = None,
            ax: Optional[mpl.axes.Axes] = None,
            mannwhitney: bool = True,
            with_strip: bool = False,
            vertical: bool = True,
            notch: bool = False,
            capsize: float = 1.0,
            outliers: bool = True,
            grid: bool = True,
            width: Union[float, List[float]] = 0.7,
            label_rotation: float = 0.0,
            label_max_length: int = 25,
            spines: Optional[_ListLike] = None,
            strip_jitter: float = 0.15,
            theme: str = "white_circle",
    """Plots two 1-dimensional boxplots using vectors `X`, `Y`.

    X : list/tuple/np.ndarray/pd.Series (1d)
        The first data column to draw. Must be numeric.
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The second data column to draw. Must be numeric.
    colors : str/list of str, optional
        If None, uses a default color
    labels : str/list of str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X/Y is of type pandas.Series, uses this label instead.
    measured : str, optional
        A label to define what the measurement is
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    mannwhitney : bool, default=True
        If True, performs a Mann-Whitney U test between the values
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays fliers as outliers
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    strip_jitter : float, default=0.15
        With stripplot, defines the amount of jitter in the variables
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    See Also

    Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes
    instance_check((X, Y), (list, tuple, np.ndarray, pd.Series))
    instance_check((colors, labels, spines), (type(None), list, pd.Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check((mannwhitney, vertical, notch, outliers, grid, with_strip),
    instance_check((capsize, width, strip_jitter, label_rotation),
                   (float, int))
    instance_check(theme, str)
    instance_check(label_max_length, int)
    bounds_check(strip_jitter, 0.0, 1.0)

    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    _style = _get_flier_style(theme)

    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(3.5, 7))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(7, 3.5))

    if with_strip:
        outliers = False

    if spines is None:
        if vertical and mannwhitney:
            spines = ("bottom", "left", "right")
        elif not vertical and mannwhitney:
            spines = ("bottom", "left", "top")
            spines = ("bottom", "left", "top", "right")
    # sort out labels
    if labels is None:
        labels = [
            X.name if isinstance(X, pd.Series) else "",
            Y.name if isinstance(Y, pd.Series) else "",
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot([_X, _Y],
    # define boxplot extras
    _define_boxplot_arguments(ax, patch_obj, vertical, measured, grid, spines,
                              capsize, None)
    # define basic colours - overrides if needs be
    colors = _kcolor_arrangement(patch_obj, colors)
    # label axes
    _label_axes(ax, labels, vertical, label_rotation, label_max_length)
    # if we have stripplot, draw this
    if with_strip:
        # plot x strips
        _overlay_stripplot(_X, ax, 1, width, colors[0], vertical, outliers,
        _overlay_stripplot(_Y, ax, 2, width, colors[1], vertical, outliers,

    # if we have mann-whitney append this info
    if mannwhitney:
        # determine mann-whitney U test
        z, p = mannwhitneyu(_X, _Y)
        # p-value * 2
        p *= 2
        star = _get_stars(p)
        # get dimensions to annotate
        joined = np.concatenate((_X, _Y))
        _max, _min = np.max(joined), np.min(joined)
        # annotate on mann-whitney test
        if vertical:
                xy=(1, _max),
                xytext=(2, _max),
            # add mw text
                _max + np.abs(_max - _min) * 0.1,
                xy=(_max, 2),
                xytext=(_max, 1),
            # add mw text
                _max + np.abs(_max - _min) * 0.1,

    return ax
def box1d(X: _ArrayLike,
          color: Optional[str] = None,
          label: Optional[str] = None,
          ax: Optional[mpl.axes.Axes] = None,
          with_strip: bool = False,
          vertical: bool = True,
          notch: bool = False,
          capsize: float = 1.0,
          outliers: bool = True,
          axis_scale: Optional[Union[str, Callable]] = None,
          grid: bool = True,
          width: float = 0.7,
          label_rotation: float = 0.0,
          label_max_length: int = 25,
          spines: Optional[_ListLike] = None,
          theme: str = "white_circle",
    """Plots a 1-dimensional boxplot using a vector.

    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw. Must be numeric.
    color : str, optional
        If None, uses a default color
    label : str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X is of type pandas.Series, uses this label instead.
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays outfliers as outliers
    axis_scale: str/callable, optional
        Scales the data along the axis.
        If str, use {'log', 'sqrt', 'log2'}
        If callable, must reference a `np.*` function which takes array X and returns X'
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    instance_check(X, (np.ndarray, pd.Series, list, tuple))
    instance_check((vertical, notch, outliers, grid, with_strip), bool)
    instance_check(spines, (type(None), list))
    instance_check(theme, str)
    instance_check((label, color), (type(None), str))
    instance_check((capsize, width), float)
    instance_check(label_rotation, (int, float))
    instance_check(label_max_length, int)
    bounds_check(width, 0.0, 1.0)

    # convert option to numpy
    _X = as_flattened_numpy(X)
    _style = _get_flier_style(theme)
    # convert X data if we have axis_scale
    if axis_scale:
        _X = _convert_x_scale(_X, axis_scale)

    if with_strip:
        outliers = False
    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(2.5, 5))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(5, 2.5))
    if spines is None:
        spines = ("left", "top", "right", "bottom")
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot(_X,
    # define basic arguments
    _define_boxplot_arguments(ax, patch_obj, vertical, None, grid, spines,
                              capsize, axis_scale)
    # define colour features
    color = _color_arrangement(ax, patch_obj, color)
    # label the appropriate axes
        X.name if isinstance(X, pd.Series) else label,
    # plot the strips
    if with_strip:
    return ax
def histogram(X: _ArrayLike,
              kde: str = "freeform",
              bins: Optional[Union[int, _ListLike]] = None,
              density: bool = True,
              stat: bool = False,
              ax: Optional[mpl.axes.Axes] = None,
              x_label: str = "",
              title: str = "",
              kde_range: float = 1e-3,
              smoothen_kde: bool = True,
              verbose: int = 0,
              **hist_kwargs) -> mpl.axes.Axes:
    """Draws pretty histograms using `X`.

    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw. Must be numeric.
    kde : str/tuple of str, optional, default="freeform"
        If None, does not draw a KDE plot
        If 'freeform': fits the best KDE to the points
        If 'auto': attempts to fit the best `continuous` distribution
        If list/tuple: uses 'auto' to fit the best distribution out of options
        else, choose from available distributions in `scipy.stats`
    bins : int, optional
        If None, uses optimal algorithm to find best bin count
    density : bool, default=True
        If True, uses density approximation
    stat : bool, default=False
        If True, sets statistical variables in legend
    ax : matplotlib.ax object, optional, default=None
        If None, creates one.
    x_label : str, optional, default=None
        If None, uses `x-axis`.
    title : str, optional, default=""
        If None, uses `Default Title`
    kde_range : float, default=1e-3
        Defines the precision on the KDE range if plotted between (1e-3, 1-1e-3)
        Must be > 0.
    smoothen_kde : bool, default=True
        If discrete-distribution, applies smoothing function to KDE if True
    verbose : int, default=0
        If > 0, prints out useful messages

    Other Parameters
    args ; list
        Arguments to pass to `ax.hist`
    kwargs : dict
        Keyword arguments to pass to `ax.hist`

    ax : matplotlib.ax object
        Allows further modifications to the axes post-histogram
    instance_check(X, (np.ndarray, pd.Series, list, tuple))
    instance_check((density, stat, smoothen_kde), bool)
    instance_check((title, x_label), str)
    instance_check(kde, (str, type(None), list, tuple))
    instance_check(kde_range, float)
    bounds_check(verbose, 0, 4)

    # convert to numpy.
    _X = as_flattened_numpy(X)
    # make bins if set to None
    if bins is None:
        # if X is float, use freedman_diaconis_bins determinant, else simply np.arange for integer input.
        bins = get_bins(_X)
    if kde:
        density = True
    # plot histogram
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    if stat:
        stat_label = "mean: {:0.2f}, sd: {:0.3f},\n skew: {:0.3f} kurt: {:0.3f}".format(
            np.nanmean(_X), np.nanstd(_X), stats.skew(_X), stats.kurtosis(_X))
        # plot the histogram
        # plot the histogram


    if density:

    if kde is not None:
        if kde == "auto" or isinstance(kde, (list, tuple)):
            # uses slim parameters by default
            auto_fitted = auto_fit(_X, kde)
            best_model_ = auto_fitted.loc[auto_fitted["r"].idxmax()]
            # set kde to the name given
            x_kde, y_kde, model = univariate_kde(
        elif (kde == "freeform") or hasattr(stats, kde):
            # fetches the kde if possible
            auto_fitted = None
            x_kde, y_kde, model = univariate_kde(
            raise ValueError(
                "kde value '{}' not found in scipy.stats".format(kde))

        # plot
        ax.plot(x_kde, y_kde, "-", color="r")
        auto_fitted = None
        model = None

    if x_label == "":
        x_label = _assign_x_label(
            X.name if isinstance(X, pd.Series) else "",
            kde is not None,
            model if not kde == "freeform" else None,


    return ax
 def test_bounds_check(self):
     assert utils.bounds_check(math.pi, math.pi - 0.00001,
                               math.pi + 0.00001)
     assert utils.bounds_check(5, 5 - 1, 5 + 1)
def basic(
    df: Union[pd.DataFrame, "MetaPanda"],
    y: str,
    x: Optional[SelectorType] = None,
    cv: Union[int, Tuple[int, int]] = 5,
    model: str = "LinearRegression",
    cache: Optional[str] = None,
    plot: bool = False,
    verbose: int = 0,
    """Performs a rudimentary fit model with no parameter searching.

    This function helps to provide a broad overview of how successful a given model is on the
    inputs of x -> y. `cv` returns scoring and timing metrics, as well as coefficients if available, whereas
    `yp` provides predicted values for each given `y`.

    df : DataFrame / MetaPanda
        The main dataset.
    y : str
        Target/dependent variable (as column)
    x : list / tuple of str, optional
        A list of selected column names for independent variables. If None uses all except `y` column
    cv : int / tuple, default=5
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    model : str / sklearn model, default="LinearRegression"
        The name of a scikit-learn model, or the model object itself.
    cache : str, optional
        If not None, stores the resulting model parts in JSON and reloads if present.
    plot : bool, default=False
        If True, produces `overview_plot` inplace.
    verbose : int, default=0
        If > 0, prints out statements depending on level.

    Other Parameters
    model_kws : dict, optional
        Keywords to pass to the sklearn model which are not parameterized.

    cv : MetaPanda
        A dataframe result of cross-validated repeats. Can include w_ coefficients.
    yp : pd.Series
        The predictions for each of y

    Shorthand names for the models, i.e `lm` for LinearRegression
        or `gauss` for a GaussianProcessRegressor, are accepted.

    By default, `fit_basic` uses the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    See Also
    fit_grid : Performs exhaustive grid search analysis on the models selected.

    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    # checks
    instance_check(df, (pd.DataFrame, MetaPanda))
    instance_check(x, (type(None), str, list, tuple, pd.Index))
    instance_check(y, str)
    instance_check(cv, (int, tuple))
    instance_check(cache, (type(None), str))
    instance_check(plot, bool)
    bounds_check(verbose, 0, 4)
    assert is_sklearn_model(model), "model '{}' is not a valid sklearn model."

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)

    rep = _define_regression_kfold_object(cv)
    lm, pkg_name = find_sklearn_model(model, "regression")
    # assign keywords to lm
    # make data set machine learning ready.
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    if verbose > 0:
            "full dataset: {}/{} -> ML: {}/{}({},{})".format(
                _df.n_, _df.p_, _df.shape[0], _df.shape[1], _x.shape[1], 1

    # function 1: performing cross-validated fit.
    def _perform_cv_fit(
        _x: np.ndarray, _columns: pd.Index, _y: np.ndarray, _rep, _lm, package_name: str
        # cv cross-validate and wrap.
        score_mat = pd.DataFrame(
        # append results to cv
        # if repeatedkfold, add n_repeats
        if isinstance(rep, RepeatedKFold):
            score_mat["k"] = np.repeat(np.arange(rep.n_splits), rep.n_repeats)
            score_mat["k"] = np.arange(rep.n_splits)
        # extract coefficients
        coef = _extract_coefficients_from_model(score_mat, _xcols, package_name)
        # integrate coefficients
        if not isinstance(coef, (list, tuple)):
            score_mat = score_mat.join(coef.add_prefix("w__"))
        # drop estimator
        score_mat.drop("estimator", axis=1, inplace=True)
        # wrap as metapanda and return
        return MetaPanda(score_mat)

    # function 2: performing cross-validated predictions.
    def _perform_prediction_fit(
        _x: np.ndarray, _y: np.ndarray, _ind: pd.Index, _yn: str, _rep, _lm
    ) -> pd.DataFrame:
        return pd.Series(cross_val_predict(_lm, _x, _y, cv=_rep), index=_ind).to_frame(

    if cache is not None:
        cache_cv = insert_suffix(cache, "_cv")
        cache_yp = insert_suffix(cache, "_yp")
        _cv = cache_function(
        _yp = cache_function(
        _cv = _perform_cv_fit(_x, _xcols, _y, rep, lm, pkg_name)
        _yp = _perform_prediction_fit(_df, _x, _y, y, rep, lm)

    if plot:
        overview(_df, x, y, _cv, _yp)
    # return both.
    return _cv, _yp
def pca(
    df: Union[np.ndarray, pd.DataFrame, MetaPanda],
    x: Optional[SelectorType] = None,
    preprocess: bool = True,
    refit: bool = False,
    with_transform: bool = False,
    plot: bool = False,
    whiten: bool = False,
    sparsity: float = 0.0,
    variance_threshold: float = 0.9,
    plot_kwargs: Optional[Dict] = None,
    """Fits a PCA model to the data set.

    .. note:: Supports vectorization and `Param`. See `turb.vectorize`.

    df : np.ndarray / pd.DataFrame / MetaPanda
        The full dataset
    x : selector
        A subset of df to select (if MetaPanda), optionally
    preprocess : bool, default=True
        Preprocesses the data matrix X if set. Only preprocesses if pandas.DataFrame or above
        Uses the `.pipe.clean1` function which includes zscore,
            dropping object columns and NA.
    refit : bool, default=False
        If True, a second PCA model is fitted using the 'best'
        proportional variance/AUC which is returned.
    with_transform : bool, default=False
        If True, returns transformed `X` as a second argument.
    plot : bool, default=False
        If True, plots an 'overview' of the PCA result
    whiten : bool, default=False
        When True (False by default) the components_ vectors are multiplied by the square root of n_samples
        and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances.
        Whitening will remove some information from the transformed signal (the relative variance scales of the
         components) but can sometime improve the predictive accuracy of the downstream estimators
          by making their data respect some hard-wired assumptions.
    sparsity : float, default = 0.0
        If `sparsity` > 0, uses `SparsePCA` algorithm to induce sparse components using
         L1 norm.
    variance_threshold : float, default=0.9
        Determines the threshold of 'cumulative proportional variance'
         to select a refitted model from. Must be 0 <= `variance_threshold` <= N.
    plot_kwargs : dict, optional
        optional arguments to pass to `pca_overview`.

    model : sklearn.decomposition.PCA
        A PCA model
    X_t : np.ndarray/pd.DataFrame, optional
        The transformed input matrix `X`. Returned if `with_transform` is True.

    instance_check(df, (np.ndarray, pd.DataFrame, MetaPanda))
    instance_check((preprocess, plot, whiten, refit, with_transform), bool)
    instance_check(plot_kwargs, (type(None), dict))
    instance_check(sparsity, float)
    bounds_check(variance_threshold, 0.0, 1.0)

    # define our selected columns
    if x is None:
        if not isinstance(df, np.ndarray):
            x = df.columns
            x = pd.Index(patproduct("X%d", range(df.shape[1])))
    # extract x columns
    if isinstance(df, MetaPanda):
        cols = df.view(x)
        cols = x

    # generate ML ready subset
    if preprocess and not isinstance(df, np.ndarray):
        _x = preprocess_continuous_X(df, cols)
        _x = df

    # determine PCA model
    _model = _create_pca_model(_x.shape[1], sparsity, whiten)
    #  fit the model

    if plot:
        if plot_kwargs is None:
            plot_kwargs = {}
            _model, labels=cols, cutoff_selection=variance_threshold, **plot_kwargs

    # if we refit the model, refit it and return
    if refit:
        # calculate best index (N)
        _ycum = np.cumsum(_model.explained_variance_ratio_)
        new_n = np.where(_ycum > variance_threshold)[0][0] + 1
        # fit a new PCA model.
        _pcan = _create_pca_model(new_n, sparsity, whiten)
        if with_transform:
            return _pcan, pd.DataFrame(_pcan.transform(_x), index=_x.index)
            return _pcan
        if with_transform:
            return _model, pd.DataFrame(_model.transform(_x), index=_x.index)
            return _model
def correlate(
        data: Union[pd.DataFrame, MetaPanda],
        x: Optional[SelectorType] = None,
        y: Optional[SelectorType] = None,
        covar: Optional[SelectorType] = None,
        cartesian_covar: bool = False,
        output: str = "full",
        method: str = "spearman",
        verbose: int = 0,
) -> pd.DataFrame:
    """Correlates X and Y together to generate a list of correlations.

    If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame

    data : pd.DataFrame / MetaPanda
        The full dataset.
    x : (str, list, tuple, pd.Index), optional
        Subset of input(s) for column names.
            if None, uses the full dataset. Y must be None in this case also.
    y : (str, list, tuple, pd.Index), optional
        Subset of output(s) for column names.
            if None, uses the full dataset (from optional `x` subset)
    covar : (str, list, tuple, pd.Index), optional
        set of covariate(s). Covariates are needed to compute partial correlations.
            If None, uses standard correlation.
    cartesian_covar : bool, default=False
        If True, and if covar is not None, separates every
            element in covar to individually control for
        using the cartesian product
    output : str, default="full"
        Choose from {'full', 'score'}. Score just returns `r` number.
    method : str, default="spearman"
        Method to correlate with. Choose from:
            'pearson' : Pearson product-moment correlation
            'spearman' : Spearman rank-order correlation
            'kendall' : Kendall’s tau (ordinal data)
            'biserial' : Biserial correlation (continuous and boolean data only)
            'percbend' : percentage bend correlation (robust)
            'shepherd' : Shepherd's pi correlation (robust Spearman)
            'skipped' : skipped correlation (robust Spearman, requires sklearn)
    verbose : int, default=0
        If > 0, prints out useful debugging messages

    R : pd.DataFrame
        correlation rows (based on pingouin structure)

    >>> import turbopanda as turb
    >>> data = turb.read('example.json')
    >>> R = turb.correlate(data) # uses full dataset
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000
    >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset
                 X         M         Y
    X     1.000000  0.392251  0.059771
    M     0.392251  1.000000  0.545618
    Y     0.059771  0.545618  1.000000

    # correlates X columns against Ybin
    >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin')
                    X         M         Y
    Ybin     1.000000  0.392251  0.059771

    # correlates X against Ybin controlling for
    >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y
    Ybin     -0.149210

    # using a different technique
    >>>  R = turb.correlate(data, method="shepherd")
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000

    # data cannot be NONE
    instance_check(data, (pd.DataFrame, MetaPanda))
    instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index))
    instance_check(cartesian_covar, bool)
    belongs(output, ("full","score"))
    bounds_check(verbose, 0, 4)

    # downcast to dataframe option
    df = data.df_ if not isinstance(data, pd.DataFrame) else data
    # downcast if list/tuple/pd.index is of length 1
    x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x
    y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y

    # convert using `view` if we have string instances.
    if isinstance(x, str):
        x = pattern(x, df.columns)
    if isinstance(y, str):
        y = pattern(y, df.columns)
    if isinstance(covar, str):
        covar = pattern(covar, df.columns)

    # perform a check to make sure every column in `covar` is continuous.
    if covar is not None:
        if not is_dataframe_float(data[covar]):
            raise TypeError(
                "`covar` variables in `correlate` all must be of type `float`/continuous."

    # execute various use cases based on the presense of x, y, and covar, respectively.
    if x is None and y is None:
        # here just perform matrix-based correlation
        comb = it.combinations_with_replacement(df.columns, 2)
        niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and y is None:
        # use a subset of x, in union with covar
        comb = it.combinations_with_replacement(x, 2)
        niter = (len(x)**2) // 2 + (len(x) // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str):
        # list of x, y str -> matrix-vector cartesian product
        comb = it.product(x, [y])
        niter = len(x)
    elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str):
        # list of y, x str -> matrix-vector cartesian product
        comb = it.product(y, [x])
        niter = len(y)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(
            y, (list, tuple, pd.Index)
        # list of x, y -> cartesian product of x: y terms
        comb = it.product(x, y)
        niter = len(x) * len(y)
        raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar))
    # return the combination of these effects.
    return _corr_combination(
        df, comb, niter, covar, cartesian_covar, method, output, verbose