Python instance_check示例，turbopanda.utils.instance_check Python示例

示例#1

0

显示文件

def dummy_categorical(cat: pd.Series) -> pd.DataFrame:
    """Given pd.Series of type 'category', return boolean dummies as matrix."""
    instance_check(cat, pd.Series)
    if cat.dtype == "category":
        return pd.get_dummies(cat).add_prefix("is_").astype(np.bool)
    else:
        raise TypeError("'cat' Series is {}, not of type 'category'".format(cat.dtype))

示例#2

0

显示文件

def shorten(s, newl: int = 15, strategy: str = "middle"):
    """Shortens a string or array of strings to length `newl`.

    Parameters
    ----------
    s : str / list of str / np.ndarray / pd.Series / pd.Index
        The string or list of strings to shorten
    newl : int, default=15
        The number of characters to preserve (5 on each side + spaces)
    strategy : str, default="middle"
        Choose from {'middle', 'end'}, determines where to put dots...

    Returns
    -------
    ns : str / list of str
        A shortened string or array of strings
    """
    instance_check(s, (str, list, tuple, np.ndarray, Series, Index))
    instance_check(newl, int)
    belongs(strategy, ("middle", "end"))

    if isinstance(s, str):
        return _shorten_string(s, newl, strategy)
    else:
        # create a partial passing in keyword arguments to every call.
        _shorten_part = partial(_shorten_string,
                                approp_len=newl,
                                strategy=strategy)
        # map through the strings and shorten them.
        return list(map(_shorten_part, s))

示例#3

0

显示文件

def cat_array_to_color(array, cmap="Blues", rstart=0., rend=1.):
    """Given some list/array of values, find some way of mapping this to colour values

    Parameters
    ----------
    array : np.ndarray
        An array of values.
    cmap : str, dict, list-like
        Either a string or an array referencing the 'unique' colours
        If dict (k = name, v = hex color)
    rstart : float [0..1]
        The start of the color range
    rend : float [0..1]
        The end of the color range
    """
    instance_check(cmap, (str, dict, list, tuple))

    # map to numpy
    _array = (
        np.asarray(array).flatten()
        if not isinstance(array, (np.ndarray, pd.Series))
        else array
    )
    # if boolean, cast as a 'string'
    if _array.dtype.kind == "b":
        _array = _array.astype(np.str)
    if (_array.dtype.kind == "U") | (_array.dtype.kind == "O"):
        # i.e we have a string array
        name_uniq = unique_ordered(_array)
        col_uniq = _colormap_to_hex(cm.get_cmap(cmap)(np.linspace(rstart, rend, len(name_uniq))))
        name_col_map = dict(zip(name_uniq, col_uniq))
        cl = list(map(lambda s: s.replace(s, name_col_map[s]), _array))
        return np.asarray(cl), "discrete"
    else:
        return _array, "continuous"

示例#4

0

显示文件

def common_substrings(
        a: Union[str, List[str]],
        b: Optional[Union[str, List[str]]] = None,
        min_length: int = 2,
) -> Union[str, Series]:
    """Given at least one pair of strings, find all the best common substring matches.

    By default, if one a is passed, it uses the pairwise combinations between all values in the list,
        otherwise with a + b, the cartesian product of the lists is used.

    Parameters
    ----------
    a : str/list of str
        A word or list of words to find the common substring to
    b : str/list of str, optional
        A word or list of words to find the common substring to
        If None, pairwise combinations in a are used
    min_length: int, default=2
        The minimum accepted length of string for a given pair

    Returns
    -------
    z_up : str/Series
        str returned if (a, b) are strs, else Series of valuecounts
    """

    instance_check(a, (str, list, tuple, Index))
    instance_check(b, (type(None), str, list, tuple, Index))
    nonnegative(min_length, int)
    # prevent a case where a can be a str, b is None
    disallow_instance_pair(a, str, b, type(None))

    filters = ("", "_", "__", "-")
    if isinstance(a, str) and isinstance(b, type(None)):
        return a
    elif isinstance(a, str) and isinstance(b, str):
        return _single_common_substring_match(a, b)
    else:
        if isinstance(a, str):
            a = [a]
        elif isinstance(b, str):
            b = [b]
        # determine pair set.
        if b is None:
            # combination iterator
            pair_groups = it.combinations(a, 2)
        else:
            # cartesian product iterator
            pair_groups = it.product(a, b)
        # generate pairs
        z = [_single_common_substring_match(i, j) for i, j in pair_groups]

        def filter_func(x):
            """Custom function which filters according to tuple and keeps elements >= min length"""
            return (x in filters) or (len(x) < min_length) or (z.count(x) <= 1)

        # filter out naff elements
        z_up = list(it.filterfalse(filter_func, z))
        # save as series valuecounts.
        return Series(z_up, dtype=object).value_counts()

示例#5

0

显示文件

def string_replace(strings: Union[str, List[str], Tuple[str, ...], Series,
                                  Index], *operations: Tuple[str, str]):
    """Performs all replace operations on the string inplace.

    By default, if operations is a list of these tuples, it will work also.

    Parameters
    ----------
    strings : str, list, tuple, Series, Index
        A string or set of strings to possibly rename
    operations : arguments of tuple-2 (before, after) strings
        The 'before' string chooses a subsection to change, and after is the replacement

    Returns
    -------
    strings_new : list, tuple, Series, Index
        The replaced-string array

    Examples
    --------
    This function allows you to perform an ordered-set of changes to an array of strings:
    >>> from turbopanda.str import string_replace as strrepl
    >>> strrepl(['hello', 'i am', 'pleased'], ("i", "u"))
    >>> ['hello', 'u am', 'pleased']
    We can perform multiple changes:
    >>> strrepl(['hello', 'i am', 'pleased'], ("i", "you"), ("am", "are"))
    >>> ['hello', 'you are', 'pleased']
    The changes also obey the order, so potentially the same string can be changed more than once:
    >>> strrepl(['hello', 'i am', 'pleased'], ("hello", "goodbye"), ("good", "bad"))
    >>> ['badbye', 'i am', 'pleased']
    Note that to provide backwards compatibility, if there is only argument and its a list, it will
    attempt to treat this as a stack of arguments to parse:
    >>> strrepl(['hello', 'i am', 'pleased'], ("hello", "goodbye"), ("good", "bad"))
    >>> ['badbye', 'i am', 'pleased']

    See Also
    --------
    re.search
    turbopanda.str.pattern
    """
    # perform check
    instance_check(strings, (str, list, tuple, Series, Index))
    # convert single list to op list
    if len(operations) == 1 and isinstance(operations[0], list):
        operations = operations[0]

    if isinstance(strings, str):
        return reduce(lambda sold, arg: sold.replace(*arg),
                      [strings, *operations])
    else:
        if len(strings) == 0:
            return strings
        else:
            strings_new = list(
                reduce(
                    lambda sold, arg: map(lambda s: s.replace(*arg), sold),
                    [strings, *operations],
                ))
            return transform_copy(strings, strings_new)

示例#6

0

显示文件

文件： test_utils.py 项目： gregparkes/turbopanda

 def test_instance_check(self):
     # single example
     x = ['abba', 'father', 'cross']
     assert utils.instance_check(x, list)
     y = np.array([1, 2, 3], dtype=float)
     assert utils.instance_check(y, np.ndarray)
     # multiples given a tuple
     ij = True
     ji = False
     assert utils.instance_check((ij, ji), bool)

示例#7

0

显示文件

def multivariate_gaussians(n, k, C=0.5):
    """Creates k multivariate Gaussian distributions with sample size n, according to correlations C.

    All Gaussians are with mu = 0, sigma = [ratio of C].

    Parameters
    ----------
    n : int
        Sample size
    k : int, list of int
        Dimensionality of one group (int) or each group of Multivariate Gaussians (list of int)
    C : float, list of float
        Correlation strength [-1...1] for all groups (float) or each group (list of float)

    Returns
    -------
    X : np.ndarray (n, sum(k))
        Multivariate Gaussian synthetic data
    """
    if n < 1:
        raise ValueError("'n' must be > 0")

    instance_check(k, (int, np.int, list, tuple))
    instance_check(C, (float, np.float, list, tuple))
    # if C is a list, ensure k and C are same length
    if isinstance(C, (list, tuple)) and isinstance(k, (list, tuple)):
        arrays_equal_size(k, C)

    # handle single k case
    if isinstance(k, (int, np.int)):
        # just one gaussian group
        if isinstance(C, (float, np.float)):
            return np.random.multivariate_normal(np.zeros(k),
                                                 covariance_matrix(
                                                     k,
                                                     C,
                                                     random_direction=False),
                                                 size=n)
        else:
            raise ValueError(
                "'C' must be of type 'float' when 'k' is of type 'int'")
    else:
        # must be a list, iterate over it
        result = []
        for i, p in enumerate(k):
            mu = np.zeros(p)
            # collect the correlation ratio if its a float or from a list
            c = C if isinstance(C, (float, np.float)) else C[i]
            # compute covariance matrix
            cov = covariance_matrix(p, c, random_direction=False)
            # make data
            X = np.random.multivariate_normal(mu, cov, size=n)
            result.append(X)

        return np.hstack(result)

示例#8

0

显示文件

def _shorten_string(s: str,
                    approp_len: int = 15,
                    strategy: str = "middle") -> str:
    instance_check(s, str)

    if (len(s) == 0) | (len(s) <= approp_len):
        return s
    else:
        if strategy == "end":
            return s[:approp_len - 2] + ".."
        elif strategy == "middle":
            midpoint = (approp_len - 2) // 2
            return s[:midpoint] + ".." + s[-midpoint:]
        else:
            raise ValueError("strategy '{}' not in {}".format(
                strategy, ("middle", "start", "end")))

示例#9

0

显示文件

文件： _shadow.py 项目： gregparkes/turbopanda

def dtypes(self, grouped: bool = True) -> Union[pd.Series, pd.DataFrame]:
    """Determine the grouped data types in the dataset.

    Parameters
    --------
    grouped : bool, optional
        If True, returns the value_counts of each data type,
            else returns the direct types.

    Returns
    -------
    true_types : pd.Series/pd.DataFrame
        A series of index (group/name) and value (count/type)
    """
    instance_check(grouped, bool)
    return (
        self.meta_["true_type"].value_counts() if grouped else self.meta_["true_type"]
    )

示例#10

0

显示文件

def expand(self, column: str, sep: Optional[str] = ","):
    """Expand out a 'stacked' id column to a longer-form DataFrame.

    Expands out a 'stacked' id column to a longer-form DataFrame, and re-merging
    the data back in.

    Parameters
    ----------
    column : str
        The name of the column to expand, must be of datatype [object]
    sep : str, optional
        The separating string to use.

    Raises
    ------
    ValueError
        If `column` not found in `df_` or `meta_`, or `column` is not stackable

    Returns
    -------
    self

    See Also
    --------
    shrink : Expands out a 'unstacked' id column to a shorter-form DataFrame.
    """
    instance_check((column, sep), str)

    if column not in self.df_.columns:
        raise ValueError("column '{}' not found in df".format(column))

    self._df = pd.merge(
        # expand out id column
        self.df_[column].str.strip().str.split(sep).explode(),
        self.df_.dropna(subset=[column]).drop(column, axis=1),
        left_index=True,
        right_index=True,
    )
    self._df.columns.name = "colnames"
    return self

示例#11

0

显示文件

def apply(self, f_name: str, *f_args, **f_kwargs):
    """Apply a `pd.DataFrame` function to `df_`.

    e.g mdf.apply("groupby", ["counter","refseq_id"], as_index=False)
        applies self.df_.groupby() to data and return value is stored in df_
        assumes pandas.DataFrame is returned.

    Parameters
    ----------
    self
    f_name : str
        The name of the function
    f_args : list/tuple, optional
        Arguments to pass to the function
    f_kwargs : dict, optional
        Keyword arguments to pass to the function
    Returns
    -------
    self
    """
    instance_check(f_name, str)
    self._apply_function(f_name, *f_args, **f_kwargs)
    return self

示例#12

0

显示文件

文件： _kdeplot.py 项目： gregparkes/turbopanda

def kde2d(X: Union[np.ndarray, Series, List, Tuple],
          Y: Union[np.ndarray, Series, List, Tuple],
          c: str = "red",
          ax: mpl.axes.Axes = None,
          fill: bool = False,
          with_scatter: bool = False,
          **contour_kwargs):
    """TODO: Generates a 2D KDE using contours."""
    instance_check((X, Y), (list, tuple, np.ndarray, Series))
    instance_check(c, str)
    instance_check((fill, with_scatter), bool)
    instance_check(ax, mpl.axes.Axes)
    arrays_equal_size(X, Y)

    # calculate density
    _X, _Y = remove_na(np.asarray(X), np.asarray(Y), paired=True)

    H = density(_X, _Y)
    offx = np.abs(_X.max() - _X.min()) / 15.0
    offy = np.abs(_Y.max() - _Y.min()) / 15.0
    _alpha = 0.5 if with_scatter else 1.0

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 5))

    if fill:
        ax.contourf(
            H,
            extent=(_X.min() - offx, _X.max() + offx, _Y.min() - offy,
                    _Y.max() + offy),
            color=c,
            alpha=_alpha,
        )
    else:
        cset = ax.contour(H,
                          extent=(_X.min() - offx, _X.max() + offx,
                                  _Y.min() - offy, _Y.max() + offy),
                          color=c,
                          **contour_kwargs)
        ax.clabel(cset, inline=1, fontsize=10)

    if with_scatter:
        ax.scatter(_X, _Y, c=c, alpha=_alpha)

    return ax

示例#13

0

显示文件

文件： _density.py 项目： gregparkes/turbopanda

def density(
    X: np.ndarray,
    Y: Optional[np.ndarray] = None,
    Z: Optional[np.ndarray] = None,
    r: Optional[int] = None,
) -> np.ndarray:
    """Estimates the density of X using binning, accepts np.ndarray.

    Parameters
    ----------
    X : np.ndarray (n,)
        The first dimension
    Y : np.ndarray (n,), optional
        The second dimension
    Z : np.ndarray (n,), optional
        The third dimension
    r : int, optional
        The number of bins for each dimension,
        If None, uses the freedman-diaconis rule

    Returns
    -------
    d : np.ndarray (r,...)
        The density in binned-dimensions
    """
    instance_check(X, np.ndarray)
    instance_check((Y, Z), (type(None), np.ndarray))
    instance_check(r, (type(None), int))

    if r is None:
        r = min(freedman_diaconis_bins(X), 50)
    else:
        nonnegative(r, int)

    if Y is None and Z is None:
        _X = remove_na(X)
        return np.histogram(_X, bins=r, density=True)[0]
    elif Z is None:
        _X, _Y = remove_na(X, Y, paired=True)
        return np.histogram2d(_X, _Y, bins=(r, r), density=True)[0]
    else:
        return np.histogramdd(np.vstack((X, Y, Z)).T,
                              bins=(r, r, r),
                              density=True)[0]

示例#14

0

显示文件

文件： _cache.py 项目： gregparkes/turbopanda

def cachedec(
    _func=None,
    *,
    filename: str = "example1.pkl",
    compress: int = 0,
    return_as: str = "MetaPanda"
) -> Callable:
    """Provides automatic decorator caching for objects.

    Especially compatible with `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: this is a decorator function, not to be called directly. All parameters
    must be passed as keyword arguments.

    Parameters
    ----------
    _func
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
         Accepts {'json', 'csv', 'pkl'} extensions only.
    compress : int [0-9] or 2-tuple, optional
        Optional compression level for the data. 0 or False is no compression.
        Higher value means more compression, but also slower read and
        write times. Using a value of 3 is often a good compromise.
        See the notes for more details.
        If compress is True, the compression level used is 3.
        If compress is a 2-tuple, the first element must correspond to a string
        between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma'
        'xz'), the second element must be an integer from 0 to 9, corresponding
        to the compression level.
    return_as : str, default="MetaPanda"
        Accepts {'pandas', 'MetaPanda'}
        Only applies if filename is "csv" or "json". Attempts to cast the return object
        as something palatable to the user.

    Warnings
    --------
    ImportWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv', 'pkl'}

    Returns
    -------
    mp : turb.MetaPanda / object
        The MetaPanda object if {'csv' or 'json'}, otherwise uses
        serialized pickling which can return an arbritrary object.

    Examples
    --------
    For example, we call as a decorator to our custom function:
    >>> from turbopanda import cachedec
    >>> @cachedec('meta_file.json')
    >>> def f(x):
    ...     return turb.MetaPanda(x)
    These also work with numpy arrays or python objects by using `joblib`:
    >>> @cachedec("meta.pkl")
    >>> def g(x):
    ...     return [1, 2, [3, 4], {"hi":"moto"}]
    """
    # check it is string
    instance_check(filename, str)
    file_ext = filename.rsplit(".")[-1]
    # check that file ends with json or csv
    belongs(file_ext, ("json", "csv", "pkl"))

    # define decorator
    def _decorator_cache(func):
        """Basic decorator."""

        @functools.wraps(func)
        def _wrapper_cache(*args, **kwargs):
            # if we find the file
            if os.path.isfile(filename):
                # if its .csv or .json, use `read`
                if file_ext in ("json", "csv"):
                    # read it in
                    mdf = read(filename)
                    _set_index_def(mdf.df_)
                    if return_as == "MetaPanda":
                        return mdf
                    else:
                        return mdf.df_
                else:
                    if is_joblib_installed(raise_error=True):
                        import joblib

                        mdf = joblib.load(filename)
                        return mdf
            else:
                # returns MetaPanda or pandas.DataFrame
                mpf = func(*args, **kwargs)
                if isinstance(mpf, MetaPanda):
                    # save file
                    mpf.write(filename)
                    if return_as == "MetaPanda":
                        return mpf
                    else:
                        return mpf.df_
                elif isinstance(mpf, DataFrame):
                    # save - bumping index into the file.
                    mpf.reset_index().to_csv(filename, index=None)
                    if return_as == "MetaPanda":
                        return MetaPanda(mpf)
                    else:
                        return mpf
                else:
                    if is_joblib_installed(raise_error=True):
                        import joblib
                        # attempt to use joblib to dump
                        joblib.dump(mpf, filename, compress=compress)
                        return mpf

        return _wrapper_cache

    if _func is None:
        return _decorator_cache
    else:
        return _decorator_cache(_func)

示例#15

0

显示文件

文件： _cache.py 项目： gregparkes/turbopanda

def cached_chunk(
    func: Callable,
    param_name: str,
    param_values: Union[List, Tuple],
    parallel: bool = True,
    filename: str = "example1.json",
    verbose: int = 0,
    *args,
    **kwargs
) -> "MetaPanda":
    """Provides chunked automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object.

    Parameters
    --------
    func : function
        A custom function returning the pd.DataFrame/MetaPanda
    param_name : str
        The keyword name of the parameter in question to iterate over
    param_values : list/tuple of something
        The values associated with the parameter to iterate over
    parallel : bool, default=True
        Determines whether to use `joblib` to compute independent chunks in parallel or not
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
        Accepts {'json', 'csv'} formats.
    verbose : int, optional
        If > 0, prints out useful information
    *args : list, optional
        Arguments to pass to function(...)
    **kwargs : dict, optional
        Keyword arguments to pass to function(...)

    Warnings
    --------
    FutureWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv'}

    Returns
    -------
    mp : MetaPanda
        The MetaPanda object

    See Also
    --------
    cached : Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.
    """
    # check it is string
    instance_check(filename, str)
    instance_check(param_name, str)
    instance_check(param_values, (list, tuple, dict))

    if not callable(func):
        raise ValueError("function is not callable")
    # check that file ends with json or csv
    belongs(filename.rsplit(".", 1)[-1], ("json", "csv"))

    # if the final file exists, perform as normal.
    if os.path.isfile(filename):
        if verbose > 0:
            print("reading in cached file: {}".format(filename))
        # read it in
        mdf = read(filename)
        _set_index_def(mdf.df_)
        return mdf
    else:
        # create a bunch of chunks by repeatedly calling cache.
        if parallel:
            _mdf_chunks = joblib.Parallel(joblib.cpu_count())(
                joblib.delayed(cached)(
                    func,
                    insert_suffix(filename, "_chunk%d" % i),
                    verbose=verbose,
                    *args,
                    **dictcopy(kwargs, {param_name: chunk})
                ).df_
                for i, chunk in enumerate(param_values)
            )
        else:
            _mdf_chunks = [
                cached(
                    func,
                    insert_suffix(filename, "_chunk%d" % i),
                    verbose=verbose,
                    *args,
                    **dictcopy(kwargs, {param_name: chunk})
                ).df_
                for i, chunk in enumerate(param_values)
            ]
        # join together the chunks
        mpf = _stack_rows(_mdf_chunks)
        # save file - return type must be a MetaPanda or error occurs!
        mpf.write(filename)
        # now delete the 'chunked' files.
        for i in range(len(param_values)):
            os.remove(insert_suffix(filename, "_chunk%d" % i))

        return mpf

示例#16

0

显示文件

文件： _fit_learning.py 项目： gregparkes/turbopanda

def learning(
    df: "MetaPanda",
    y: str,
    x: Optional[SelectorType] = None,
    train_n: Optional[np.ndarray] = None,
    permute_n: int = 0,
    cv: Tuple[int, int] = (5, 15),
    model: str = "LinearRegression",
    cache: Optional[str] = None,
    plot: bool = False,
    verbose: int = 0,
    **model_kws
):
    """Fits a basic model to generate cross-validated training/test scores for different training set sizes.

    A cross-validation generator splits the whole dataset `k` times in training and test data. Subsets of the training set with
    varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed.
    Afterwards, the scores will be averaged over all `k` runs for each training subset size.

    Parameters
    ----------
    df : MetaPanda (n_samples, n_features)
        The main dataset.
    y : str
        A selected y column.
    x : list/tuple of str/selector, optional
        A list of selected column names for x or MetaPanda `selector`.
    train_n : int/array-like, with shape (n_ticks,) dtype float or int, optional
        Relative or absolute numbers of training examples that will be used to generate
        learning curve related data.
        If None: uses `linspace(.1, .9, 8)`
        If int: uses `linspace(.1, .9, n)`
    permute_n : int (default 0)
        The number of times to permute y, if > 0, then does full permutation analysis (making 4th plot)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    model : str/estimator sklearn model that implements `fit` and `predict` methods
        The name of a scikit-learn model, or the model object itself.
    cache : str, optional
        TODO: Not Implemented yet.
        If not None, stores the resulting model parts in JSON and reloads if present.
    plot : bool, optional
        If True, produces `.plot.learning_curve` inplace.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Other Parameters
    ----------------
    model_kws : dict, optional
        Keywords to pass to the sklearn model which are not parameterized.

    Returns
    -------
    results : MetaPanda (n_ticks, 8)
        The results matrix of mean and std scores
    permute_ : np.ndarray (permute_n,), optional
        The permutation scores associated with the permutation analysis

    Notes
    -----
    Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted.

    By default, `fit_learning` uses the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    `permute_n` is set to 0 by default, if you want a permutation histogram, this value must be > 0.

    See Also
    --------
    fit_basic : Performs a rudimentary fit model with no parameter searching.
    fit_grid : Performs exhaustive grid search analysis on the models selected.

    References
    ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # perform checks
    instance_check(df, pd.DataFrame, MetaPanda)
    instance_check(y, str)
    instance_check(train_n, (type(None), int, list, tuple, np.ndarray))
    instance_check(permute_n, int)
    instance_check(cv, (int, tuple))
    # instance_check(cache, (type(None), str))
    instance_check(plot, bool)
    bounds_check(verbose, 0, 4)

    # set dataset if a pandas object
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    # retrieve x columns if none
    _xcols = select_xcols(_df, x, y)
    k, repeats = cv if isinstance(cv, tuple) else cv, 1
    lm, pkg_name = find_sklearn_model(model, "regression")
    # assign keywords to lm
    lm.set_params(**model_kws)

    if train_n is None:
        train_n = np.linspace(0.1, 0.9, 8)
    elif isinstance(train_n, int):
        train_n = np.linspace(0.1, 0.9, train_n)
    # ml ready
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    rep = RepeatedKFold(n_splits=k, n_repeats=repeats)
    vars_ = learning_curve(
        lm,
        _x,
        _y,
        train_sizes=train_n,
        cv=rep,
        scoring="neg_root_mean_squared_error",
        n_jobs=-2,
        verbose=verbose,
        return_times=True,
    )
    # permutation analysis if permute_n > 0
    if permute_n > 0:
        perm_score_, perm_scorez_, pval = permutation_test_score(
            lm,
            _x,
            _y,
            cv=rep,
            n_permutations=permute_n,
            scoring="neg_root_mean_squared_error",
            n_jobs=-2,
            verbose=verbose,
        )

    # outputs
    output_labels_ = ["train_score", "test_score", "fit_time", "score_time"]
    # format as df
    results = pd.DataFrame(
        # stack them together
        np.hstack(
            (
                np.stack([np.mean(vars_[i], axis=1) for i in range(1, 5)], axis=1),
                np.stack([np.std(vars_[i], axis=1) for i in range(1, 5)], axis=1),
            )
        ),
        columns=list(
            it.chain(
                map(lambda s: "mean_" + s, output_labels_),
                map(lambda s: "std_" + s, output_labels_),
            )
        ),
    )
    # add N column
    results["N"] = vars_[0]
    R = MetaPanda(results)
    if plot and permute_n > 0:
        lcurve(R, perm_scorez_)
    elif plot:
        lcurve(R)
    # return as MetaPanda
    if permute_n > 0:
        return R, perm_score_, perm_scorez_, pval
    else:
        return R

示例#17

0

显示文件

文件： _visualise.py 项目： gregparkes/turbopanda

def hist_grid(mdf: Union[DataFrame, "MetaPanda"],
              subset: SelectorType,
              arrange: str = "square",
              plot_size: int = 3,
              shared_dist: str = "auto",
              savepath: Optional[Union[str, bool]] = None,
              **hist_kws):
    """
    Plots a grid of histograms comparing the distributions in a MetaPanda
    selector.

    Parameters
    --------
    mdf : turb.MetaPanda
        The dataset
    subset : str or list/tuple of str
        Contains either types, meta column names, column names or regex-compliant strings
    arrange : str
        Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row
        prioritises plots row-like, and column-wise for column.
    plot_size : int, default=3
        The size of each axes
    shared_dist : str/tuple of str/dict, default="auto"
        Determines what KDE to fit to the data, set to None if you don't want
        If tuple/list: attempts using these specified distributions
        If dict: maps column name (k) to distribution choice (v)
    savepath : None, bool, str
        saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved.

    Other Parameters
    ----------------
    hist_kws : dict
        Keywords to pass to `turb.plot.histogram`

    Returns
    -------
    None
    """
    # checks
    instance_check(shared_dist, (type(None), str, list, tuple, dict))
    instance_check(savepath, (type(None), str, bool))
    nonnegative(plot_size, int)
    belongs(arrange, ["square", "row", "column"])
    # make a metapanda if we have a dataframe.
    _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf

    # get selector
    selection = _mdf.view(subset)
    # assuming we've selected something...
    if selection.size > 0:
        fig, axes = gridplot(len(selection), arrange, ax_size=plot_size)

        if not isinstance(shared_dist, dict):
            for i, x in enumerate(selection):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=shared_dist,
                              **hist_kws)
            fig.tight_layout()
        else:
            for i, (x, d) in enumerate(shared_dist.items()):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=d,
                              **hist_kws)
            # iterate over any 'remaining' columns in selection and handle appropriately
            remaining = difference(selection, tuple(shared_dist.keys()))
            if remaining.shape[0] > 0:
                for i, x in enumerate(remaining):
                    _ = histogram(_mdf[x].dropna(),
                                  ax=axes[i + len(shared_dist)],
                                  title=x,
                                  kde="auto",
                                  **hist_kws)
            fig.tight_layout()

        if isinstance(savepath, bool):
            save(fig, "hist", _mdf.name_)
        elif isinstance(savepath, str):
            save(fig, "hist", _mdf.name_, fp=savepath)

示例#18

0

显示文件

def overview_pca(
    model,
    distance_color: bool = True,
    labels: Optional[pd.Index] = None,
    cutoff_selection: float = 0.9,
    n_samples_annotate: int = 6,
    n_pcs: int = 5,
    ax_size: int = 4,
):
    """Provides an overview plot from a PCA result.

    Parameters
    ----------
    model : sklearn.decomposition.PCA
        A fitted PCA model.
    distance_color : bool, default=True
        If True, plots the magnitude of each PC as a color
    labels : np.ndarray (n,) of str / pd.Series / list / tuple, optional
        If not None, provides a label for every PC component (dimension), and annotates
        the most 'outlier' like samples in plot 1
    cutoff_selection : float, default=0.9
        The cutoff for proportional variance to select for
    n_samples_annotate : int, default=10
        Defines the number of labels to show if `labels` is not None in plot 1
    n_pcs : int, default=5
        The number of principle components to consider in plot 3
    ax_size : int, default=4
        The default size for each axes.

    Other Parameters
    ----------------
    scatter_kws : dict
        keywords to pass to `plt.scatter`
    """
    instance_check(distance_color, bool)
    instance_check(labels, (type(None), np.ndarray, pd.Series, pd.Index, list, tuple))
    nonnegative(
        (
            n_samples_annotate,
            n_pcs,
            ax_size,
        ),
        int,
    )

    if labels is not None:
        fig, axes = gridplot(3, ax_size=ax_size)
    else:
        fig, axes = gridplot(2, ax_size=ax_size)

    if n_samples_annotate > model.n_components_:
        n_samples_annotate = model.n_components_ - 1
    if n_pcs > model.n_components_:
        n_pcs = model.n_components_ - 1

    # 1 plot the scatter of PC
    _plot_pca_scatter(model, axes[0], distance_color)
    # 2 plot the line AUC for explained variance
    _explained_variance_plot(model, axes[1], cutoff=cutoff_selection)
    # if annotate, we annotate the scatter plot with samples.
    if labels is not None:
        # check to make sure labels is same length as components
        _annotate_on_magnitude(model, labels, n_samples_annotate, axes[0])
        # 3 plot the top N components by the `most important eigenvector values`
        _x3, _y3, _sel_labels = _best_principle_eigenvectors(
            model, labels=labels, k=n_samples_annotate, p=n_pcs
        )
        _best_eigenvector_plot(
            _x3, _y3, _sel_labels, axes[-1], nk=(n_samples_annotate, n_pcs)
        )
        axes[-1].set_title("Top {} eigenvectors".format(n_samples_annotate))

    fig.tight_layout()

示例#19

0

显示文件

def bicorr(x: pd.Series,
           y: pd.Series,
           method: str = "spearman",
           tail: str = "two-sided",
           output: str = "score") -> Union[float, dict]:
    """(Robust) correlation between two variables.

    Adapted from the `pingouin` library, made by Raphael Vallat.

    .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py

    Parameters
    ----------
    x, y : pd.Series
        First and second set of observations. x and y must be independent.
    method : str
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::
        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'biserial' : Biserial correlation (continuous and boolean data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)
    tail : str
        Specify whether to return 'one-sided' or 'two-sided' p-value.
    output : str, default='score'
        Determines whether to display the full output or
            just the correlation (r) score
            options are {'score', 'full'}.

    Returns
    -------
    stats : float/dict
        Test summary ::
        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'method' : pearson/spearman/biserial... etc
        'p-val' : one or two tailed p-value
        'power' : achieved power of the test (= 1 - type II error).
    Notes
    -----
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Correlations of -1 or +1 imply
    an exact linear relationship.
    The Spearman correlation is a nonparametric measure of the monotonicity of
    the relationship between two datasets. Unlike the Pearson correlation,
    the Spearman correlation does not assume that both datasets are normally
    distributed. Correlations of -1 or +1 imply an exact monotonic
    relationship.
    Kendall’s tau is a measure of the correspondence between two rankings.
    Values close to 1 indicate strong agreement, values close to -1 indicate
    strong disagreement.
    The percentage bend correlation [1]_ is a robust method that
    protects against univariate outliers.
    The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust
    methods that returns the Spearman's rho after bivariate outliers removal.
    Note that the skipped correlation requires that the scikit-learn
    package is installed (for computing the minimum covariance determinant).
    Please note that rows with NaN are automatically removed.
    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395
    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200
    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119
    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606
    """
    # perform all checks in the public method.. rather than repeating them internally.
    # check type
    instance_check((x, y), pd.Series)
    belongs(tail, ("one-sided", "two-sided"))
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ('score', 'full'))
    # Check size
    if x.shape[0] != y.shape[0]:
        raise ValueError("x and y must have the same length.")

    if output == "score":
        return _bicorr_inner_score(x, y, method)
    else:
        return _bicorr_inner_full(x, y, method, tail=tail)

示例#20

0

显示文件

def grid(df: Union[pd.DataFrame, "MetaPanda"],
         y: str,
         x: Optional[SelectorType] = None,
         models=("Ridge", "Lasso"),
         cv: Union[int, Tuple[int, int]] = 5,
         cache: Optional[str] = None,
         plot: bool = False,
         chunks: bool = False,
         verbose: int = 0,
         **grid_kws) -> "MetaPanda":
    """Performs exhaustive grid search analysis on the models selected.

    This function aims to encapsulate much of the functionality associated around `GridSearchCV` class
    within scikit-learn. With in-built caching options, flexible selection of inputs and outputs with the
    MetaPanda class.

    Parameters
    ----------
    df : pd.DataFrame/MetaPanda
        The main dataset.
    y : str
        A selected y column.
    x : list/tuple of str, optional
        A list of selected column names for x or MetaPanda `selector`.
    models : list/dict, default=["Ridge", "Lasso"]
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, default=5
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Other Parameters
    ----------------
    grid_kws : dict, optional
        Additional keywords to assign to GridSearchCV.

    Raises
    ------
    TypeError
        If one of the parameters has wrong input type

    Returns
    -------
    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    Notes ----- From version 0.2.3 the `chunks` argument allows for fitting by parts. This means that breaks
    throughout a large pipeline will result only in losses up to the previous chunk. Chunk files are saved as
    '%filename_chunk%i.csv' so beware of clashes. Make sure to set `chunks=True` and `cache=str` where the `models`
    parameter is time-expensive.

    By default, `grid` tunes using the root mean squared error (RMSE). There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
    will arrive in a later version.

    See Also
    --------
    basic : Performs a rudimentary fit model with no parameter searching.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    References
    ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # checks
    instance_check(df, (pd.DataFrame, MetaPanda))
    instance_check(x, (type(None), str, list, tuple, pd.Index))
    instance_check(y, str)
    instance_check(cv, (int, tuple))
    instance_check(cache, (type(None), str))
    instance_check((plot, chunks), bool)
    bounds_check(verbose, 0, 4)

    if is_sklearn_model(models):
        models = [models]
    else:
        if isinstance(models, tuple):
            models = list(models)
        instance_check(models, (list, dict))

    # set dataset if a pandas object
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    # retrieve x columns if none
    # set up cv, repeats
    k, repeats = cv if isinstance(cv, tuple) else cv, 1

    # do caching
    def _perform_fit(_df: MetaPanda, _x, _y, _k: int, _repeats: int, _models):
        rep = RepeatedKFold(n_splits=_k, n_repeats=_repeats)
        # the header is 'model_est'
        header = "model"
        # any basic regression model
        pipe = Pipeline([(header, LinearRegression())])
        # get paramgrid - the magic happens here!
        pgrid = make_parameter_grid(_models, header=header)
        # join default grid parameters to given grid_kws
        def_grid_params = {
            "scoring": "neg_root_mean_squared_error",
            "n_jobs": -2,
            "verbose": verbose,
            "return_train_score": True,
        }
        def_grid_params.update(grid_kws)
        # create gridsearch
        gs = GridSearchCV(pipe, param_grid=pgrid, cv=rep, **def_grid_params)
        # make ml ready
        __xnp, __y = preprocess_continuous_X_y(_df, _x, _y)
        # fit the grid - expensive.
        gs.fit(__xnp, __y)
        # generate result
        _result = pd.DataFrame(gs.cv_results_)
        # associate model column to respective results
        _result["model"] = _result["param_model"].apply(
            lambda f: str(f).split("(")[0])
        # set as MetaPanda
        _met_result = MetaPanda(_result)
        # cast down parameter columns to appropriate type
        _met_result.transform(pd.to_numeric, object, errors="ignore")
        return _met_result

    if cache is not None:
        if chunks:
            # if dictionary, we need to split this into 1-sized list/dict blocks.
            values = dictchunk(models, 1) if isinstance(models,
                                                        dict) else models
            _cv_results = cached_chunk(
                _perform_fit,
                "_models",
                values,
                False,
                cache,
                verbose,
                _df=_df,
                _x=x,
                _y=y,
                _k=k,
                _repeats=repeats,
                _models=models,
            )
        else:
            _cv_results = cache_f(
                cache,
                _perform_fit,
                _df=_df,
                _x=x,
                _y=y,
                _k=k,
                _repeats=repeats,
                _models=models,
            )
    else:
        _cv_results = _perform_fit(_df=_df,
                                   _x=x,
                                   _y=y,
                                   _k=k,
                                   _repeats=repeats,
                                   _models=models)

    if plot:
        parameter_tune(_cv_results)

    return _cv_results

示例#21

0

显示文件

def optimize(df: "MetaPanda",
             x: SelectorType,
             y: str,
             models,
             cv: int = 5,
             verbose: int = 0):
    """Performs optimization grid analysis on the models selected.

    This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model.

    .. note:: optimization only works on *continuous* parameters with each model.

    TODO: complete `.ml.fit.optimize` function

    Parameters
    ----------
    df : MetaPanda
        The main dataset.
    x : list/tuple of str
        A list of selected column names for x or MetaPanda `selector`.
    y : str
        A selected y column.
    models : tuple/dict
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Returns
    -------
    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    By default, `optimize` tunes using the root mean squared error (RMSE).
       There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
        will arrive in a later version.

    See Also
    --------
    grid : Performs exhaustive grid search analysis on the models selected.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    References
     ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # checks
    instance_check(df, MetaPanda)
    instance_check(x, (str, list, tuple, pd.Index))
    instance_check(y, str)
    nonnegative((cv, verbose), int)
    instance_check(models, (tuple, list, dict))
    bounds_check(verbose, 0, 4)

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)
    _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y)

    # define the parameter sets
    param_sets = make_optimize_grid(models)

    for m, params in zip(models, param_sets):
        model = find_sklearn_model(m)[0]
        inits, bounds = optimize_grid_for_model(params)
        # minimize for every i element
        mins = [
            so.minimize(
                _min_cross_val_scores,
                x0=i,
                args=(_xnp, _y, model, params, cv),
                bounds=bounds,
            ) for i in inits
        ]

    pass

示例#22

0

显示文件

def bibox1d(X: _ArrayLike,
            Y: _ArrayLike,
            colors: Optional[_ListLike] = None,
            labels: Optional[_ListLike] = None,
            measured: Optional[str] = None,
            ax: Optional[mpl.axes.Axes] = None,
            mannwhitney: bool = True,
            with_strip: bool = False,
            vertical: bool = True,
            notch: bool = False,
            capsize: float = 1.0,
            outliers: bool = True,
            grid: bool = True,
            width: Union[float, List[float]] = 0.7,
            label_rotation: float = 0.0,
            label_max_length: int = 25,
            spines: Optional[_ListLike] = None,
            strip_jitter: float = 0.15,
            theme: str = "white_circle",
            **plot_kwargs):
    """Plots two 1-dimensional boxplots using vectors `X`, `Y`.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The first data column to draw. Must be numeric.
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The second data column to draw. Must be numeric.
    colors : str/list of str, optional
        If None, uses a default color
    labels : str/list of str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X/Y is of type pandas.Series, uses this label instead.
    measured : str, optional
        A label to define what the measurement is
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    mannwhitney : bool, default=True
        If True, performs a Mann-Whitney U test between the values
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays fliers as outliers
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    strip_jitter : float, default=0.15
        With stripplot, defines the amount of jitter in the variables
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    ----------------
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    See Also
    --------
    matplotlib.pyplot.boxplot

    References
    ----------
    Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes
    """
    instance_check((X, Y), (list, tuple, np.ndarray, pd.Series))
    instance_check((colors, labels, spines), (type(None), list, pd.Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check((mannwhitney, vertical, notch, outliers, grid, with_strip),
                   bool)
    instance_check((capsize, width, strip_jitter, label_rotation),
                   (float, int))
    instance_check(theme, str)
    instance_check(label_max_length, int)
    bounds_check(strip_jitter, 0.0, 1.0)

    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    _style = _get_flier_style(theme)

    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(3.5, 7))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(7, 3.5))

    if with_strip:
        outliers = False

    if spines is None:
        if vertical and mannwhitney:
            spines = ("bottom", "left", "right")
        elif not vertical and mannwhitney:
            spines = ("bottom", "left", "top")
        else:
            spines = ("bottom", "left", "top", "right")
    # sort out labels
    if labels is None:
        labels = [
            X.name if isinstance(X, pd.Series) else "",
            Y.name if isinstance(Y, pd.Series) else "",
        ]
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot([_X, _Y],
                           vert=vertical,
                           patch_artist=True,
                           showfliers=outliers,
                           notch=notch,
                           widths=width,
                           flierprops=_style,
                           boxprops=dict(alpha=box_alpha),
                           **plot_kwargs)
    # define boxplot extras
    _define_boxplot_arguments(ax, patch_obj, vertical, measured, grid, spines,
                              capsize, None)
    # define basic colours - overrides if needs be
    colors = _kcolor_arrangement(patch_obj, colors)
    # label axes
    _label_axes(ax, labels, vertical, label_rotation, label_max_length)
    # if we have stripplot, draw this
    if with_strip:
        # plot x strips
        _overlay_stripplot(_X, ax, 1, width, colors[0], vertical, outliers,
                           strip_jitter)
        _overlay_stripplot(_Y, ax, 2, width, colors[1], vertical, outliers,
                           strip_jitter)

    # if we have mann-whitney append this info
    if mannwhitney:
        # determine mann-whitney U test
        z, p = mannwhitneyu(_X, _Y)
        # p-value * 2
        p *= 2
        star = _get_stars(p)
        # get dimensions to annotate
        joined = np.concatenate((_X, _Y))
        _max, _min = np.max(joined), np.min(joined)
        # annotate on mann-whitney test
        if vertical:
            ax.annotate(
                "",
                xy=(1, _max),
                xycoords="data",
                xytext=(2, _max),
                textcoords="data",
                arrowprops=dict(arrowstyle="-",
                                ec="#666666",
                                connectionstyle="bar,fraction=0.2"),
            )
            # add mw text
            ax.text(
                1.5,
                _max + np.abs(_max - _min) * 0.1,
                star,
                horizontalalignment="center",
                verticalalignment="center",
            )
        else:
            ax.annotate(
                "",
                xy=(_max, 2),
                xycoords="data",
                xytext=(_max, 1),
                textcoords="data",
                arrowprops=dict(arrowstyle="-",
                                ec="#666666",
                                connectionstyle="bar,fraction=0.2"),
            )
            # add mw text
            ax.text(
                _max + np.abs(_max - _min) * 0.1,
                1.5,
                star,
                horizontalalignment="center",
                verticalalignment="center",
            )

    return ax

示例#23

0

显示文件

def widebox(data: Union[List, np.ndarray, pd.DataFrame],
            colors: Optional[_ListLike] = None,
            measured: Optional[str] = None,
            ax: Optional[mpl.axes.Axes] = None,
            vert: bool = True,
            sort: bool = True,
            outliers: bool = True,
            notch: bool = False,
            with_strip: bool = False,
            capsize: float = 1.0,
            width: float = 0.7,
            grid: bool = True,
            title: Optional[str] = None,
            label_rotation: float = 0.0,
            label_max_length: int = 25,
            spines: Optional[_ListLike] = None,
            strip_jitter: float = 0.15,
            theme="white_circle",
            **plot_kwargs):
    """Plots a 2D boxplot with data oriented in wide-form.

    Parameters
    ----------
    data : list, np.ndarray or pd.DataFrame (2d)
        The raw data to plot as a box.
        If data is of type pd.DataFrame: columns represent X-axis
    colors : list, tuple, optional
        Represents colors for each x-variable
    measured : str, optional
        A name for the measured variable
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    vert : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    sort : bool, default=True
        Determines whether to sort the data by numerical value
    outliers : bool, default=True
        If True, displays fliers as outliers
    notch : bool, default=False
        Determines whether to draw a notched plot
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    capsize : float, default=1.0
        Defines the length of the caps
    width : float, default=0.7
        Determines the width/height of the box
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    title : str, optional
        Sets the title of the axes if a string is passed
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    strip_jitter : float, default=0.15
        With stripplot, defines the amount of jitter in the variables
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    ----------------
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot

    See Also
    --------
    matplotlib.pyplot.boxplot
    seaborn.boxplot
    seaborn.boxenplot

    References
    ----------
    Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes
    """
    instance_check(data, (list, np.ndarray, pd.DataFrame))
    instance_check((colors, spines), (type(None), list, pd.Index))
    instance_check(ax, (type(None), mpl.axes.Axes))
    instance_check((vert, sort, notch, outliers, grid, with_strip), bool)
    instance_check((capsize, width, strip_jitter, label_rotation),
                   (float, int))
    instance_check(theme, str)
    instance_check(label_max_length, int)
    bounds_check(width, 0.0, 1.0)
    bounds_check(strip_jitter, 0.0, 1.0)
    bounds_check(label_rotation, 0.0, 360.0)

    if isinstance(data, pd.DataFrame):
        # select float, int subset
        ss = data.select_dtypes(include=[float, int])
        _data = np.asarray(ss)
        _labels = ss.columns
    elif isinstance(data, (list, np.ndarray)):
        _data = np.asarray(data)
        _labels = None
    else:
        raise TypeError("data matrix is not of type np.ndarray")

    _style = _get_flier_style(theme)

    # negative-exponential increase in figure size with more features
    def _figure_spacing(x):
        return np.exp(-0.35 * x) * x

    if with_strip:
        outliers = False
    if ax is None and vert:
        fig, ax = plt.subplots(figsize=(2.5 + _figure_spacing(_data.shape[1]),
                                        7))
    elif ax is None and not vert:
        fig, ax = plt.subplots(figsize=(7,
                                        2.5 + _figure_spacing(_data.shape[1])))
    if spines is None:
        spines = ("left", "top", "right", "bottom")

    # sort the data by the mean if selected
    if sort:
        _order = np.argsort(np.mean(_data, axis=0))
        _data = _data[:, _order]
        _labels = _labels[_order]

    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot(_data,
                           vert=vert,
                           patch_artist=True,
                           widths=width,
                           showfliers=outliers,
                           notch=notch,
                           flierprops=_style,
                           boxprops=dict(alpha=box_alpha),
                           **plot_kwargs)

    # define boxplot extras
    _define_boxplot_arguments(ax, patch_obj, vert, measured, grid, spines,
                              capsize, None)
    # define basic colours - overrides if needs be
    colors = _kcolor_arrangement(patch_obj, colors, k=_data.shape[1])
    # label axes
    _label_axes(ax, _labels, vert, label_rotation, label_max_length)
    if title is not None:
        ax.set_title(title)
    # perform stripplots
    if with_strip:
        for n in range(_data.shape[1]):
            # plot x strips
            _overlay_stripplot(_data[:, n], ax, n + 1, width, colors[n], vert,
                               outliers, strip_jitter)

    return ax

示例#24

0

显示文件

文件： _inspect.py 项目： gregparkes/turbopanda

def select(self, sc: str) -> pd.Index:
    """View a subset of columns using a flexible `eval`-like string.

    Select merely returns the columns of interest selected using this selector.
    Selections of columns can be done by:
        type [object, int, float, numpy.dtype*, pandas.CategoricalDtype]
        callable (function) that returns [bool list] of length p
        pd.Index
        str [regex, df.column name, cached name,
            meta.column name (that references a boolean column)]
        list/tuple of the above

    .. note:: We do not currently incorporate the use of brackets.

    Parameters
    ----------
    sc : str-like
        The selection string to find an optimal subset of columns.

    Warnings
    --------
    UserWarning
        If the selection returned is empty.

    Returns
    -------
    sel : pd.Index
        The list of column names NOT selected, or empty

    See Also
    --------
    view : View a selection of columns in `df_`.
    search : View the intersection of search terms, for columns in `df_`.

    Examples
    --------
    You can use string names of types to select columns of a certain type:
    >>> import turbopanda as turb
    >>> import pandas as pd
    >>> mdf = turb.MetaPanda(pd.DataFrame({'a': [1., 2.], 'b': [3, 4]}))
    >>> mdf.select("float")
    Index(['a'], dtype='object', name='colnames')

    Or inverses can also be selected using tilde `~`:
    >>> mdf.select("~float")
    Index(['b'], dtype='object', name='colnames')

    Multiple terms can be joined together, include
        regex-expressions NOT including `&` or `|`, for
        instance if we wanted to select all float columns
         containing names x1, x2 or x3:
    >>> mdf.select("float & x[1-3]")
    """
    instance_check(sc, str)

    terms = [c.strip() for c in re.split("[&|]", sc)]
    operator = re.findall("[&|]", sc)
    if len(terms) < 1:
        return pd.Index([])
    else:
        grp = [
            self.view_not(t[1:]) if t.startswith("~") else self.view(t)
            for t in terms
        ]
        full = grp[0]
        for mg, op in zip(grp[1:], operator):
            if op == "&":
                full = intersect(full, mg)
            elif op == "|":
                full = union(full, mg)
        return full

示例#25

0

显示文件

def partial_bicorr(data: pd.DataFrame,
                   x: str,
                   y: str,
                   covar: Union[str, List[str], Tuple[str, ...], pd.Index],
                   method: str = "spearman",
                   tail: str = "two-sided",
                   output: str = 'score') -> Union[float, dict]:
    """Partial and semi-partial correlation.

    Adapted from the `pingouin` library, made by Raphael Vallat.

    .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py

    Parameters
    ----------
    data : pd.DataFrame
        The full dataset including covariates.
    x, y : str, list of str
        x and y. Must be names of columns in ``data``.
    covar : list of str
        Covariate(s). Column names of the covariates.
            covar must be made of continuous columns.
            If x, y are not continuous, will perform logistic regression
            to generate residuals.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::
        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'biserial' : Biserial correlation (continuous and boolean data)
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)
    tail : string
        Specify whether to return the 'one-sided' or 'two-sided' p-value.
    output : str, default='score'
        Determines whether to display the full output or
            just the correlation (r) score
            options are {'score', 'full'}.

    Returns
    -------
    stats : float/dict
        Test summary ::
        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'method' : pearson/spearman/biserial... etc
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).
    Notes
    -----
    From [4]_:
    “With *partial correlation*, we find the correlation between :math:`x`
    and :math:`y` holding :math:`C` constant for both :math:`x` and
    :math:`y`. Sometimes, however, we want to hold :math:`C` constant for
    just :math:`x` or just :math:`y`. In that case, we compute a
    *semi-partial correlation*. A partial correlation is computed between
    two residuals. A semi-partial correlation is computed between one
    residual and another raw (or unresidualized) variable.”
    Note that if you are not interested in calculating the statistics and
    p-values but only the partial correlation matrix, a (faster)
    alternative is to use the :py:func:`pingouin.pcorr` method (see example 4).
    Rows with missing values are automatically removed from data. Results have
    been tested against the `ppcor` R package.
    References
    ----------
    .. [2] https://en.wikipedia.org/wiki/Partial_correlation
    .. [3] https://cran.r-project.org/web/packages/ppcor/index.html
    .. [4] https://gist.github.com/fabianp/9396204419c7b638d38f
    .. [5] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html
    """
    # perform all checks in the public method..
    instance_check(data, pd.DataFrame)
    instance_check((x, y), str)
    instance_check(covar, (str, list, tuple, pd.Index))
    belongs(tail, ("one-sided", "two-sided"))
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ('score', 'full'))
    # perform a check to make sure every column in `covar`
    # is continuous.
    if not is_dataframe_float(data[covar]):
        raise TypeError("`covar` variables in `partial_bicorr` "
                        "all must be of type `float`/continuous.")

    return _partial_bicorr_inner(data,
                                 x,
                                 y,
                                 covar,
                                 tail=tail,
                                 method=method,
                                 output=output)

示例#26

0

显示文件

文件： _cache.py 项目： gregparkes/turbopanda

def cached(
    func: Callable, filename: str = "example1.json", verbose: int = 0, *args, **kwargs
) -> "MetaPanda":
    """Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`.

    .. note:: this is a direct-call cache function. Not cached.

    For example, we call `cached` as a wrapper to our custom function:
    >>> import turbopanda as turb
    >>> def f(x):
    ...     return turb.MetaPanda(x)
    >>> data = cached(f, 'meta_file.json')

    .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object.

    Parameters
    --------
    func : function
        A custom function returning the pd.DataFrame/MetaPanda
    filename : str, optional
        The name of the file to cache to, or read from. This is fixed.
        Accepts {'json', 'csv'} formats.
    verbose : int, optional
        If > 0, prints out useful information
    *args : list, optional
        Arguments to pass to function(...)
    **kwargs : dict, optional
        Keyword arguments to pass to function(...)

    Warnings
    --------
    FutureWarning
        Returned object from cache isn't of type {pd.DataFrame, MetaPanda}

    Raises
    ------
    TypeError
        `filename` isn't of type `str`
    ValueError
        `filename` extension isn't found in {'json', 'csv'}

    Returns
    -------
    mp : MetaPanda
        The MetaPanda object

    See Also
    --------
    cache : Provides automatic {.json, .csv} decorator caching for `turb.MetaPanda` or `pd.DataFrame`.
    """
    # check it is string
    instance_check(filename, str)
    instance_check(verbose, int)
    instance_check(func, "__call__")

    # check that file ends with json or csv
    belongs(filename.rsplit(".", 1)[-1], ("json", "csv"))

    if os.path.isfile(filename):
        if verbose > 0:
            print("reading in cached file: {}".format(filename))
        # read it in
        mdf = read(filename)
        _set_index_def(mdf.df_)
        return mdf
    else:
        if verbose > 0:
            print("running function '{}' for cache".format(func.__name__))
        # returns MetaPanda or pandas.DataFrame
        mpf = func(*args, **kwargs)
        if isinstance(mpf, MetaPanda):
            # save file
            mpf.write(filename)
            return mpf
        elif isinstance(mpf, DataFrame):
            # save - bumping index into the file.
            mpf.reset_index().to_csv(filename, index=None)
            return MetaPanda(mpf)
        else:
            if verbose > 0:
                print(
                    "returned object from cache not of type [DataFrame, MetaPanda], not cached"
                )
            return mpf

示例#27

0

显示文件

def color_qualitative(n: Union[int, List, Tuple], sharp: bool = True) -> List[str]:
    """Generates a qualitative palette generator as hex.

    Parameters
    ----------
    n : int, list or tuple
        The number of hex colors to return, or the list/tuple of elements.
    sharp : bool
        If True, only uses strong/sharp colors, else uses pastelly colors.

    Returns
    -------
    L : list
        list of hex colors of length (n,).
    """
    instance_check(n, (int, list, tuple))
    instance_check(sharp, bool)

    if isinstance(n, (list, tuple)):
        n = len(n)

    lt8_sharp = ("Accent", "Dark2")
    lt8_pastel = ("Pastel2", "Set2")
    # lt9 = ('Set1', 'Pastel1')
    # lt10 = ['tab10']
    # lt12 = ['Set3']
    lt20 = ("tab20", "tab20b", "tab20c")
    # choose random cmap
    if n <= 8 and sharp:
        return _colormap_to_hex(
            getattr(cm, np.random.choice(lt8_sharp))(np.linspace(0, 1, n))
        )
    elif n <= 8 and not sharp:
        return _colormap_to_hex(
            getattr(cm, np.random.choice(lt8_pastel))(np.linspace(0, 1, n))
        )
    elif n <= 9 and sharp:
        return palette_cmap(n, "Set1")
    elif n <= 9 and not sharp:
        return palette_cmap(n, "Pastel1")
    elif n <= 10:
        return palette_cmap(n, "tab10")
    elif n <= 12:
        return palette_cmap(n, "Set3")
    elif n <= 20:
        return _colormap_to_hex(
            getattr(cm, np.random.choice(lt20))(np.linspace(0, 1, n))
        )
    else:
        # we cycle one of the lt20s
        return list(
            it.islice(
                it.cycle(
                    _colormap_to_hex(
                        getattr(cm, np.random.choice(lt20))(np.linspace(0, 1, 20))
                    )
                ),
                0,
                n,
            )
        )

示例#28

0

显示文件

def box1d(X: _ArrayLike,
          color: Optional[str] = None,
          label: Optional[str] = None,
          ax: Optional[mpl.axes.Axes] = None,
          with_strip: bool = False,
          vertical: bool = True,
          notch: bool = False,
          capsize: float = 1.0,
          outliers: bool = True,
          axis_scale: Optional[Union[str, Callable]] = None,
          grid: bool = True,
          width: float = 0.7,
          label_rotation: float = 0.0,
          label_max_length: int = 25,
          spines: Optional[_ListLike] = None,
          theme: str = "white_circle",
          **plot_kwargs):
    """Plots a 1-dimensional boxplot using a vector.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw. Must be numeric.
    color : str, optional
        If None, uses a default color
    label : str, optional
        If set, draws this on the appropriate axis, if None, does nothing
        If X is of type pandas.Series, uses this label instead.
    ax : matplotlib.ax object, optional, default=None
        If None, creates a plot.
    with_strip : bool, default=False
        If True, draws a stripplot over the top of the boxplot, in a similar colour
        `outliers` are set to False in this case
    vertical : bool, default=True
        Determines whether to draw the plot vertically or horizontally
    notch : bool, default=False
        Determines whether to draw a notched plot
    capsize : float, default=1.0
        Defines the length of the caps
    outliers : bool, default=True
        If True, displays outfliers as outliers
    axis_scale: str/callable, optional
        Scales the data along the axis.
        If str, use {'log', 'sqrt', 'log2'}
        If callable, must reference a `np.*` function which takes array X and returns X'
    grid : bool, default=True
        If True: draws gridlines for the numeric axis
    width : float, default=0.7
        Determines the width/height of the box
    label_rotation : float, default=0
        The degrees of rotation to the ticklabels
    label_max_length : int, default=25
        If any label exceeds this length, it truncates it
    spines : tuple, default=('top','left',bottom','right')
        Defines which spines are to be visible
    theme : str, default="white_circle"
        Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'}

    Other Parameters
    ----------------
    plot_kwargs : dict
        keyword arguments to pass to `ax.boxplot`

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-boxplot
    """

    instance_check(X, (np.ndarray, pd.Series, list, tuple))
    instance_check((vertical, notch, outliers, grid, with_strip), bool)
    instance_check(spines, (type(None), list))
    instance_check(theme, str)
    instance_check((label, color), (type(None), str))
    instance_check((capsize, width), float)
    instance_check(label_rotation, (int, float))
    instance_check(label_max_length, int)
    bounds_check(width, 0.0, 1.0)

    # convert option to numpy
    _X = as_flattened_numpy(X)
    _style = _get_flier_style(theme)
    # convert X data if we have axis_scale
    if axis_scale:
        _X = _convert_x_scale(_X, axis_scale)

    if with_strip:
        outliers = False
    if ax is None and vertical:
        fig, ax = plt.subplots(figsize=(2.5, 5))
    elif ax is None and not vertical:
        fig, ax = plt.subplots(figsize=(5, 2.5))
    if spines is None:
        spines = ("left", "top", "right", "bottom")
    box_alpha = 1.0 if not with_strip else 0.5

    patch_obj = ax.boxplot(_X,
                           vert=vertical,
                           patch_artist=True,
                           showfliers=outliers,
                           notch=notch,
                           widths=width,
                           boxprops=dict(alpha=box_alpha),
                           flierprops=_style,
                           **plot_kwargs)
    # define basic arguments
    _define_boxplot_arguments(ax, patch_obj, vertical, None, grid, spines,
                              capsize, axis_scale)
    # define colour features
    color = _color_arrangement(ax, patch_obj, color)
    # label the appropriate axes
    _label_axes(
        ax,
        X.name if isinstance(X, pd.Series) else label,
        vertical,
        label_rotation,
        label_max_length,
    )
    # plot the strips
    if with_strip:
        _overlay_stripplot(_X,
                           ax,
                           1,
                           width,
                           color,
                           vertical,
                           outliers,
                           strip_jitter=0.15)
    return ax

示例#29

0

显示文件

def save(
    fig_obj: plt.Figure,
    plot_type: str,
    name: str = "example1",
    save_types: Tuple[str, ...] = ("png", "pdf"),
    fp: str = "./",
    dpi: int = 360,
    savemode: str = "first",
) -> bool:
    """Saves a matplotlib figure in many formats.

    Given a matplotlib.Figure object, save appropriate numbers of Figures to the respective
    folders.

    Parameters
    ----------
    fig_obj : plt.Figure
        The figure object to save.
    plot_type : str
        Choose from:
            {"scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver",
            "box", "line", "venn", "multi", "pie"}
    name : str, optional
        The name of the file, this may be added to based on the other parameters
    save_types : tuple of str, optional
        Choose any from {"png", "pdf", "svg", "eps", "ps"}
    fp : str, optional
        The file path to the root directory of saving images
    dpi : int, optional
        The resolution in dots per inch; set to high if you want a good image
    savemode : str, optional
        Choose from {'first', 'update'}
            if first, only saves if file isn't present
            if update, overrides saved figure if present

    Warnings
    --------
    UserWarning
        If figure file itself already exists

    Raises
    ------
    IOError
        If the filepath does not exist
    TypeError
        If the arguments do not match their declared type
    ValueError
        If `plot_type`, `savemode` does not belong to an acceptable argument

    Returns
    -------
    success : bool
        Whether it was successful or not
    """
    accepted_types = (
        "scatter",
        "kde",
        "heatmap",
        "cluster",
        "bar",
        "hist",
        "kde",
        "quiver",
        "box",
        "line",
        "venn",
        "multi",
        "pie",
    )
    file_types_supported = ("png", "pdf", "svg", "eps", "ps")
    accepted_savemodes = ("first", "update")

    instance_check(fig_obj, (plt.Figure, mpl.figure.Figure))
    instance_check(name, str)
    instance_check(fp, str)
    belongs(plot_type, accepted_types)
    belongs(savemode, accepted_savemodes)

    for st in save_types:
        if st not in file_types_supported:
            TypeError("save_type: [%s] not supported" % st)

    # correct to ensure filepath has / at end
    if not fp.endswith("/"):
        fp += "/"

    # check whether the filepath exists
    if os.path.exists(fp):
        for t in save_types:
            # if the directory does not exist, create it!
            if not os.path.isdir(fp + "_" + t):
                os.mkdir(fp + "_" + t)
            # check if the figures themselves already exist.
            filename = "{}_{}/{}_{}.{}".format(fp, t, plot_type, name, t)
            if os.path.isfile(filename):
                warnings.warn(
                    "Figure: '{}' already exists: Using savemode: {}".format(
                        filename, savemode),
                    UserWarning,
                )
                if savemode == "update":
                    fig_obj.savefig(filename,
                                    format=t,
                                    bbox_inches="tight",
                                    dpi=dpi)
            else:
                # make the file
                fig_obj.savefig(filename,
                                format=t,
                                bbox_inches="tight",
                                dpi=dpi)
    else:
        raise IOError("filepath: [%s] does not exist." % fp)
    return True

示例#30

0

显示文件

文件： _plot_overview.py 项目： gregparkes/turbopanda

def overview(
    df,
    x: SelectorType,
    y: str,
    cv,
    yp,
    plot_names: Optional[List[str]] = None,
    plot_size: int = 3,
):
    """Presents an overview of the results of a machine learning basic run.

    Parameters
    ----------
    df : MetaPanda
        The raw dataset.
    x : selector
        The input selection to the model
    y : str
        The target vector
    cv : MetaPanda
        The results cv from a call to `fit_basic`
    yp : MetaPanda
        The result fitted values from a call to `fit_basic`
    plot_names : list of str, optional
        Names of specific plot types to draw.
        Choose any combo of: {'resid_fitted', 'score', 'actual_predicted', 'coef',
            'correlation', 'qqplot', 'cooks'}
        If None: draws ALL.
    plot_size : int, optional
        Defines the size of each plot size.

    Returns
    -------
    a bunch of plots. No Return.
    """
    from turbopanda.corr._correlate import correlate, row_to_matrix
    from turbopanda.stats import cook_distance

    instance_check(plot_names, (type(None), tuple))
    instance_check(y, str)
    options_ = (
        "resid_fitted",
        "score",
        "actual_predicted",
        "coef",
        "correlation",
        "qqplot",
        "cooks",
    )
    """ Prepare data here. """
    # set yp as series
    yp = yp[y].squeeze()
    # pair them and remove NA
    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)
    _x, _y = preprocess_continuous_X_y(_df, _xcols, y)

    options_yes_ = (True, True, True, True, 1 < len(_xcols) < 50, True, True)
    # compress down options
    option_compressed = list(it.compress(options_, options_yes_))
    """ Make plots here """
    if plot_names is None:
        plot_names = options_
    # overlap plots
    overlap_ = sorted(intersect(option_compressed, plot_names),
                      key=options_.index)

    # make plots
    fig, ax = gridplot(len(overlap_), ax_size=plot_size)
    I = it.count()

    if "score" in overlap_:
        # plot 2. boxplot for scores
        _boxplot_scores(ax[next(I)], cv)
    if "resid_fitted" in overlap_:
        # plot 1. fitted vs. residual plots
        _fitted_vs_residual(ax[next(I)], _y, yp)
    if "actual_predicted" in overlap_:
        # plot 3. KDE plot estimation between y and yhat
        _actual_vs_predicted(ax[next(I)], _y, yp)
    if "coef" in overlap_:
        # plot 4. coefficient plot
        coefficient(cv, ax[next(I)])
    if "correlation" in overlap_ and (1 < len(_xcols) < 50):
        # plot 5. correlation matrix
        corr = correlate(df, x)
        _cmatrix = row_to_matrix(corr)
        _basic_correlation_matrix(ax[next(I)], _cmatrix)
    if "qqplot" in overlap_:
        # plot 7. q-q plot
        stats.probplot(_df[y], dist="norm", plot=ax[next(I)])
    if "cooks" in overlap_:
        # plot 8. outlier detection using cook's distance plot
        _c = cook_distance(df, x, y, yp)
        _cooks(ax[next(I)], _c)

    fig.tight_layout()
    plt.show()