def shorten(s, newl: int = 15, strategy: str = "middle"): """Shortens a string or array of strings to length `newl`. Parameters ---------- s : str / list of str / np.ndarray / pd.Series / pd.Index The string or list of strings to shorten newl : int, default=15 The number of characters to preserve (5 on each side + spaces) strategy : str, default="middle" Choose from {'middle', 'end'}, determines where to put dots... Returns ------- ns : str / list of str A shortened string or array of strings """ instance_check(s, (str, list, tuple, np.ndarray, Series, Index)) instance_check(newl, int) belongs(strategy, ("middle", "end")) if isinstance(s, str): return _shorten_string(s, newl, strategy) else: # create a partial passing in keyword arguments to every call. _shorten_part = partial(_shorten_string, approp_len=newl, strategy=strategy) # map through the strings and shorten them. return list(map(_shorten_part, s))
def _generate_diag_like_grid(n, direction, ax_size): """ Direction is in [row, column]""" belongs(direction, ["row", "column"]) f1, f2 = nearest_factors(n, shape="diag") axf1, axf2 = ax_size fmax, fmin = max(f1, f2), min(f1, f2) # get longest one tup, nc, nr = ( ((axf1 * fmin, axf2 * fmax), fmin, fmax) if direction == "row" else ((axf1 * fmax, axf2 * fmin), fmax, fmin) ) fig, axes = plt.subplots(ncols=nc, nrows=nr, figsize=tup) axes = _clean_axes_objects(n, axes) return fig, axes
def bleep(_func=None, *, note="C") -> Callable: """Provides automatic sound release when a function has completed. .. note:: this requires the `simpleaudio` package to run. Note chord progression is played at the *end* of the function, and not the start. Parameters ---------- note : str Must be {'A':'G'} Examples -------- >>> from turbopanda.dev import bleep >>> @bleep >>> def f(x): ... # compute some long function here ... pass """ belongs(note, list(_get_notes_all())) # define decorator def _decorator_wrap(func): @wraps(func) def _bleep_function(*args, **kwargs): # enter try-catch and if success, positive noise, or failure, negative noise. try: result = func(*args, **kwargs) # make positive noise _play_arpeggio(note.upper(), key="major") # return return result except Exception as e: # make negative noise _play_arpeggio(note.upper(), key="minor") print(e.args) return _bleep_function if _func is None: return _decorator_wrap else: return _decorator_wrap(_func)
def rename_axis(self, ops: Tuple[str, str], selector: Optional[SelectorType] = None, axis: int = 1): """Perform a chain of .str.replace operations on one of the axes. .. note:: strings that are unchanged remain the same (are not NA'd). Parameters ------- ops : list of tuple (2,) Where the first value of each tuple is the string to find, with its replacement At this stage we only accept *direct* replacements. No regex. Operations are performed 'in order' selector : None, str, or tuple args, optional Contains either types, meta column names, column names or regex-compliant strings If None, all column names are subject to potential renaming axis : int, optional Choose from {1, 0} 1 = columns, 0 = index. Returns ------- self """ # check ops is right format is_twotuple(ops) belongs(axis, [0, 1]) curr_cols = sel_cols = inspect(self.df_, self.meta_, self.selectors_, selector, axis=axis, mode="view") # performs the replacement operation inplace curr_cols = string_replace(curr_cols, ops) # rename using mapping _rename_axis(self.df_, self.meta_, sel_cols, curr_cols, axis=axis) return self
def save( fig_obj: plt.Figure, plot_type: str, name: str = "example1", save_types: Tuple[str, ...] = ("png", "pdf"), fp: str = "./", dpi: int = 360, savemode: str = "first", ) -> bool: """Saves a matplotlib figure in many formats. Given a matplotlib.Figure object, save appropriate numbers of Figures to the respective folders. Parameters ---------- fig_obj : plt.Figure The figure object to save. plot_type : str Choose from: {"scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver", "box", "line", "venn", "multi", "pie"} name : str, optional The name of the file, this may be added to based on the other parameters save_types : tuple of str, optional Choose any from {"png", "pdf", "svg", "eps", "ps"} fp : str, optional The file path to the root directory of saving images dpi : int, optional The resolution in dots per inch; set to high if you want a good image savemode : str, optional Choose from {'first', 'update'} if first, only saves if file isn't present if update, overrides saved figure if present Warnings -------- UserWarning If figure file itself already exists Raises ------ IOError If the filepath does not exist TypeError If the arguments do not match their declared type ValueError If `plot_type`, `savemode` does not belong to an acceptable argument Returns ------- success : bool Whether it was successful or not """ accepted_types = ( "scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver", "box", "line", "venn", "multi", "pie", ) file_types_supported = ("png", "pdf", "svg", "eps", "ps") accepted_savemodes = ("first", "update") instance_check(fig_obj, (plt.Figure, mpl.figure.Figure)) instance_check(name, str) instance_check(fp, str) belongs(plot_type, accepted_types) belongs(savemode, accepted_savemodes) for st in save_types: if st not in file_types_supported: TypeError("save_type: [%s] not supported" % st) # correct to ensure filepath has / at end if not fp.endswith("/"): fp += "/" # check whether the filepath exists if os.path.exists(fp): for t in save_types: # if the directory does not exist, create it! if not os.path.isdir(fp + "_" + t): os.mkdir(fp + "_" + t) # check if the figures themselves already exist. filename = "{}_{}/{}_{}.{}".format(fp, t, plot_type, name, t) if os.path.isfile(filename): warnings.warn( "Figure: '{}' already exists: Using savemode: {}".format( filename, savemode), UserWarning, ) if savemode == "update": fig_obj.savefig(filename, format=t, bbox_inches="tight", dpi=dpi) else: # make the file fig_obj.savefig(filename, format=t, bbox_inches="tight", dpi=dpi) else: raise IOError("filepath: [%s] does not exist." % fp) return True
def partial_bicorr(data: pd.DataFrame, x: str, y: str, covar: Union[str, List[str], Tuple[str, ...], pd.Index], method: str = "spearman", tail: str = "two-sided", output: str = 'score') -> Union[float, dict]: """Partial and semi-partial correlation. Adapted from the `pingouin` library, made by Raphael Vallat. .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py Parameters ---------- data : pd.DataFrame The full dataset including covariates. x, y : str, list of str x and y. Must be names of columns in ``data``. covar : list of str Covariate(s). Column names of the covariates. covar must be made of continuous columns. If x, y are not continuous, will perform logistic regression to generate residuals. method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'biserial' : Biserial correlation (continuous and boolean data) 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) tail : string Specify whether to return the 'one-sided' or 'two-sided' p-value. output : str, default='score' Determines whether to display the full output or just the correlation (r) score options are {'score', 'full'}. Returns ------- stats : float/dict Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'method' : pearson/spearman/biserial... etc 'p-val' : one or two tailed p-value 'BF10' : Bayes Factor of the alternative hypothesis (Pearson only) 'power' : achieved power of the test (= 1 - type II error). Notes ----- From [4]_: “With *partial correlation*, we find the correlation between :math:`x` and :math:`y` holding :math:`C` constant for both :math:`x` and :math:`y`. Sometimes, however, we want to hold :math:`C` constant for just :math:`x` or just :math:`y`. In that case, we compute a *semi-partial correlation*. A partial correlation is computed between two residuals. A semi-partial correlation is computed between one residual and another raw (or unresidualized) variable.” Note that if you are not interested in calculating the statistics and p-values but only the partial correlation matrix, a (faster) alternative is to use the :py:func:`pingouin.pcorr` method (see example 4). Rows with missing values are automatically removed from data. Results have been tested against the `ppcor` R package. References ---------- .. [2] https://en.wikipedia.org/wiki/Partial_correlation .. [3] https://cran.r-project.org/web/packages/ppcor/index.html .. [4] https://gist.github.com/fabianp/9396204419c7b638d38f .. [5] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html """ # perform all checks in the public method.. instance_check(data, pd.DataFrame) instance_check((x, y), str) instance_check(covar, (str, list, tuple, pd.Index)) belongs(tail, ("one-sided", "two-sided")) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ('score', 'full')) # perform a check to make sure every column in `covar` # is continuous. if not is_dataframe_float(data[covar]): raise TypeError("`covar` variables in `partial_bicorr` " "all must be of type `float`/continuous.") return _partial_bicorr_inner(data, x, y, covar, tail=tail, method=method, output=output)
def bicorr(x: pd.Series, y: pd.Series, method: str = "spearman", tail: str = "two-sided", output: str = "score") -> Union[float, dict]: """(Robust) correlation between two variables. Adapted from the `pingouin` library, made by Raphael Vallat. .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py Parameters ---------- x, y : pd.Series First and second set of observations. x and y must be independent. method : str Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'biserial' : Biserial correlation (continuous and boolean data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) tail : str Specify whether to return 'one-sided' or 'two-sided' p-value. output : str, default='score' Determines whether to display the full output or just the correlation (r) score options are {'score', 'full'}. Returns ------- stats : float/dict Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'method' : pearson/spearman/biserial... etc 'p-val' : one or two tailed p-value 'power' : achieved power of the test (= 1 - type II error). Notes ----- The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply an exact linear relationship. The Spearman correlation is a nonparametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact monotonic relationship. Kendall’s tau is a measure of the correspondence between two rankings. Values close to 1 indicate strong agreement, values close to -1 indicate strong disagreement. The percentage bend correlation [1]_ is a robust method that protects against univariate outliers. The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust methods that returns the Spearman's rho after bivariate outliers removal. Note that the skipped correlation requires that the scikit-learn package is installed (for computing the minimum covariance determinant). Please note that rows with NaN are automatically removed. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 """ # perform all checks in the public method.. rather than repeating them internally. # check type instance_check((x, y), pd.Series) belongs(tail, ("one-sided", "two-sided")) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ('score', 'full')) # Check size if x.shape[0] != y.shape[0]: raise ValueError("x and y must have the same length.") if output == "score": return _bicorr_inner_score(x, y, method) else: return _bicorr_inner_full(x, y, method, tail=tail)
def hist_grid(mdf: Union[DataFrame, "MetaPanda"], subset: SelectorType, arrange: str = "square", plot_size: int = 3, shared_dist: str = "auto", savepath: Optional[Union[str, bool]] = None, **hist_kws): """ Plots a grid of histograms comparing the distributions in a MetaPanda selector. Parameters -------- mdf : turb.MetaPanda The dataset subset : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings arrange : str Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row prioritises plots row-like, and column-wise for column. plot_size : int, default=3 The size of each axes shared_dist : str/tuple of str/dict, default="auto" Determines what KDE to fit to the data, set to None if you don't want If tuple/list: attempts using these specified distributions If dict: maps column name (k) to distribution choice (v) savepath : None, bool, str saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved. Other Parameters ---------------- hist_kws : dict Keywords to pass to `turb.plot.histogram` Returns ------- None """ # checks instance_check(shared_dist, (type(None), str, list, tuple, dict)) instance_check(savepath, (type(None), str, bool)) nonnegative(plot_size, int) belongs(arrange, ["square", "row", "column"]) # make a metapanda if we have a dataframe. _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf # get selector selection = _mdf.view(subset) # assuming we've selected something... if selection.size > 0: fig, axes = gridplot(len(selection), arrange, ax_size=plot_size) if not isinstance(shared_dist, dict): for i, x in enumerate(selection): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=shared_dist, **hist_kws) fig.tight_layout() else: for i, (x, d) in enumerate(shared_dist.items()): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=d, **hist_kws) # iterate over any 'remaining' columns in selection and handle appropriately remaining = difference(selection, tuple(shared_dist.keys())) if remaining.shape[0] > 0: for i, x in enumerate(remaining): _ = histogram(_mdf[x].dropna(), ax=axes[i + len(shared_dist)], title=x, kde="auto", **hist_kws) fig.tight_layout() if isinstance(savepath, bool): save(fig, "hist", _mdf.name_) elif isinstance(savepath, str): save(fig, "hist", _mdf.name_, fp=savepath)
def best_model(cv_results, y_var: str = "test", minimize: bool = True, score: str = "RMSE", **box_kws): """Determines the best model (min or max) and plots the boxplot of all resulting best models. Parameters ---------- cv_results : MetaPanda The results from a call to `fit_grid`. y_var : str Choose from {'test', 'train'} If 'test': draws the test score If 'train': draws the training score minimize : bool If True, selects best smallest score, else select best largest score score : str The name of the scoring function box_kws : dict, optional Keyword arguments to pass to `plt.boxplot`. Returns ------- fig : matplotlib.figure The figure object """ instance_check(minimize, bool) instance_check(score, str) belongs(y_var, ("train", "test")) sely = pattern("mean_%s_score" % y_var, cv_results.columns, False) # create figures fig = plt.figure(figsize=(8, 5)) ax = fig.add_subplot(111) # create a copy res = cv_results.df_ if not isinstance(cv_results, pd.DataFrame) else cv_results # transform. if res[sely].squeeze().mean() < 0.0: res = res.pipe(absolute, "(?:split[0-9]+|mean)_(?:train|test)_score") # for each 'model', arrange data into boxplot if minimize: indices = res.groupby("model")[sely].idxmin() else: indices = res.groupby("model")[sely].idxmax() # arrange data result_p = res.df_.loc[indices, res.view("split[0-9]+_%s_score" % y_var)] # reorder based on the best score re_order = result_p.median(axis=1).sort_values() result_p = result_p.reindex(re_order.index) # get best score name indices = switcheroo(indices).reindex(re_order.index) # plot bp = ax.boxplot(result_p, patch_artist=True, showfliers=False, **box_kws) # fetch package names and map them to colors - returned as pd.Series packages = find_model_family(indices.values) # map colors to each of the packages. mapping = dictzip(set_like(packages), color_qualitative(len(set_like(packages)))) mapped_cols = packages.map(mapping) # iterate over boxes and colour for box, col in zip(bp["boxes"], mapped_cols): box.set(facecolor=col, linewidth=1.2) plt.setp(bp["medians"], linewidth=1.5) # additional box requirements ax.set_xlabel("Model") ax.set_ylabel("%s %s" % (y_var, score)) ax.set_xticklabels(indices.values) ax.tick_params("x", rotation=45) ax.grid() for tick in ax.get_xmajorticklabels(): tick.set_horizontalalignment("right") # generate legend ax.legend(legend_line(mapping), list(mapping.keys()), bbox_to_anchor=(1.03, 1.03)) plt.show() return fig
def cachedec( _func=None, *, filename: str = "example1.pkl", compress: int = 0, return_as: str = "MetaPanda" ) -> Callable: """Provides automatic decorator caching for objects. Especially compatible with `turb.MetaPanda` or `pd.DataFrame`. .. note:: this is a decorator function, not to be called directly. All parameters must be passed as keyword arguments. Parameters ---------- _func filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv', 'pkl'} extensions only. compress : int [0-9] or 2-tuple, optional Optional compression level for the data. 0 or False is no compression. Higher value means more compression, but also slower read and write times. Using a value of 3 is often a good compromise. See the notes for more details. If compress is True, the compression level used is 3. If compress is a 2-tuple, the first element must correspond to a string between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma' 'xz'), the second element must be an integer from 0 to 9, corresponding to the compression level. return_as : str, default="MetaPanda" Accepts {'pandas', 'MetaPanda'} Only applies if filename is "csv" or "json". Attempts to cast the return object as something palatable to the user. Warnings -------- ImportWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv', 'pkl'} Returns ------- mp : turb.MetaPanda / object The MetaPanda object if {'csv' or 'json'}, otherwise uses serialized pickling which can return an arbritrary object. Examples -------- For example, we call as a decorator to our custom function: >>> from turbopanda import cachedec >>> @cachedec('meta_file.json') >>> def f(x): ... return turb.MetaPanda(x) These also work with numpy arrays or python objects by using `joblib`: >>> @cachedec("meta.pkl") >>> def g(x): ... return [1, 2, [3, 4], {"hi":"moto"}] """ # check it is string instance_check(filename, str) file_ext = filename.rsplit(".")[-1] # check that file ends with json or csv belongs(file_ext, ("json", "csv", "pkl")) # define decorator def _decorator_cache(func): """Basic decorator.""" @functools.wraps(func) def _wrapper_cache(*args, **kwargs): # if we find the file if os.path.isfile(filename): # if its .csv or .json, use `read` if file_ext in ("json", "csv"): # read it in mdf = read(filename) _set_index_def(mdf.df_) if return_as == "MetaPanda": return mdf else: return mdf.df_ else: if is_joblib_installed(raise_error=True): import joblib mdf = joblib.load(filename) return mdf else: # returns MetaPanda or pandas.DataFrame mpf = func(*args, **kwargs) if isinstance(mpf, MetaPanda): # save file mpf.write(filename) if return_as == "MetaPanda": return mpf else: return mpf.df_ elif isinstance(mpf, DataFrame): # save - bumping index into the file. mpf.reset_index().to_csv(filename, index=None) if return_as == "MetaPanda": return MetaPanda(mpf) else: return mpf else: if is_joblib_installed(raise_error=True): import joblib # attempt to use joblib to dump joblib.dump(mpf, filename, compress=compress) return mpf return _wrapper_cache if _func is None: return _decorator_cache else: return _decorator_cache(_func)
def scatter(X: _ArrayLike, Y: _ArrayLike, c: Union[str, _ArrayLike] = "k", marker: Union[str, _ArrayLike] = "o", s: Optional[Union[_Numeric, _ArrayLike]] = None, dense: bool = False, fit_line: bool = False, ax: Optional[mpl.axes.Axes] = None, alpha: Optional[float] = None, cmap: str = "viridis", legend: bool = True, colorbar: bool = True, with_jitter: bool = False, x_label: Optional[str] = None, y_label: Optional[str] = None, x_scale: str = "linear", y_scale: str = "linear", legend_outside: bool = False, title: str = "", with_grid: bool = False, fit_line_degree: int = 1, **scatter_kws): """Generates a scatterplot, with some useful features added to it. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The data column to draw on the x-axis. Flattens if np.ndarray Y : list/tuple/np.ndarray/pd.Series (1d) The data column to draw on the y-axis. Flattens if np.ndarray c : str/list/tuple/np.ndarray/pd.Series (1d), default='blue' The colour of the points. If array, colors must be a categorical/valid float type, uses cmap marker : str/list/tuple/np.ndarray/pd.Series (1d), default='o' The marker style of the points. If type=list/array, array must be a categorical/str-like type to map to matplotlib markers If dense=True, treats each marker as a circle, ignores this input s : int/float/list/tuple/np.ndarray/pd.Series (1d), optional Size of each point. If dense=True, this value is set automatically. If type=list/array, array must be array of floats dense : bool If True, draws the uniform densities instead of the actual points fit_line : bool If True, draws a line of best fit on the data ax : matplotlib.ax.Axes, optional, default=None If None, creates one. alpha : float, optional Sets the alpha for colour. If dense is True, this value is set automatically cmap : str, default="viridis" The default colormap for continuous-valued c. legend : bool, default=True Draws a legend if the 'c' variable is discrete colorbar : bool, default=True Draws a colorbar if the 'c' variable is continuous with_jitter : bool, default=False If True, and dense=True, adds some jitter to the uniform points x_label : str, default="x-axis" If X is not a pandas.Series, this is used y_label : str, default="y-axis" If Y is not a pandas.Series, this is used x_scale : str, default="linear" Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_xscale` y_scale : str, default="linear" Choose from {'linear', 'log', 'symlog', 'logit'}, see `matplotlib.ax.set_yscale` legend_outside : bool, default=False If True, plots the legend outside the plot at (1, 1) title : str, default="" Optional title at the top of the axes with_grid : bool, default=False If True, draws a grid fit_line_degree : int, default=1 If fit_line=True, Determines the degree to which a line is fitted to the data, allows polynomials Other Parameters ---------------- scatter_kws : dict Keyword arguments to pass to `ax.scatter` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-scatter """ instance_check((X, Y), (list, tuple, np.ndarray, Series)) instance_check((c, marker), (str, list, tuple, np.ndarray, Series, Index)) instance_check( s, (type(None), int, float, list, tuple, np.ndarray, Series, Index)) instance_check(alpha, (type(None), float)) instance_check(ax, (type(None), mpl.axes.Axes)) instance_check( (dense, with_jitter, fit_line, with_grid, legend, legend_outside), bool) instance_check((x_label, y_label, title, x_scale, y_scale), (type(None), str)) instance_check(fit_line_degree, int) arrays_equal_size(X, Y) if isinstance(marker, str): belongs(marker, _marker_set()) # get subset where missing values from either are dropped _X = as_flattened_numpy(X) _Y = as_flattened_numpy(Y) # remove values not found in both. _X, _Y = remove_na(_X, _Y, paired=True) # warn the user if n is large to maybe consider dense option? if _X.shape[0] > 15000 and not dense: warn( "Data input n={} is large, consider setting dense=True or using function `scatter_slim`." .format(X.shape[0]), UserWarning, ) # reconfigure colors if qualitative if isinstance(s, (list, tuple)) and not dense: s = as_flattened_numpy(s) arrays_equal_size(X, Y, s) if isinstance(marker, (list, tuple)) and not dense: marker = np.asarray(marker) arrays_equal_size(X, Y, marker) if not isinstance(c, str): # do some prep work on the color variable. palette, _cmode = cat_array_to_color(c, cmap=cmap) # perform size check arrays_equal_size(X, Y, palette) else: palette = c _cmode = "static" if ax is None: fig, ax = plt.subplots(figsize=(8, 5)) if dense: # alpha, s are set in this case alpha = 0.8 marker = "o" # perform density plotting bins_x = min(freedman_diaconis_bins(_X), 50) bins_y = min(freedman_diaconis_bins(_Y), 50) # estimate counts using histogram2d s, xs, ys = np.histogram2d(_X, _Y, bins=(bins_x, bins_y)) # create a mesh xp, yp = np.meshgrid(xs[:-1], ys[:-1]) if with_jitter: xp += np.random.rand(*xp.shape) / (_X.max() - _X.min()) yp += np.random.rand(*yp.shape) / (_Y.max() - _Y.min()) else: if alpha is None: alpha = _select_best_alpha(_X.shape[0]) if s is None: s = _select_best_size(_X.shape[0]) xp = _X yp = _Y # draw _ = _draw_scatter(xp, yp, palette, s, marker, alpha, ax, cmap=cmap, **scatter_kws) # optionally fit a line of best fit if fit_line: _draw_line_best_fit(_X, _Y, palette, ax, fit_line_degree) if with_grid: ax.grid() # associate legend if colour map is used if _cmode == "discrete" and legend: map_legend(c, palette, marker, ax, legend_outside) elif _cmode == "continuous" and colorbar: # add colorbar _make_colorbar(c, ax, cmap) # apply x-label, y-label, title if isinstance(x_label, str): ax.set_xlabel(x_label) elif isinstance(X, Series): ax.set_xlabel(X.name) if isinstance(y_label, str): ax.set_ylabel(y_label) elif isinstance(Y, Series): ax.set_ylabel(Y.name) ax.set_xscale(x_scale) ax.set_yscale(y_scale) ax.set_title(title) return ax
def bar1d( X: _ArrayLike, Y: Optional[_ListLike] = None, c: Optional[Union[_ArrayLike, str]] = "k", vert: bool = True, sort: bool = True, ax: Optional[mpl.axes.Axes] = None, scale: str = "linear", annotate: bool = False, legend: bool = False, width: float = 0.8, label_rotation: float = 0.0, value_label: Optional[str] = None, sort_by: str = "values", cmap: str = "Blues", linesAt: Optional[Union[_Numeric, _ListLike]] = None, ): """Plots a 1 dimensional barplot. Parameters ---------- X : list, tuple, np.ndarray, pd.Series Categorical/string/time labels for the data. If pandas.Series, Index must be categorical, Values must be numeric. Y : list, tuple, np.ndarray, optional If None, X must be a pd.Series. Must be numeric dtype. c : str/list/tuple/np.ndarray/pd.Series (1d), optional Defines the colour of each bar. If str, colours all of the bars with the same If array, must be a categorical type. If None, uses an automatic qualitative palette vert : bool, default=True Determines whether the plot is vertical or horizontal sort : bool, default=True Sorts the data or labels ax : matplotlib.ax.Axes, optional, default=None If None, creates one. scale : str, default="linear" Determines how to scale the numeric axis. annotate : bool, default=False Determines whether values should be annotated legend : bool, default=False Choose whether to display a legend width : float, default=0.8 The width of each bar in the barplot label_rotation : float, default=0 The degrees of rotation to the ticklabels value_label : str, optional Defines a name for the numerical axis sort_by : str, default="values" Defines how to sort the data if sort=True. Choose from {'values', 'labels'} cmap : str, default="Blues" Defines a colormap if color values are specified linesAt : int, float, list, tuple, optional If set, defines one or more vertical lines to add to the barplot Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot """ # define plot if not set belongs(sort_by, ("values", "labels")) if ax is None: fig, ax = plt.subplots(figsize=(8, 5)) # X is either numerical (in which case there is no Y, or categorical labels) if Y is None: # in this case, X must contain all the data if isinstance(X, pd.Series): _labels = as_flattened_numpy(X.index) _values = as_flattened_numpy(X.values) _ticks = np.arange(X.shape[0]) value_label = X.name else: _labels = _ticks = np.arange(len(X)) _values = as_flattened_numpy(X) else: # X is labels, Y are numeric values (assume!) _labels = as_flattened_numpy(X) _values = as_flattened_numpy(Y) _ticks = np.arange(_labels.shape[0]) # sort out colour pal = _determine_color_palette(c, _ticks.shape[0], cmap) # perform sorting here if sort: if sort_by == "values": _order = np.argsort(_values) elif sort_by == "labels": _order = np.argsort(_labels) else: raise ValueError( "sort_by '{}': must be ['values', 'labels']".format(sort_by) ) # apply sort if not isinstance(c, (type(None), str)): _labels, _values, pal = _apply_data_sort(_order, _labels, _values, pal) else: _labels, _values = _apply_data_sort(_order, _labels, _values) # plot the bar _plot_bar_orient( ax, _ticks, _labels, _values, c=pal, w=width, vert=vert, lrot=label_rotation, annotate=annotate, lines=linesAt, vlabel=value_label, ) # orient scale if vert: ax.set_yscale(scale) else: ax.set_xscale(scale) # map a legend to it if legend and not isinstance(c, str): map_legend(c, pal, "o", ax, False) return ax
def test_belongs2(self): l1 = ['apples', 'oranges', 'pears'] assert utils.belongs("apples", l1)
def test_belongs1(self, s): l1 = ['apples', 'oranges', 'pears'] with pytest.raises(ValueError): assert utils.belongs(s, l1)
def gridplot( n_plots: int, arrange: str = "square", ax_size: Union[int, Tuple[int, int]] = 2, annotate_labels: bool = False, annotate_offset: float = 0.01, **annotate_args ): """Determines the most optimal shape for a set of plots. Parameters ---------- n_plots : int The total number of plots. arrange : str, default="square" Choose from {'square', 'row' 'column'}. Indicates preference for direction of plots. ax_size : int, default=2 The square size of each plot. annotate_labels : bool, default=False If True, adds A, B,.. K label to top-left corner of each axes. annotate_offset : float, default=0.01 Determines the amount of offset for each label Returns ------- fig : matplotlib.figure.Figure The figure axes : list of matplotlib.ax.Axes A list of axes to use. """ instance_check(annotate_labels, bool) nonnegative((n_plots,), int) belongs(arrange, ["square", "row", "column"]) annot_props = { "weight": "bold", "horizontalalignment": "left", "verticalalignment": "center", } # update with args annot_props.update(annotate_args) if isinstance(ax_size, int): fs = np.array([ax_size, ax_size]) else: fs = np.array(ax_size) if n_plots == 1: fig, ax = plt.subplots(figsize=fs) # # wrap ax as a list to iterate over. if annotate_labels: fig.text(0.01, 0.98, "A", **annot_props) return fig, [ax] else: fig, ax = ( _generate_square_like_grid(n_plots, ax_size=fs) if arrange == "square" else _generate_diag_like_grid(n_plots, arrange, ax_size=fs) ) # add annotation labels, hmmm if annotate_labels: # we use tight layout to make sure text isnt overlapping fig.tight_layout() for a, n in zip(ax, string.ascii_uppercase): pos_ = a.get_position().bounds # add label fig.text( pos_[0] - annotate_offset, pos_[1] + pos_[3] + annotate_offset, n, **annot_props ) return fig, ax
def cached_chunk( func: Callable, param_name: str, param_values: Union[List, Tuple], parallel: bool = True, filename: str = "example1.json", verbose: int = 0, *args, **kwargs ) -> "MetaPanda": """Provides chunked automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object. Parameters -------- func : function A custom function returning the pd.DataFrame/MetaPanda param_name : str The keyword name of the parameter in question to iterate over param_values : list/tuple of something The values associated with the parameter to iterate over parallel : bool, default=True Determines whether to use `joblib` to compute independent chunks in parallel or not filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv'} formats. verbose : int, optional If > 0, prints out useful information *args : list, optional Arguments to pass to function(...) **kwargs : dict, optional Keyword arguments to pass to function(...) Warnings -------- FutureWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv'} Returns ------- mp : MetaPanda The MetaPanda object See Also -------- cached : Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. """ # check it is string instance_check(filename, str) instance_check(param_name, str) instance_check(param_values, (list, tuple, dict)) if not callable(func): raise ValueError("function is not callable") # check that file ends with json or csv belongs(filename.rsplit(".", 1)[-1], ("json", "csv")) # if the final file exists, perform as normal. if os.path.isfile(filename): if verbose > 0: print("reading in cached file: {}".format(filename)) # read it in mdf = read(filename) _set_index_def(mdf.df_) return mdf else: # create a bunch of chunks by repeatedly calling cache. if parallel: _mdf_chunks = joblib.Parallel(joblib.cpu_count())( joblib.delayed(cached)( func, insert_suffix(filename, "_chunk%d" % i), verbose=verbose, *args, **dictcopy(kwargs, {param_name: chunk}) ).df_ for i, chunk in enumerate(param_values) ) else: _mdf_chunks = [ cached( func, insert_suffix(filename, "_chunk%d" % i), verbose=verbose, *args, **dictcopy(kwargs, {param_name: chunk}) ).df_ for i, chunk in enumerate(param_values) ] # join together the chunks mpf = _stack_rows(_mdf_chunks) # save file - return type must be a MetaPanda or error occurs! mpf.write(filename) # now delete the 'chunked' files. for i in range(len(param_values)): os.remove(insert_suffix(filename, "_chunk%d" % i)) return mpf
def scatter_grid( mdf: Union[DataFrame, "MetaPanda"], x: SelectorType, y: SelectorType, arrange: str = "square", plot_size: int = 3, best_fit: bool = True, best_fit_deg: int = 1, savepath: Optional[Union[bool, str]] = None, ): """ Plots a grid of scatter plots comparing each column for MetaPanda in selector to y target value. Parameters -------- mdf : turb.MetaPanda The dataset x : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings y : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings arrange : str Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row prioritises plots row-like, and column-wise for column. plot_size : int The size of each axes best_fit : bool If True, draws a line of best fit best_fit_deg : int, default=1 The degree of the line of best fit, can draw polynomial savepath : None, bool, str saves the figure to file. If bool, uses the name in mdf, else uses given string. Returns ------- None """ from turbopanda.corr import bicorr # checks instance_check((plot_size, best_fit_deg), int) instance_check(savepath, (type(None), str, bool)) instance_check(best_fit, bool) nonnegative(( best_fit_deg, plot_size, )) belongs(arrange, ["square", "row", "column"]) # make a metapanda if we have a dataframe. _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf # get selector x_sel = _mdf.view(x) y_sel = _mdf.view(y) # create a product between x and y and plot prod = list(it.product(x_sel, y_sel)) if len(prod) > 0: fig, axes = gridplot(len(prod), arrange, ax_size=plot_size) for i, (_x, _y) in enumerate(prod): # pair x, y __x, __y = remove_na(_mdf[_x].values, _mdf[_y].values, paired=True) axes[i].scatter(__x.flatten(), __y, alpha=0.5) # line of best fit if best_fit: xn = np.linspace(__x.min(), __x.max(), 100) z = np.polyfit(__x.flatten(), __y, deg=best_fit_deg) axes[i].plot(xn, np.polyval(z, xn), "k--") # spearman correlation pair_corr = bicorr(_mdf[_x], _mdf[_y]).loc["spearman", "r"] axes[i].set_title("r={:0.3f}".format(pair_corr)) axes[i].set_xlabel(_x) axes[i].set_ylabel(_y) fig.tight_layout() if isinstance(savepath, bool): save(fig, "scatter", _mdf.name_) elif isinstance(savepath, str): save(fig, "scatter", _mdf.name_, fp=savepath)
def cached( func: Callable, filename: str = "example1.json", verbose: int = 0, *args, **kwargs ) -> "MetaPanda": """Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. .. note:: this is a direct-call cache function. Not cached. For example, we call `cached` as a wrapper to our custom function: >>> import turbopanda as turb >>> def f(x): ... return turb.MetaPanda(x) >>> data = cached(f, 'meta_file.json') .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object. Parameters -------- func : function A custom function returning the pd.DataFrame/MetaPanda filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv'} formats. verbose : int, optional If > 0, prints out useful information *args : list, optional Arguments to pass to function(...) **kwargs : dict, optional Keyword arguments to pass to function(...) Warnings -------- FutureWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv'} Returns ------- mp : MetaPanda The MetaPanda object See Also -------- cache : Provides automatic {.json, .csv} decorator caching for `turb.MetaPanda` or `pd.DataFrame`. """ # check it is string instance_check(filename, str) instance_check(verbose, int) instance_check(func, "__call__") # check that file ends with json or csv belongs(filename.rsplit(".", 1)[-1], ("json", "csv")) if os.path.isfile(filename): if verbose > 0: print("reading in cached file: {}".format(filename)) # read it in mdf = read(filename) _set_index_def(mdf.df_) return mdf else: if verbose > 0: print("running function '{}' for cache".format(func.__name__)) # returns MetaPanda or pandas.DataFrame mpf = func(*args, **kwargs) if isinstance(mpf, MetaPanda): # save file mpf.write(filename) return mpf elif isinstance(mpf, DataFrame): # save - bumping index into the file. mpf.reset_index().to_csv(filename, index=None) return MetaPanda(mpf) else: if verbose > 0: print( "returned object from cache not of type [DataFrame, MetaPanda], not cached" ) return mpf
def correlate( data: Union[pd.DataFrame, MetaPanda], x: Optional[SelectorType] = None, y: Optional[SelectorType] = None, covar: Optional[SelectorType] = None, cartesian_covar: bool = False, output: str = "full", method: str = "spearman", verbose: int = 0, ) -> pd.DataFrame: """Correlates X and Y together to generate a list of correlations. If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame Parameters --------- data : pd.DataFrame / MetaPanda The full dataset. x : (str, list, tuple, pd.Index), optional Subset of input(s) for column names. if None, uses the full dataset. Y must be None in this case also. y : (str, list, tuple, pd.Index), optional Subset of output(s) for column names. if None, uses the full dataset (from optional `x` subset) covar : (str, list, tuple, pd.Index), optional set of covariate(s). Covariates are needed to compute partial correlations. If None, uses standard correlation. cartesian_covar : bool, default=False If True, and if covar is not None, separates every element in covar to individually control for using the cartesian product output : str, default="full" Choose from {'full', 'score'}. Score just returns `r` number. method : str, default="spearman" Method to correlate with. Choose from: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'biserial' : Biserial correlation (continuous and boolean data only) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) verbose : int, default=0 If > 0, prints out useful debugging messages Returns ------- R : pd.DataFrame correlation rows (based on pingouin structure) Examples -------- >>> import turbopanda as turb >>> data = turb.read('example.json') >>> R = turb.correlate(data) # uses full dataset X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset X M Y X 1.000000 0.392251 0.059771 M 0.392251 1.000000 0.545618 Y 0.059771 0.545618 1.000000 # correlates X columns against Ybin >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin') X M Y Ybin 1.000000 0.392251 0.059771 # correlates X against Ybin controlling for >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y X Ybin -0.149210 # using a different technique >>> R = turb.correlate(data, method="shepherd") X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 """ # data cannot be NONE instance_check(data, (pd.DataFrame, MetaPanda)) instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index)) instance_check(cartesian_covar, bool) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ("full","score")) bounds_check(verbose, 0, 4) # downcast to dataframe option df = data.df_ if not isinstance(data, pd.DataFrame) else data # downcast if list/tuple/pd.index is of length 1 x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y # convert using `view` if we have string instances. if isinstance(x, str): x = pattern(x, df.columns) if isinstance(y, str): y = pattern(y, df.columns) if isinstance(covar, str): covar = pattern(covar, df.columns) # perform a check to make sure every column in `covar` is continuous. if covar is not None: if not is_dataframe_float(data[covar]): raise TypeError( "`covar` variables in `correlate` all must be of type `float`/continuous." ) # execute various use cases based on the presense of x, y, and covar, respectively. if x is None and y is None: # here just perform matrix-based correlation comb = it.combinations_with_replacement(df.columns, 2) niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2) elif isinstance(x, (list, tuple, pd.Index)) and y is None: # use a subset of x, in union with covar comb = it.combinations_with_replacement(x, 2) niter = (len(x)**2) // 2 + (len(x) // 2) elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str): # list of x, y str -> matrix-vector cartesian product comb = it.product(x, [y]) niter = len(x) elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str): # list of y, x str -> matrix-vector cartesian product comb = it.product(y, [x]) niter = len(y) elif isinstance(x, (list, tuple, pd.Index)) and isinstance( y, (list, tuple, pd.Index) ): # list of x, y -> cartesian product of x: y terms comb = it.product(x, y) niter = len(x) * len(y) else: raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar)) # return the combination of these effects. return _corr_combination( df, comb, niter, covar, cartesian_covar, method, output, verbose )