def dummy_categorical(cat: pd.Series) -> pd.DataFrame: """Given pd.Series of type 'category', return boolean dummies as matrix.""" instance_check(cat, pd.Series) if cat.dtype == "category": return pd.get_dummies(cat).add_prefix("is_").astype(np.bool) else: raise TypeError("'cat' Series is {}, not of type 'category'".format(cat.dtype))
def shorten(s, newl: int = 15, strategy: str = "middle"): """Shortens a string or array of strings to length `newl`. Parameters ---------- s : str / list of str / np.ndarray / pd.Series / pd.Index The string or list of strings to shorten newl : int, default=15 The number of characters to preserve (5 on each side + spaces) strategy : str, default="middle" Choose from {'middle', 'end'}, determines where to put dots... Returns ------- ns : str / list of str A shortened string or array of strings """ instance_check(s, (str, list, tuple, np.ndarray, Series, Index)) instance_check(newl, int) belongs(strategy, ("middle", "end")) if isinstance(s, str): return _shorten_string(s, newl, strategy) else: # create a partial passing in keyword arguments to every call. _shorten_part = partial(_shorten_string, approp_len=newl, strategy=strategy) # map through the strings and shorten them. return list(map(_shorten_part, s))
def cat_array_to_color(array, cmap="Blues", rstart=0., rend=1.): """Given some list/array of values, find some way of mapping this to colour values Parameters ---------- array : np.ndarray An array of values. cmap : str, dict, list-like Either a string or an array referencing the 'unique' colours If dict (k = name, v = hex color) rstart : float [0..1] The start of the color range rend : float [0..1] The end of the color range """ instance_check(cmap, (str, dict, list, tuple)) # map to numpy _array = ( np.asarray(array).flatten() if not isinstance(array, (np.ndarray, pd.Series)) else array ) # if boolean, cast as a 'string' if _array.dtype.kind == "b": _array = _array.astype(np.str) if (_array.dtype.kind == "U") | (_array.dtype.kind == "O"): # i.e we have a string array name_uniq = unique_ordered(_array) col_uniq = _colormap_to_hex(cm.get_cmap(cmap)(np.linspace(rstart, rend, len(name_uniq)))) name_col_map = dict(zip(name_uniq, col_uniq)) cl = list(map(lambda s: s.replace(s, name_col_map[s]), _array)) return np.asarray(cl), "discrete" else: return _array, "continuous"
def common_substrings( a: Union[str, List[str]], b: Optional[Union[str, List[str]]] = None, min_length: int = 2, ) -> Union[str, Series]: """Given at least one pair of strings, find all the best common substring matches. By default, if one a is passed, it uses the pairwise combinations between all values in the list, otherwise with a + b, the cartesian product of the lists is used. Parameters ---------- a : str/list of str A word or list of words to find the common substring to b : str/list of str, optional A word or list of words to find the common substring to If None, pairwise combinations in a are used min_length: int, default=2 The minimum accepted length of string for a given pair Returns ------- z_up : str/Series str returned if (a, b) are strs, else Series of valuecounts """ instance_check(a, (str, list, tuple, Index)) instance_check(b, (type(None), str, list, tuple, Index)) nonnegative(min_length, int) # prevent a case where a can be a str, b is None disallow_instance_pair(a, str, b, type(None)) filters = ("", "_", "__", "-") if isinstance(a, str) and isinstance(b, type(None)): return a elif isinstance(a, str) and isinstance(b, str): return _single_common_substring_match(a, b) else: if isinstance(a, str): a = [a] elif isinstance(b, str): b = [b] # determine pair set. if b is None: # combination iterator pair_groups = it.combinations(a, 2) else: # cartesian product iterator pair_groups = it.product(a, b) # generate pairs z = [_single_common_substring_match(i, j) for i, j in pair_groups] def filter_func(x): """Custom function which filters according to tuple and keeps elements >= min length""" return (x in filters) or (len(x) < min_length) or (z.count(x) <= 1) # filter out naff elements z_up = list(it.filterfalse(filter_func, z)) # save as series valuecounts. return Series(z_up, dtype=object).value_counts()
def string_replace(strings: Union[str, List[str], Tuple[str, ...], Series, Index], *operations: Tuple[str, str]): """Performs all replace operations on the string inplace. By default, if operations is a list of these tuples, it will work also. Parameters ---------- strings : str, list, tuple, Series, Index A string or set of strings to possibly rename operations : arguments of tuple-2 (before, after) strings The 'before' string chooses a subsection to change, and after is the replacement Returns ------- strings_new : list, tuple, Series, Index The replaced-string array Examples -------- This function allows you to perform an ordered-set of changes to an array of strings: >>> from turbopanda.str import string_replace as strrepl >>> strrepl(['hello', 'i am', 'pleased'], ("i", "u")) >>> ['hello', 'u am', 'pleased'] We can perform multiple changes: >>> strrepl(['hello', 'i am', 'pleased'], ("i", "you"), ("am", "are")) >>> ['hello', 'you are', 'pleased'] The changes also obey the order, so potentially the same string can be changed more than once: >>> strrepl(['hello', 'i am', 'pleased'], ("hello", "goodbye"), ("good", "bad")) >>> ['badbye', 'i am', 'pleased'] Note that to provide backwards compatibility, if there is only argument and its a list, it will attempt to treat this as a stack of arguments to parse: >>> strrepl(['hello', 'i am', 'pleased'], ("hello", "goodbye"), ("good", "bad")) >>> ['badbye', 'i am', 'pleased'] See Also -------- re.search turbopanda.str.pattern """ # perform check instance_check(strings, (str, list, tuple, Series, Index)) # convert single list to op list if len(operations) == 1 and isinstance(operations[0], list): operations = operations[0] if isinstance(strings, str): return reduce(lambda sold, arg: sold.replace(*arg), [strings, *operations]) else: if len(strings) == 0: return strings else: strings_new = list( reduce( lambda sold, arg: map(lambda s: s.replace(*arg), sold), [strings, *operations], )) return transform_copy(strings, strings_new)
def test_instance_check(self): # single example x = ['abba', 'father', 'cross'] assert utils.instance_check(x, list) y = np.array([1, 2, 3], dtype=float) assert utils.instance_check(y, np.ndarray) # multiples given a tuple ij = True ji = False assert utils.instance_check((ij, ji), bool)
def multivariate_gaussians(n, k, C=0.5): """Creates k multivariate Gaussian distributions with sample size n, according to correlations C. All Gaussians are with mu = 0, sigma = [ratio of C]. Parameters ---------- n : int Sample size k : int, list of int Dimensionality of one group (int) or each group of Multivariate Gaussians (list of int) C : float, list of float Correlation strength [-1...1] for all groups (float) or each group (list of float) Returns ------- X : np.ndarray (n, sum(k)) Multivariate Gaussian synthetic data """ if n < 1: raise ValueError("'n' must be > 0") instance_check(k, (int, np.int, list, tuple)) instance_check(C, (float, np.float, list, tuple)) # if C is a list, ensure k and C are same length if isinstance(C, (list, tuple)) and isinstance(k, (list, tuple)): arrays_equal_size(k, C) # handle single k case if isinstance(k, (int, np.int)): # just one gaussian group if isinstance(C, (float, np.float)): return np.random.multivariate_normal(np.zeros(k), covariance_matrix( k, C, random_direction=False), size=n) else: raise ValueError( "'C' must be of type 'float' when 'k' is of type 'int'") else: # must be a list, iterate over it result = [] for i, p in enumerate(k): mu = np.zeros(p) # collect the correlation ratio if its a float or from a list c = C if isinstance(C, (float, np.float)) else C[i] # compute covariance matrix cov = covariance_matrix(p, c, random_direction=False) # make data X = np.random.multivariate_normal(mu, cov, size=n) result.append(X) return np.hstack(result)
def _shorten_string(s: str, approp_len: int = 15, strategy: str = "middle") -> str: instance_check(s, str) if (len(s) == 0) | (len(s) <= approp_len): return s else: if strategy == "end": return s[:approp_len - 2] + ".." elif strategy == "middle": midpoint = (approp_len - 2) // 2 return s[:midpoint] + ".." + s[-midpoint:] else: raise ValueError("strategy '{}' not in {}".format( strategy, ("middle", "start", "end")))
def dtypes(self, grouped: bool = True) -> Union[pd.Series, pd.DataFrame]: """Determine the grouped data types in the dataset. Parameters -------- grouped : bool, optional If True, returns the value_counts of each data type, else returns the direct types. Returns ------- true_types : pd.Series/pd.DataFrame A series of index (group/name) and value (count/type) """ instance_check(grouped, bool) return ( self.meta_["true_type"].value_counts() if grouped else self.meta_["true_type"] )
def expand(self, column: str, sep: Optional[str] = ","): """Expand out a 'stacked' id column to a longer-form DataFrame. Expands out a 'stacked' id column to a longer-form DataFrame, and re-merging the data back in. Parameters ---------- column : str The name of the column to expand, must be of datatype [object] sep : str, optional The separating string to use. Raises ------ ValueError If `column` not found in `df_` or `meta_`, or `column` is not stackable Returns ------- self See Also -------- shrink : Expands out a 'unstacked' id column to a shorter-form DataFrame. """ instance_check((column, sep), str) if column not in self.df_.columns: raise ValueError("column '{}' not found in df".format(column)) self._df = pd.merge( # expand out id column self.df_[column].str.strip().str.split(sep).explode(), self.df_.dropna(subset=[column]).drop(column, axis=1), left_index=True, right_index=True, ) self._df.columns.name = "colnames" return self
def apply(self, f_name: str, *f_args, **f_kwargs): """Apply a `pd.DataFrame` function to `df_`. e.g mdf.apply("groupby", ["counter","refseq_id"], as_index=False) applies self.df_.groupby() to data and return value is stored in df_ assumes pandas.DataFrame is returned. Parameters ---------- self f_name : str The name of the function f_args : list/tuple, optional Arguments to pass to the function f_kwargs : dict, optional Keyword arguments to pass to the function Returns ------- self """ instance_check(f_name, str) self._apply_function(f_name, *f_args, **f_kwargs) return self
def kde2d(X: Union[np.ndarray, Series, List, Tuple], Y: Union[np.ndarray, Series, List, Tuple], c: str = "red", ax: mpl.axes.Axes = None, fill: bool = False, with_scatter: bool = False, **contour_kwargs): """TODO: Generates a 2D KDE using contours.""" instance_check((X, Y), (list, tuple, np.ndarray, Series)) instance_check(c, str) instance_check((fill, with_scatter), bool) instance_check(ax, mpl.axes.Axes) arrays_equal_size(X, Y) # calculate density _X, _Y = remove_na(np.asarray(X), np.asarray(Y), paired=True) H = density(_X, _Y) offx = np.abs(_X.max() - _X.min()) / 15.0 offy = np.abs(_Y.max() - _Y.min()) / 15.0 _alpha = 0.5 if with_scatter else 1.0 if ax is None: fig, ax = plt.subplots(figsize=(8, 5)) if fill: ax.contourf( H, extent=(_X.min() - offx, _X.max() + offx, _Y.min() - offy, _Y.max() + offy), color=c, alpha=_alpha, ) else: cset = ax.contour(H, extent=(_X.min() - offx, _X.max() + offx, _Y.min() - offy, _Y.max() + offy), color=c, **contour_kwargs) ax.clabel(cset, inline=1, fontsize=10) if with_scatter: ax.scatter(_X, _Y, c=c, alpha=_alpha) return ax
def density( X: np.ndarray, Y: Optional[np.ndarray] = None, Z: Optional[np.ndarray] = None, r: Optional[int] = None, ) -> np.ndarray: """Estimates the density of X using binning, accepts np.ndarray. Parameters ---------- X : np.ndarray (n,) The first dimension Y : np.ndarray (n,), optional The second dimension Z : np.ndarray (n,), optional The third dimension r : int, optional The number of bins for each dimension, If None, uses the freedman-diaconis rule Returns ------- d : np.ndarray (r,...) The density in binned-dimensions """ instance_check(X, np.ndarray) instance_check((Y, Z), (type(None), np.ndarray)) instance_check(r, (type(None), int)) if r is None: r = min(freedman_diaconis_bins(X), 50) else: nonnegative(r, int) if Y is None and Z is None: _X = remove_na(X) return np.histogram(_X, bins=r, density=True)[0] elif Z is None: _X, _Y = remove_na(X, Y, paired=True) return np.histogram2d(_X, _Y, bins=(r, r), density=True)[0] else: return np.histogramdd(np.vstack((X, Y, Z)).T, bins=(r, r, r), density=True)[0]
def cachedec( _func=None, *, filename: str = "example1.pkl", compress: int = 0, return_as: str = "MetaPanda" ) -> Callable: """Provides automatic decorator caching for objects. Especially compatible with `turb.MetaPanda` or `pd.DataFrame`. .. note:: this is a decorator function, not to be called directly. All parameters must be passed as keyword arguments. Parameters ---------- _func filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv', 'pkl'} extensions only. compress : int [0-9] or 2-tuple, optional Optional compression level for the data. 0 or False is no compression. Higher value means more compression, but also slower read and write times. Using a value of 3 is often a good compromise. See the notes for more details. If compress is True, the compression level used is 3. If compress is a 2-tuple, the first element must correspond to a string between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma' 'xz'), the second element must be an integer from 0 to 9, corresponding to the compression level. return_as : str, default="MetaPanda" Accepts {'pandas', 'MetaPanda'} Only applies if filename is "csv" or "json". Attempts to cast the return object as something palatable to the user. Warnings -------- ImportWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv', 'pkl'} Returns ------- mp : turb.MetaPanda / object The MetaPanda object if {'csv' or 'json'}, otherwise uses serialized pickling which can return an arbritrary object. Examples -------- For example, we call as a decorator to our custom function: >>> from turbopanda import cachedec >>> @cachedec('meta_file.json') >>> def f(x): ... return turb.MetaPanda(x) These also work with numpy arrays or python objects by using `joblib`: >>> @cachedec("meta.pkl") >>> def g(x): ... return [1, 2, [3, 4], {"hi":"moto"}] """ # check it is string instance_check(filename, str) file_ext = filename.rsplit(".")[-1] # check that file ends with json or csv belongs(file_ext, ("json", "csv", "pkl")) # define decorator def _decorator_cache(func): """Basic decorator.""" @functools.wraps(func) def _wrapper_cache(*args, **kwargs): # if we find the file if os.path.isfile(filename): # if its .csv or .json, use `read` if file_ext in ("json", "csv"): # read it in mdf = read(filename) _set_index_def(mdf.df_) if return_as == "MetaPanda": return mdf else: return mdf.df_ else: if is_joblib_installed(raise_error=True): import joblib mdf = joblib.load(filename) return mdf else: # returns MetaPanda or pandas.DataFrame mpf = func(*args, **kwargs) if isinstance(mpf, MetaPanda): # save file mpf.write(filename) if return_as == "MetaPanda": return mpf else: return mpf.df_ elif isinstance(mpf, DataFrame): # save - bumping index into the file. mpf.reset_index().to_csv(filename, index=None) if return_as == "MetaPanda": return MetaPanda(mpf) else: return mpf else: if is_joblib_installed(raise_error=True): import joblib # attempt to use joblib to dump joblib.dump(mpf, filename, compress=compress) return mpf return _wrapper_cache if _func is None: return _decorator_cache else: return _decorator_cache(_func)
def cached_chunk( func: Callable, param_name: str, param_values: Union[List, Tuple], parallel: bool = True, filename: str = "example1.json", verbose: int = 0, *args, **kwargs ) -> "MetaPanda": """Provides chunked automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object. Parameters -------- func : function A custom function returning the pd.DataFrame/MetaPanda param_name : str The keyword name of the parameter in question to iterate over param_values : list/tuple of something The values associated with the parameter to iterate over parallel : bool, default=True Determines whether to use `joblib` to compute independent chunks in parallel or not filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv'} formats. verbose : int, optional If > 0, prints out useful information *args : list, optional Arguments to pass to function(...) **kwargs : dict, optional Keyword arguments to pass to function(...) Warnings -------- FutureWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv'} Returns ------- mp : MetaPanda The MetaPanda object See Also -------- cached : Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. """ # check it is string instance_check(filename, str) instance_check(param_name, str) instance_check(param_values, (list, tuple, dict)) if not callable(func): raise ValueError("function is not callable") # check that file ends with json or csv belongs(filename.rsplit(".", 1)[-1], ("json", "csv")) # if the final file exists, perform as normal. if os.path.isfile(filename): if verbose > 0: print("reading in cached file: {}".format(filename)) # read it in mdf = read(filename) _set_index_def(mdf.df_) return mdf else: # create a bunch of chunks by repeatedly calling cache. if parallel: _mdf_chunks = joblib.Parallel(joblib.cpu_count())( joblib.delayed(cached)( func, insert_suffix(filename, "_chunk%d" % i), verbose=verbose, *args, **dictcopy(kwargs, {param_name: chunk}) ).df_ for i, chunk in enumerate(param_values) ) else: _mdf_chunks = [ cached( func, insert_suffix(filename, "_chunk%d" % i), verbose=verbose, *args, **dictcopy(kwargs, {param_name: chunk}) ).df_ for i, chunk in enumerate(param_values) ] # join together the chunks mpf = _stack_rows(_mdf_chunks) # save file - return type must be a MetaPanda or error occurs! mpf.write(filename) # now delete the 'chunked' files. for i in range(len(param_values)): os.remove(insert_suffix(filename, "_chunk%d" % i)) return mpf
def learning( df: "MetaPanda", y: str, x: Optional[SelectorType] = None, train_n: Optional[np.ndarray] = None, permute_n: int = 0, cv: Tuple[int, int] = (5, 15), model: str = "LinearRegression", cache: Optional[str] = None, plot: bool = False, verbose: int = 0, **model_kws ): """Fits a basic model to generate cross-validated training/test scores for different training set sizes. A cross-validation generator splits the whole dataset `k` times in training and test data. Subsets of the training set with varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed. Afterwards, the scores will be averaged over all `k` runs for each training subset size. Parameters ---------- df : MetaPanda (n_samples, n_features) The main dataset. y : str A selected y column. x : list/tuple of str/selector, optional A list of selected column names for x or MetaPanda `selector`. train_n : int/array-like, with shape (n_ticks,) dtype float or int, optional Relative or absolute numbers of training examples that will be used to generate learning curve related data. If None: uses `linspace(.1, .9, 8)` If int: uses `linspace(.1, .9, n)` permute_n : int (default 0) The number of times to permute y, if > 0, then does full permutation analysis (making 4th plot) cv : int/tuple, optional (5, 10) If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` model : str/estimator sklearn model that implements `fit` and `predict` methods The name of a scikit-learn model, or the model object itself. cache : str, optional TODO: Not Implemented yet. If not None, stores the resulting model parts in JSON and reloads if present. plot : bool, optional If True, produces `.plot.learning_curve` inplace. verbose : int, optional If > 0, prints out statements depending on level. Other Parameters ---------------- model_kws : dict, optional Keywords to pass to the sklearn model which are not parameterized. Returns ------- results : MetaPanda (n_ticks, 8) The results matrix of mean and std scores permute_ : np.ndarray (permute_n,), optional The permutation scores associated with the permutation analysis Notes ----- Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted. By default, `fit_learning` uses the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. `permute_n` is set to 0 by default, if you want a permutation histogram, this value must be > 0. See Also -------- fit_basic : Performs a rudimentary fit model with no parameter searching. fit_grid : Performs exhaustive grid search analysis on the models selected. References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # perform checks instance_check(df, pd.DataFrame, MetaPanda) instance_check(y, str) instance_check(train_n, (type(None), int, list, tuple, np.ndarray)) instance_check(permute_n, int) instance_check(cv, (int, tuple)) # instance_check(cache, (type(None), str)) instance_check(plot, bool) bounds_check(verbose, 0, 4) # set dataset if a pandas object _df = df.df_ if not isinstance(df, pd.DataFrame) else df # retrieve x columns if none _xcols = select_xcols(_df, x, y) k, repeats = cv if isinstance(cv, tuple) else cv, 1 lm, pkg_name = find_sklearn_model(model, "regression") # assign keywords to lm lm.set_params(**model_kws) if train_n is None: train_n = np.linspace(0.1, 0.9, 8) elif isinstance(train_n, int): train_n = np.linspace(0.1, 0.9, train_n) # ml ready _x, _y = preprocess_continuous_X_y(_df, _xcols, y) rep = RepeatedKFold(n_splits=k, n_repeats=repeats) vars_ = learning_curve( lm, _x, _y, train_sizes=train_n, cv=rep, scoring="neg_root_mean_squared_error", n_jobs=-2, verbose=verbose, return_times=True, ) # permutation analysis if permute_n > 0 if permute_n > 0: perm_score_, perm_scorez_, pval = permutation_test_score( lm, _x, _y, cv=rep, n_permutations=permute_n, scoring="neg_root_mean_squared_error", n_jobs=-2, verbose=verbose, ) # outputs output_labels_ = ["train_score", "test_score", "fit_time", "score_time"] # format as df results = pd.DataFrame( # stack them together np.hstack( ( np.stack([np.mean(vars_[i], axis=1) for i in range(1, 5)], axis=1), np.stack([np.std(vars_[i], axis=1) for i in range(1, 5)], axis=1), ) ), columns=list( it.chain( map(lambda s: "mean_" + s, output_labels_), map(lambda s: "std_" + s, output_labels_), ) ), ) # add N column results["N"] = vars_[0] R = MetaPanda(results) if plot and permute_n > 0: lcurve(R, perm_scorez_) elif plot: lcurve(R) # return as MetaPanda if permute_n > 0: return R, perm_score_, perm_scorez_, pval else: return R
def hist_grid(mdf: Union[DataFrame, "MetaPanda"], subset: SelectorType, arrange: str = "square", plot_size: int = 3, shared_dist: str = "auto", savepath: Optional[Union[str, bool]] = None, **hist_kws): """ Plots a grid of histograms comparing the distributions in a MetaPanda selector. Parameters -------- mdf : turb.MetaPanda The dataset subset : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings arrange : str Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row prioritises plots row-like, and column-wise for column. plot_size : int, default=3 The size of each axes shared_dist : str/tuple of str/dict, default="auto" Determines what KDE to fit to the data, set to None if you don't want If tuple/list: attempts using these specified distributions If dict: maps column name (k) to distribution choice (v) savepath : None, bool, str saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved. Other Parameters ---------------- hist_kws : dict Keywords to pass to `turb.plot.histogram` Returns ------- None """ # checks instance_check(shared_dist, (type(None), str, list, tuple, dict)) instance_check(savepath, (type(None), str, bool)) nonnegative(plot_size, int) belongs(arrange, ["square", "row", "column"]) # make a metapanda if we have a dataframe. _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf # get selector selection = _mdf.view(subset) # assuming we've selected something... if selection.size > 0: fig, axes = gridplot(len(selection), arrange, ax_size=plot_size) if not isinstance(shared_dist, dict): for i, x in enumerate(selection): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=shared_dist, **hist_kws) fig.tight_layout() else: for i, (x, d) in enumerate(shared_dist.items()): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=d, **hist_kws) # iterate over any 'remaining' columns in selection and handle appropriately remaining = difference(selection, tuple(shared_dist.keys())) if remaining.shape[0] > 0: for i, x in enumerate(remaining): _ = histogram(_mdf[x].dropna(), ax=axes[i + len(shared_dist)], title=x, kde="auto", **hist_kws) fig.tight_layout() if isinstance(savepath, bool): save(fig, "hist", _mdf.name_) elif isinstance(savepath, str): save(fig, "hist", _mdf.name_, fp=savepath)
def overview_pca( model, distance_color: bool = True, labels: Optional[pd.Index] = None, cutoff_selection: float = 0.9, n_samples_annotate: int = 6, n_pcs: int = 5, ax_size: int = 4, ): """Provides an overview plot from a PCA result. Parameters ---------- model : sklearn.decomposition.PCA A fitted PCA model. distance_color : bool, default=True If True, plots the magnitude of each PC as a color labels : np.ndarray (n,) of str / pd.Series / list / tuple, optional If not None, provides a label for every PC component (dimension), and annotates the most 'outlier' like samples in plot 1 cutoff_selection : float, default=0.9 The cutoff for proportional variance to select for n_samples_annotate : int, default=10 Defines the number of labels to show if `labels` is not None in plot 1 n_pcs : int, default=5 The number of principle components to consider in plot 3 ax_size : int, default=4 The default size for each axes. Other Parameters ---------------- scatter_kws : dict keywords to pass to `plt.scatter` """ instance_check(distance_color, bool) instance_check(labels, (type(None), np.ndarray, pd.Series, pd.Index, list, tuple)) nonnegative( ( n_samples_annotate, n_pcs, ax_size, ), int, ) if labels is not None: fig, axes = gridplot(3, ax_size=ax_size) else: fig, axes = gridplot(2, ax_size=ax_size) if n_samples_annotate > model.n_components_: n_samples_annotate = model.n_components_ - 1 if n_pcs > model.n_components_: n_pcs = model.n_components_ - 1 # 1 plot the scatter of PC _plot_pca_scatter(model, axes[0], distance_color) # 2 plot the line AUC for explained variance _explained_variance_plot(model, axes[1], cutoff=cutoff_selection) # if annotate, we annotate the scatter plot with samples. if labels is not None: # check to make sure labels is same length as components _annotate_on_magnitude(model, labels, n_samples_annotate, axes[0]) # 3 plot the top N components by the `most important eigenvector values` _x3, _y3, _sel_labels = _best_principle_eigenvectors( model, labels=labels, k=n_samples_annotate, p=n_pcs ) _best_eigenvector_plot( _x3, _y3, _sel_labels, axes[-1], nk=(n_samples_annotate, n_pcs) ) axes[-1].set_title("Top {} eigenvectors".format(n_samples_annotate)) fig.tight_layout()
def bicorr(x: pd.Series, y: pd.Series, method: str = "spearman", tail: str = "two-sided", output: str = "score") -> Union[float, dict]: """(Robust) correlation between two variables. Adapted from the `pingouin` library, made by Raphael Vallat. .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py Parameters ---------- x, y : pd.Series First and second set of observations. x and y must be independent. method : str Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'biserial' : Biserial correlation (continuous and boolean data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) tail : str Specify whether to return 'one-sided' or 'two-sided' p-value. output : str, default='score' Determines whether to display the full output or just the correlation (r) score options are {'score', 'full'}. Returns ------- stats : float/dict Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'method' : pearson/spearman/biserial... etc 'p-val' : one or two tailed p-value 'power' : achieved power of the test (= 1 - type II error). Notes ----- The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply an exact linear relationship. The Spearman correlation is a nonparametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact monotonic relationship. Kendall’s tau is a measure of the correspondence between two rankings. Values close to 1 indicate strong agreement, values close to -1 indicate strong disagreement. The percentage bend correlation [1]_ is a robust method that protects against univariate outliers. The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust methods that returns the Spearman's rho after bivariate outliers removal. Note that the skipped correlation requires that the scikit-learn package is installed (for computing the minimum covariance determinant). Please note that rows with NaN are automatically removed. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 """ # perform all checks in the public method.. rather than repeating them internally. # check type instance_check((x, y), pd.Series) belongs(tail, ("one-sided", "two-sided")) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ('score', 'full')) # Check size if x.shape[0] != y.shape[0]: raise ValueError("x and y must have the same length.") if output == "score": return _bicorr_inner_score(x, y, method) else: return _bicorr_inner_full(x, y, method, tail=tail)
def grid(df: Union[pd.DataFrame, "MetaPanda"], y: str, x: Optional[SelectorType] = None, models=("Ridge", "Lasso"), cv: Union[int, Tuple[int, int]] = 5, cache: Optional[str] = None, plot: bool = False, chunks: bool = False, verbose: int = 0, **grid_kws) -> "MetaPanda": """Performs exhaustive grid search analysis on the models selected. This function aims to encapsulate much of the functionality associated around `GridSearchCV` class within scikit-learn. With in-built caching options, flexible selection of inputs and outputs with the MetaPanda class. Parameters ---------- df : pd.DataFrame/MetaPanda The main dataset. y : str A selected y column. x : list/tuple of str, optional A list of selected column names for x or MetaPanda `selector`. models : list/dict, default=["Ridge", "Lasso"] tuple: list of model names, uses default parameters dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values) cv : int/tuple, default=5 If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` cache : str, optional If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file. plot : bool, optional If True, produces appropriate plot determining for each parameter. chunks : bool, optional If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks. This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk. verbose : int, optional If > 0, prints out statements depending on level. Other Parameters ---------------- grid_kws : dict, optional Additional keywords to assign to GridSearchCV. Raises ------ TypeError If one of the parameters has wrong input type Returns ------- cv_results : MetaPanda A dataframe result from GridSearchCV detailing iterations and all scores. Notes ----- From version 0.2.3 the `chunks` argument allows for fitting by parts. This means that breaks throughout a large pipeline will result only in losses up to the previous chunk. Chunk files are saved as '%filename_chunk%i.csv' so beware of clashes. Make sure to set `chunks=True` and `cache=str` where the `models` parameter is time-expensive. By default, `grid` tunes using the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- basic : Performs a rudimentary fit model with no parameter searching. sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, (pd.DataFrame, MetaPanda)) instance_check(x, (type(None), str, list, tuple, pd.Index)) instance_check(y, str) instance_check(cv, (int, tuple)) instance_check(cache, (type(None), str)) instance_check((plot, chunks), bool) bounds_check(verbose, 0, 4) if is_sklearn_model(models): models = [models] else: if isinstance(models, tuple): models = list(models) instance_check(models, (list, dict)) # set dataset if a pandas object _df = df.df_ if not isinstance(df, pd.DataFrame) else df # retrieve x columns if none # set up cv, repeats k, repeats = cv if isinstance(cv, tuple) else cv, 1 # do caching def _perform_fit(_df: MetaPanda, _x, _y, _k: int, _repeats: int, _models): rep = RepeatedKFold(n_splits=_k, n_repeats=_repeats) # the header is 'model_est' header = "model" # any basic regression model pipe = Pipeline([(header, LinearRegression())]) # get paramgrid - the magic happens here! pgrid = make_parameter_grid(_models, header=header) # join default grid parameters to given grid_kws def_grid_params = { "scoring": "neg_root_mean_squared_error", "n_jobs": -2, "verbose": verbose, "return_train_score": True, } def_grid_params.update(grid_kws) # create gridsearch gs = GridSearchCV(pipe, param_grid=pgrid, cv=rep, **def_grid_params) # make ml ready __xnp, __y = preprocess_continuous_X_y(_df, _x, _y) # fit the grid - expensive. gs.fit(__xnp, __y) # generate result _result = pd.DataFrame(gs.cv_results_) # associate model column to respective results _result["model"] = _result["param_model"].apply( lambda f: str(f).split("(")[0]) # set as MetaPanda _met_result = MetaPanda(_result) # cast down parameter columns to appropriate type _met_result.transform(pd.to_numeric, object, errors="ignore") return _met_result if cache is not None: if chunks: # if dictionary, we need to split this into 1-sized list/dict blocks. values = dictchunk(models, 1) if isinstance(models, dict) else models _cv_results = cached_chunk( _perform_fit, "_models", values, False, cache, verbose, _df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models, ) else: _cv_results = cache_f( cache, _perform_fit, _df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models, ) else: _cv_results = _perform_fit(_df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models) if plot: parameter_tune(_cv_results) return _cv_results
def optimize(df: "MetaPanda", x: SelectorType, y: str, models, cv: int = 5, verbose: int = 0): """Performs optimization grid analysis on the models selected. This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model. .. note:: optimization only works on *continuous* parameters with each model. TODO: complete `.ml.fit.optimize` function Parameters ---------- df : MetaPanda The main dataset. x : list/tuple of str A list of selected column names for x or MetaPanda `selector`. y : str A selected y column. models : tuple/dict tuple: list of model names, uses default parameters dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values) cv : int/tuple, optional (5, 10) If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` cache : str, optional If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file. plot : bool, optional If True, produces appropriate plot determining for each parameter. chunks : bool, optional If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks. This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk. verbose : int, optional If > 0, prints out statements depending on level. Returns ------- cv_results : MetaPanda A dataframe result from GridSearchCV detailing iterations and all scores. By default, `optimize` tunes using the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- grid : Performs exhaustive grid search analysis on the models selected. sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, MetaPanda) instance_check(x, (str, list, tuple, pd.Index)) instance_check(y, str) nonnegative((cv, verbose), int) instance_check(models, (tuple, list, dict)) bounds_check(verbose, 0, 4) _df = df.df_ if not isinstance(df, pd.DataFrame) else df _xcols = select_xcols(_df, x, y) _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y) # define the parameter sets param_sets = make_optimize_grid(models) for m, params in zip(models, param_sets): model = find_sklearn_model(m)[0] inits, bounds = optimize_grid_for_model(params) # minimize for every i element mins = [ so.minimize( _min_cross_val_scores, x0=i, args=(_xnp, _y, model, params, cv), bounds=bounds, ) for i in inits ] pass
def bibox1d(X: _ArrayLike, Y: _ArrayLike, colors: Optional[_ListLike] = None, labels: Optional[_ListLike] = None, measured: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, mannwhitney: bool = True, with_strip: bool = False, vertical: bool = True, notch: bool = False, capsize: float = 1.0, outliers: bool = True, grid: bool = True, width: Union[float, List[float]] = 0.7, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, strip_jitter: float = 0.15, theme: str = "white_circle", **plot_kwargs): """Plots two 1-dimensional boxplots using vectors `X`, `Y`. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The first data column to draw. Must be numeric. Y : list/tuple/np.ndarray/pd.Series (1d) The second data column to draw. Must be numeric. colors : str/list of str, optional If None, uses a default color labels : str/list of str, optional If set, draws this on the appropriate axis, if None, does nothing If X/Y is of type pandas.Series, uses this label instead. measured : str, optional A label to define what the measurement is ax : matplotlib.ax object, optional, default=None If None, creates a plot. mannwhitney : bool, default=True If True, performs a Mann-Whitney U test between the values with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case vertical : bool, default=True Determines whether to draw the plot vertically or horizontally notch : bool, default=False Determines whether to draw a notched plot capsize : float, default=1.0 Defines the length of the caps outliers : bool, default=True If True, displays fliers as outliers grid : bool, default=True If True: draws gridlines for the numeric axis width : float, default=0.7 Determines the width/height of the box label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible strip_jitter : float, default=0.15 With stripplot, defines the amount of jitter in the variables theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot See Also -------- matplotlib.pyplot.boxplot References ---------- Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes """ instance_check((X, Y), (list, tuple, np.ndarray, pd.Series)) instance_check((colors, labels, spines), (type(None), list, pd.Index)) instance_check(ax, (type(None), mpl.axes.Axes)) instance_check((mannwhitney, vertical, notch, outliers, grid, with_strip), bool) instance_check((capsize, width, strip_jitter, label_rotation), (float, int)) instance_check(theme, str) instance_check(label_max_length, int) bounds_check(strip_jitter, 0.0, 1.0) _X = as_flattened_numpy(X) _Y = as_flattened_numpy(Y) _style = _get_flier_style(theme) if ax is None and vertical: fig, ax = plt.subplots(figsize=(3.5, 7)) elif ax is None and not vertical: fig, ax = plt.subplots(figsize=(7, 3.5)) if with_strip: outliers = False if spines is None: if vertical and mannwhitney: spines = ("bottom", "left", "right") elif not vertical and mannwhitney: spines = ("bottom", "left", "top") else: spines = ("bottom", "left", "top", "right") # sort out labels if labels is None: labels = [ X.name if isinstance(X, pd.Series) else "", Y.name if isinstance(Y, pd.Series) else "", ] box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot([_X, _Y], vert=vertical, patch_artist=True, showfliers=outliers, notch=notch, widths=width, flierprops=_style, boxprops=dict(alpha=box_alpha), **plot_kwargs) # define boxplot extras _define_boxplot_arguments(ax, patch_obj, vertical, measured, grid, spines, capsize, None) # define basic colours - overrides if needs be colors = _kcolor_arrangement(patch_obj, colors) # label axes _label_axes(ax, labels, vertical, label_rotation, label_max_length) # if we have stripplot, draw this if with_strip: # plot x strips _overlay_stripplot(_X, ax, 1, width, colors[0], vertical, outliers, strip_jitter) _overlay_stripplot(_Y, ax, 2, width, colors[1], vertical, outliers, strip_jitter) # if we have mann-whitney append this info if mannwhitney: # determine mann-whitney U test z, p = mannwhitneyu(_X, _Y) # p-value * 2 p *= 2 star = _get_stars(p) # get dimensions to annotate joined = np.concatenate((_X, _Y)) _max, _min = np.max(joined), np.min(joined) # annotate on mann-whitney test if vertical: ax.annotate( "", xy=(1, _max), xycoords="data", xytext=(2, _max), textcoords="data", arrowprops=dict(arrowstyle="-", ec="#666666", connectionstyle="bar,fraction=0.2"), ) # add mw text ax.text( 1.5, _max + np.abs(_max - _min) * 0.1, star, horizontalalignment="center", verticalalignment="center", ) else: ax.annotate( "", xy=(_max, 2), xycoords="data", xytext=(_max, 1), textcoords="data", arrowprops=dict(arrowstyle="-", ec="#666666", connectionstyle="bar,fraction=0.2"), ) # add mw text ax.text( _max + np.abs(_max - _min) * 0.1, 1.5, star, horizontalalignment="center", verticalalignment="center", ) return ax
def widebox(data: Union[List, np.ndarray, pd.DataFrame], colors: Optional[_ListLike] = None, measured: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, vert: bool = True, sort: bool = True, outliers: bool = True, notch: bool = False, with_strip: bool = False, capsize: float = 1.0, width: float = 0.7, grid: bool = True, title: Optional[str] = None, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, strip_jitter: float = 0.15, theme="white_circle", **plot_kwargs): """Plots a 2D boxplot with data oriented in wide-form. Parameters ---------- data : list, np.ndarray or pd.DataFrame (2d) The raw data to plot as a box. If data is of type pd.DataFrame: columns represent X-axis colors : list, tuple, optional Represents colors for each x-variable measured : str, optional A name for the measured variable ax : matplotlib.ax object, optional, default=None If None, creates a plot. vert : bool, default=True Determines whether to draw the plot vertically or horizontally sort : bool, default=True Determines whether to sort the data by numerical value outliers : bool, default=True If True, displays fliers as outliers notch : bool, default=False Determines whether to draw a notched plot with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case capsize : float, default=1.0 Defines the length of the caps width : float, default=0.7 Determines the width/height of the box grid : bool, default=True If True: draws gridlines for the numeric axis title : str, optional Sets the title of the axes if a string is passed label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible strip_jitter : float, default=0.15 With stripplot, defines the amount of jitter in the variables theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot See Also -------- matplotlib.pyplot.boxplot seaborn.boxplot seaborn.boxenplot References ---------- Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes """ instance_check(data, (list, np.ndarray, pd.DataFrame)) instance_check((colors, spines), (type(None), list, pd.Index)) instance_check(ax, (type(None), mpl.axes.Axes)) instance_check((vert, sort, notch, outliers, grid, with_strip), bool) instance_check((capsize, width, strip_jitter, label_rotation), (float, int)) instance_check(theme, str) instance_check(label_max_length, int) bounds_check(width, 0.0, 1.0) bounds_check(strip_jitter, 0.0, 1.0) bounds_check(label_rotation, 0.0, 360.0) if isinstance(data, pd.DataFrame): # select float, int subset ss = data.select_dtypes(include=[float, int]) _data = np.asarray(ss) _labels = ss.columns elif isinstance(data, (list, np.ndarray)): _data = np.asarray(data) _labels = None else: raise TypeError("data matrix is not of type np.ndarray") _style = _get_flier_style(theme) # negative-exponential increase in figure size with more features def _figure_spacing(x): return np.exp(-0.35 * x) * x if with_strip: outliers = False if ax is None and vert: fig, ax = plt.subplots(figsize=(2.5 + _figure_spacing(_data.shape[1]), 7)) elif ax is None and not vert: fig, ax = plt.subplots(figsize=(7, 2.5 + _figure_spacing(_data.shape[1]))) if spines is None: spines = ("left", "top", "right", "bottom") # sort the data by the mean if selected if sort: _order = np.argsort(np.mean(_data, axis=0)) _data = _data[:, _order] _labels = _labels[_order] box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot(_data, vert=vert, patch_artist=True, widths=width, showfliers=outliers, notch=notch, flierprops=_style, boxprops=dict(alpha=box_alpha), **plot_kwargs) # define boxplot extras _define_boxplot_arguments(ax, patch_obj, vert, measured, grid, spines, capsize, None) # define basic colours - overrides if needs be colors = _kcolor_arrangement(patch_obj, colors, k=_data.shape[1]) # label axes _label_axes(ax, _labels, vert, label_rotation, label_max_length) if title is not None: ax.set_title(title) # perform stripplots if with_strip: for n in range(_data.shape[1]): # plot x strips _overlay_stripplot(_data[:, n], ax, n + 1, width, colors[n], vert, outliers, strip_jitter) return ax
def select(self, sc: str) -> pd.Index: """View a subset of columns using a flexible `eval`-like string. Select merely returns the columns of interest selected using this selector. Selections of columns can be done by: type [object, int, float, numpy.dtype*, pandas.CategoricalDtype] callable (function) that returns [bool list] of length p pd.Index str [regex, df.column name, cached name, meta.column name (that references a boolean column)] list/tuple of the above .. note:: We do not currently incorporate the use of brackets. Parameters ---------- sc : str-like The selection string to find an optimal subset of columns. Warnings -------- UserWarning If the selection returned is empty. Returns ------- sel : pd.Index The list of column names NOT selected, or empty See Also -------- view : View a selection of columns in `df_`. search : View the intersection of search terms, for columns in `df_`. Examples -------- You can use string names of types to select columns of a certain type: >>> import turbopanda as turb >>> import pandas as pd >>> mdf = turb.MetaPanda(pd.DataFrame({'a': [1., 2.], 'b': [3, 4]})) >>> mdf.select("float") Index(['a'], dtype='object', name='colnames') Or inverses can also be selected using tilde `~`: >>> mdf.select("~float") Index(['b'], dtype='object', name='colnames') Multiple terms can be joined together, include regex-expressions NOT including `&` or `|`, for instance if we wanted to select all float columns containing names x1, x2 or x3: >>> mdf.select("float & x[1-3]") """ instance_check(sc, str) terms = [c.strip() for c in re.split("[&|]", sc)] operator = re.findall("[&|]", sc) if len(terms) < 1: return pd.Index([]) else: grp = [ self.view_not(t[1:]) if t.startswith("~") else self.view(t) for t in terms ] full = grp[0] for mg, op in zip(grp[1:], operator): if op == "&": full = intersect(full, mg) elif op == "|": full = union(full, mg) return full
def partial_bicorr(data: pd.DataFrame, x: str, y: str, covar: Union[str, List[str], Tuple[str, ...], pd.Index], method: str = "spearman", tail: str = "two-sided", output: str = 'score') -> Union[float, dict]: """Partial and semi-partial correlation. Adapted from the `pingouin` library, made by Raphael Vallat. .. [1] https://github.com/raphaelvallat/pingouin/blob/master/pingouin/correlation.py Parameters ---------- data : pd.DataFrame The full dataset including covariates. x, y : str, list of str x and y. Must be names of columns in ``data``. covar : list of str Covariate(s). Column names of the covariates. covar must be made of continuous columns. If x, y are not continuous, will perform logistic regression to generate residuals. method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'biserial' : Biserial correlation (continuous and boolean data) 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) tail : string Specify whether to return the 'one-sided' or 'two-sided' p-value. output : str, default='score' Determines whether to display the full output or just the correlation (r) score options are {'score', 'full'}. Returns ------- stats : float/dict Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'method' : pearson/spearman/biserial... etc 'p-val' : one or two tailed p-value 'BF10' : Bayes Factor of the alternative hypothesis (Pearson only) 'power' : achieved power of the test (= 1 - type II error). Notes ----- From [4]_: “With *partial correlation*, we find the correlation between :math:`x` and :math:`y` holding :math:`C` constant for both :math:`x` and :math:`y`. Sometimes, however, we want to hold :math:`C` constant for just :math:`x` or just :math:`y`. In that case, we compute a *semi-partial correlation*. A partial correlation is computed between two residuals. A semi-partial correlation is computed between one residual and another raw (or unresidualized) variable.” Note that if you are not interested in calculating the statistics and p-values but only the partial correlation matrix, a (faster) alternative is to use the :py:func:`pingouin.pcorr` method (see example 4). Rows with missing values are automatically removed from data. Results have been tested against the `ppcor` R package. References ---------- .. [2] https://en.wikipedia.org/wiki/Partial_correlation .. [3] https://cran.r-project.org/web/packages/ppcor/index.html .. [4] https://gist.github.com/fabianp/9396204419c7b638d38f .. [5] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html """ # perform all checks in the public method.. instance_check(data, pd.DataFrame) instance_check((x, y), str) instance_check(covar, (str, list, tuple, pd.Index)) belongs(tail, ("one-sided", "two-sided")) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ('score', 'full')) # perform a check to make sure every column in `covar` # is continuous. if not is_dataframe_float(data[covar]): raise TypeError("`covar` variables in `partial_bicorr` " "all must be of type `float`/continuous.") return _partial_bicorr_inner(data, x, y, covar, tail=tail, method=method, output=output)
def cached( func: Callable, filename: str = "example1.json", verbose: int = 0, *args, **kwargs ) -> "MetaPanda": """Provides automatic {.json, .csv} caching for `turb.MetaPanda` or `pd.DataFrame`. .. note:: this is a direct-call cache function. Not cached. For example, we call `cached` as a wrapper to our custom function: >>> import turbopanda as turb >>> def f(x): ... return turb.MetaPanda(x) >>> data = cached(f, 'meta_file.json') .. note:: custom function must return a `pd.DataFrame` or `turb.MetaPanda` object. Parameters -------- func : function A custom function returning the pd.DataFrame/MetaPanda filename : str, optional The name of the file to cache to, or read from. This is fixed. Accepts {'json', 'csv'} formats. verbose : int, optional If > 0, prints out useful information *args : list, optional Arguments to pass to function(...) **kwargs : dict, optional Keyword arguments to pass to function(...) Warnings -------- FutureWarning Returned object from cache isn't of type {pd.DataFrame, MetaPanda} Raises ------ TypeError `filename` isn't of type `str` ValueError `filename` extension isn't found in {'json', 'csv'} Returns ------- mp : MetaPanda The MetaPanda object See Also -------- cache : Provides automatic {.json, .csv} decorator caching for `turb.MetaPanda` or `pd.DataFrame`. """ # check it is string instance_check(filename, str) instance_check(verbose, int) instance_check(func, "__call__") # check that file ends with json or csv belongs(filename.rsplit(".", 1)[-1], ("json", "csv")) if os.path.isfile(filename): if verbose > 0: print("reading in cached file: {}".format(filename)) # read it in mdf = read(filename) _set_index_def(mdf.df_) return mdf else: if verbose > 0: print("running function '{}' for cache".format(func.__name__)) # returns MetaPanda or pandas.DataFrame mpf = func(*args, **kwargs) if isinstance(mpf, MetaPanda): # save file mpf.write(filename) return mpf elif isinstance(mpf, DataFrame): # save - bumping index into the file. mpf.reset_index().to_csv(filename, index=None) return MetaPanda(mpf) else: if verbose > 0: print( "returned object from cache not of type [DataFrame, MetaPanda], not cached" ) return mpf
def color_qualitative(n: Union[int, List, Tuple], sharp: bool = True) -> List[str]: """Generates a qualitative palette generator as hex. Parameters ---------- n : int, list or tuple The number of hex colors to return, or the list/tuple of elements. sharp : bool If True, only uses strong/sharp colors, else uses pastelly colors. Returns ------- L : list list of hex colors of length (n,). """ instance_check(n, (int, list, tuple)) instance_check(sharp, bool) if isinstance(n, (list, tuple)): n = len(n) lt8_sharp = ("Accent", "Dark2") lt8_pastel = ("Pastel2", "Set2") # lt9 = ('Set1', 'Pastel1') # lt10 = ['tab10'] # lt12 = ['Set3'] lt20 = ("tab20", "tab20b", "tab20c") # choose random cmap if n <= 8 and sharp: return _colormap_to_hex( getattr(cm, np.random.choice(lt8_sharp))(np.linspace(0, 1, n)) ) elif n <= 8 and not sharp: return _colormap_to_hex( getattr(cm, np.random.choice(lt8_pastel))(np.linspace(0, 1, n)) ) elif n <= 9 and sharp: return palette_cmap(n, "Set1") elif n <= 9 and not sharp: return palette_cmap(n, "Pastel1") elif n <= 10: return palette_cmap(n, "tab10") elif n <= 12: return palette_cmap(n, "Set3") elif n <= 20: return _colormap_to_hex( getattr(cm, np.random.choice(lt20))(np.linspace(0, 1, n)) ) else: # we cycle one of the lt20s return list( it.islice( it.cycle( _colormap_to_hex( getattr(cm, np.random.choice(lt20))(np.linspace(0, 1, 20)) ) ), 0, n, ) )
def box1d(X: _ArrayLike, color: Optional[str] = None, label: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, with_strip: bool = False, vertical: bool = True, notch: bool = False, capsize: float = 1.0, outliers: bool = True, axis_scale: Optional[Union[str, Callable]] = None, grid: bool = True, width: float = 0.7, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, theme: str = "white_circle", **plot_kwargs): """Plots a 1-dimensional boxplot using a vector. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The data column to draw. Must be numeric. color : str, optional If None, uses a default color label : str, optional If set, draws this on the appropriate axis, if None, does nothing If X is of type pandas.Series, uses this label instead. ax : matplotlib.ax object, optional, default=None If None, creates a plot. with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case vertical : bool, default=True Determines whether to draw the plot vertically or horizontally notch : bool, default=False Determines whether to draw a notched plot capsize : float, default=1.0 Defines the length of the caps outliers : bool, default=True If True, displays outfliers as outliers axis_scale: str/callable, optional Scales the data along the axis. If str, use {'log', 'sqrt', 'log2'} If callable, must reference a `np.*` function which takes array X and returns X' grid : bool, default=True If True: draws gridlines for the numeric axis width : float, default=0.7 Determines the width/height of the box label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot """ instance_check(X, (np.ndarray, pd.Series, list, tuple)) instance_check((vertical, notch, outliers, grid, with_strip), bool) instance_check(spines, (type(None), list)) instance_check(theme, str) instance_check((label, color), (type(None), str)) instance_check((capsize, width), float) instance_check(label_rotation, (int, float)) instance_check(label_max_length, int) bounds_check(width, 0.0, 1.0) # convert option to numpy _X = as_flattened_numpy(X) _style = _get_flier_style(theme) # convert X data if we have axis_scale if axis_scale: _X = _convert_x_scale(_X, axis_scale) if with_strip: outliers = False if ax is None and vertical: fig, ax = plt.subplots(figsize=(2.5, 5)) elif ax is None and not vertical: fig, ax = plt.subplots(figsize=(5, 2.5)) if spines is None: spines = ("left", "top", "right", "bottom") box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot(_X, vert=vertical, patch_artist=True, showfliers=outliers, notch=notch, widths=width, boxprops=dict(alpha=box_alpha), flierprops=_style, **plot_kwargs) # define basic arguments _define_boxplot_arguments(ax, patch_obj, vertical, None, grid, spines, capsize, axis_scale) # define colour features color = _color_arrangement(ax, patch_obj, color) # label the appropriate axes _label_axes( ax, X.name if isinstance(X, pd.Series) else label, vertical, label_rotation, label_max_length, ) # plot the strips if with_strip: _overlay_stripplot(_X, ax, 1, width, color, vertical, outliers, strip_jitter=0.15) return ax
def save( fig_obj: plt.Figure, plot_type: str, name: str = "example1", save_types: Tuple[str, ...] = ("png", "pdf"), fp: str = "./", dpi: int = 360, savemode: str = "first", ) -> bool: """Saves a matplotlib figure in many formats. Given a matplotlib.Figure object, save appropriate numbers of Figures to the respective folders. Parameters ---------- fig_obj : plt.Figure The figure object to save. plot_type : str Choose from: {"scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver", "box", "line", "venn", "multi", "pie"} name : str, optional The name of the file, this may be added to based on the other parameters save_types : tuple of str, optional Choose any from {"png", "pdf", "svg", "eps", "ps"} fp : str, optional The file path to the root directory of saving images dpi : int, optional The resolution in dots per inch; set to high if you want a good image savemode : str, optional Choose from {'first', 'update'} if first, only saves if file isn't present if update, overrides saved figure if present Warnings -------- UserWarning If figure file itself already exists Raises ------ IOError If the filepath does not exist TypeError If the arguments do not match their declared type ValueError If `plot_type`, `savemode` does not belong to an acceptable argument Returns ------- success : bool Whether it was successful or not """ accepted_types = ( "scatter", "kde", "heatmap", "cluster", "bar", "hist", "kde", "quiver", "box", "line", "venn", "multi", "pie", ) file_types_supported = ("png", "pdf", "svg", "eps", "ps") accepted_savemodes = ("first", "update") instance_check(fig_obj, (plt.Figure, mpl.figure.Figure)) instance_check(name, str) instance_check(fp, str) belongs(plot_type, accepted_types) belongs(savemode, accepted_savemodes) for st in save_types: if st not in file_types_supported: TypeError("save_type: [%s] not supported" % st) # correct to ensure filepath has / at end if not fp.endswith("/"): fp += "/" # check whether the filepath exists if os.path.exists(fp): for t in save_types: # if the directory does not exist, create it! if not os.path.isdir(fp + "_" + t): os.mkdir(fp + "_" + t) # check if the figures themselves already exist. filename = "{}_{}/{}_{}.{}".format(fp, t, plot_type, name, t) if os.path.isfile(filename): warnings.warn( "Figure: '{}' already exists: Using savemode: {}".format( filename, savemode), UserWarning, ) if savemode == "update": fig_obj.savefig(filename, format=t, bbox_inches="tight", dpi=dpi) else: # make the file fig_obj.savefig(filename, format=t, bbox_inches="tight", dpi=dpi) else: raise IOError("filepath: [%s] does not exist." % fp) return True
def overview( df, x: SelectorType, y: str, cv, yp, plot_names: Optional[List[str]] = None, plot_size: int = 3, ): """Presents an overview of the results of a machine learning basic run. Parameters ---------- df : MetaPanda The raw dataset. x : selector The input selection to the model y : str The target vector cv : MetaPanda The results cv from a call to `fit_basic` yp : MetaPanda The result fitted values from a call to `fit_basic` plot_names : list of str, optional Names of specific plot types to draw. Choose any combo of: {'resid_fitted', 'score', 'actual_predicted', 'coef', 'correlation', 'qqplot', 'cooks'} If None: draws ALL. plot_size : int, optional Defines the size of each plot size. Returns ------- a bunch of plots. No Return. """ from turbopanda.corr._correlate import correlate, row_to_matrix from turbopanda.stats import cook_distance instance_check(plot_names, (type(None), tuple)) instance_check(y, str) options_ = ( "resid_fitted", "score", "actual_predicted", "coef", "correlation", "qqplot", "cooks", ) """ Prepare data here. """ # set yp as series yp = yp[y].squeeze() # pair them and remove NA _df = df.df_ if not isinstance(df, pd.DataFrame) else df _xcols = select_xcols(_df, x, y) _x, _y = preprocess_continuous_X_y(_df, _xcols, y) options_yes_ = (True, True, True, True, 1 < len(_xcols) < 50, True, True) # compress down options option_compressed = list(it.compress(options_, options_yes_)) """ Make plots here """ if plot_names is None: plot_names = options_ # overlap plots overlap_ = sorted(intersect(option_compressed, plot_names), key=options_.index) # make plots fig, ax = gridplot(len(overlap_), ax_size=plot_size) I = it.count() if "score" in overlap_: # plot 2. boxplot for scores _boxplot_scores(ax[next(I)], cv) if "resid_fitted" in overlap_: # plot 1. fitted vs. residual plots _fitted_vs_residual(ax[next(I)], _y, yp) if "actual_predicted" in overlap_: # plot 3. KDE plot estimation between y and yhat _actual_vs_predicted(ax[next(I)], _y, yp) if "coef" in overlap_: # plot 4. coefficient plot coefficient(cv, ax[next(I)]) if "correlation" in overlap_ and (1 < len(_xcols) < 50): # plot 5. correlation matrix corr = correlate(df, x) _cmatrix = row_to_matrix(corr) _basic_correlation_matrix(ax[next(I)], _cmatrix) if "qqplot" in overlap_: # plot 7. q-q plot stats.probplot(_df[y], dist="norm", plot=ax[next(I)]) if "cooks" in overlap_: # plot 8. outlier detection using cook's distance plot _c = cook_distance(df, x, y, yp) _cooks(ax[next(I)], _c) fig.tight_layout() plt.show()