def select_xcols(df: pd.DataFrame, xs, y): """Selects the appropriate x-column selection from a dataset for ML use.""" if xs is None: return df.columns.difference(pd.Index([y])) elif isinstance(xs, str): return pattern(xs, df.columns) else: return xs
def get_best_model(cv_results, minimize: bool = True): """Returns the best model (with correct params) given the cv_results from a `fit_grid` call. The idea behind this function is to fetch from the pool of models the best model which could be fed directly into `fit_basic` to get the detailed plots. Parameters ---------- cv_results : MetaPanda A dataframe result from `.ml.fit.grid` minimize : bool Determines whether the scoring function is minimized or maximized Returns ------- M : sklearn model A parameterized sklearn model (unfitted). Notes ----- The returned model is not fitted, you will need to do this yourself. See Also -------- fit_basic : Performs a rudimentary fit model with no parameter searching """ if minimize: select = cv_results.df_["mean_test_score"].idxmin() else: select = cv_results.df_["mean_test_score"].idxmax() M = cv_results.df_.loc[select, "model"] # instantiate a model from text M inst_M = find_sklearn_model(M)[0] # get dict params param_columns = pattern("param_model__", cv_results.df_.loc[select].dropna().index, False) # preprocess dict params to eliminate the header for sklearn models _old_params = cv_results.df_.loc[select, param_columns] _old_params.index = _old_params.index.str.rsplit("__", 1).str[-1] params = _old_params.to_dict() # iterate through parameters and cast down potential floats to ints for k, v in params.items(): if isinstance(v, float): if v.is_integer(): params[k] = int(v) # set parameters in to the model. inst_M.set_params(**params) return inst_M
def absolute(df: pd.DataFrame, pat: str = None) -> pd.DataFrame: """Performs subselected absolute operation on certain columns.""" condition = (lambda x: list(pattern(pat, x, extended_regex=False)) if pat is not None else df.columns.tolist()) return _multi_assign(df, np.abs, condition)
def best_model(cv_results, y_var: str = "test", minimize: bool = True, score: str = "RMSE", **box_kws): """Determines the best model (min or max) and plots the boxplot of all resulting best models. Parameters ---------- cv_results : MetaPanda The results from a call to `fit_grid`. y_var : str Choose from {'test', 'train'} If 'test': draws the test score If 'train': draws the training score minimize : bool If True, selects best smallest score, else select best largest score score : str The name of the scoring function box_kws : dict, optional Keyword arguments to pass to `plt.boxplot`. Returns ------- fig : matplotlib.figure The figure object """ instance_check(minimize, bool) instance_check(score, str) belongs(y_var, ("train", "test")) sely = pattern("mean_%s_score" % y_var, cv_results.columns, False) # create figures fig = plt.figure(figsize=(8, 5)) ax = fig.add_subplot(111) # create a copy res = cv_results.df_ if not isinstance(cv_results, pd.DataFrame) else cv_results # transform. if res[sely].squeeze().mean() < 0.0: res = res.pipe(absolute, "(?:split[0-9]+|mean)_(?:train|test)_score") # for each 'model', arrange data into boxplot if minimize: indices = res.groupby("model")[sely].idxmin() else: indices = res.groupby("model")[sely].idxmax() # arrange data result_p = res.df_.loc[indices, res.view("split[0-9]+_%s_score" % y_var)] # reorder based on the best score re_order = result_p.median(axis=1).sort_values() result_p = result_p.reindex(re_order.index) # get best score name indices = switcheroo(indices).reindex(re_order.index) # plot bp = ax.boxplot(result_p, patch_artist=True, showfliers=False, **box_kws) # fetch package names and map them to colors - returned as pd.Series packages = find_model_family(indices.values) # map colors to each of the packages. mapping = dictzip(set_like(packages), color_qualitative(len(set_like(packages)))) mapped_cols = packages.map(mapping) # iterate over boxes and colour for box, col in zip(bp["boxes"], mapped_cols): box.set(facecolor=col, linewidth=1.2) plt.setp(bp["medians"], linewidth=1.5) # additional box requirements ax.set_xlabel("Model") ax.set_ylabel("%s %s" % (y_var, score)) ax.set_xticklabels(indices.values) ax.tick_params("x", rotation=45) ax.grid() for tick in ax.get_xmajorticklabels(): tick.set_horizontalalignment("right") # generate legend ax.legend(legend_line(mapping), list(mapping.keys()), bbox_to_anchor=(1.03, 1.03)) plt.show() return fig
def correlate( data: Union[pd.DataFrame, MetaPanda], x: Optional[SelectorType] = None, y: Optional[SelectorType] = None, covar: Optional[SelectorType] = None, cartesian_covar: bool = False, output: str = "full", method: str = "spearman", verbose: int = 0, ) -> pd.DataFrame: """Correlates X and Y together to generate a list of correlations. If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame Parameters --------- data : pd.DataFrame / MetaPanda The full dataset. x : (str, list, tuple, pd.Index), optional Subset of input(s) for column names. if None, uses the full dataset. Y must be None in this case also. y : (str, list, tuple, pd.Index), optional Subset of output(s) for column names. if None, uses the full dataset (from optional `x` subset) covar : (str, list, tuple, pd.Index), optional set of covariate(s). Covariates are needed to compute partial correlations. If None, uses standard correlation. cartesian_covar : bool, default=False If True, and if covar is not None, separates every element in covar to individually control for using the cartesian product output : str, default="full" Choose from {'full', 'score'}. Score just returns `r` number. method : str, default="spearman" Method to correlate with. Choose from: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'biserial' : Biserial correlation (continuous and boolean data only) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) verbose : int, default=0 If > 0, prints out useful debugging messages Returns ------- R : pd.DataFrame correlation rows (based on pingouin structure) Examples -------- >>> import turbopanda as turb >>> data = turb.read('example.json') >>> R = turb.correlate(data) # uses full dataset X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset X M Y X 1.000000 0.392251 0.059771 M 0.392251 1.000000 0.545618 Y 0.059771 0.545618 1.000000 # correlates X columns against Ybin >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin') X M Y Ybin 1.000000 0.392251 0.059771 # correlates X against Ybin controlling for >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y X Ybin -0.149210 # using a different technique >>> R = turb.correlate(data, method="shepherd") X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 """ # data cannot be NONE instance_check(data, (pd.DataFrame, MetaPanda)) instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index)) instance_check(cartesian_covar, bool) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ("full","score")) bounds_check(verbose, 0, 4) # downcast to dataframe option df = data.df_ if not isinstance(data, pd.DataFrame) else data # downcast if list/tuple/pd.index is of length 1 x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y # convert using `view` if we have string instances. if isinstance(x, str): x = pattern(x, df.columns) if isinstance(y, str): y = pattern(y, df.columns) if isinstance(covar, str): covar = pattern(covar, df.columns) # perform a check to make sure every column in `covar` is continuous. if covar is not None: if not is_dataframe_float(data[covar]): raise TypeError( "`covar` variables in `correlate` all must be of type `float`/continuous." ) # execute various use cases based on the presense of x, y, and covar, respectively. if x is None and y is None: # here just perform matrix-based correlation comb = it.combinations_with_replacement(df.columns, 2) niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2) elif isinstance(x, (list, tuple, pd.Index)) and y is None: # use a subset of x, in union with covar comb = it.combinations_with_replacement(x, 2) niter = (len(x)**2) // 2 + (len(x) // 2) elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str): # list of x, y str -> matrix-vector cartesian product comb = it.product(x, [y]) niter = len(x) elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str): # list of y, x str -> matrix-vector cartesian product comb = it.product(y, [x]) niter = len(y) elif isinstance(x, (list, tuple, pd.Index)) and isinstance( y, (list, tuple, pd.Index) ): # list of x, y -> cartesian product of x: y terms comb = it.product(x, y) niter = len(x) * len(y) else: raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar)) # return the combination of these effects. return _corr_combination( df, comb, niter, covar, cartesian_covar, method, output, verbose )
def melt( df, id_vars=None, value_vars=None, var_name=None, value_name=None, index_name="index", include_index=True, include_regex=True, include_question_guess=True, ): """Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set. .. note:: Does not accept MultIndex pandas.dataFrames. Parameters ---------- df : DataFrame id_vars : str, tuple, list or ndarray, optional Column(s) to use as identifier variables. If None: No identifier columns are used If str: uses a regex pattern if `include_regex` is True value_vars : str, tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars` If str: uses a regex pattern if `include_regex` is True var_name : str, optional Name to use for the `variable` column. If None it uses the `strategy` variable to find the common substring of the names value_name : str, optional Name to use for the `value` column. If None it uses the `strategy` variable to find the common substring of the names index_name : str, default="index" A name to give to the index if it doesn't have a name value include_index : bool, default=True If True, it includes the current index column(s) into the `id_vars` include_regex : bool, default=True If True, uses regular expressions for `id_vars` and `value_vars` if they are `str` include_question_guess : bool, default=True If True, strategy-generated names have a question mark `?` after them Returns ------- dfn : pd.DataFrame New melted DataFrame See Also -------- pandas.DataFrame.melt pandas.DataFrame.pivot_table """ # check inputs instance_check(df, pd.DataFrame) instance_check( (id_vars, value_vars), (type(None), str, list, tuple, np.ndarray, pd.Series, pd.Index), ) instance_check((var_name, value_name, index_name), (type(None), str)) instance_check((include_regex, include_question_guess, include_index), bool) _columns = df.columns.tolist() _index = df.index # perform regex options for id vars and value vars if isinstance(id_vars, str) and include_regex: # convert to list id_vars = pattern(id_vars, df) if isinstance(value_vars, str) and include_regex: # convert to list value_vars = pattern(value_vars, df) if id_vars is None: if value_vars is not None: id_vars = list(set(_columns) - set(value_vars)) else: id_vars = [] else: id_vars = list(id_vars) if value_vars is None: if id_vars is not None: value_vars = list(set(_columns) - set(id_vars)) else: value_vars = _columns else: value_vars = list(value_vars) # if we include the index, we need to reset it if include_index: # add in the index cols into the data df = df.reset_index().rename(columns={"index": index_name}) # rename index if _index.name is not None: id_vars.append(_index.name) else: id_vars.append(index_name) # update var_name if var_name is None: # use common_substring in the id_vars columns valns = common_substrings(value_vars) if isinstance(valns, pd.Series) and valns.shape[0] > 0: _var_name = valns.idxmax() # if we have question guess, add it on elif isinstance(valns, str): _var_name = valns elif df.columns.name != "": _var_name = df.columns.name else: _var_name = "variable" if include_question_guess: _var_name += "?" else: _var_name = var_name if value_name is None: _value_name = "value" else: _value_name = value_name return pd.melt(df, id_vars, value_vars, _var_name, _value_name)