def transform_dict_like(obj, func, *args, **kwargs): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat if obj.ndim != 1: cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") if any(isinstance(v, dict) for v in func.values()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as e: if str(e) == "Function did not transform": raise e # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1)
def transform_dict_like( obj: FrameOrSeries, func: AggFuncTypeDict, *args, **kwargs, ): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat if len(func) == 0: raise ValueError("No transform functions were provided") if obj.ndim != 1: # Check for missing columns on a frame cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") # Can't use func.values(); wouldn't work for a Series if any(is_dict_like(v) for _, v in func.items()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes # Cannot use func.values() because arg may be a Series if any(is_aggregator(x) for _, x in func.items()): new_func: AggFuncTypeDict = {} for k, v in func.items(): if not is_aggregator(v): # mypy can't realize v is not a list here new_func[k] = [v] # type:ignore[list-item] else: new_func[k] = v func = new_func results: Dict[Label, FrameOrSeriesUnion] = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as err: if ( str(err) == "Function did not transform" or str(err) == "No transform functions were provided" ): raise err # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1)
def reconstruct_func( func: AggFuncType | None, **kwargs ) -> tuple[bool, AggFuncType | None, list[str] | None, list[int] | None]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. If named aggregation is applied, `func` will be None, and kwargs contains the column and aggregation function information to be parsed; If named aggregation is not applied, `func` is either string (e.g. 'min') or Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) If relabeling is True, will return relabeling, reconstructed func, column names, and the reconstructed order of columns. If relabeling is False, the columns and order will be None. Parameters ---------- func: agg function (e.g. 'min' or Callable) or list of agg functions (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). **kwargs: dict, kwargs used in is_multi_agg_with_relabel and normalize_keyword_aggregation function for relabelling Returns ------- relabelling: bool, if there is relabelling or not func: normalized and mangled func columns: list of column names order: list of columns indices Examples -------- >>> reconstruct_func(None, **{"foo": ("col", "min")}) (True, defaultdict(<class 'list'>, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") (False, 'min', None, None) """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: list[str] | None = None order: list[int] | None = None if not relabeling: if isinstance(func, list) and len(func) > len(set(func)): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( "Function names must be unique if there is no new column names " "assigned") elif func is None: # nicer error message raise TypeError( "Must provide 'func' or tuples of '(column, aggfunc).") if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) return relabeling, func, columns, order
def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = obj._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) return colg.aggregate(how)
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") relabeling_required = False if isinstance(func, dict) or func is None: def _reconstruct_func(func, **kwargs): relabeling_required, func, new_columns, order = reconstruct_func( func, **kwargs ) # We convert to the string version of the function for simplicity. func = { k: v if not callable(v) or v.__name__ not in dir(self) else v.__name__ for k, v in func.items() } return relabeling_required, func, new_columns, order relabeling_required, func_dict, new_columns, order = _reconstruct_func( func, **kwargs ) if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._apply_agg_function( func, drop=self._as_index, *args, **kwargs, ) if relabeling_required: result = result.iloc[:, order] result.columns = new_columns return result
def transform_dict_like( obj: FrameOrSeries, func: AggFuncTypeDict, *args, **kwargs, ): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat if len(func) == 0: raise ValueError("No transform functions were provided") if obj.ndim != 1: # Check for missing columns on a frame cols = set(func.keys()) - set(obj.columns) if len(cols) > 0: cols_sorted = list(safe_sort(list(cols))) raise SpecificationError(f"Column(s) {cols_sorted} do not exist") # Can't use func.values(); wouldn't work for a Series if any(is_dict_like(v) for _, v in func.items()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as err: if ( str(err) == "Function did not transform" or str(err) == "No transform functions were provided" ): raise err # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1)
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if isinstance(func, dict) or func is None: if func is None: func = {} else: if any(i not in self._df.columns for i in func.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if isinstance(self._by, type(self._query_compiler)): by = list(self._by.columns) else: by = self._by # We convert to the string version of the function for simplicity. func_dict = { k: v if not callable(v) or v.__name__ not in dir(self) else v.__name__ for k, v in func.items() } subset_cols = list( func_dict.keys()) + (list(self._by.columns) if isinstance( self._by, type(self._query_compiler)) and all( c in self._df.columns for c in self._by.columns) else []) return type(self._df)(query_compiler=self._df[subset_cols]. _query_compiler.groupby_dict_agg( by=by, func_dict=func_dict, groupby_args=self._kwargs, agg_args=kwargs, drop=self._drop, )) if is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate( func, *args, **kwargs), *args, **kwargs, ) if isinstance(func, str): agg_func = getattr(self, func, None) if callable(agg_func): return agg_func(*args, **kwargs) return self._apply_agg_function( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), drop=self._as_index, *args, **kwargs, )
def normalize_dictlike_arg( self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict ) -> AggFuncTypeDict: """ Handler for dict-like argument. Ensures that necessary columns exist if obj is a DataFrame, and that a nested renamer is not passed. Also normalizes to all lists when values consists of a mix of list and non-lists. """ assert how in ("apply", "agg", "transform") # Can't use func.values(); wouldn't work for a Series if ( how == "agg" and isinstance(obj, ABCSeries) and any(is_list_like(v) for _, v in func.items()) ) or (any(is_dict_like(v) for _, v in func.items())): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") if obj.ndim != 1: # Check for missing columns on a frame cols = set(func.keys()) - set(obj.columns) if len(cols) > 0: cols_sorted = list(safe_sort(list(cols))) raise KeyError(f"Column(s) {cols_sorted} do not exist") is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes # Cannot use func.values() because arg may be a Series if any(is_aggregator(x) for _, x in func.items()): new_func: AggFuncTypeDict = {} for k, v in func.items(): if not is_aggregator(v): # mypy can't realize v is not a list here new_func[k] = [v] # type:ignore[list-item] else: new_func[k] = v func = new_func return func
def validate_dictlike_arg(self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict) -> None: """ Raise if dict-like argument is invalid. Ensures that necessary columns exist if obj is a DataFrame, and that a nested renamer is not passed. """ assert how in ("apply", "agg", "transform") # Can't use func.values(); wouldn't work for a Series if (how == "agg" and isinstance(obj, ABCSeries) and any(is_list_like(v) for _, v in func.items())) or (any( is_dict_like(v) for _, v in func.items())): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") if obj.ndim != 1: # Check for missing columns on a frame cols = set(func.keys()) - set(obj.columns) if len(cols) > 0: cols_sorted = list(safe_sort(list(cols))) raise KeyError(f"Column(s) {cols_sorted} do not exist")
def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a dict-like argument. Parameters ---------- _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ obj = self.obj arg = cast(AggFuncTypeDict, self.f) is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes # Cannot use arg.values() because arg may be a Series if any(is_aggregator(x) for _, x in arg.items()): new_arg: AggFuncTypeDict = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif (isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(selected_obj, ABCDataFrame) and len( selected_obj.columns.intersection(keys)) != len(keys): cols = list( safe_sort( list( set(keys) - set(selected_obj.columns.intersection(keys))), )) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(obj._selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if (callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self)): func = func.__name__ relabeling_required = False if isinstance(func, dict) or func is None: def try_get_str_func(fn): if not isinstance(fn, str) and isinstance(fn, Iterable): return [try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) and fn.__name__ in dir( self) else fn relabeling_required, func_dict, new_columns, order = reconstruct_func( func, **kwargs) func_dict = { col: try_get_str_func(fn) for col, fn in func_dict.items() } if (relabeling_required and not self._as_index and any(col in func_dict for col in self._internal_by)): ErrorMessage.missmatch_with_pandas( operation= "GroupBy.aggregate(**dictionary_renaming_aggregation)", message= ("intersection of the columns to aggregate and 'by' is not yet supported when 'as_index=False', " + "columns with group names of the intersection will not be presented in the result. " + "To achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column', as_index=False).agg(agg_func=('by_column', agg_func))\n" + "to the:\n" + "df.groupby('by_column').agg(agg_func=('by_column', agg_func)).reset_index()" ), ) if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate( func, *args, **kwargs), *args, **kwargs, ) elif callable(func): return self._check_index( self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", )) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", ) if relabeling_required: if not self._as_index: nby_cols = len(result.columns) - len(new_columns) order = np.concatenate([np.arange(nby_cols), order + nby_cols]) by_cols = result.columns[:nby_cols] new_columns = pandas.Index(new_columns) if by_cols.nlevels != new_columns.nlevels: by_cols = by_cols.remove_unused_levels() empty_levels = [ i for i, level in enumerate(by_cols.levels) if len(level) == 1 and level[0] == "" ] by_cols = by_cols.droplevel(empty_levels) new_columns = by_cols.append(new_columns) result = result.iloc[:, order] result.columns = new_columns return result
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if ( callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self) ): func = func.__name__ relabeling_required = False if isinstance(func, dict) or func is None: def try_get_str_func(fn): if not isinstance(fn, str) and isinstance(fn, Iterable): return [try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn relabeling_required, func_dict, new_columns, order = reconstruct_func( func, **kwargs ) func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()} if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif callable(func): return self._apply_agg_function( lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._apply_agg_function( func, *args, **kwargs, ) if relabeling_required: if not self._as_index: nby_cols = len(result.columns) - len(new_columns) order = np.concatenate([np.arange(nby_cols), order + nby_cols]) by_cols = result.columns[:nby_cols] new_columns = pandas.Index(new_columns) if by_cols.nlevels != new_columns.nlevels: by_cols = by_cols.remove_unused_levels() empty_levels = [ i for i, level in enumerate(by_cols.levels) if len(level) == 1 and level[0] == "" ] by_cols = by_cols.droplevel(empty_levels) new_columns = by_cols.append(new_columns) result = result.iloc[:, order] result.columns = new_columns return result
def aggregate(obj, arg: AggFuncType, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(obj, "axis", 0) if isinstance(arg, str): return obj._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif (isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(selected_obj, ABCDataFrame) and len( selected_obj.columns.intersection(keys)) != len(keys): cols = sorted( set(keys) - set(selected_obj.columns.intersection(keys))) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(obj._selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result, True elif is_list_like(arg): # we require a list, but not an 'str' return aggregate_multiple_funcs(obj, arg, _axis=_axis), None else: result = None if callable(arg): f = obj._get_cython_func(arg) if f and not args and not kwargs: return getattr(obj, f)(), None # caller can react return result, True
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ from pandas.core.reshape.concat import concat is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): if not is_series: cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") if any(isinstance(v, dict) for v in func.values()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as e: if str(e) == "Function did not transform": raise e # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1) # func is either str or callable try: if isinstance(func, str): result = obj._try_aggregate_string_function(func, *args, **kwargs) else: f = obj._get_cython_func(func) if f and not args and not kwargs: result = getattr(obj, f)() else: try: result = obj.apply(func, args=args, **kwargs) except Exception: result = func(obj, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index): raise ValueError("Function did not transform") return result
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if ( callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self) ): func = func.__name__ relabeling_required = False if isinstance(func, dict) or func is None: def try_get_str_func(fn): if not isinstance(fn, str) and isinstance(fn, Iterable): return [try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn relabeling_required, func_dict, new_columns, order = reconstruct_func( func, **kwargs ) func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()} if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif callable(func): return self._apply_agg_function( lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._apply_agg_function( func, *args, **kwargs, ) if relabeling_required: result = result.iloc[:, order] result.columns = new_columns return result
def aggregate(obj, arg: AggFuncType, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(obj, "axis", 0) if isinstance(arg, str): return obj._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif ( isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns ): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(selected_obj, ABCDataFrame) and len( selected_obj.columns.intersection(keys) ) != len(keys): cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys))) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = obj._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) return colg.aggregate(how) def _agg_2dim(how): """ aggregate a 2-dim with how """ colg = obj._gotitem(obj._selection, ndim=2, subset=selected_obj) return colg.aggregate(how) def _agg(arg, func): """ run the aggregations over the arg with func return a dict """ result = {} for fname, agg_how in arg.items(): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(arg.keys()) if obj._selection is not None: sl = set(obj._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim(obj._selection, agg_how) ) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in result.values()) def is_any_frame() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in result.values()) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame keys_to_use = [k for k in keys if not result[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys return ( concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1), True, ) elif isinstance(obj, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError as err: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError( "cannot perform both aggregation " "and transformation operations " "simultaneously" ) from err return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(result, name=name) return result, True elif is_list_like(arg): # we require a list, but not an 'str' return aggregate_multiple_funcs(obj, arg, _axis=_axis), None else: result = None if callable(arg): f = obj._get_cython_func(arg) if f and not args and not kwargs: return getattr(obj, f)(), None # caller can react return result, True