Exemplo n.º 1
0
def transform_dict_like(obj, func, *args, **kwargs):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if obj.ndim != 1:
        cols = sorted(set(func.keys()) - set(obj.columns))
        if len(cols) > 0:
            raise SpecificationError(f"Column(s) {cols} do not exist")

    if any(isinstance(v, dict) for v in func.values()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    results = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as e:
            if str(e) == "Function did not transform":
                raise e

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Exemplo n.º 2
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = sorted(set(func.keys()) - set(obj.columns))
        if len(cols) > 0:
            raise SpecificationError(f"Column(s) {cols} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

    # if we have a dict of any non-scalars
    # eg. {'A' : ['mean']}, normalize all to
    # be list-likes
    # Cannot use func.values() because arg may be a Series
    if any(is_aggregator(x) for _, x in func.items()):
        new_func: AggFuncTypeDict = {}
        for k, v in func.items():
            if not is_aggregator(v):
                # mypy can't realize v is not a list here
                new_func[k] = [v]  # type:ignore[list-item]
            else:
                new_func[k] = v
        func = new_func

    results: Dict[Label, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Exemplo n.º 3
0
def reconstruct_func(
    func: AggFuncType | None, **kwargs
) -> tuple[bool, AggFuncType | None, list[str] | None, list[int] | None]:
    """
    This is the internal function to reconstruct func given if there is relabeling
    or not and also normalize the keyword to get new order of columns.

    If named aggregation is applied, `func` will be None, and kwargs contains the
    column and aggregation function information to be parsed;
    If named aggregation is not applied, `func` is either string (e.g. 'min') or
    Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name
    and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]})

    If relabeling is True, will return relabeling, reconstructed func, column
    names, and the reconstructed order of columns.
    If relabeling is False, the columns and order will be None.

    Parameters
    ----------
    func: agg function (e.g. 'min' or Callable) or list of agg functions
        (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}).
    **kwargs: dict, kwargs used in is_multi_agg_with_relabel and
        normalize_keyword_aggregation function for relabelling

    Returns
    -------
    relabelling: bool, if there is relabelling or not
    func: normalized and mangled func
    columns: list of column names
    order: list of columns indices

    Examples
    --------
    >>> reconstruct_func(None, **{"foo": ("col", "min")})
    (True, defaultdict(<class 'list'>, {'col': ['min']}), ('foo',), array([0]))

    >>> reconstruct_func("min")
    (False, 'min', None, None)
    """
    relabeling = func is None and is_multi_agg_with_relabel(**kwargs)
    columns: list[str] | None = None
    order: list[int] | None = None

    if not relabeling:
        if isinstance(func, list) and len(func) > len(set(func)):

            # GH 28426 will raise error if duplicated function names are used and
            # there is no reassigned name
            raise SpecificationError(
                "Function names must be unique if there is no new column names "
                "assigned")
        elif func is None:
            # nicer error message
            raise TypeError(
                "Must provide 'func' or tuples of '(column, aggfunc).")

    if relabeling:
        func, columns, order = normalize_keyword_aggregation(kwargs)

    return relabeling, func, columns, order
Exemplo n.º 4
0
 def _agg_1dim(name, how, subset=None):
     """
     aggregate a 1-dim with how
     """
     colg = obj._gotitem(name, ndim=1, subset=subset)
     if colg.ndim != 1:
         raise SpecificationError(
             "nested dictionary is ambiguous in aggregation"
         )
     return colg.aggregate(how)
Exemplo n.º 5
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def _reconstruct_func(func, **kwargs):
                relabeling_required, func, new_columns, order = reconstruct_func(
                    func, **kwargs
                )
                # We convert to the string version of the function for simplicity.
                func = {
                    k: v
                    if not callable(v) or v.__name__ not in dir(self)
                    else v.__name__
                    for k, v in func.items()
                }
                return relabeling_required, func, new_columns, order

            relabeling_required, func_dict, new_columns, order = _reconstruct_func(
                func, **kwargs
            )

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._apply_agg_function(
            func,
            drop=self._as_index,
            *args,
            **kwargs,
        )

        if relabeling_required:
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
Exemplo n.º 6
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = set(func.keys()) - set(obj.columns)
        if len(cols) > 0:
            cols_sorted = list(safe_sort(list(cols)))
            raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    results: Dict[Hashable, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Exemplo n.º 7
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")
        if isinstance(func, dict) or func is None:
            if func is None:
                func = {}
            else:
                if any(i not in self._df.columns for i in func.keys()):
                    from pandas.core.base import SpecificationError

                    raise SpecificationError("nested renamer is not supported")
            if isinstance(self._by, type(self._query_compiler)):
                by = list(self._by.columns)
            else:
                by = self._by
            # We convert to the string version of the function for simplicity.
            func_dict = {
                k: v if not callable(v) or v.__name__ not in dir(self) else
                v.__name__
                for k, v in func.items()
            }
            subset_cols = list(
                func_dict.keys()) + (list(self._by.columns) if isinstance(
                    self._by, type(self._query_compiler)) and all(
                        c in self._df.columns
                        for c in self._by.columns) else [])
            return type(self._df)(query_compiler=self._df[subset_cols].
                                  _query_compiler.groupby_dict_agg(
                                      by=by,
                                      func_dict=func_dict,
                                      groupby_args=self._kwargs,
                                      agg_args=kwargs,
                                      drop=self._drop,
                                  ))
        if is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(
                    func, *args, **kwargs),
                *args,
                **kwargs,
            )
        if isinstance(func, str):
            agg_func = getattr(self, func, None)
            if callable(agg_func):
                return agg_func(*args, **kwargs)
        return self._apply_agg_function(
            lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
            drop=self._as_index,
            *args,
            **kwargs,
        )
    def normalize_dictlike_arg(
        self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
    ) -> AggFuncTypeDict:
        """
        Handler for dict-like argument.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed. Also normalizes to all lists
        when values consists of a mix of list and non-lists.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (
            how == "agg"
            and isinstance(obj, ABCSeries)
            and any(is_list_like(v) for _, v in func.items())
        ) or (any(is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        # Cannot use func.values() because arg may be a Series
        if any(is_aggregator(x) for _, x in func.items()):
            new_func: AggFuncTypeDict = {}
            for k, v in func.items():
                if not is_aggregator(v):
                    # mypy can't realize v is not a list here
                    new_func[k] = [v]  # type:ignore[list-item]
                else:
                    new_func[k] = v
            func = new_func
        return func
Exemplo n.º 9
0
    def validate_dictlike_arg(self, how: str, obj: FrameOrSeriesUnion,
                              func: AggFuncTypeDict) -> None:
        """
        Raise if dict-like argument is invalid.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (how == "agg" and isinstance(obj, ABCSeries)
                and any(is_list_like(v) for _, v in func.items())) or (any(
                    is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")
Exemplo n.º 10
0
    def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
        """
        Compute aggregation in the case of a dict-like argument.

        Parameters
        ----------
        _axis : int, 0 or 1
            Axis to compute aggregation on.

        Returns
        -------
        Result of aggregation.
        """
        obj = self.obj
        arg = cast(AggFuncTypeDict, self.f)

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

        if _axis != 0:  # pragma: no cover
            raise ValueError("Can only pass dict with axis=0")

        selected_obj = obj._selected_obj

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        # Cannot use arg.values() because arg may be a Series
        if any(is_aggregator(x) for _, x in arg.items()):
            new_arg: AggFuncTypeDict = {}
            for k, v in arg.items():
                if not isinstance(v, (tuple, list, dict)):
                    new_arg[k] = [v]
                else:
                    new_arg[k] = v

                # the keys must be in the columns
                # for ndim=2, or renamers for ndim=1

                # ok for now, but deprecated
                # {'A': { 'ra': 'mean' }}
                # {'A': { 'ra': ['mean'] }}
                # {'ra': ['mean']}

                # not ok
                # {'ra' : { 'A' : 'mean' }}
                if isinstance(v, dict):
                    raise SpecificationError("nested renamer is not supported")
                elif isinstance(selected_obj, ABCSeries):
                    raise SpecificationError("nested renamer is not supported")
                elif (isinstance(selected_obj, ABCDataFrame)
                      and k not in selected_obj.columns):
                    raise KeyError(f"Column '{k}' does not exist!")

            arg = new_arg

        else:
            # deprecation of renaming keys
            # GH 15931
            keys = list(arg.keys())
            if isinstance(selected_obj, ABCDataFrame) and len(
                    selected_obj.columns.intersection(keys)) != len(keys):
                cols = list(
                    safe_sort(
                        list(
                            set(keys) -
                            set(selected_obj.columns.intersection(keys))), ))
                raise SpecificationError(f"Column(s) {cols} do not exist")

        from pandas.core.reshape.concat import concat

        if selected_obj.ndim == 1:
            # key only used for output
            colg = obj._gotitem(obj._selection, ndim=1)
            results = {key: colg.agg(how) for key, how in arg.items()}
        else:
            # key used for column selection and output
            results = {
                key: obj._gotitem(key, ndim=1).agg(how)
                for key, how in arg.items()
            }

        # set the final keys
        keys = list(arg.keys())

        # Avoid making two isinstance calls in all and any below
        is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

        # combine results
        if all(is_ndframe):
            keys_to_use = [k for k in keys if not results[k].empty]
            # Have to check, if at least one DataFrame is not empty.
            keys_to_use = keys_to_use if keys_to_use != [] else keys
            axis = 0 if isinstance(obj, ABCSeries) else 1
            result = concat({k: results[k] for k in keys_to_use}, axis=axis)
        elif any(is_ndframe):
            # There is a mix of NDFrames and scalars
            raise ValueError("cannot perform both aggregation "
                             "and transformation operations "
                             "simultaneously")
        else:
            from pandas import Series

            # we have a dict of scalars
            # GH 36212 use name only if obj is a series
            if obj.ndim == 1:
                obj = cast("Series", obj)
                name = obj.name
            else:
                name = None

            result = Series(results, name=name)

        return result
Exemplo n.º 11
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        if (callable(func) and isinstance(func, BuiltinFunctionType)
                and func.__name__ in dir(self)):
            func = func.__name__

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def try_get_str_func(fn):
                if not isinstance(fn, str) and isinstance(fn, Iterable):
                    return [try_get_str_func(f) for f in fn]
                return fn.__name__ if callable(fn) and fn.__name__ in dir(
                    self) else fn

            relabeling_required, func_dict, new_columns, order = reconstruct_func(
                func, **kwargs)
            func_dict = {
                col: try_get_str_func(fn)
                for col, fn in func_dict.items()
            }
            if (relabeling_required and not self._as_index
                    and any(col in func_dict for col in self._internal_by)):
                ErrorMessage.missmatch_with_pandas(
                    operation=
                    "GroupBy.aggregate(**dictionary_renaming_aggregation)",
                    message=
                    ("intersection of the columns to aggregate and 'by' is not yet supported when 'as_index=False', "
                     +
                     "columns with group names of the intersection will not be presented in the result. "
                     +
                     "To achieve the desired result rewrite the original code from:\n"
                     +
                     "df.groupby('by_column', as_index=False).agg(agg_func=('by_column', agg_func))\n"
                     + "to the:\n" +
                     "df.groupby('by_column').agg(agg_func=('by_column', agg_func)).reset_index()"
                     ),
                )

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            if func is None:
                kwargs = {}
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(
                    func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif callable(func):
            return self._check_index(
                self._wrap_aggregation(
                    qc_method=type(self._query_compiler).groupby_agg,
                    numeric_only=False,
                    agg_func=func,
                    agg_args=args,
                    agg_kwargs=kwargs,
                    how="axis_wise",
                ))
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._wrap_aggregation(
            qc_method=type(self._query_compiler).groupby_agg,
            numeric_only=False,
            agg_func=func,
            agg_args=args,
            agg_kwargs=kwargs,
            how="axis_wise",
        )

        if relabeling_required:
            if not self._as_index:
                nby_cols = len(result.columns) - len(new_columns)
                order = np.concatenate([np.arange(nby_cols), order + nby_cols])
                by_cols = result.columns[:nby_cols]
                new_columns = pandas.Index(new_columns)
                if by_cols.nlevels != new_columns.nlevels:
                    by_cols = by_cols.remove_unused_levels()
                    empty_levels = [
                        i for i, level in enumerate(by_cols.levels)
                        if len(level) == 1 and level[0] == ""
                    ]
                    by_cols = by_cols.droplevel(empty_levels)
                new_columns = by_cols.append(new_columns)
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
Exemplo n.º 12
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        if (
            callable(func)
            and isinstance(func, BuiltinFunctionType)
            and func.__name__ in dir(self)
        ):
            func = func.__name__

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def try_get_str_func(fn):
                if not isinstance(fn, str) and isinstance(fn, Iterable):
                    return [try_get_str_func(f) for f in fn]
                return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn

            relabeling_required, func_dict, new_columns, order = reconstruct_func(
                func, **kwargs
            )
            func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()}

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            if func is None:
                kwargs = {}
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif callable(func):
            return self._apply_agg_function(
                lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._apply_agg_function(
            func,
            *args,
            **kwargs,
        )

        if relabeling_required:
            if not self._as_index:
                nby_cols = len(result.columns) - len(new_columns)
                order = np.concatenate([np.arange(nby_cols), order + nby_cols])
                by_cols = result.columns[:nby_cols]
                new_columns = pandas.Index(new_columns)
                if by_cols.nlevels != new_columns.nlevels:
                    by_cols = by_cols.remove_unused_levels()
                    empty_levels = [
                        i
                        for i, level in enumerate(by_cols.levels)
                        if len(level) == 1 and level[0] == ""
                    ]
                    by_cols = by_cols.droplevel(empty_levels)
                new_columns = by_cols.append(new_columns)
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
Exemplo n.º 13
0
def aggregate(obj, arg: AggFuncType, *args, **kwargs):
    """
    provide an implementation for the aggregators

    Parameters
    ----------
    arg : string, dict, function
    *args : args to pass on to the function
    **kwargs : kwargs to pass on to the function

    Returns
    -------
    tuple of result, how

    Notes
    -----
    how can be a string describe the required post-processing, or
    None if not required
    """
    is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

    _axis = kwargs.pop("_axis", None)
    if _axis is None:
        _axis = getattr(obj, "axis", 0)

    if isinstance(arg, str):
        return obj._try_aggregate_string_function(arg, *args, **kwargs), None

    if isinstance(arg, dict):
        # aggregate based on the passed dict
        if _axis != 0:  # pragma: no cover
            raise ValueError("Can only pass dict with axis=0")

        selected_obj = obj._selected_obj

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        if any(is_aggregator(x) for x in arg.values()):
            new_arg: Dict[Label, Union[AggFuncTypeBase,
                                       List[AggFuncTypeBase]]] = {}
            for k, v in arg.items():
                if not isinstance(v, (tuple, list, dict)):
                    new_arg[k] = [v]
                else:
                    new_arg[k] = v

                # the keys must be in the columns
                # for ndim=2, or renamers for ndim=1

                # ok for now, but deprecated
                # {'A': { 'ra': 'mean' }}
                # {'A': { 'ra': ['mean'] }}
                # {'ra': ['mean']}

                # not ok
                # {'ra' : { 'A' : 'mean' }}
                if isinstance(v, dict):
                    raise SpecificationError("nested renamer is not supported")
                elif isinstance(selected_obj, ABCSeries):
                    raise SpecificationError("nested renamer is not supported")
                elif (isinstance(selected_obj, ABCDataFrame)
                      and k not in selected_obj.columns):
                    raise KeyError(f"Column '{k}' does not exist!")

            arg = new_arg

        else:
            # deprecation of renaming keys
            # GH 15931
            keys = list(arg.keys())
            if isinstance(selected_obj, ABCDataFrame) and len(
                    selected_obj.columns.intersection(keys)) != len(keys):
                cols = sorted(
                    set(keys) - set(selected_obj.columns.intersection(keys)))
                raise SpecificationError(f"Column(s) {cols} do not exist")

        from pandas.core.reshape.concat import concat

        if selected_obj.ndim == 1:
            # key only used for output
            colg = obj._gotitem(obj._selection, ndim=1)
            results = {key: colg.agg(how) for key, how in arg.items()}
        else:
            # key used for column selection and output
            results = {
                key: obj._gotitem(key, ndim=1).agg(how)
                for key, how in arg.items()
            }

        # set the final keys
        keys = list(arg.keys())

        # Avoid making two isinstance calls in all and any below
        is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

        # combine results
        if all(is_ndframe):
            keys_to_use = [k for k in keys if not results[k].empty]
            # Have to check, if at least one DataFrame is not empty.
            keys_to_use = keys_to_use if keys_to_use != [] else keys
            axis = 0 if isinstance(obj, ABCSeries) else 1
            result = concat({k: results[k] for k in keys_to_use}, axis=axis)
        elif any(is_ndframe):
            # There is a mix of NDFrames and scalars
            raise ValueError("cannot perform both aggregation "
                             "and transformation operations "
                             "simultaneously")
        else:
            from pandas import Series

            # we have a dict of scalars
            # GH 36212 use name only if obj is a series
            if obj.ndim == 1:
                obj = cast("Series", obj)
                name = obj.name
            else:
                name = None

            result = Series(results, name=name)

        return result, True
    elif is_list_like(arg):
        # we require a list, but not an 'str'
        return aggregate_multiple_funcs(obj, arg, _axis=_axis), None
    else:
        result = None

    if callable(arg):
        f = obj._get_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(obj, f)(), None

    # caller can react
    return result, True
Exemplo n.º 14
0
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args,
              **kwargs) -> FrameOrSeries:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    from pandas.core.reshape.concat import concat

    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if isinstance(func, list):
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if isinstance(func, dict):
        if not is_series:
            cols = sorted(set(func.keys()) - set(obj.columns))
            if len(cols) > 0:
                raise SpecificationError(f"Column(s) {cols} do not exist")

        if any(isinstance(v, dict) for v in func.values()):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        results = {}
        for name, how in func.items():
            colg = obj._gotitem(name, ndim=1)
            try:
                results[name] = transform(colg, how, 0, *args, **kwargs)
            except Exception as e:
                if str(e) == "Function did not transform":
                    raise e

        # combine results
        if len(results) == 0:
            raise ValueError("Transform function failed")
        return concat(results, axis=1)

    # func is either str or callable
    try:
        if isinstance(func, str):
            result = obj._try_aggregate_string_function(func, *args, **kwargs)
        else:
            f = obj._get_cython_func(func)
            if f and not args and not kwargs:
                result = getattr(obj, f)()
            else:
                try:
                    result = obj.apply(func, args=args, **kwargs)
                except Exception:
                    result = func(obj, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result,
                      (ABCSeries, ABCDataFrame)) or not result.index.equals(
                          obj.index):
        raise ValueError("Function did not transform")

    return result
Exemplo n.º 15
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        if (
            callable(func)
            and isinstance(func, BuiltinFunctionType)
            and func.__name__ in dir(self)
        ):
            func = func.__name__

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def try_get_str_func(fn):
                if not isinstance(fn, str) and isinstance(fn, Iterable):
                    return [try_get_str_func(f) for f in fn]
                return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn

            relabeling_required, func_dict, new_columns, order = reconstruct_func(
                func, **kwargs
            )
            func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()}

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            if func is None:
                kwargs = {}
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif callable(func):
            return self._apply_agg_function(
                lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._apply_agg_function(
            func,
            *args,
            **kwargs,
        )

        if relabeling_required:
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
Exemplo n.º 16
0
def aggregate(obj, arg: AggFuncType, *args, **kwargs):
    """
    provide an implementation for the aggregators

    Parameters
    ----------
    arg : string, dict, function
    *args : args to pass on to the function
    **kwargs : kwargs to pass on to the function

    Returns
    -------
    tuple of result, how

    Notes
    -----
    how can be a string describe the required post-processing, or
    None if not required
    """
    is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

    _axis = kwargs.pop("_axis", None)
    if _axis is None:
        _axis = getattr(obj, "axis", 0)

    if isinstance(arg, str):
        return obj._try_aggregate_string_function(arg, *args, **kwargs), None

    if isinstance(arg, dict):
        # aggregate based on the passed dict
        if _axis != 0:  # pragma: no cover
            raise ValueError("Can only pass dict with axis=0")

        selected_obj = obj._selected_obj

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        if any(is_aggregator(x) for x in arg.values()):
            new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {}
            for k, v in arg.items():
                if not isinstance(v, (tuple, list, dict)):
                    new_arg[k] = [v]
                else:
                    new_arg[k] = v

                # the keys must be in the columns
                # for ndim=2, or renamers for ndim=1

                # ok for now, but deprecated
                # {'A': { 'ra': 'mean' }}
                # {'A': { 'ra': ['mean'] }}
                # {'ra': ['mean']}

                # not ok
                # {'ra' : { 'A' : 'mean' }}
                if isinstance(v, dict):
                    raise SpecificationError("nested renamer is not supported")
                elif isinstance(selected_obj, ABCSeries):
                    raise SpecificationError("nested renamer is not supported")
                elif (
                    isinstance(selected_obj, ABCDataFrame)
                    and k not in selected_obj.columns
                ):
                    raise KeyError(f"Column '{k}' does not exist!")

            arg = new_arg

        else:
            # deprecation of renaming keys
            # GH 15931
            keys = list(arg.keys())
            if isinstance(selected_obj, ABCDataFrame) and len(
                selected_obj.columns.intersection(keys)
            ) != len(keys):
                cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys)))
                raise SpecificationError(f"Column(s) {cols} do not exist")

        from pandas.core.reshape.concat import concat

        def _agg_1dim(name, how, subset=None):
            """
            aggregate a 1-dim with how
            """
            colg = obj._gotitem(name, ndim=1, subset=subset)
            if colg.ndim != 1:
                raise SpecificationError(
                    "nested dictionary is ambiguous in aggregation"
                )
            return colg.aggregate(how)

        def _agg_2dim(how):
            """
            aggregate a 2-dim with how
            """
            colg = obj._gotitem(obj._selection, ndim=2, subset=selected_obj)
            return colg.aggregate(how)

        def _agg(arg, func):
            """
            run the aggregations over the arg with func
            return a dict
            """
            result = {}
            for fname, agg_how in arg.items():
                result[fname] = func(fname, agg_how)
            return result

        # set the final keys
        keys = list(arg.keys())

        if obj._selection is not None:

            sl = set(obj._selection_list)

            # we are a Series like object,
            # but may have multiple aggregations
            if len(sl) == 1:

                result = _agg(
                    arg, lambda fname, agg_how: _agg_1dim(obj._selection, agg_how)
                )

            # we are selecting the same set as we are aggregating
            elif not len(sl - set(keys)):

                result = _agg(arg, _agg_1dim)

            # we are a DataFrame, with possibly multiple aggregations
            else:

                result = _agg(arg, _agg_2dim)

        # no selection
        else:

            try:
                result = _agg(arg, _agg_1dim)
            except SpecificationError:

                # we are aggregating expecting all 1d-returns
                # but we have 2d
                result = _agg(arg, _agg_2dim)

        # combine results

        def is_any_series() -> bool:
            # return a boolean if we have *any* nested series
            return any(isinstance(r, ABCSeries) for r in result.values())

        def is_any_frame() -> bool:
            # return a boolean if we have *any* nested series
            return any(isinstance(r, ABCDataFrame) for r in result.values())

        if isinstance(result, list):
            return concat(result, keys=keys, axis=1, sort=True), True

        elif is_any_frame():
            # we have a dict of DataFrames
            # return a MI DataFrame

            keys_to_use = [k for k in keys if not result[k].empty]
            # Have to check, if at least one DataFrame is not empty.
            keys_to_use = keys_to_use if keys_to_use != [] else keys
            return (
                concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1),
                True,
            )

        elif isinstance(obj, ABCSeries) and is_any_series():

            # we have a dict of Series
            # return a MI Series
            try:
                result = concat(result)
            except TypeError as err:
                # we want to give a nice error here if
                # we have non-same sized objects, so
                # we don't automatically broadcast

                raise ValueError(
                    "cannot perform both aggregation "
                    "and transformation operations "
                    "simultaneously"
                ) from err

            return result, True

        # fall thru
        from pandas import DataFrame, Series

        try:
            result = DataFrame(result)
        except ValueError:
            # we have a dict of scalars

            # GH 36212 use name only if obj is a series
            if obj.ndim == 1:
                obj = cast("Series", obj)
                name = obj.name
            else:
                name = None

            result = Series(result, name=name)

        return result, True
    elif is_list_like(arg):
        # we require a list, but not an 'str'
        return aggregate_multiple_funcs(obj, arg, _axis=_axis), None
    else:
        result = None

    if callable(arg):
        f = obj._get_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(obj, f)(), None

    # caller can react
    return result, True