示例#1
0
文件: parsers.py 项目: RehanSD/modin
    def get_dtypes(cls, dtypes_ids):
        """
        Get common for all partitions dtype for each of the columns.

        Parameters
        ----------
        dtypes_ids : list
            Array with references to the partitions dtypes objects.

        Returns
        -------
        frame_dtypes : pandas.Series or dtype
            Resulting dtype or pandas.Series where column names are used as
            index and types of columns are used as values for full resulting
            frame.
        """
        # each element in `partitions_dtypes` is a Series, where column names are
        # used as index and types of columns for different partitions are used as values
        partitions_dtypes = cls.materialize(dtypes_ids)
        if all([len(dtype) == 0 for dtype in partitions_dtypes]):
            return None

        combined_part_dtypes = pandas.concat(partitions_dtypes, axis=1)
        frame_dtypes = combined_part_dtypes.iloc[:, 0]

        if not combined_part_dtypes.eq(frame_dtypes, axis=0).all(axis=None):
            ErrorMessage.missmatch_with_pandas(
                operation="read_*",
                message="Data types of partitions are different! "
                "Please refer to the troubleshooting section of the Modin documentation "
                "to fix this issue",
            )

            # concat all elements of `partitions_dtypes` and find common dtype
            # for each of the column among all partitions
            frame_dtypes = combined_part_dtypes.apply(
                lambda row: find_common_type_cat(row.values),
                axis=1,
            ).squeeze(axis=0)

        return frame_dtypes
示例#2
0
文件: groupby.py 项目: RehanSD/modin
    def caller(
        cls,
        query_compiler,
        by,
        map_func,
        reduce_func,
        axis,
        groupby_kwargs,
        agg_args,
        agg_kwargs,
        drop=False,
        method=None,
        default_to_pandas_func=None,
    ):
        """
        Execute GroupBy aggregation with TreeReduce approach.

        Parameters
        ----------
        query_compiler : BaseQueryCompiler
            Frame to group.
        by : BaseQueryCompiler, column or index label, Grouper or list of such
            Object that determine groups.
        map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Map phase.
        reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame
            Function to apply to the `GroupByObject` at the Reduce phase.
        axis : {0, 1}
            Axis to group and apply aggregation function along. 0 means index axis
            when 1 means column axis.
        groupby_kwargs : dict
            Dictionary which carries arguments for pandas.DataFrame.groupby.
        agg_args : list-like
            Positional arguments to pass to the aggregation functions.
        agg_kwargs : dict
            Keyword arguments to pass to the aggregation functions.
        drop : bool, default: False
            Indicates whether or not by-data came from the `self` frame.
        method : str, optional
            Name of the GroupBy aggregation function. This is a hint to be able to do special casing.
        default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional
            The pandas aggregation function equivalent to the `map_func + reduce_func`.
            Used in case of defaulting to pandas. If not specified `map_func` is used.

        Returns
        -------
        The same type as `query_compiler`
            QueryCompiler which carries the result of GroupBy aggregation.
        """
        if (axis != 0 or groupby_kwargs.get("level", None) is None and
            (not (isinstance(by, (type(query_compiler))) or hashable(by))
             or isinstance(by, pandas.Grouper))):
            by = try_cast_to_pandas(by, squeeze=True)
            # Since 'by' may be a 2D query compiler holding columns to group by,
            # to_pandas will also produce a pandas DataFrame containing them.
            # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by':
            by = GroupBy.validate_by(by)
            if default_to_pandas_func is None:
                default_to_pandas_func = ((lambda grp: grp.agg(map_func))
                                          if isinstance(map_func, dict) else
                                          map_func)
            return query_compiler.default_to_pandas(
                lambda df: default_to_pandas_func(
                    df.groupby(by=by, axis=axis, **groupby_kwargs),
                    *agg_args,
                    **agg_kwargs,
                ))

        # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of
        # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes'
        # computation if they're not computed may take time, so we don't do it
        if not groupby_kwargs.get("sort", True) and isinstance(
                by, type(query_compiler)):
            ErrorMessage.missmatch_with_pandas(
                operation="df.groupby(categorical_by, sort=False)",
                message=
                ("the groupby keys will be sorted anyway, although the 'sort=False' was passed. "
                 "See the following issue for more details: "
                 "https://github.com/modin-project/modin/issues/3571"),
            )
            groupby_kwargs = groupby_kwargs.copy()
            groupby_kwargs["sort"] = True

        map_fn, reduce_fn = cls.build_map_reduce_functions(
            by=by,
            axis=axis,
            groupby_kwargs=groupby_kwargs,
            map_func=map_func,
            reduce_func=reduce_func,
            agg_args=agg_args,
            agg_kwargs=agg_kwargs,
            drop=drop,
            method=method,
        )

        # If `by` is a ModinFrame, then its partitions will be broadcasted to every
        # `self` partition in a way determined by engine (modin_frame.groupby_reduce)
        # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`.
        broadcastable_by = getattr(by, "_modin_frame", None)
        apply_indices = list(map_func.keys()) if isinstance(map_func,
                                                            dict) else None
        new_modin_frame = query_compiler._modin_frame.groupby_reduce(
            axis,
            broadcastable_by,
            map_fn,
            reduce_fn,
            apply_indices=apply_indices)

        result = query_compiler.__constructor__(new_modin_frame)
        if result.index.name == "__reduced__":
            result.index.name = None
        return result
示例#3
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        if (callable(func) and isinstance(func, BuiltinFunctionType)
                and func.__name__ in dir(self)):
            func = func.__name__

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def try_get_str_func(fn):
                if not isinstance(fn, str) and isinstance(fn, Iterable):
                    return [try_get_str_func(f) for f in fn]
                return fn.__name__ if callable(fn) and fn.__name__ in dir(
                    self) else fn

            relabeling_required, func_dict, new_columns, order = reconstruct_func(
                func, **kwargs)
            func_dict = {
                col: try_get_str_func(fn)
                for col, fn in func_dict.items()
            }
            if (relabeling_required and not self._as_index
                    and any(col in func_dict for col in self._internal_by)):
                ErrorMessage.missmatch_with_pandas(
                    operation=
                    "GroupBy.aggregate(**dictionary_renaming_aggregation)",
                    message=
                    ("intersection of the columns to aggregate and 'by' is not yet supported when 'as_index=False', "
                     +
                     "columns with group names of the intersection will not be presented in the result. "
                     +
                     "To achieve the desired result rewrite the original code from:\n"
                     +
                     "df.groupby('by_column', as_index=False).agg(agg_func=('by_column', agg_func))\n"
                     + "to the:\n" +
                     "df.groupby('by_column').agg(agg_func=('by_column', agg_func)).reset_index()"
                     ),
                )

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            if func is None:
                kwargs = {}
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(
                    func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif callable(func):
            return self._check_index(
                self._wrap_aggregation(
                    qc_method=type(self._query_compiler).groupby_agg,
                    numeric_only=False,
                    agg_func=func,
                    agg_args=args,
                    agg_kwargs=kwargs,
                    how="axis_wise",
                ))
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._wrap_aggregation(
            qc_method=type(self._query_compiler).groupby_agg,
            numeric_only=False,
            agg_func=func,
            agg_args=args,
            agg_kwargs=kwargs,
            how="axis_wise",
        )

        if relabeling_required:
            if not self._as_index:
                nby_cols = len(result.columns) - len(new_columns)
                order = np.concatenate([np.arange(nby_cols), order + nby_cols])
                by_cols = result.columns[:nby_cols]
                new_columns = pandas.Index(new_columns)
                if by_cols.nlevels != new_columns.nlevels:
                    by_cols = by_cols.remove_unused_levels()
                    empty_levels = [
                        i for i, level in enumerate(by_cols.levels)
                        if len(level) == 1 and level[0] == ""
                    ]
                    by_cols = by_cols.droplevel(empty_levels)
                new_columns = by_cols.append(new_columns)
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
示例#4
0
    def __getitem__(self, key):
        """
        Implement indexing operation on a DataFrameGroupBy object.

        Parameters
        ----------
        key : list or str
            Names of columns to use as subset of original object.

        Returns
        -------
        DataFrameGroupBy or SeriesGroupBy
            Result of indexing operation.

        Raises
        ------
        NotImplementedError
            Column lookups on GroupBy with arbitrary Series in by is not yet supported.
        """
        # These parameters are common for building the resulted Series or DataFrame groupby object
        kwargs = {
            **self._kwargs.copy(),
            "by": self._by,
            "axis": self._axis,
            "idx_name": self._idx_name,
            "squeeze": self._squeeze,
        }
        # The rules of type deduction for the resulted object is the following:
        #   1. If `key` is a list-like or `as_index is False`, then the resulted object is a DataFrameGroupBy
        #   2. Otherwise, the resulted object is SeriesGroupBy
        #   3. Result type does not depend on the `by` origin
        # Examples:
        #   - drop: any, as_index: any, __getitem__(key: list_like) -> DataFrameGroupBy
        #   - drop: any, as_index: False, __getitem__(key: any) -> DataFrameGroupBy
        #   - drop: any, as_index: True, __getitem__(key: label) -> SeriesGroupBy
        if is_list_like(key):
            make_dataframe = True
        else:
            if self._as_index:
                make_dataframe = False
            else:
                make_dataframe = True
                key = [key]
        if make_dataframe:
            internal_by = frozenset(self._internal_by)
            if len(internal_by.intersection(key)) != 0:
                ErrorMessage.missmatch_with_pandas(
                    operation="GroupBy.__getitem__",
                    message=
                    ("intersection of the selection and 'by' columns is not yet supported, "
                     +
                     "to achieve the desired result rewrite the original code from:\n"
                     + "df.groupby('by_column')['by_column']\n" + "to the:\n" +
                     "df.groupby(df['by_column'].copy())['by_column']"),
                )
            cols_to_grab = internal_by.union(key)
            key = [col for col in self._df.columns if col in cols_to_grab]
            return DataFrameGroupBy(
                self._df[key],
                drop=self._drop,
                **kwargs,
            )
        if (self._is_multi_by and isinstance(self._by, list)
                and not all(hashable(o) and o in self._df for o in self._by)):
            raise NotImplementedError(
                "Column lookups on GroupBy with arbitrary Series in by" +
                " is not yet supported.")
        return SeriesGroupBy(
            self._df[key],
            drop=False,
            **kwargs,
        )