def get_dtypes(cls, dtypes_ids): """ Get common for all partitions dtype for each of the columns. Parameters ---------- dtypes_ids : list Array with references to the partitions dtypes objects. Returns ------- frame_dtypes : pandas.Series or dtype Resulting dtype or pandas.Series where column names are used as index and types of columns are used as values for full resulting frame. """ # each element in `partitions_dtypes` is a Series, where column names are # used as index and types of columns for different partitions are used as values partitions_dtypes = cls.materialize(dtypes_ids) if all([len(dtype) == 0 for dtype in partitions_dtypes]): return None combined_part_dtypes = pandas.concat(partitions_dtypes, axis=1) frame_dtypes = combined_part_dtypes.iloc[:, 0] if not combined_part_dtypes.eq(frame_dtypes, axis=0).all(axis=None): ErrorMessage.missmatch_with_pandas( operation="read_*", message="Data types of partitions are different! " "Please refer to the troubleshooting section of the Modin documentation " "to fix this issue", ) # concat all elements of `partitions_dtypes` and find common dtype # for each of the column among all partitions frame_dtypes = combined_part_dtypes.apply( lambda row: find_common_type_cat(row.values), axis=1, ).squeeze(axis=0) return frame_dtypes
def caller( cls, query_compiler, by, map_func, reduce_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, method=None, default_to_pandas_func=None, ): """ Execute GroupBy aggregation with TreeReduce approach. Parameters ---------- query_compiler : BaseQueryCompiler Frame to group. by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for pandas.DataFrame.groupby. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. Returns ------- The same type as `query_compiler` QueryCompiler which carries the result of GroupBy aggregation. """ if (axis != 0 or groupby_kwargs.get("level", None) is None and (not (isinstance(by, (type(query_compiler))) or hashable(by)) or isinstance(by, pandas.Grouper))): by = try_cast_to_pandas(by, squeeze=True) # Since 'by' may be a 2D query compiler holding columns to group by, # to_pandas will also produce a pandas DataFrame containing them. # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by': by = GroupBy.validate_by(by) if default_to_pandas_func is None: default_to_pandas_func = ((lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func) return query_compiler.default_to_pandas( lambda df: default_to_pandas_func( df.groupby(by=by, axis=axis, **groupby_kwargs), *agg_args, **agg_kwargs, )) # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes' # computation if they're not computed may take time, so we don't do it if not groupby_kwargs.get("sort", True) and isinstance( by, type(query_compiler)): ErrorMessage.missmatch_with_pandas( operation="df.groupby(categorical_by, sort=False)", message= ("the groupby keys will be sorted anyway, although the 'sort=False' was passed. " "See the following issue for more details: " "https://github.com/modin-project/modin/issues/3571"), ) groupby_kwargs = groupby_kwargs.copy() groupby_kwargs["sort"] = True map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_kwargs=groupby_kwargs, map_func=map_func, reduce_func=reduce_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method=method, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = query_compiler._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if (callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self)): func = func.__name__ relabeling_required = False if isinstance(func, dict) or func is None: def try_get_str_func(fn): if not isinstance(fn, str) and isinstance(fn, Iterable): return [try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) and fn.__name__ in dir( self) else fn relabeling_required, func_dict, new_columns, order = reconstruct_func( func, **kwargs) func_dict = { col: try_get_str_func(fn) for col, fn in func_dict.items() } if (relabeling_required and not self._as_index and any(col in func_dict for col in self._internal_by)): ErrorMessage.missmatch_with_pandas( operation= "GroupBy.aggregate(**dictionary_renaming_aggregation)", message= ("intersection of the columns to aggregate and 'by' is not yet supported when 'as_index=False', " + "columns with group names of the intersection will not be presented in the result. " + "To achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column', as_index=False).agg(agg_func=('by_column', agg_func))\n" + "to the:\n" + "df.groupby('by_column').agg(agg_func=('by_column', agg_func)).reset_index()" ), ) if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate( func, *args, **kwargs), *args, **kwargs, ) elif callable(func): return self._check_index( self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", )) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", ) if relabeling_required: if not self._as_index: nby_cols = len(result.columns) - len(new_columns) order = np.concatenate([np.arange(nby_cols), order + nby_cols]) by_cols = result.columns[:nby_cols] new_columns = pandas.Index(new_columns) if by_cols.nlevels != new_columns.nlevels: by_cols = by_cols.remove_unused_levels() empty_levels = [ i for i, level in enumerate(by_cols.levels) if len(level) == 1 and level[0] == "" ] by_cols = by_cols.droplevel(empty_levels) new_columns = by_cols.append(new_columns) result = result.iloc[:, order] result.columns = new_columns return result
def __getitem__(self, key): """ Implement indexing operation on a DataFrameGroupBy object. Parameters ---------- key : list or str Names of columns to use as subset of original object. Returns ------- DataFrameGroupBy or SeriesGroupBy Result of indexing operation. Raises ------ NotImplementedError Column lookups on GroupBy with arbitrary Series in by is not yet supported. """ # These parameters are common for building the resulted Series or DataFrame groupby object kwargs = { **self._kwargs.copy(), "by": self._by, "axis": self._axis, "idx_name": self._idx_name, "squeeze": self._squeeze, } # The rules of type deduction for the resulted object is the following: # 1. If `key` is a list-like or `as_index is False`, then the resulted object is a DataFrameGroupBy # 2. Otherwise, the resulted object is SeriesGroupBy # 3. Result type does not depend on the `by` origin # Examples: # - drop: any, as_index: any, __getitem__(key: list_like) -> DataFrameGroupBy # - drop: any, as_index: False, __getitem__(key: any) -> DataFrameGroupBy # - drop: any, as_index: True, __getitem__(key: label) -> SeriesGroupBy if is_list_like(key): make_dataframe = True else: if self._as_index: make_dataframe = False else: make_dataframe = True key = [key] if make_dataframe: internal_by = frozenset(self._internal_by) if len(internal_by.intersection(key)) != 0: ErrorMessage.missmatch_with_pandas( operation="GroupBy.__getitem__", message= ("intersection of the selection and 'by' columns is not yet supported, " + "to achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column')['by_column']\n" + "to the:\n" + "df.groupby(df['by_column'].copy())['by_column']"), ) cols_to_grab = internal_by.union(key) key = [col for col in self._df.columns if col in cols_to_grab] return DataFrameGroupBy( self._df[key], drop=self._drop, **kwargs, ) if (self._is_multi_by and isinstance(self._by, list) and not all(hashable(o) and o in self._df for o in self._by)): raise NotImplementedError( "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported.") return SeriesGroupBy( self._df[key], drop=False, **kwargs, )