def args_cast(self, *args, **kwargs): """ Preprocess `default_to_pandas` function arguments and apply default function. Cast all Modin objects that function arguments contain to its pandas representation. """ args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) return wrapper(self, *args, **kwargs)
def _default_to_pandas(self, f, *args, **kwargs): """ Defaults the execution of this function to pandas. TODO: add types. Parameters ---------- f: The function to apply to each group. Returns ------- A new Modin DataFrame with the result of the pandas function. """ if (isinstance(self._by, type(self._query_compiler)) and len(self._by.columns) == 1): by = self._by.columns[0] if self._drop else self._by.to_pandas( ).squeeze() elif isinstance(self._by, type(self._query_compiler)): by = list(self._by.columns) else: by = self._by by = try_cast_to_pandas(by) def groupby_on_multiple_columns(df, *args, **kwargs): return f(df.groupby(by=by, axis=self._axis, **self._kwargs), *args, **kwargs) return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs)
def align_datetime_dtypes(*dfs): """ Make all of the passed frames have DateTime dtype for the same columns. Cast column type of the certain frame to the DateTime type if any frame in the `dfs` sequence has DateTime type for this column. Parameters ---------- *dfs : iterable of DataFrames DataFrames to align DateTime dtypes. Notes ----- Passed Modin frames may be casted to pandas in the result. """ datetime_cols = {} for df in dfs: for col, dtype in df.dtypes.items(): # If we already decided to cast this column to DateTime no more actions are needed if col not in datetime_cols and is_datetime64_any_dtype(dtype): datetime_cols[col] = dtype casted_dfs = ( # OmniSci has difficulties with casting to certain dtypes (i.e. datetime64), # so casting it to pandas before doing 'astype' tuple(try_cast_to_pandas(df).astype(datetime_cols) for df in dfs) # This is required so we don't try to cast empty OmniSci frames to pandas: # https://github.com/modin-project/modin/issues/3428 if len(datetime_cols) else dfs ) return casted_dfs
def _index_grouped(self): if self._index_grouped_cache is None: if hasattr(self._by, "columns") and len(self._by.columns) > 1: by = list(self._by.columns) is_multi_by = True else: by = self._by is_multi_by = self._is_multi_by if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all(isinstance(o, str) for o in by): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by) pandas_df = self._df._to_pandas() self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: if hasattr(self._by, "columns") and len(self._by.columns) > 1: by = list(self._by.columns) is_multi_by = True else: by = self._by is_multi_by = self._is_multi_by if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( hashable(o) and ( o in self._df or o in self._df._query_compiler.get_index_names(self._axis) ) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def caller( cls, query_compiler, by, axis, groupby_args, map_args, map_func, numeric_only=True, **kwargs, ): if not (isinstance(by, (type(query_compiler)) or hashable(by))) or isinstance( by, pandas.Grouper ): by = try_cast_to_pandas(by, squeeze=True) default_func = ( (lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func ) return query_compiler.default_to_pandas( lambda df: default_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args ) ) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: qc = query_compiler.getitem_column_array( query_compiler._modin_frame._numeric_columns(True) ) else: qc = query_compiler map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_args=groupby_args, map_func=map_func, map_args=map_args, **kwargs, ) broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = qc._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices ) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def _default_to_pandas(self, f, *args, **kwargs): """ Execute function `f` in default-to-pandas way. Parameters ---------- f : callable The function to apply to each group. *args : list Extra positional arguments to pass to `f`. **kwargs : dict Extra keyword arguments to pass to `f`. Returns ------- modin.pandas.DataFrame A new Modin DataFrame with the result of the pandas function. """ if (isinstance(self._by, type(self._query_compiler)) and len(self._by.columns) == 1): by = self._by.columns[0] if self._drop else self._by.to_pandas( ).squeeze() # converting QC 'by' to a list of column labels only if this 'by' comes from the self (if drop is True) elif self._drop and isinstance(self._by, type(self._query_compiler)): by = list(self._by.columns) else: by = self._by by = try_cast_to_pandas(by, squeeze=True) # Since 'by' may be a 2D query compiler holding columns to group by, # to_pandas will also produce a pandas DataFrame containing them. # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by': by = GroupBy.validate_by(by) def groupby_on_multiple_columns(df, *args, **kwargs): return f( df.groupby(by=by, axis=self._axis, squeeze=self._squeeze, **self._kwargs), *args, **kwargs, ) return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs)
def _default_to_pandas(self, f, *args, **kwargs): """ Execute function `f` in default-to-pandas way. Parameters ---------- f : callable The function to apply to each group. *args : list Extra positional arguments to pass to `f`. **kwargs : dict Extra keyword arguments to pass to `f`. Returns ------- modin.pandas.DataFrame A new Modin DataFrame with the result of the pandas function. """ if ( isinstance(self._by, type(self._query_compiler)) and len(self._by.columns) == 1 ): by = self._by.columns[0] if self._drop else self._by.to_pandas().squeeze() elif isinstance(self._by, type(self._query_compiler)): by = list(self._by.columns) else: by = self._by by = try_cast_to_pandas(by, squeeze=True) def groupby_on_multiple_columns(df, *args, **kwargs): return f( df.groupby( by=by, axis=self._axis, squeeze=self._squeeze, **self._kwargs ), *args, **kwargs, ) return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs)
def run_and_compare( fn, data, data2=None, force_lazy=True, force_arrow_execute=False, allow_subqueries=False, comparator=df_equals, **kwargs, ): """Verify equality of the results of the passed function executed against pandas and modin frame.""" def run_modin( fn, data, data2, force_lazy, force_arrow_execute, allow_subqueries, constructor_kwargs, **kwargs, ): kwargs["df1"] = pd.DataFrame(data, **constructor_kwargs) kwargs["df2"] = pd.DataFrame(data2, **constructor_kwargs) kwargs["df"] = kwargs["df1"] if force_lazy: set_execution_mode(kwargs["df1"], "lazy") set_execution_mode(kwargs["df2"], "lazy") elif force_arrow_execute: set_execution_mode(kwargs["df1"], "arrow") set_execution_mode(kwargs["df2"], "arrow") exp_res = fn(lib=pd, **kwargs) if force_arrow_execute: set_execution_mode(exp_res, "arrow", allow_subqueries) elif force_lazy: set_execution_mode(exp_res, None, allow_subqueries) return exp_res constructor_kwargs = kwargs.pop("constructor_kwargs", {}) try: kwargs["df1"] = pandas.DataFrame(data, **constructor_kwargs) kwargs["df2"] = pandas.DataFrame(data2, **constructor_kwargs) kwargs["df"] = kwargs["df1"] ref_res = fn(lib=pandas, **kwargs) except Exception as e: with pytest.raises(type(e)): exp_res = run_modin( fn=fn, data=data, data2=data2, force_lazy=force_lazy, force_arrow_execute=force_arrow_execute, allow_subqueries=allow_subqueries, constructor_kwargs=constructor_kwargs, **kwargs, ) _ = exp_res.index else: exp_res = run_modin( fn=fn, data=data, data2=data2, force_lazy=force_lazy, force_arrow_execute=force_arrow_execute, allow_subqueries=allow_subqueries, constructor_kwargs=constructor_kwargs, **kwargs, ) # Currently, strings are converted to categories when exported from OmniSci, # this makes the equality comparison fail. Converting string cols back to # their original dtypes until the issue is resolved: # https://github.com/modin-project/modin/issues/2747 if isinstance(exp_res, pd.DataFrame): external_dtypes = exp_res.dtypes exp_res = try_cast_to_pandas(exp_res) internal_dtypes = exp_res.dtypes new_schema = {} for col in exp_res.columns: if ( internal_dtypes[col] == "category" and external_dtypes[col] != "category" ): new_schema[col] = external_dtypes[col] exp_res = exp_res.astype(new_schema) comparator(ref_res, exp_res)
def _compute_index_grouped(self, numerical=False): """ Construct an index of group IDs. Parameters ---------- numerical : bool, default: False Whether a group indices should be positional (True) or label-based (False). Returns ------- dict A dict of {group name -> group indices} values. See Also -------- pandas.core.groupby.GroupBy.groups """ # We end up using pure pandas to compute group indices, so raising a warning ErrorMessage.default_to_pandas("Group indices computation") # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) # `dropna` param is the only one that matters for the group indices result dropna = self._kwargs.get("dropna", True) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by): pandas_df = self._df._query_compiler.getitem_column_array( by).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) groupby_obj = pandas_df.groupby(by=by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by axis_labels = self._query_compiler.get_axis(self._axis) if numerical: # Since we want positional indices of the groups, we want to group # on a `RangeIndex`, not on the actual index labels axis_labels = pandas.RangeIndex(len(axis_labels)) # `pandas.Index.groupby` doesn't take any parameters except `by`. # Have to convert an Index to a Series to be able to process `dropna=False`: if dropna: return axis_labels.groupby(by) else: groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def test_simple_row_groupby(by, as_index, col1_category): pandas_df = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, np.NaN, 7], "col3": [np.NaN, np.NaN, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], }) if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) modin_df = from_pandas(pandas_df) n = 1 def maybe_get_columns(df, by): if isinstance(by, list): return [o(df) if isinstance(o, GetColumn) else o for o in by] else: return by modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by), as_index=as_index) pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by)) pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod(axis=0)) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) # Workaround for Pandas bug #34656. Recreate groupby object for Pandas pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.rank()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique()) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) if not check_df_columns_have_nans(modin_df, by): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_general( modin_groupby, pandas_groupby, lambda df: df.transform(func), check_exception_type=None, ) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) if get_current_backend() != "BaseOnPython": eval_general( modin_groupby, pandas_groupby, lambda df: df.size(), check_exception_type=None, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) if isinstance(by, list) and not any( isinstance(o, (pd.Series, pandas.Series)) for o in by): # Not yet supported for non-original-column-from-dataframe Series in by: eval___getattr__(modin_groupby, pandas_groupby, "col3") eval_groups(modin_groupby, pandas_groupby)
def caller( cls, query_compiler, by, axis, groupby_args, map_args, map_func, reduce_func, reduce_args, numeric_only=True, drop=False, method=None, default_to_pandas_func=None, ): """ Execute GroupBy aggregation with MapReduce approach. Parameters ---------- query_compiler : BaseQueryCompiler Frame to group. by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. axis : {0, 1}, default: 0 Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_args : dict Dictionary which carries arguments for pandas.DataFrame.groupby. map_args : dict Arguments which will be passed to `map_func`. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. reduce_args : dict Arguments which will be passed to `reduce_func`. numeric_only : bool, default: True Whether or not to drop non-numeric columns before executing GroupBy. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. Returns ------- The same type as `query_compiler` QueryCompiler which carries the result of GroupBy aggregation. """ if groupby_args.get("level", None) is None and ( not (isinstance(by, (type(query_compiler))) or hashable(by)) or isinstance(by, pandas.Grouper)): by = try_cast_to_pandas(by, squeeze=True) if default_to_pandas_func is None: default_to_pandas_func = ((lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func) return query_compiler.default_to_pandas( lambda df: default_to_pandas_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args)) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: qc = query_compiler.getitem_column_array( query_compiler._modin_frame.numeric_columns(True)) else: qc = query_compiler map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_args=groupby_args, map_func=map_func, map_args=map_args, reduce_func=reduce_func, reduce_args=reduce_args, drop=drop, method=method, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = qc._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def align_datetime_dtypes(*dfs): """ Make all of the passed frames have DateTime dtype for the same columns. Cast column type of the certain frame to the DateTime type if any frame in the `dfs` sequence has DateTime type for this column. Parameters ---------- *dfs : iterable of DataFrames DataFrames to align DateTime dtypes. Notes ----- Passed Modin frames may be casted to pandas in the result. """ datetime_cols = {} time_cols = set() for df in dfs: for col, dtype in df.dtypes.items(): # If we already decided to cast this column to DateTime no more actions are needed if col not in datetime_cols and is_datetime64_any_dtype(dtype): datetime_cols[col] = dtype # datetime.time is considered to be an 'object' dtype in pandas that's why # we have to explicitly check the values type in the column elif ( dtype == np.dtype("O") and col not in time_cols # OmniSci has difficulties with empty frames, so explicitly skip them # https://github.com/modin-project/modin/issues/3428 and len(df) > 0 and all( isinstance(val, datetime.time) or pandas.isna(val) for val in df[col] ) ): time_cols.add(col) if len(datetime_cols) == 0 and len(time_cols) == 0: return dfs def convert_to_time(value): """Convert passed value to `datetime.time`.""" if isinstance(value, datetime.time): return value elif isinstance(value, str): return datetime.time.fromisoformat(value) else: return datetime.time(value) time_cols_list = list(time_cols) casted_dfs = [] for df in dfs: # OmniSci has difficulties with casting to certain dtypes (i.e. datetime64), # so casting it to pandas pandas_df = try_cast_to_pandas(df) if datetime_cols: pandas_df = pandas_df.astype(datetime_cols) if time_cols: pandas_df[time_cols_list] = pandas_df[time_cols_list].applymap( convert_to_time ) casted_dfs.append(pandas_df) return casted_dfs
def caller( query_compiler, by, axis, groupby_args, map_args, reduce_args=None, numeric_only=True, drop=False, ): if not isinstance(by, (type(query_compiler), str)): by = try_cast_to_pandas(by, squeeze=True) return query_compiler.default_to_pandas(lambda df: map_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args)) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: qc = query_compiler.getitem_column_array( query_compiler._modin_frame._numeric_columns(True)) else: qc = query_compiler # since we're going to modify `groupby_args` dict in a `compute_map`, # we want to copy it to not propagate these changes into source dict, in case # of unsuccessful end of function groupby_args = groupby_args.copy() as_index = groupby_args.get("as_index", True) observed = groupby_args.get("observed", False) if isinstance(by, str): def _map(df): # Set `as_index` to True to track the metadata of the grouping # object It is used to make sure that between phases we are # constructing the right index and placing columns in the correct # order. groupby_args["as_index"] = True groupby_args["observed"] = True result = map_func( df.groupby(by=by, axis=axis, **groupby_args), **map_args) # The _modin_groupby_ prefix indicates that this is the first # partition, and since we may need to insert the grouping data in # the reduce phase if (not isinstance(result.index, pandas.MultiIndex) and result.index.name is not None and result.index.name in result.columns): result.index.name = "{}{}".format( "_modin_groupby_", result.index.name) return result else: def _map(df, other): def compute_map(df, other): # Set `as_index` to True to track the metadata of the grouping object # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. groupby_args["as_index"] = True groupby_args["observed"] = True other = other.squeeze(axis=axis ^ 1) if isinstance(other, pandas.DataFrame): df = pandas.concat( [df] + [other[[o for o in other if o not in df]]], axis=1, ) other = list(other.columns) result = map_func( df.groupby(by=other, axis=axis, **groupby_args), **map_args) # if `other` has category dtype, then pandas will drop that # column after groupby, inserting it back to correctly process # reduce phase if (drop and not as_index and isinstance(other, pandas.Series) and isinstance(other.dtype, pandas.CategoricalDtype) and result.index.name is not None and result.index.name not in result.columns): result.insert(loc=0, column=result.index.name, value=result.index) # The _modin_groupby_ prefix indicates that this is the first partition, # and since we may need to insert the grouping data in the reduce phase if (not isinstance(result.index, pandas.MultiIndex) and result.index.name is not None and result.index.name in result.columns): result.index.name = "{}{}".format( "_modin_groupby_", result.index.name) return result try: return compute_map(df, other) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except ValueError: return compute_map(df.copy(), other.copy()) def _reduce(df): def compute_reduce(df): other_len = len(df.index.names) df = df.reset_index(drop=False) # See note above about setting `as_index` groupby_args["as_index"] = as_index groupby_args["observed"] = observed if other_len > 1: by_part = list(df.columns[0:other_len]) else: by_part = df.columns[0] result = reduce_func( df.groupby(by=by_part, axis=axis, **groupby_args), **reduce_args) if (not isinstance(result.index, pandas.MultiIndex) and result.index.name is not None and "_modin_groupby_" in result.index.name): result.index.name = result.index.name[ len("_modin_groupby_"):] if isinstance(by_part, str) and by_part in result.columns: if "_modin_groupby_" in by_part and drop: col_name = by_part[len("_modin_groupby_"):] new_result = result.drop(columns=col_name, errors="ignore") new_result.columns = [ col_name if "_modin_groupby_" in c else c for c in new_result.columns ] return new_result else: return (result.drop(columns=by_part) if call_kwds.get("method", None) != "size" else result) return result try: return compute_reduce(df) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except ValueError: return compute_reduce(df.copy()) # TODO: try to precompute `new_index` and `new_columns` if isinstance(by, str): new_modin_frame = qc._modin_frame._map_reduce( axis, _map, reduce_func=_reduce, preserve_index=False) else: new_modin_frame = qc._modin_frame.groupby_reduce( axis, by._modin_frame, _map, _reduce) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def _index_grouped(self): """ Construct an index of group IDs. Returns ------- dict A dict of {group name -> group labels} values. See Also -------- pandas.core.groupby.GroupBy.groups """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def caller( cls, query_compiler, by, map_func, reduce_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, method=None, default_to_pandas_func=None, ): """ Execute GroupBy aggregation with TreeReduce approach. Parameters ---------- query_compiler : BaseQueryCompiler Frame to group. by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for pandas.DataFrame.groupby. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. Returns ------- The same type as `query_compiler` QueryCompiler which carries the result of GroupBy aggregation. """ if (axis != 0 or groupby_kwargs.get("level", None) is None and (not (isinstance(by, (type(query_compiler))) or hashable(by)) or isinstance(by, pandas.Grouper))): by = try_cast_to_pandas(by, squeeze=True) # Since 'by' may be a 2D query compiler holding columns to group by, # to_pandas will also produce a pandas DataFrame containing them. # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by': by = GroupBy.validate_by(by) if default_to_pandas_func is None: default_to_pandas_func = ((lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func) return query_compiler.default_to_pandas( lambda df: default_to_pandas_func( df.groupby(by=by, axis=axis, **groupby_kwargs), *agg_args, **agg_kwargs, )) # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes' # computation if they're not computed may take time, so we don't do it if not groupby_kwargs.get("sort", True) and isinstance( by, type(query_compiler)): ErrorMessage.missmatch_with_pandas( operation="df.groupby(categorical_by, sort=False)", message= ("the groupby keys will be sorted anyway, although the 'sort=False' was passed. " "See the following issue for more details: " "https://github.com/modin-project/modin/issues/3571"), ) groupby_kwargs = groupby_kwargs.copy() groupby_kwargs["sort"] = True map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_kwargs=groupby_kwargs, map_func=map_func, reduce_func=reduce_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method=method, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None new_modin_frame = query_compiler._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices) result = query_compiler.__constructor__(new_modin_frame) if result.index.name == "__reduced__": result.index.name = None return result
def args_cast(self, *args, **kwargs): args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) return wrapper(self, *args, **kwargs)