def apply(self, function): """Apply a transformation function over the grouped chunk. """ if not callable(function): raise TypeError("type {!r} is not callable", type(function)) df, segs = self.as_df() ends = chain(segs[1:], [None]) chunks = [df[s:e] for s, e in zip(segs, ends)] return concat([function(chk) for chk in chunks])
def test_datetime_drop_duplicates(): date_df = cudf.DataFrame() date_df["date"] = date_range("11/20/2018", periods=6, freq="D") date_df["value"] = np.random.sample(len(date_df)) df = concat([date_df, date_df[:4]]) assert_df(df[:-4], df.drop_duplicates()) df2 = df.reset_index() assert_df(df2[:-4], df2.drop_duplicates()) df3 = df.set_index("date") assert_df(df3[:-4], df3.drop_duplicates())
def test_datetime_drop_duplicates(): date_df = cudf.DataFrame() date_df['date'] = date_range('11/20/2018', periods=6, freq='D') date_df['value'] = np.random.sample(len(date_df)) df = concat([date_df, date_df[:4]]) assert_df(df[:-4], df.drop_duplicates()) df2 = df.reset_index() assert_df(df2[:-4], df2.drop_duplicates()) df3 = df.set_index('date') assert_df(df3[:-4], df3.drop_duplicates())
def apply(self, function): """Apply a transformation function over the grouped chunk. This uses numba's CUDA JIT compiler to convert the Python transformation function into a CUDA kernel, thus will have a compilation overhead during the first run. Parameters ---------- func : function The transformation function that will be executed on the CUDA GPU. Examples -------- .. code-block:: python from cudf import DataFrame df = DataFrame() df['key'] = [0, 0, 1, 1, 2, 2, 2] df['val'] = [0, 1, 2, 3, 4, 5, 6] groups = df.groupby(['key'], method='cudf') # Define a function to apply to each row in a group def mult(df): df['out'] = df['key'] * df['val'] return df result = groups.apply(mult) print(result) Output: .. code-block:: python key val out 0 0 0 0 1 0 1 0 2 1 2 2 3 1 3 3 4 2 4 8 5 2 5 10 6 2 6 12 """ if not callable(function): raise TypeError("type {!r} is not callable", type(function)) df, segs = self.as_df() ends = chain(segs[1:], [None]) chunks = [df[s:e] for s, e in zip(segs, ends)] return concat([function(chk) for chk in chunks])
def get_dummies( df, prefix=None, prefix_sep="_", dummy_na=False, columns=None, cats={}, sparse=False, drop_first=False, dtype="int8", ): """ Returns a dataframe whose columns are the one hot encodings of all columns in `df` Parameters ---------- df : cudf.DataFrame dataframe to encode prefix : str, dict, or sequence, optional prefix to append. Either a str (to apply a constant prefix), dict mapping column names to prefixes, or sequence of prefixes to apply with the same length as the number of columns. If not supplied, defaults to the empty string prefix_sep : str, dict, or sequence, optional, default '_' separator to use when appending prefixes dummy_na : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. cats : dict, optional dictionary mapping column names to sequences of integers representing that column's category. See `cudf.DataFrame.one_hot_encoding` for more information. if not supplied, it will be computed sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. columns : sequence of str, optional Names of columns to encode. If not provided, will attempt to encode all columns. Note this is different from pandas default behavior, which encodes all columns with dtype object or categorical dtype : str, optional output dtype, default 'int8' """ if dummy_na: raise NotImplementedError("dummy_na is not supported yet") if sparse: raise NotImplementedError("sparse is not supported yet") if drop_first: raise NotImplementedError("drop_first is not supported yet") from cudf.multi import concat # TODO: This has to go away once we start supporting uint8. if dtype == np.uint8: dtype = "int8" encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: columns = df.select_dtypes(include=encode_fallback_dtypes).columns def length_check(obj, name): err_msg = ("Length of '{name}' ({len_obj}) did not match the " "length of the columns being encoded ({len_required}).") if utils.is_list_like(obj): if len(obj) != len(columns): err_msg = err_msg.format(name=name, len_obj=len(obj), len_required=len(columns)) raise ValueError(err_msg) length_check(prefix, "prefix") length_check(prefix_sep, "prefix_sep") if prefix is None: prefix = columns if isinstance(prefix, str): prefix_map = {} elif isinstance(prefix, dict): prefix_map = prefix else: prefix_map = dict(zip(columns, prefix)) if isinstance(prefix_sep, str): prefix_sep_map = {} elif isinstance(prefix_sep, dict): prefix_sep_map = prefix_sep else: prefix_sep_map = dict(zip(columns, prefix_sep)) # If we have no columns to encode, we need to drop fallback columns(if any) if len(columns) == 0: return df.select_dtypes(exclude=encode_fallback_dtypes) else: df_list = [] for name in columns: if hasattr(df[name]._column, "categories"): unique = df[name]._column.categories else: unique = df[name].unique() col_enc_df = df.one_hot_encoding( name, prefix=prefix_map.get(name, prefix), cats=cats.get(name, unique), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, ) df_list.append(col_enc_df) return concat(df_list, axis=1).drop(labels=columns)