Пример #1
0
    def apply(self, function):
        """Apply a transformation function over the grouped chunk.
        """
        if not callable(function):
            raise TypeError("type {!r} is not callable", type(function))

        df, segs = self.as_df()
        ends = chain(segs[1:], [None])
        chunks = [df[s:e] for s, e in zip(segs, ends)]
        return concat([function(chk) for chk in chunks])
Пример #2
0
def test_datetime_drop_duplicates():

    date_df = cudf.DataFrame()
    date_df["date"] = date_range("11/20/2018", periods=6, freq="D")
    date_df["value"] = np.random.sample(len(date_df))

    df = concat([date_df, date_df[:4]])
    assert_df(df[:-4], df.drop_duplicates())

    df2 = df.reset_index()
    assert_df(df2[:-4], df2.drop_duplicates())

    df3 = df.set_index("date")
    assert_df(df3[:-4], df3.drop_duplicates())
Пример #3
0
def test_datetime_drop_duplicates():

    date_df = cudf.DataFrame()
    date_df['date'] = date_range('11/20/2018', periods=6, freq='D')
    date_df['value'] = np.random.sample(len(date_df))

    df = concat([date_df, date_df[:4]])
    assert_df(df[:-4], df.drop_duplicates())

    df2 = df.reset_index()
    assert_df(df2[:-4], df2.drop_duplicates())

    df3 = df.set_index('date')
    assert_df(df3[:-4], df3.drop_duplicates())
Пример #4
0
    def apply(self, function):
        """Apply a transformation function over the grouped chunk.

        This uses numba's CUDA JIT compiler to convert the Python
        transformation function into a CUDA kernel, thus will have a
        compilation overhead during the first run.

        Parameters
        ----------
        func : function
          The transformation function that will be executed on the CUDA GPU.

        Examples
        --------
        .. code-block:: python

          from cudf import DataFrame
          df = DataFrame()
          df['key'] = [0, 0, 1, 1, 2, 2, 2]
          df['val'] = [0, 1, 2, 3, 4, 5, 6]
          groups = df.groupby(['key'], method='cudf')

          # Define a function to apply to each row in a group
          def mult(df):
            df['out'] = df['key'] * df['val']
            return df

          result = groups.apply(mult)
          print(result)

        Output:

        .. code-block:: python

             key  val  out
          0    0    0    0
          1    0    1    0
          2    1    2    2
          3    1    3    3
          4    2    4    8
          5    2    5   10
          6    2    6   12
        """
        if not callable(function):
            raise TypeError("type {!r} is not callable", type(function))

        df, segs = self.as_df()
        ends = chain(segs[1:], [None])
        chunks = [df[s:e] for s, e in zip(segs, ends)]
        return concat([function(chk) for chk in chunks])
Пример #5
0
def get_dummies(
    df,
    prefix=None,
    prefix_sep="_",
    dummy_na=False,
    columns=None,
    cats={},
    sparse=False,
    drop_first=False,
    dtype="int8",
):
    """ Returns a dataframe whose columns are the one hot encodings of all
    columns in `df`

    Parameters
    ----------
    df : cudf.DataFrame
        dataframe to encode
    prefix : str, dict, or sequence, optional
        prefix to append. Either a str (to apply a constant prefix), dict
        mapping column names to prefixes, or sequence of prefixes to apply with
        the same length as the number of columns. If not supplied, defaults
        to the empty string
    prefix_sep : str, dict, or sequence, optional, default '_'
        separator to use when appending prefixes
    dummy_na : boolean, optional
        Right now this is NON-FUNCTIONAL argument in rapids.
    cats : dict, optional
        dictionary mapping column names to sequences of integers representing
        that column's category. See `cudf.DataFrame.one_hot_encoding` for more
        information. if not supplied, it will be computed
    sparse : boolean, optional
        Right now this is NON-FUNCTIONAL argument in rapids.
    drop_first : boolean, optional
        Right now this is NON-FUNCTIONAL argument in rapids.
    columns : sequence of str, optional
        Names of columns to encode. If not provided, will attempt to encode all
        columns. Note this is different from pandas default behavior, which
        encodes all columns with dtype object or categorical
    dtype : str, optional
        output dtype, default 'int8'
    """
    if dummy_na:
        raise NotImplementedError("dummy_na is not supported yet")

    if sparse:
        raise NotImplementedError("sparse is not supported yet")

    if drop_first:
        raise NotImplementedError("drop_first is not supported yet")

    from cudf.multi import concat

    # TODO: This has to go away once we start supporting uint8.
    if dtype == np.uint8:
        dtype = "int8"

    encode_fallback_dtypes = ["object", "category"]

    if columns is None or len(columns) == 0:
        columns = df.select_dtypes(include=encode_fallback_dtypes).columns

    def length_check(obj, name):
        err_msg = ("Length of '{name}' ({len_obj}) did not match the "
                   "length of the columns being encoded ({len_required}).")

        if utils.is_list_like(obj):
            if len(obj) != len(columns):
                err_msg = err_msg.format(name=name,
                                         len_obj=len(obj),
                                         len_required=len(columns))
                raise ValueError(err_msg)

    length_check(prefix, "prefix")
    length_check(prefix_sep, "prefix_sep")

    if prefix is None:
        prefix = columns

    if isinstance(prefix, str):
        prefix_map = {}
    elif isinstance(prefix, dict):
        prefix_map = prefix
    else:
        prefix_map = dict(zip(columns, prefix))

    if isinstance(prefix_sep, str):
        prefix_sep_map = {}
    elif isinstance(prefix_sep, dict):
        prefix_sep_map = prefix_sep
    else:
        prefix_sep_map = dict(zip(columns, prefix_sep))

    # If we have no columns to encode, we need to drop fallback columns(if any)
    if len(columns) == 0:
        return df.select_dtypes(exclude=encode_fallback_dtypes)
    else:
        df_list = []

        for name in columns:
            if hasattr(df[name]._column, "categories"):
                unique = df[name]._column.categories
            else:
                unique = df[name].unique()

            col_enc_df = df.one_hot_encoding(
                name,
                prefix=prefix_map.get(name, prefix),
                cats=cats.get(name, unique),
                prefix_sep=prefix_sep_map.get(name, prefix_sep),
                dtype=dtype,
            )
            df_list.append(col_enc_df)

        return concat(df_list, axis=1).drop(labels=columns)