Exemplo n.º 1
0
def test_cdt_eq(data, ordered):
    dt = cudf.CategoricalDtype(categories=data, ordered=ordered)
    assert dt == "category"
    assert dt == dt
    assert dt == cudf.CategoricalDtype(categories=None, ordered=ordered)
    assert dt == cudf.CategoricalDtype(categories=data, ordered=ordered)
    assert not dt == cudf.CategoricalDtype(categories=data,
                                           ordered=not ordered)
Exemplo n.º 2
0
def test_is_categorical_dispatch():
    assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3]))
    assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3]))

    assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category"))
    assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category"))

    assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category"))
    assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
Exemplo n.º 3
0
def _match_categorical_dtypes_both(lcol: CategoricalColumn,
                                   rcol: CategoricalColumn,
                                   how: str) -> Tuple[ColumnBase, ColumnBase]:
    # The commontype depends on both `how` and the specifics of the
    # categorical variables to be merged.

    ltype, rtype = lcol.dtype, rcol.dtype

    # when both are ordered and both have the same categories,
    # no casting required:
    if ltype == rtype:
        return lcol, rcol

    # Merging categorical variables when only one side is ordered is
    # ambiguous and not allowed.
    if ltype.ordered != rtype.ordered:
        raise TypeError("Merging on categorical variables with mismatched"
                        " ordering is ambiguous")

    if ltype.ordered and rtype.ordered:
        # if we get to here, categories must be what causes the
        # dtype equality check to fail. And we can never merge
        # two ordered categoricals with different categories
        raise TypeError(f"{how} merge between categoricals with "
                        "different categories is only valid when "
                        "neither side is ordered")

    # the following should now always hold
    assert not ltype.ordered and not rtype.ordered

    if how == "inner":
        # cast to category types -- we must cast them back later
        return _match_join_keys(
            lcol.cat()._decategorize(),
            rcol.cat()._decategorize(),
            how,
        )
    elif how in {"left", "leftanti", "leftsemi"}:
        # always cast to left type
        return lcol, rcol.astype(ltype)
    else:
        # merge categories
        merged_categories = cudf.concat([ltype.categories,
                                         rtype.categories]).unique()
        common_type = cudf.CategoricalDtype(categories=merged_categories,
                                            ordered=False)
        return lcol.astype(common_type), rcol.astype(common_type)
Exemplo n.º 4
0
def _libcudf_to_output_castrules(lcol, rcol, how):
    """
    Determine what dtype an output merge key column should be
    cast to after it has been processed by libcudf. Determine
    if a column should be promoted to a categorical datatype.
    For inner merges between unordered categoricals, we get a
    new categorical variable containing the intersection of
    the two source variables. For left or right joins, we get
    the original categorical variable from whichever was the
    major operand of the join, e.g. left for a left join or
    right for a right join. In the case of an outer join, the
    result will be a new categorical variable with both sets
    of categories.
    """
    merge_return_type = None

    ltype = lcol.dtype
    rtype = rcol.dtype

    if pd.api.types.is_dtype_equal(ltype, rtype):
        return ltype

    l_is_cat = isinstance(ltype, CategoricalDtype)
    r_is_cat = isinstance(rtype, CategoricalDtype)

    # we  currently only need to do this for categorical variables
    if how == "inner":
        if l_is_cat and r_is_cat:
            merge_return_type = "category"
    elif how == "left":
        if l_is_cat:
            merge_return_type = ltype
    elif how == "right":
        if r_is_cat:
            merge_return_type = rtype
    elif how == "outer":
        if l_is_cat and r_is_cat:
            new_cats = cudf.concat([ltype.categories,
                                    rtype.categories]).unique()
            merge_return_type = cudf.CategoricalDtype(categories=new_cats,
                                                      ordered=ltype.ordered)
    return merge_return_type
Exemplo n.º 5
0
def test_merging_categorical_columns():
    try:
        from dask.dataframe.dispatch import (  # noqa: F401
            union_categoricals_dispatch, )
    except ImportError:
        pytest.skip(
            "need a version of dask that has union_categoricals_dispatch")

    df_1 = cudf.DataFrame({
        "id_1": [0, 1, 2, 3],
        "cat_col": ["a", "b", "f", "f"]
    })

    ddf_1 = dgd.from_cudf(df_1, npartitions=2)

    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])

    df_2 = cudf.DataFrame({
        "id_2": [111, 112, 113],
        "cat_col": ["g", "h", "f"]
    })

    ddf_2 = dgd.from_cudf(df_2, npartitions=2)

    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
    expected = cudf.DataFrame({
        "id_1": [2, 3],
        "cat_col":
        cudf.Series(
            ["f", "f"],
            dtype=cudf.CategoricalDtype(categories=["a", "b", "f", "g", "h"],
                                        ordered=False),
        ),
        "id_2": [113, 113],
    })
    dd.assert_eq(ddf_1.merge(ddf_2), expected)
Exemplo n.º 6
0
def test_cdf_to_pandas(data, ordered):
    assert (pd.CategoricalDtype(data, ordered) == cudf.CategoricalDtype(
        categories=data, ordered=ordered).to_pandas())
Exemplo n.º 7
0
def test_categorical_dtype(categories, ordered):
    expected = pd.CategoricalDtype(categories=categories, ordered=ordered)
    got = cudf.CategoricalDtype(categories=categories, ordered=ordered)
    assert_eq(expected, got)
Exemplo n.º 8
0
def categorical_dtype_cudf(categories=None, ordered=None):
    return cudf.CategoricalDtype(categories=categories, ordered=ordered)
Exemplo n.º 9
0
def _input_to_libcudf_castrules_both_cat(lcol, rcol, how):
    """
    Based off the left and right operands, determine the libcudf
    merge dtype or error for corner cases where the merge cannot
    proceed. This function handles categorical variables.
    Categorical variable typecasting logic depends on both `how`
    and the specifics of the categorical variables to be merged.
    Merging categorical variables when only one side is ordered
    is ambiguous and not allowed. Merging when both categoricals
    are ordered is allowed, but only when the categories are
    exactly equal and have equal ordering, and will result in the
    common dtype.
    When both sides are unordered, the result categorical depends
    on the kind of join:
    - For inner joins, the result will be the intersection of the
    categories
    - For left or right joins, the result will be the the left or
    right dtype respectively. This extends to semi and anti joins.
    - For outer joins, the result will be the union of categories
    from both sides.

    """
    ltype = lcol.dtype
    rtype = rcol.dtype

    # this function is only to be used to resolve the result when both
    # sides are categorical
    if not isinstance(ltype, CategoricalDtype) and isinstance(
            rtype, CategoricalDtype):
        raise TypeError("Both operands must be CategoricalDtype")

    # true for every configuration
    if ltype == rtype:
        return ltype

    # raise for any join where ordering doesn't match
    if ltype.ordered != rtype.ordered:
        raise TypeError("Merging on categorical variables with mismatched"
                        " ordering is ambiguous")
    elif ltype.ordered and rtype.ordered:
        # if we get to here, categories must be what causes the
        # dtype equality check to fail. And we can never merge
        # two ordered categoricals with different categories
        raise TypeError(f"{how} merge between categoricals with "
                        "different categories is only valid when "
                        "neither side is ordered")

    elif how == "inner":
        # neither ordered, so categories must be different
        # demote to underlying types
        return _input_to_libcudf_castrules_any(ltype.categories,
                                               rtype.categories, how)

    elif how == "left":
        return ltype
    elif how == "right":
        return rtype

    elif how == "outer":
        new_cats = cudf.concat([ltype.categories, rtype.categories]).unique()
        return cudf.CategoricalDtype(categories=new_cats, ordered=False)
Exemplo n.º 10
0
 (pd.Series(dtype="str"), False),
 (pd.Series(dtype="unicode"), False),
 (pd.Series(dtype="datetime64[s]"), False),
 (pd.Series(dtype="timedelta64[s]"), False),
 (pd.Series(dtype="category"), True),
 (pd.Series(dtype="object"), False),
 # cuDF dtypes.
 (cudf.CategoricalDtype, True),
 (cudf.ListDtype, False),
 (cudf.StructDtype, False),
 (cudf.Decimal128Dtype, False),
 (cudf.Decimal64Dtype, False),
 (cudf.Decimal32Dtype, False),
 (cudf.IntervalDtype, False),
 # cuDF dtype instances.
 (cudf.CategoricalDtype("a"), True),
 (cudf.ListDtype(int), False),
 (cudf.StructDtype({"a": int}), False),
 (cudf.Decimal128Dtype(5, 2), False),
 (cudf.Decimal64Dtype(5, 2), False),
 (cudf.Decimal32Dtype(5, 2), False),
 (cudf.IntervalDtype(int), False),
 # cuDF objects
 (cudf.Series(dtype="bool"), False),
 (cudf.Series(dtype="int"), False),
 (cudf.Series(dtype="float"), False),
 (cudf.Series(dtype="str"), False),
 (cudf.Series(dtype="datetime64[s]"), False),
 (cudf.Series(dtype="timedelta64[s]"), False),
 (cudf.Series(dtype="category"), True),
 (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
Exemplo n.º 11
0
def find_common_type(dtypes):
    """
    Wrapper over np.find_common_type to handle special cases

    Corner cases:
    1. "M8", "M8" -> "M8" | "m8", "m8" -> "m8"

    Parameters
    ----------
    dtypes : iterable, sequence of dtypes to find common types

    Returns
    -------
    dtype : np.dtype optional, the result from np.find_common_type,
    None if input is empty

    """

    if len(dtypes) == 0:
        return None

    # Early exit for categoricals since they're not hashable and therefore
    # can't be put in a set.
    if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes):
        if all((cudf.api.types.is_categorical_dtype(dtype) and (
                not dtype.ordered if hasattr(dtype, "ordered") else True))
               for dtype in dtypes):
            if len({dtype._categories.dtype for dtype in dtypes}) == 1:
                return cudf.CategoricalDtype(
                    cudf.core.column.concat_columns(
                        [dtype._categories for dtype in dtypes]).unique())
            else:
                raise ValueError(
                    "Only unordered categories of the same underlying type "
                    "may be coerced to a common type.")
        else:
            # TODO: Should this be an error case (mixing categorical with other
            # dtypes) or should this return object? Unclear if we have enough
            # information to decide right now, may have to come back to this as
            # usage of find_common_type increases.
            return cudf.dtype("O")

    # Aggregate same types
    dtypes = set(dtypes)

    if any(cudf.api.types.is_decimal_dtype(dtype) for dtype in dtypes):
        if all(
                cudf.api.types.is_decimal_dtype(dtype)
                or cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes):
            return _find_common_type_decimal([
                dtype for dtype in dtypes
                if cudf.api.types.is_decimal_dtype(dtype)
            ])
        else:
            return cudf.dtype("O")

    # Corner case 1:
    # Resort to np.result_type to handle "M" and "m" types separately
    dt_dtypes = set(
        filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes))
    if len(dt_dtypes) > 0:
        dtypes = dtypes - dt_dtypes
        dtypes.add(np.result_type(*dt_dtypes))

    td_dtypes = set(
        filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes))
    if len(td_dtypes) > 0:
        dtypes = dtypes - td_dtypes
        dtypes.add(np.result_type(*td_dtypes))

    common_dtype = np.find_common_type(list(dtypes), [])
    if common_dtype == np.dtype("float16"):
        return cudf.dtype("float32")
    return cudf.dtype(common_dtype)
Exemplo n.º 12
0
 (float, np.dtype("float64")),
 (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
 ("float16", np.dtype("float32")),
 (np.dtype("U"), np.dtype("object")),
 ("timedelta64", np.dtype("<m8")),
 ("timedelta64[ns]", np.dtype("<m8[ns]")),
 ("timedelta64[ms]", np.dtype("<m8[ms]")),
 ("timedelta64[D]", np.dtype("<m8[D]")),
 ("<m8[s]", np.dtype("<m8[s]")),
 ("datetime64", np.dtype("<M8")),
 ("datetime64[ns]", np.dtype("<M8[ns]")),
 ("datetime64[ms]", np.dtype("<M8[ms]")),
 ("datetime64[D]", np.dtype("<M8[D]")),
 ("<M8[s]", np.dtype("<M8[s]")),
 (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
 ("category", cudf.CategoricalDtype()),
 (
     cudf.CategoricalDtype(categories=("a", "b", "c")),
     cudf.CategoricalDtype(categories=("a", "b", "c")),
 ),
 (
     pd.CategoricalDtype(categories=("a", "b", "c")),
     cudf.CategoricalDtype(categories=("a", "b", "c")),
 ),
 (
     # this is a pandas.core.arrays.numpy_.PandasDtype...
     pd.array([1], dtype="int16").dtype,
     np.dtype("int16"),
 ),
 (pd.IntervalDtype("int"), cudf.IntervalDtype("int64")),
 (cudf.IntervalDtype("int"), cudf.IntervalDtype("int64")),