示例#1
0
    def _copy_type_metadata(self, other, include_index: bool = True):
        """
        Copy type metadata from each column of `other` to the corresponding
        column of `self`.
        See `ColumnBase._with_type_metadata` for more information.
        """
        for name, col, other_col in zip(self._data.keys(), self._data.values(),
                                        other._data.values()):
            # libcudf APIs lose all information about GeoColumns, operating
            # solely on the underlying base data. Therefore, our only recourse
            # is to recreate a new GeoColumn with the same underlying data.
            # Since there's no easy way to create a GeoColumn from a
            # NumericalColumn, we're forced to do so manually.
            if isinstance(other_col, GeoColumn):
                col = GeoColumn(other_col._geo, other_col._meta,
                                cudf.Index(col))

            self._data.set_by_label(name,
                                    col._with_type_metadata(other_col.dtype),
                                    validate=False)

        if include_index:
            if self._index is not None and other._index is not None:
                self._index._copy_type_metadata(other._index)
                # When other._index is a CategoricalIndex, there is
                if isinstance(
                        other._index,
                        cudf.core.index.CategoricalIndex) and not isinstance(
                            self._index, cudf.core.index.CategoricalIndex):
                    self._index = cudf.Index(self._index._column)

        return self
示例#2
0
def test_timedelta_datetime_index_ops_misc(datetime_data, timedelta_data,
                                           datetime_dtype, timedelta_dtype):
    gdt = cudf.Index(datetime_data, dtype=datetime_dtype)
    gtd = cudf.Index(timedelta_data, dtype=timedelta_dtype)

    pdt = gdt.to_pandas()
    ptd = gtd.to_pandas()

    assert_eq(gdt - gtd, pdt - ptd)
    assert_eq(gdt + gtd, pdt + ptd)
示例#3
0
    def astype(self, dtype, copy=False):
        """
        Create an Index with values cast to dtypes. The class of a new Index
        is determined by dtype. When conversion is impossible, a ValueError
        exception is raised.

        Parameters
        ----------
        dtype : numpy dtype
            Use a numpy.dtype to cast entire Index object to.
        copy : bool, default False
            By default, astype always returns a newly allocated object.
            If copy is set to False and internal requirements on dtype are
            satisfied, the original data is used to create a new Index
            or the original Index is returned.

        Returns
        -------
        Index
            Index with values cast to specified dtype.

        Examples
        --------
        >>> import cudf
        >>> index = cudf.Index([1, 2, 3])
        >>> index
        Int64Index([1, 2, 3], dtype='int64')
        >>> index.astype('float64')
        Float64Index([1.0, 2.0, 3.0], dtype='float64')
        """
        if is_dtype_equal(dtype, self.dtype):
            return self.copy(deep=copy)

        return cudf.Index(self.copy(deep=copy)._values.astype(dtype),
                          name=self.name)
示例#4
0
def test_categorical_basic():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    cudf_cat = cudf.Index(cat)

    pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"])
    sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"])
    assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False)

    # Test attributes
    assert_eq(pdsr.cat.categories, sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(
        pdsr.cat.codes.values, sr.cat.codes.to_array()
    )

    string = str(sr)
    expect_str = """
p a
q a
r b
s c
t a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    assert_eq(cat.codes, cudf_cat.codes.to_array())
示例#5
0
def _union_categoricals(
    to_union: List[Union[cudf.Series, cudf.Index]],
    sort_categories: bool = False,
    ignore_order: bool = False,
):
    """Combine categorical data.

    This API is currently internal but should be exposed once full support for
    cudf.Categorical is ready.
    """
    # TODO(s) in the order specified :
    # 1. The return type needs to be changed
    #    to cudf.Categorical once it is implemented.
    # 2. Make this API public (i.e., to resemble
    #    pd.api.types.union_categoricals)

    if ignore_order:
        raise TypeError("ignore_order is not yet implemented")

    result_col = cudf.core.column.CategoricalColumn._concat(
        [obj._column for obj in to_union])
    if sort_categories:
        sorted_categories = result_col.categories.sort_by_values(
            ascending=True)[0]
        result_col = result_col.reorder_categories(
            new_categories=sorted_categories)

    return cudf.Index(result_col)
示例#6
0
def test_index_sample_basic(n, frac, replace):
    psr = pd.Series([1, 2, 3, 4, 5])
    gindex = cudf.Index(psr)
    random_state = 0

    kind = None

    try:
        pout = psr.sample(n=n,
                          frac=frac,
                          replace=replace,
                          random_state=random_state)
    except BaseException as e:
        kind = type(e)
        msg = str(e)

    if kind is not None:
        with pytest.raises(kind, match=msg):
            gout = gindex.sample(n=n,
                                 frac=frac,
                                 replace=replace,
                                 random_state=random_state)
    else:
        gout = gindex.sample(n=n,
                             frac=frac,
                             replace=replace,
                             random_state=random_state)

    if kind is not None:
        return

    assert pout.shape == gout.shape
示例#7
0
    def __setitem__(self, key, value):
        if cudf.utils.dtypes.is_scalar(
                value) and cudf._lib.scalar._is_null_host_scalar(value):
            to_add_categories = 0
        else:
            to_add_categories = len(
                cudf.Index(value).difference(self.categories))

        if to_add_categories > 0:
            raise ValueError("Cannot setitem on a Categorical with a new "
                             "category, set the categories first")

        if cudf.utils.dtypes.is_scalar(value):
            value = self._encode(value) if value is not None else value
        else:
            value = cudf.core.column.as_column(value).astype(self.dtype)
            value = value.codes
        codes = self.codes
        codes[key] = value
        out = cudf.core.column.build_categorical_column(
            categories=self.categories,
            codes=codes,
            mask=codes.base_mask,
            size=codes.size,
            offset=self.offset,
            ordered=self.ordered,
        )
        self._mimic_inplace(out, inplace=True)
示例#8
0
def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op):
    gtdi = cudf.Index(data=data, dtype=dtype)
    ptdi = gtdi.to_pandas()

    if op == "add":
        expected = ptdi + other_scalars
        actual = gtdi + other_scalars
    elif op == "sub":
        expected = ptdi - other_scalars
        actual = gtdi - other_scalars
    elif op == "truediv":
        expected = ptdi / other_scalars
        actual = gtdi / other_scalars
    elif op == "floordiv":
        expected = ptdi // other_scalars
        actual = gtdi // other_scalars

    assert_eq(expected, actual)

    if op == "add":
        expected = other_scalars + ptdi
        actual = other_scalars + gtdi
    elif op == "sub":
        expected = other_scalars - ptdi
        actual = other_scalars - gtdi
    elif op == "truediv":
        expected = other_scalars / ptdi
        actual = other_scalars / gtdi
    elif op == "floordiv":
        expected = other_scalars // ptdi
        actual = other_scalars // gtdi

    assert_eq(expected, actual)
示例#9
0
def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op):
    gtdi = cudf.Index(data=data, dtype=dtype)
    ptdi = gtdi.to_pandas()

    gpu_scalar = cudf.Scalar(cpu_scalar)

    if op == "add":
        expected = ptdi + cpu_scalar
        actual = gtdi + gpu_scalar
    elif op == "sub":
        expected = ptdi - cpu_scalar
        actual = gtdi - gpu_scalar
    elif op == "truediv":
        expected = ptdi / cpu_scalar
        actual = gtdi / gpu_scalar
    elif op == "floordiv":
        expected = ptdi // cpu_scalar
        actual = gtdi // gpu_scalar

    assert_eq(expected, actual)

    if op == "add":
        expected = cpu_scalar + ptdi
        actual = gpu_scalar + gtdi
    elif op == "sub":
        expected = cpu_scalar - ptdi
        actual = gpu_scalar - gtdi
    elif op == "truediv":
        expected = cpu_scalar / ptdi
        actual = gpu_scalar / gtdi
    elif op == "floordiv":
        expected = cpu_scalar // ptdi
        actual = gpu_scalar // gtdi

    assert_eq(expected, actual)
示例#10
0
    def _setitem_tuple_arg(self, key, value):
        if isinstance(self._df.index, cudf.MultiIndex) or isinstance(
                self._df.columns, pd.MultiIndex):
            raise NotImplementedError(
                "Setting values using df.loc[] not supported on "
                "DataFrames with a MultiIndex")

        try:
            columns = self._get_column_selection(key[1])
        except KeyError:
            if not self._df.empty and isinstance(key[0], slice):
                pos_range = get_label_range_or_mask(self._df.index,
                                                    key[0].start, key[0].stop,
                                                    key[0].step)
                idx = self._df.index[pos_range]
            elif self._df.empty and isinstance(key[0], slice):
                idx = None
            else:
                idx = cudf.Index(key[0])
            if is_scalar(value):
                length = len(idx) if idx is not None else 1
                value = as_column(value, length=length)

            new_col = cudf.Series(value, index=idx)
            if not self._df.empty:
                new_col = new_col._align_to_index(self._df.index, how="right")

            if self._df.empty:
                self._df.index = (idx if idx is not None else cudf.RangeIndex(
                    len(new_col)))
            self._df._data.insert(key[1], new_col)
        else:
            for col in columns:
                self._df[col].loc[key[0]] = value
示例#11
0
def test_index_to_arrow(data):
    pdi = pd.Index(data)
    gdi = cudf.Index(data)

    expected_arrow_array = pa.Array.from_pandas(pdi)
    got_arrow_array = gdi.to_arrow()

    assert_eq(expected_arrow_array, got_arrow_array)
示例#12
0
def test_is_categorical_dispatch():
    assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3]))
    assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3]))

    assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category"))
    assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category"))

    assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category"))
    assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
示例#13
0
def test_multiindex_sample_basic(n, frac, replace, axis):
    # as we currently don't support column with same name
    if axis == 1 and replace:
        return
    pdf = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, 5],
            "float": [0.05, 0.2, 0.3, 0.2, 0.25],
            "int": [1, 3, 5, 4, 2],
        },
    )
    mul_index = cudf.Index(DataFrame.from_pandas(pdf))
    random_state = 0

    try:
        pout = pdf.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )
    except BaseException:
        assert_exceptions_equal(
            lfunc=pdf.sample,
            rfunc=mul_index.sample,
            lfunc_args_and_kwargs=(
                [],
                {
                    "n": n,
                    "frac": frac,
                    "replace": replace,
                    "random_state": random_state,
                    "axis": axis,
                },
            ),
            rfunc_args_and_kwargs=(
                [],
                {
                    "n": n,
                    "frac": frac,
                    "replace": replace,
                    "random_state": random_state,
                    "axis": axis,
                },
            ),
        )
    else:
        gout = mul_index.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )
        assert pout.shape == gout.shape
示例#14
0
    def __getitem__(self, arg):
        if isinstance(arg, tuple):
            arg = list(arg)
        data = self._sr._column[arg]

        if (isinstance(data, (dict, list)) or _is_scalar_or_zero_d_array(data)
                or _is_null_host_scalar(data)):
            return data
        return self._sr._from_data({self._sr.name: data},
                                   index=cudf.Index(self._sr.index.take(arg)))
示例#15
0
def test_index_difference_sort_error():
    pdi = pd.Index([1, 2, 3])
    gdi = cudf.Index([1, 2, 3])

    assert_exceptions_equal(
        pdi.difference,
        gdi.difference,
        ([pdi], {"sort": True}),
        ([gdi], {"sort": True}),
    )
示例#16
0
 def run(
     self,
     train_df: cudf.DataFrame,
     test_df: Optional[cudf.DataFrame] = None,
     log: bool = False,
 ):
     with timer(self.name, log=log):
         self.create_features(train_df, test_df=test_df)
         prefix = self.prefix + "_" if self.prefix else ""
         suffix = self.suffix + "_" if self.suffix else ""
         self.train.columns = cudf.Index(
             [str(c) for c in self.train.columns]).to_array()
         self.valid.columns = cudf.Index(
             [str(c) for c in self.valid.columns]).to_array()
         self.test.columns = cudf.Index([str(c) for c in self.test.columns
                                         ]).to_array()
         self.train.columns = prefix + self.train.columns + suffix
         self.valid.columns = prefix + self.valid.columns + suffix
         self.test.columns = prefix + self.test.columns + suffix
     return self
示例#17
0
def test_index_tolist(data, dtype):
    gdi = cudf.Index(data, dtype=dtype)

    with pytest.raises(
            TypeError,
            match=re.escape(
                r"cuDF does not support conversion to host memory "
                r"via `tolist()` method. Consider using "
                r"`.to_arrow().to_pylist()` to construct a Python list."),
    ):
        gdi.tolist()
示例#18
0
def test_index_iter_error(data, dtype):
    gdi = cudf.Index(data, dtype=dtype)

    with pytest.raises(
            TypeError,
            match=re.escape(
                f"{gdi.__class__.__name__} object is not iterable. "
                f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` "
                f"if you wish to iterate over the values."),
    ):
        iter(gdi)
示例#19
0
def test_index_difference_sort_error():
    pdi = pd.Index([1, 2, 3])
    gdi = cudf.Index([1, 2, 3])

    try:
        pdi.difference(pdi, sort=True)
    except Exception as e:
        with pytest.raises(type(e), match=e.__str__()):
            gdi.difference(gdi, sort=True)
    else:
        raise AssertionError("Expected pdi.difference to fail when sort=True")
示例#20
0
    def difference(self, other, sort=None):
        """
        Return a new Index with elements from the index that are not in
        `other`.

        This is the set difference of two Index objects.

        Parameters
        ----------
        other : Index or array-like
        sort : False or None, default None
            Whether to sort the resulting index. By default, the
            values are attempted to be sorted, but any TypeError from
            incomparable elements is caught by cudf.

            * None : Attempt to sort the result, but catch any TypeErrors
              from comparing incomparable elements.
            * False : Do not sort the result.

        Returns
        -------
        difference : Index

        Examples
        --------
        >>> import cudf
        >>> idx1 = cudf.Index([2, 1, 3, 4])
        >>> idx1
        Int64Index([2, 1, 3, 4], dtype='int64')
        >>> idx2 = cudf.Index([3, 4, 5, 6])
        >>> idx2
        Int64Index([3, 4, 5, 6], dtype='int64')
        >>> idx1.difference(idx2)
        Int64Index([1, 2], dtype='int64')
        >>> idx1.difference(idx2, sort=False)
        Int64Index([2, 1], dtype='int64')
        """
        if sort not in {None, False}:
            raise ValueError(f"The 'sort' keyword only takes the values "
                             f"of None or False; {sort} was passed.")

        other = cudf.Index(other)

        if is_mixed_with_object_dtype(self, other):
            difference = self.copy()
        else:
            difference = self.join(other, how="leftanti")
            if self.dtype != other.dtype:
                difference = difference.astype(self.dtype)

        if sort is None and len(other):
            return difference.sort_values()

        return difference
示例#21
0
def test_categorical_allow_nan():
    gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False)
    gs = gs.astype("category")
    expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8")
    assert_eq(expected_codes, gs.cat.codes)

    expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64")
    assert_eq(expected_categories, gs.cat.categories)

    actual_ps = gs.to_pandas()
    expected_ps = pd.Series([1.0, 2.0, np.nan, 10.0, np.nan, np.nan],
                            dtype="category")
    assert_eq(actual_ps, expected_ps)
示例#22
0
def test_cudf_factorize_index():
    data = [1, 2, 3, 4, 5]

    pi = pd.Index(data)
    gi = cudf.Index(data)

    expect = pd.factorize(pi)
    got = cudf.factorize(gi)

    assert len(expect) == len(got)

    np.testing.assert_array_equal(expect[0], got[0].get())
    np.testing.assert_array_equal(expect[1], got[1].values.get())
示例#23
0
def test_multiindex_sample_basic(n, frac, replace, axis):
    # as we currently don't support column with same name
    if axis == 1 and replace:
        return
    pdf = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, 5],
            "float": [0.05, 0.2, 0.3, 0.2, 0.25],
            "int": [1, 3, 5, 4, 2],
        },
    )
    mul_index = cudf.Index(DataFrame.from_pandas(pdf))
    random_state = 0

    kind = None

    try:
        pout = pdf.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )
    except BaseException as e:
        kind = type(e)
        msg = str(e)

    if kind is not None:
        with pytest.raises(kind, match=msg):
            gout = mul_index.sample(
                n=n,
                frac=frac,
                replace=replace,
                random_state=random_state,
                axis=axis,
            )
    else:
        gout = mul_index.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )

    if kind is not None:
        return

    assert pout.shape == gout.shape
示例#24
0
def _pivot(df, index, columns):
    """
    Reorganize the values of the DataFrame according to the given
    index and columns.

    Parameters
    ----------
    df : DataFrame
    index : cudf.Index
        Index labels of the result
    columns : cudf.Index
        Column labels of the result
    """
    columns_labels, columns_idx = columns._encode()
    index_labels, index_idx = index._encode()
    column_labels = columns_labels.to_pandas().to_flat_index()

    # the result of pivot always has a multicolumn
    result = cudf.core.column_accessor.ColumnAccessor(multiindex=True,
                                                      level_names=(None, ) +
                                                      columns._data.names)

    def as_tuple(x):
        return x if isinstance(x, tuple) else (x, )

    for v in df:
        names = [as_tuple(v) + as_tuple(name) for name in column_labels]
        nrows = len(index_labels)
        ncols = len(names)
        num_elements = nrows * ncols
        if num_elements > 0:
            col = df._data[v]
            scatter_map = (columns_idx * np.int32(nrows)) + index_idx
            target = cudf.core.frame.Frame({
                None:
                cudf.core.column.column_empty_like(col,
                                                   masked=True,
                                                   newsize=nrows * ncols)
            })
            target._data[None][scatter_map] = col
            result_frames = target._split(range(nrows, nrows * ncols, nrows))
            result.update({
                name: next(iter(f._columns))
                for name, f in zip(names, result_frames)
            })

    return cudf.DataFrame._from_data(result,
                                     index=cudf.Index(index_labels,
                                                      name=index.name))
示例#25
0
    def _clean_nulls_from_index(self):
        """
        Convert all na values(if any) in Index object
        to `<NA>` as a preprocessing step to `__repr__` methods.

        This will involve changing type of Index object
        to StringIndex but it is the responsibility of the `__repr__`
        methods using this method to replace or handle representation
        of the actual types correctly.
        """
        if self._values.has_nulls():
            return cudf.Index(self._values.astype("str").fillna(cudf._NA_REP),
                              name=self.name)
        else:
            return self
示例#26
0
def test_multiindex_to_arrow():
    pdf = pd.DataFrame({
        "a": [1, 2, 1, 2, 3],
        "b": [1.0, 2.0, 3.0, 4.0, 5.0],
        "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"),
        "d": ["a", "b", "c", "d", "e"],
    })
    pdf["a"] = pdf["a"].astype("category")
    df = cudf.from_pandas(pdf)
    gdi = cudf.Index(df)

    expected = pa.Table.from_pandas(pdf)
    got = gdi.to_arrow()

    assert_eq(expected, got)
示例#27
0
def test_categorical_index_with_nan_repr():
    cat_index = cudf.Index(
        cudf.Series([1, 2, np.nan, 10, np.nan, None],
                    nan_as_null=False).astype("category"))

    expected_repr = (
        "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, <NA>], "
        "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')")

    assert cat_index.__repr__() == expected_repr

    sliced_expected_repr = (
        "CategoricalIndex([NaN, 10.0, NaN, <NA>], "
        "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')")

    assert cat_index[2:].__repr__() == sliced_expected_repr
示例#28
0
def test_factorize_result_classes():
    data = [1, 2, 3]

    labels, cats = cudf.factorize(cudf.Series(data))

    assert isinstance(labels, cp.ndarray)
    assert isinstance(cats, cudf.BaseIndex)

    labels, cats = cudf.factorize(cudf.Index(data))

    assert isinstance(labels, cp.ndarray)
    assert isinstance(cats, cudf.BaseIndex)

    labels, cats = cudf.factorize(cp.array(data))

    assert isinstance(labels, cp.ndarray)
    assert isinstance(cats, cp.ndarray)
示例#29
0
def _pivot(df, index, columns):
    """
    Reorganize the values of the DataFrame according to the given
    index and columns.

    Parameters
    ----------
    df : DataFrame
    index : cudf.core.index.Index
        Index labels of the result
    columns : cudf.core.index.Index
        Column labels of the result
    """
    columns_labels, columns_idx = columns._encode()
    index_labels, index_idx = index._encode()
    column_labels = columns_labels.to_pandas().to_flat_index()

    # the result of pivot always has a multicolumn
    result = cudf.core.column_accessor.ColumnAccessor(
        multiindex=True, level_names=(None,) + columns._data.names
    )

    def as_tuple(x):
        return x if isinstance(x, tuple) else (x,)

    for v in df:
        names = [as_tuple(v) + as_tuple(name) for name in column_labels]
        col = df._data[v]
        result.update(
            cudf.DataFrame._from_table(
                col.scatter_to_table(
                    index_idx,
                    columns_idx,
                    names,
                    nrows=len(index_labels),
                    ncols=len(names),
                )
            )._data
        )

    return cudf.DataFrame(
        result, index=cudf.Index(index_labels, name=index.name)
    )
示例#30
0
def test_timedelta_index_properties(data, dtype, name):
    gdi = cudf.Index(data, dtype=dtype, name=name)
    pdi = gdi.to_pandas()

    def local_assert(expected, actual):
        if actual._values.null_count:
            assert_eq(expected, actual.astype("float64"))
        else:
            assert_eq(expected, actual)

    expected_days = pdi.days
    actual_days = gdi.days

    local_assert(expected_days, actual_days)

    expected_seconds = pdi.seconds
    actual_seconds = gdi.seconds

    local_assert(expected_seconds, actual_seconds)

    expected_microseconds = pdi.microseconds
    actual_microseconds = gdi.microseconds

    local_assert(expected_microseconds, actual_microseconds)

    expected_nanoseconds = pdi.nanoseconds
    actual_nanoseconds = gdi.nanoseconds

    local_assert(expected_nanoseconds, actual_nanoseconds)

    expected_components = pdi.components
    actual_components = gdi.components

    if actual_components.isnull().any().any():
        assert_eq(expected_components, actual_components.astype("float"))
    else:
        assert_eq(
            expected_components,
            actual_components,
            check_index_type=not actual_components.empty,
        )