Пример #1
0
    def hash_object_cudf_index(ind, index=None):

        if isinstance(ind, cudf.MultiIndex):
            return safe_hash(ind.to_frame(index=False))

        col = column.as_column(ind)
        return safe_hash(cudf.Series(col))
Пример #2
0
 def find_last_value(self, value, closest=False):
     """
     Returns offset of last value that matches
     """
     value = pd.to_datetime(value)
     value = column.as_column(value, dtype=self.dtype).as_numerical[0]
     return self.as_numerical.find_last_value(value, closest=closest)
Пример #3
0
def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
    normalized_column = column.as_column(col_to_normalize)
    col_to_normalize_dtype = normalized_column.dtype
    if isinstance(col_to_normalize, list):
        col_to_normalize_dtype = min_numeric_column_type(normalized_column)
        # Scalar case
        if len(col_to_normalize) == 1:
            col_to_normalize_casted = input_column_dtype.type(
                col_to_normalize[0]
            )
            if not np.isnan(col_to_normalize_casted) and (
                col_to_normalize_casted != col_to_normalize[0]
            ):
                raise TypeError(
                    f"Cannot safely cast non-equivalent "
                    f"{col_to_normalize[0]} "
                    f"to {input_column_dtype.name}"
                )
            else:
                col_to_normalize_dtype = input_column_dtype
    elif hasattr(col_to_normalize, "dtype"):
        col_to_normalize_dtype = col_to_normalize.dtype
    else:
        raise TypeError(f"Type {type(col_to_normalize)} not supported")

    if (
        col_to_normalize_dtype.kind == "f" and input_column_dtype.kind == "i"
    ) or (col_to_normalize_dtype > input_column_dtype):
        raise TypeError(
            f"Potentially unsafe cast for non-equivalent "
            f"{col_to_normalize_dtype.name} "
            f"to {input_column_dtype.name}"
        )
    return normalized_column.astype(input_column_dtype)
Пример #4
0
    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(
            header["dtype"], frames[:n_dtype_frames]
        )
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type-serialized"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames : n_dtype_frames + n_data_frames],
        )
        mask = None
        if "mask" in header:
            mask = Buffer.deserialize(
                header["mask"], [frames[n_dtype_frames + n_data_frames]]
            )
        return cast(
            CategoricalColumn,
            column.build_column(
                data=None,
                dtype=dtype,
                mask=mask,
                children=(column.as_column(data.base_data, dtype=data.dtype),),
            ),
        )
Пример #5
0
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = libcudf.cudf.get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs)

        out_col = column.as_column(out_arr)

        if self.null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = libcudf.cudf.get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            mask = Buffer(out_mask_arr)
            out_col = out_col.set_mask(mask)

        return out_col.astype(out_dtype)
Пример #6
0
    def fillna(self,
               fill_value: Any = None,
               method: Any = None,
               dtype: Dtype = None) -> CategoricalColumn:
        """
        Fill null values with *fill_value*
        """
        if not self.nullable:
            return self

        if fill_value is not None:
            fill_is_scalar = np.isscalar(fill_value)

            if fill_is_scalar:
                if fill_value == self.default_na_value():
                    fill_value = self.codes.dtype.type(fill_value)
                else:
                    try:
                        fill_value = self._encode(fill_value)
                        fill_value = self.codes.dtype.type(fill_value)
                    except (ValueError) as err:
                        err_msg = "fill value must be in categories"
                        raise ValueError(err_msg) from err
            else:
                fill_value = column.as_column(fill_value, nan_as_null=False)
                # TODO: only required if fill_value has a subset of the
                # categories:
                fill_value = fill_value.cat()._set_categories(
                    fill_value.cat().categories,
                    self.categories,
                    is_unique=True,
                )
                fill_value = column.as_column(fill_value.codes).astype(
                    self.codes.dtype)

        result = super().fillna(value=fill_value, method=method)

        result = column.build_categorical_column(
            categories=self.dtype.categories._values,
            codes=column.as_column(result.base_data, dtype=result.dtype),
            offset=result.offset,
            size=result.size,
            mask=result.base_mask,
            ordered=self.dtype.ordered,
        )

        return result
Пример #7
0
    def find_and_replace(
        self,
        to_replace: ColumnLike,
        replacement: ColumnLike,
        all_nan: bool = False,
    ) -> NumericalColumn:
        """
        Return col with *to_replace* replaced with *value*.
        """
        to_replace_col = column.as_column(to_replace)
        replacement_col = column.as_column(replacement)

        if type(to_replace_col) != type(replacement_col):
            raise TypeError(
                f"to_replace and value should be of same types,"
                f"got to_replace dtype: {to_replace_col.dtype} and "
                f"value dtype: {replacement_col.dtype}")

        if not isinstance(to_replace_col, NumericalColumn) and not isinstance(
                replacement_col, NumericalColumn):
            return self.copy()

        to_replace_col = _normalize_find_and_replace_input(
            self.dtype, to_replace)
        if all_nan:
            replacement_col = column.as_column(replacement, dtype=self.dtype)
        else:
            replacement_col = _normalize_find_and_replace_input(
                self.dtype, replacement)
        replaced = self.copy()
        if len(replacement_col) == 1 and len(to_replace_col) > 1:
            replacement_col = column.as_column(
                utils.scalar_broadcast_to(replacement[0],
                                          (len(to_replace_col), ), self.dtype))
        elif len(replacement_col) == 1 and len(to_replace_col) == 0:
            return replaced
        to_replace_col, replacement_col, replaced = numeric_normalize_types(
            to_replace_col, replacement_col, replaced)
        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
        df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
        if df._data["old"].null_count == 1:
            replaced = replaced.fillna(
                df._data["new"][df._data["old"].isna()][0])
            df = df.dropna(subset=["old"])

        return libcudf.replace.replace(replaced, df["old"]._column,
                                       df["new"]._column)
Пример #8
0
def get_sorted_inds(by, ascending=True, na_position="last"):
    """
        Sort by the values.

        Parameters
        ----------
        by : Column or list of Column
            Column or list of Column objects to sort by.
        ascending : bool or list of bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {‘first’ or ‘last’}, default ‘last’
            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at
            the end.
        Returns
        -------
        col_inds : cuDF Column of indices sorted based on input

        Difference from pandas:
          * Support axis='index' only.
          * Not supporting: inplace, kind
          * Ascending can be a list of bools to control per column
    """
    if isinstance(by, (ColumnBase)):
        by = [by]

    col_inds = column.as_column(cudautils.arange(len(by[0]), dtype="int32"))

    # This needs to be updated to handle list of bools for ascending
    if ascending is True:
        if na_position == "last":
            na_position = 0
        elif na_position == "first":
            na_position = 1
    elif ascending is False:
        if na_position == "last":
            na_position = 1
        elif na_position == "first":
            na_position = 0
    else:
        logging.warning(
            "When using a sequence of booleans for `ascending`, `na_position` "
            "flag is not yet supported and defaults to treating nulls as "
            "greater than all numbers")
        na_position = 0

    # If given a scalar need to construct a sequence of length # of columns
    if np.isscalar(ascending):
        ascending = [ascending] * len(by)
    # If given a list-like need to convert to a numpy array and copy to device
    if isinstance(ascending, collections.abc.Sequence):
        # Need to flip the boolean here since libcudf has 0 as ascending
        ascending = [not val for val in ascending]
        ascending = rmm.to_device(np.array(ascending, dtype="int8"))
    else:
        raise ValueError("Must use a boolean or list of booleans")

    libcudf.sort.order_by(by, col_inds, ascending, na_position)

    return col_inds
Пример #9
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, int):
         # *chunks* is the chunksize
         return column.arange(0, size, chunks).data_array_view
     else:
         # *chunks* is an array of chunk leading offset
         chunks = column.as_column(chunks)
         return chunks.data_array_view
Пример #10
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, six.integer_types):
         # *chunks* is the chunksize
         return cudautils.arange(0, size, chunks)
     else:
         # *chunks* is an array of chunk leading offset
         chunks = column.as_column(chunks)
         return chunks.data_array_view
Пример #11
0
    def add_categories(self, new_categories, **kwargs):
        new_categories = column.as_column(new_categories)
        new_categories = self._column.categories.append(new_categories)
        out_col = self._column
        if not self._categories_equal(new_categories, **kwargs):
            out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)
Пример #12
0
def _create_empty_categorical_column(
    categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
) -> CategoricalColumn:
    return column.build_categorical_column(
        categories=column.as_column(dtype.categories),
        codes=column.as_column(
            cudf.utils.utils.scalar_broadcast_to(
                categorical_column.default_na_value(),
                categorical_column.size,
                np.dtype(categorical_column.cat().codes),
            )
        ),
        offset=categorical_column.offset,
        size=categorical_column.size,
        mask=categorical_column.base_mask,
        ordered=dtype.ordered,
    )
Пример #13
0
    def add_categories(self, new_categories, **kwargs):
        """
        Add new categories.

        `new_categories` will be included at the last/highest
        place in the categories and will be unused directly
        after this call.

        Parameters
        ----------

        new_categories : category or list-like of category
            The new categories to be included.

        inplace : bool, default False
            Whether or not to add the categories inplace
            or return a copy of this categorical with
            added categories.

        Returns
        -------
        cat
            Categorical with new categories added or
            None if inplace.

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([1, 2], dtype="category")
        >>> s
        0    1
        1    2
        dtype: category
        Categories (2, int64): [1, 2]
        >>> s.cat.add_categories([0, 3, 4])
        0    1
        1    2
        dtype: category
        Categories (5, int64): [1, 2, 0, 3, 4]
        >>> s
        0    1
        1    2
        dtype: category
        Categories (2, int64): [1, 2]
        >>> s.cat.add_categories([0, 3, 4], inplace=True)
        >>> s
        0    1
        1    2
        dtype: category
        Categories (5, int64): [1, 2, 0, 3, 4]
        """
        new_categories = column.as_column(new_categories)
        new_categories = self._column.categories.append(new_categories)
        out_col = self._column
        if not self._categories_equal(new_categories, **kwargs):
            out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)
Пример #14
0
 def wrapper(*args, **kwargs):
     ret = passed_attr(*args, **kwargs)
     if isinstance(ret, nvstrings.nvstrings):
         ret = Series(
             column.as_column(ret),
             index=self._index,
             name=self._name,
         )
     return ret
Пример #15
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df.iloc[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            if isinstance(arg[0], slice):
                df = columns_df._slice(arg[0])
            elif is_scalar(arg[0]):
                index = arg[0]
                if index < 0:
                    index += len(columns_df)
                df = columns_df._slice(slice(index, index + 1, 1))
            else:
                arg = (column.as_column(arg[0]), arg[1])
                if pd.api.types.is_bool_dtype(arg[0]):
                    df = columns_df._apply_boolean_mask(arg[0])
                else:
                    df = columns_df._gather(arg[0])

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Пример #16
0
def make_aggregate_nullmask(df, columns=None, op="and"):
    out_mask = None
    for k in columns or df.columns:
        if not df[k].nullable:
            continue

        nullmask = df[k].nullmask
        if out_mask is None:
            out_mask = column.as_column(
                nullmask.copy(), dtype=utils.mask_dtype
            )
            continue

        out_mask = libcudfxx.binaryop.binaryop(
            column.as_column(nullmask), out_mask, op, out_mask.dtype
        )

    return out_mask
Пример #17
0
def test_gather_single_col():
    col = column.as_column(np.arange(100), dtype=np.int32)
    gather_map = np.array([0, 1, 2, 3, 5, 8, 13, 21], dtype=np.int32)

    device_gather_map = rmm.to_device(gather_map)

    out = libcudf.copying.gather(col, device_gather_map)

    np.testing.assert_array_equal(out.to_array(), gather_map)
Пример #18
0
    def searchsorted(self, value, side="left"):
        if not self.ordered:
            raise ValueError("Requires ordered categories")

        value_col = column.as_column(value)
        if self.dtype != value_col.dtype:
            raise TypeError("Categoricals can only compare with the same type")

        return libcudf.search.search_sorted(self, value_col, side)
Пример #19
0
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, StringColumn):
         values = values.copy()
     elif isinstance(values, StringIndex):
         values = values._values.copy()
     else:
         values = column.as_column(nvstrings.to_device(values))
     super(StringIndex, self).__init__(values, **kwargs)
Пример #20
0
 def find_first_value(self,
                      value: ScalarLike,
                      closest: bool = False) -> int:
     """
     Returns offset of first value that matches
     """
     value = pd.to_datetime(value)
     value = column.as_column(value, dtype=self.dtype).as_numerical[0]
     return self.as_numerical.find_first_value(value, closest=closest)
Пример #21
0
def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
    """Append a scalar `value` to the end of `col` inplace.
       Cast to common type if possible
    """
    to_type = find_common_type([type(value), col.dtype])
    val_col = as_column(value, dtype=to_type)
    old_col = col.astype(to_type)

    col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True)
Пример #22
0
def make_aggregate_nullmask(df, columns=None, op="and"):

    out_mask = None
    for k in columns or df._data:
        col = cudf.core.dataframe.extract_col(df, k)
        if not col.nullable:
            continue
        nullmask = df[k].nullmask

        if out_mask is None:
            out_mask = column.as_column(nullmask.copy(),
                                        dtype=utils.mask_dtype)
            continue

        out_mask = libcudf.binaryop.binaryop(column.as_column(nullmask),
                                             out_mask, op, out_mask.dtype)

    return out_mask
Пример #23
0
    def fillna(self, fill_value):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)
        result = column.build_column(result.data, result.dtype, mask=None)

        return result
Пример #24
0
 def unique(self, method="sort"):
     # method variable will indicate what algorithm to use to
     # calculate unique, not used right now
     if method != "sort":
         msg = "non sort based unique() not implemented yet"
         raise NotImplementedError(msg)
     segs, sortedvals = self._unique_segments()
     # gather result
     out_col = column.as_column(sortedvals)[segs]
     return out_col
Пример #25
0
def test_as_column_scalar_with_nan(nan_as_null):
    size = 10
    scalar = np.nan

    expected = cudf.Series([np.nan] * size, nan_as_null=nan_as_null).to_array()

    got = cudf.Series(as_column(scalar, length=size,
                                nan_as_null=nan_as_null)).to_array()

    np.testing.assert_equal(expected, got)
Пример #26
0
 def unique(self):
     codes = self.as_numerical.unique()
     return column.build_categorical_column(
         categories=self.categories,
         codes=column.as_column(codes.base_data, dtype=codes.dtype),
         mask=codes.base_mask,
         offset=codes.offset,
         size=codes.size,
         ordered=self.ordered,
     )
Пример #27
0
 def sort_by_values(self, ascending=True, na_position="last"):
     codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
     col = column.build_categorical_column(
         categories=self.dtype.categories,
         codes=column.as_column(codes.base_data, dtype=codes.dtype),
         mask=codes.base_mask,
         size=codes.size,
         ordered=self.dtype.ordered,
     )
     return col, inds
Пример #28
0
    def extend(self, array):
        from cudf.core.column import column

        needed = array.size
        self._sentry_capacity(needed)

        array = column.as_column(array).astype(self.dtype).data.mem

        self.mem[self.size : self.size + needed].copy_to_device(array)
        self.size += needed
Пример #29
0
    def fillna(self, fill_value, inplace=False):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)

        result = result.replace(mask=None)
        return self._mimic_inplace(result, inplace)
Пример #30
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._parent.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = Series(new_cats).drop_duplicates()._column

        cur_codes = self.codes
        cur_order = cudautils.arange(len(cur_codes))
        old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order").reset_index(True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column
        new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered)

        if kwargs.get("inplace", False):
            self._parent.data = None
            self._parent.mask = new_codes.mask
            self._parent.dtype = new_dtype
            self._parent.children = (new_codes, )
            return None

        return column.build_column(
            data=None,
            dtype=new_dtype,
            mask=new_codes.mask,
            children=(new_codes, ),
        )