Exemplo n.º 1
0
def digitize(
    column: ColumnBase, bins: np.ndarray, right: bool = False
) -> ColumnBase:
    """Return the indices of the bins to which each value in column belongs.

    Parameters
    ----------
    column : Column
        Input column.
    bins : Column-like
        1-D column-like object of bins with same type as `column`, should be
        monotonically increasing.
    right : bool
        Indicates whether interval contains the right or left bin edge.

    Returns
    -------
    A column containing the indices
    """
    if not column.dtype == bins.dtype:
        raise ValueError(
            "Digitize() expects bins and input column have the same dtype."
        )

    bin_col = as_column(bins, dtype=bins.dtype)
    if bin_col.nullable:
        raise ValueError("`bins` cannot contain null entries.")

    return as_column(
        libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
    )
Exemplo n.º 2
0
def build_categorical_column(
    categories, codes, mask=None, size=None, offset=0, ordered=None
):
    """
    Build a CategoricalColumn

    Parameters
    ----------
    categories : Column
        Column of categories
    codes : Column
        Column of codes, the size of the resulting Column will be
        the size of `codes`
    mask : Buffer
        Null mask
    size : int, optional
    offset : int, optional
    ordered : bool
        Indicates whether the categories are ordered
    """
    dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered)
    return build_column(
        data=None,
        dtype=dtype,
        mask=mask,
        size=size,
        offset=offset,
        children=(as_column(codes),),
    )
Exemplo n.º 3
0
def haversine_distance(p1_lon, p1_lat, p2_lon, p2_lat):
    """ Compute the haversine distances between an arbitrary list of lon/lat
    pairs

    Parameters
    ----------
    p1_lon
        longitude of first set of coords
    p1_lat
        latitude of first set of coords
    p2_lon
        longitude of second set of coords
    p2_lat
        latitude of second set of coords

    Returns
    -------
    result : cudf.Series
        The distance between all pairs of lon/lat coordinates
    """

    p1_lon, p1_lat, p2_lon, p2_lat = normalize_point_columns(
        as_column(p1_lon),
        as_column(p1_lat),
        as_column(p2_lon),
        as_column(p2_lat),
    )
    return cpp_haversine_distance(p1_lon, p1_lat, p2_lon, p2_lat)
Exemplo n.º 4
0
    def from_sequences(
            cls,
            arbitrary: Sequence[ColumnLike]) -> "cudf.core.column.ListColumn":
        """
        Create a list column for list of column-like sequences
        """
        data_col = column.column_empty(0)
        mask_col = []
        offset_col = [0]
        offset = 0

        # Build Data, Mask & Offsets
        for data in arbitrary:
            if cudf._lib.scalar._is_null_host_scalar(data):
                mask_col.append(False)
                offset_col.append(offset)
            else:
                mask_col.append(True)
                data_col = data_col.append(as_column(data))
                offset += len(data)
                offset_col.append(offset)

        offset_col = column.as_column(offset_col, dtype="int32")

        # Build ListColumn
        res = cls(
            size=len(arbitrary),
            dtype=cudf.ListDtype(data_col.dtype),
            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
            offset=0,
            null_count=0,
            children=(offset_col, data_col),
        )
        return res
Exemplo n.º 5
0
def polygon_bounding_boxes(poly_offsets, ring_offsets, xs, ys):
    """Compute the minimum bounding-boxes for a set of polygons.

    Parameters
    ----------
    poly_offsets
        Begin indices of the first ring in each polygon (i.e. prefix-sum)
    ring_offsets
        Begin indices of the first point in each ring (i.e. prefix-sum)
    xs
        Polygon point x-coordinates
    ys
        Polygon point y-coordinates

    Returns
    -------
    result : cudf.DataFrame
        minimum bounding boxes for each polygon

        x_min : cudf.Series
            the minimum x-coordinate of each bounding box
        y_min : cudf.Series
            the minimum y-coordinate of each bounding box
        x_max : cudf.Series
            the maximum x-coordinate of each bounding box
        y_max : cudf.Series
            the maximum y-coordinate of each bounding box
    """
    poly_offsets = as_column(poly_offsets, dtype="int32")
    ring_offsets = as_column(ring_offsets, dtype="int32")
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    return DataFrame._from_table(
        cpp_polygon_bounding_boxes(poly_offsets, ring_offsets, xs, ys)
    )
Exemplo n.º 6
0
def polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius):
    """Compute the minimum bounding-boxes for a set of polylines.

    Parameters
    ----------
    poly_offsets
        Begin indices of the first ring in each polyline (i.e. prefix-sum)
    xs
        Polyline point x-coordinates
    ys
        Polyline point y-coordinates
    expansion_radius
        radius of each polyline point

    Returns
    -------
    result : cudf.DataFrame
        minimum bounding boxes for each polyline

        x_min : cudf.Series
            the minimum x-coordinate of each bounding box
        y_min : cudf.Series
            the minimum y-coordinate of each bounding box
        x_max : cudf.Series
            the maximum x-coordinate of each bounding box
        y_max : cudf.Series
            the maximum y-coordinate of each bounding box
    """
    poly_offsets = as_column(poly_offsets, dtype="int32")
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    return DataFrame._from_data(
        *cpp_polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius)
    )
Exemplo n.º 7
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            path = piece
            row_group = None
            partition_keys = []
        else:
            (path, row_group, partition_keys) = piece

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                path,
                engine="cudf",
                columns=columns,
                row_groups=row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_groups=row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and (index[0] in df.columns):
            df = df.set_index(index[0])
        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]

                col = as_column(index2).as_frame().repeat(len(df))._data[None]
                df[name] = build_categorical_column(
                    categories=categories,
                    codes=as_column(col.base_data, dtype=col.dtype),
                    size=col.size,
                    offset=col.offset,
                    ordered=False,
                )

        return df
Exemplo n.º 8
0
def create_multihot_col(offsets, elements):
    """
    offsets = cudf series with offset values for list data
    data = cudf series with the list data flattened to 1-d
    """
    if isinstance(elements, pd.Series):
        col = pd.Series()
        lh, rh = pd.Series(offsets[1:]).reset_index(drop=True), pd.Series(
            offsets[:-1]).reset_index(drop=True)
        vals_per_entry = lh - rh
        vals_used = 0
        entries = []
        for vals_count in vals_per_entry:
            vals_count = int(vals_count)
            entry = elements[vals_used:vals_used + vals_count]
            if len(entry) == 1:
                entry = entry[0]
            vals_used += vals_count
            entries.append(entry.values)
        col = col.append(pd.Series(entries))
    else:
        offsets = as_column(offsets, dtype="int32")
        elements = as_column(elements)
        col = _build_cudf_list_column(elements, offsets)
        col = cudf.Series(col)
    return col
Exemplo n.º 9
0
    def from_numpy(cls, array):
        cast_dtype = array.dtype.type == np.int64
        if array.dtype.kind == "M":
            time_unit, _ = np.datetime_data(array.dtype)
            cast_dtype = time_unit in ("D", "W", "M", "Y") or (
                len(array) > 0 and (isinstance(array[0], str)
                                    or isinstance(array[0], dt.datetime)))
        elif not cast_dtype:
            raise ValueError(
                ("Cannot infer datetime dtype " + "from np.array dtype `%s`") %
                (array.dtype))

        if cast_dtype:
            array = array.astype(np.dtype("datetime64[s]"))
        assert array.dtype.itemsize == 8

        mask = None
        if np.any(np.isnat(array)):
            null = cudf.core.column.column_empty_like(array,
                                                      masked=True,
                                                      newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(array), dtype=array.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")],
                                    dtype=array.dtype)),
                    dtype=array.dtype,
                ),
                null,
            )
            mask = col.mask

        return cls(data=Buffer(array), mask=mask, dtype=array.dtype)
Exemplo n.º 10
0
def _build_cudf_list_column(new_elements, new_offsets):
    if not HAS_GPU:
        return []
    return build_column(
        None,
        dtype=cudf.core.dtypes.ListDtype(new_elements.dtype),
        size=new_offsets.size - 1,
        children=(as_column(new_offsets), as_column(new_elements)),
    )
Exemplo n.º 11
0
def trajectory_distances_and_speeds(
    num_trajectories, object_ids, xs, ys, timestamps
):
    """
    Compute the distance traveled and speed of sets of trajectories

    Parameters
    ----------
    num_trajectories
        number of trajectories (unique object ids)
    object_ids
        column of object (e.g., vehicle) ids
    xs
        column of x-coordinates (in kilometers)
    ys
        column of y-coordinates (in kilometers)
    timestamps
        column of timestamps in any resolution

    Returns
    -------
    result : cudf.DataFrame
        meters : cudf.Series
            trajectory distance (in kilometers)
        speed  : cudf.Series
            trajectory speed (in meters/second)

    Examples
    --------
    Compute the distances and speeds of derived trajectories

    >>> objects, traj_offsets = cuspatial.derive_trajectories(...)
    >>> dists_and_speeds = cuspatial.trajectory_distances_and_speeds(
            len(traj_offsets)
            objects['object_id'],
            objects['x'],
            objects['y'],
            objects['timestamp']
        )
    >>> print(dists_and_speeds)
                       distance          speed
        trajectory_id
        0                1000.0  100000.000000
        1                1000.0  111111.109375
    """

    object_ids = as_column(object_ids, dtype=np.int32)
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    timestamps = normalize_timestamp_column(as_column(timestamps))
    df = DataFrame._from_table(
        cpp_trajectory_distances_and_speeds(
            num_trajectories, object_ids, xs, ys, timestamps
        )
    )
    df.index.name = "trajectory_id"
    return df
Exemplo n.º 12
0
def _proc_inf_strings(col):
    """Convert "inf/infinity" strings into "Inf", the native string
    representing infinity in libcudf
    """
    col = libstrings.replace_multi(
        col,
        as_column(["+", "inf", "inity"]),
        as_column(["", "Inf", ""]),
    )
    return col
Exemplo n.º 13
0
def trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys):
    """ Compute the bounding boxes of sets of trajectories.

    Parameters
    ----------
    num_trajectories
        number of trajectories (unique object ids)
    object_ids
        column of object (e.g., vehicle) ids
    xs
        column of x-coordinates (in kilometers)
    ys
        column of y-coordinates (in kilometers)

    Returns
    -------
    result : cudf.DataFrame
        minimum bounding boxes (in kilometers) for each trajectory

        x_min : cudf.Series
            the minimum x-coordinate of each bounding box
        y_min : cudf.Series
            the minimum y-coordinate of each bounding box
        x_max : cudf.Series
            the maximum x-coordinate of each bounding box
        y_max : cudf.Series
            the maximum y-coordinate of each bounding box

    Examples
    --------
    Compute the minimum bounding boxes of derived trajectories

    >>> objects, traj_offsets = trajectory.derive_trajectories(
            [0, 0, 1, 1],  # object_id
            [0, 1, 2, 3],  # x
            [0, 0, 1, 1],  # y
            [0, 10, 0, 10] # timestamp
        )
    >>> traj_bounding_boxes = cuspatial.trajectory_bounding_boxes(
            len(traj_offsets),
            objects['object_id'],
            objects['x'],
            objects['y']
        )
    >>> print(traj_bounding_boxes)
        x_min   y_min   x_max   y_max
    0     0.0     0.0     2.0     2.0
    1     1.0     1.0     3.0     3.0
    """

    object_ids = as_column(object_ids, dtype=np.int32)
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    return DataFrame._from_table(
        cpp_trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys)
    )
Exemplo n.º 14
0
def _proc_inf_strings(col):
    """Convert "inf/infinity" strings into "Inf", the native string
    representing infinity in libcudf
    """
    # TODO: This can be handled by libcudf in
    # future see StringColumn.as_numerical_column
    col = libstrings.replace_multi(
        col,
        as_column(["+", "inf", "inity"]),
        as_column(["", "Inf", ""]),
    )
    return col
Exemplo n.º 15
0
def derive_trajectories(object_ids, xs, ys, timestamps):
    """
    Derive trajectories from object ids, points, and timestamps.

    Parameters
    ----------
    object_ids
        column of object (e.g., vehicle) ids
    xs
        column of x-coordinates (in kilometers)
    ys
        column of y-coordinates (in kilometers)
    timestamps
        column of timestamps in any resolution

    Returns
    -------
    result : tuple (objects, traj_offsets)
        objects : cudf.DataFrame
            object_ids, xs, ys, and timestamps sorted by
            ``(object_id, timestamp)``, used by ``trajectory_bounding_boxes``
            and ``trajectory_distances_and_speeds``
        traj_offsets : cudf.Series
            offsets of discovered trajectories

    Examples
    --------
    Compute sorted objects and discovered trajectories

    >>> objects, traj_offsets = cuspatial.derive_trajectories(
            [0, 1, 2, 3],  # object_id
            [0, 0, 1, 1],  # x
            [0, 0, 1, 1],  # y
            [0, 10, 0, 10] # timestamp
        )
    >>> print(traj_offsets)
        0  0
        1  2
    >>> print(objects)
           object_id       x       y  timestamp
        0          0       1       0          0
        1          0       0       0         10
        2          1       3       1          0
        3          1       2       1         10
    """

    object_ids = as_column(object_ids, dtype=np.int32)
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    timestamps = normalize_timestamp_column(as_column(timestamps))
    objects, traj_offsets = cpp_derive_trajectories(
        object_ids, xs, ys, timestamps
    )
    return DataFrame._from_table(objects), Series(data=traj_offsets)
Exemplo n.º 16
0
    def execute(self,
                requests: List[InferenceRequest]) -> List[InferenceResponse]:
        """Transforms the input batches by running through a NVTabular workflow.transform
        function.
        """
        responses = []
        for request in requests:
            # create a cudf DataFrame from the triton request
            input_df = cudf.DataFrame({
                name: _convert_tensor(get_input_tensor_by_name(request, name))
                for name in self.input_dtypes
            })

            for name, dtype in self.input_multihots.items():
                values = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__values")))
                nnzs = as_column(
                    _convert_tensor(
                        get_input_tensor_by_name(request, name + "__nnzs")))
                input_df[name] = build_column(None,
                                              dtype=dtype,
                                              size=nnzs.size - 1,
                                              children=(nnzs, values))

            # use our NVTabular workflow to transform the dataframe
            output_df = nvtabular.workflow._transform_partition(
                input_df, [self.workflow.column_group])

            # convert back to a triton response
            output_tensors = []
            for name in output_df.columns:
                col = output_df[name]
                if is_list_dtype(col.dtype):
                    # convert list values to match TF dataloader
                    values = col.list.leaves.values_host.astype(
                        self.output_dtypes[name + "__values"])
                    values = values.reshape(len(values), 1)
                    output_tensors.append(Tensor(name + "__values", values))

                    offsets = col._column.offsets.values_host.astype(
                        self.output_dtypes[name + "__nnzs"])
                    nnzs = offsets[1:] - offsets[:-1]
                    nnzs = nnzs.reshape(len(nnzs), 1)
                    output_tensors.append(Tensor(name + "__nnzs", nnzs))
                else:
                    d = col.values_host.astype(self.output_dtypes[name])
                    d = d.reshape(len(d), 1)
                    output_tensors.append(Tensor(name, d))

            responses.append(InferenceResponse(output_tensors))

        return responses
Exemplo n.º 17
0
    def __getitem__(self, arg):
        from cudf.core.column import column

        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):

            if is_categorical_dtype(self):
                codes = self.codes[arg]
                return build_categorical_column(
                    categories=self.categories,
                    codes=as_column(codes.base_data, dtype=codes.dtype),
                    mask=codes.base_mask,
                    ordered=self.ordered,
                    size=codes.size,
                    offset=codes.offset,
                )

            start, stop, stride = arg.indices(len(self))

            if start < 0:
                start = start + len(self)
            if stop < 0:
                stop = stop + len(self)

            if start >= stop:
                return column_empty(0, self.dtype, masked=True)
            # compute mask slice
            if stride == 1 or stride is None:

                return libcudfxx.copying.column_slice(self, [start, stop])[0]
            else:
                # Need to create a gather map for given slice with stride
                gather_map = as_column(
                    cupy.arange(
                        start=start,
                        stop=stop,
                        step=stride,
                        dtype=np.dtype(np.int32),
                    ))
                return self.as_frame()._gather(gather_map)._as_column()

        else:
            arg = column.as_column(arg)
            if len(arg) == 0:
                arg = column.as_column([], dtype="int32")
            if pd.api.types.is_integer_dtype(arg.dtype):
                return self.take(arg)
            if pd.api.types.is_bool_dtype(arg.dtype):
                return self.apply_boolean_mask(arg)
            raise NotImplementedError(type(arg))
Exemplo n.º 18
0
 def create_multihot_col(self, offsets, data):
     """
     offsets = cudf series with offset values for list data
     data = cudf series with the list data flattened to 1-d
     """
     offs = as_column(offsets, dtype="int32")
     encoded = as_column(data)
     col = build_column(
         None,
         size=offs.size - 1,
         dtype=cudf.core.dtypes.ListDtype(encoded.dtype),
         children=(offs, encoded),
     )
     return cudf.Series(col)
Exemplo n.º 19
0
    def read_metadata(*args, **kwargs):
        meta, stats, parts, index = ArrowEngine.read_metadata(*args, **kwargs)

        # If `strings_to_categorical==True`, convert objects to int32
        strings_to_cats = kwargs.get("strings_to_categorical", False)

        new_meta = cudf.DataFrame(index=meta.index)
        for col in meta.columns:
            if meta[col].dtype == "O":
                new_meta[col] = as_column(
                    meta[col], dtype="int32" if strings_to_cats else "object")
            else:
                new_meta[col] = as_column(meta[col])

        return (new_meta, stats, parts, index)
Exemplo n.º 20
0
    def find_and_replace(
        self,
        to_replace: ColumnLike,
        replacement: ColumnLike,
        all_nan: bool = False,
    ) -> NumericalColumn:
        """
        Return col with *to_replace* replaced with *value*.
        """
        to_replace_col = as_column(to_replace)
        replacement_col = as_column(replacement)

        if type(to_replace_col) != type(replacement_col):
            raise TypeError(
                f"to_replace and value should be of same types,"
                f"got to_replace dtype: {to_replace_col.dtype} and "
                f"value dtype: {replacement_col.dtype}"
            )

        if not isinstance(to_replace_col, NumericalColumn) and not isinstance(
            replacement_col, NumericalColumn
        ):
            return self.copy()

        to_replace_col = _normalize_find_and_replace_input(
            self.dtype, to_replace
        )
        if all_nan:
            replacement_col = column.as_column(replacement, dtype=self.dtype)
        else:
            replacement_col = _normalize_find_and_replace_input(
                self.dtype, replacement
            )
        replaced = self.copy()
        if len(replacement_col) == 1 and len(to_replace_col) > 1:
            replacement_col = column.as_column(
                utils.scalar_broadcast_to(
                    replacement[0], (len(to_replace_col),), self.dtype
                )
            )
        elif len(replacement_col) == 1 and len(to_replace_col) == 0:
            return replaced
        to_replace_col, replacement_col, replaced = numeric_normalize_types(
            to_replace_col, replacement_col, replaced
        )
        return libcudf.replace.replace(
            replaced, to_replace_col, replacement_col
        )
Exemplo n.º 21
0
def scalar_broadcast_to(scalar, size, dtype=None):

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None or (isinstance(scalar, (np.datetime64, np.timedelta64))
                          and np.isnat(scalar)):
        if dtype is None:
            dtype = "object"
        return column.column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        if dtype is None:
            return _categorical_scalar_broadcast_to(scalar, size)
        else:
            return scalar_broadcast_to(scalar.categories[0],
                                       size).astype(dtype)

    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
    dtype = scalar.dtype

    if np.dtype(dtype).kind in ("O", "U"):
        gather_map = column.full(size, 0, dtype="int32")
        scalar_str_col = column.as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column.column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Exemplo n.º 22
0
 def _gather(self, gather_map):
     if not pd.api.types.is_integer_dtype(gather_map.dtype):
         gather_map = gather_map.astype("int32")
     result = self.__class__._from_table(
         libcudfxx.copying.gather(self, as_column(gather_map)))
     result._copy_categories(self)
     return result
Exemplo n.º 23
0
def find_last(arr, val, compare="eq"):
    """
    Returns the index of the last occurrence of *val* in *arr*.
    Or the last occurence of *arr* *compare* *val*, if *compare* is not eq
    Otherwise, returns -1.

    Parameters
    ----------
    arr : device array
    val : scalar
    compare: str ('gt', 'lt', or 'eq' (default))
    """
    found = rmm.device_array_like(arr)
    if found.size > 0:
        if compare == "gt":
            gpu_mark_gt.forall(found.size)(arr, val, found, -1)
        elif compare == "lt":
            gpu_mark_lt.forall(found.size)(arr, val, found, -1)
        else:
            if arr.dtype in ("float32", "float64"):
                gpu_mark_found_float.forall(found.size)(arr, val, found, -1)
            else:
                gpu_mark_found_int.forall(found.size)(arr, val, found, -1)
    from cudf.core.column import as_column

    found_col = as_column(found)
    max_index = found_col.max()
    return max_index
Exemplo n.º 24
0
    def set_by_label(self, key: Any, value: Any, validate: bool = True):
        """
        Add (or modify) column by name.

        Parameters
        ----------
        key
            name of the column
        value : column-like
            The value to insert into the column.
        validate : bool
            If True, the provided value will be coerced to a column and
            validated before setting (Default value = True).
        """
        key = self._pad_key(key)
        if validate:
            value = column.as_column(value)
            if len(self._data) > 0:
                if len(value) != self._column_length:
                    raise ValueError("All columns must be of equal length")
            else:
                self._column_length = len(value)

        self._data[key] = value
        self._clear_cache()
Exemplo n.º 25
0
    def as_string_column(self, dtype, **kwargs):

        if len(self) > 0:
            return string._numeric_to_str_typecast_functions[np.dtype(
                self.dtype)](self, **kwargs)
        else:
            return as_column([], dtype="object")
Exemplo n.º 26
0
def _index_or_values_interpolation(column, index=None):
    """
    Interpolate over a float column. assumes a linear interpolation
    strategy using the index of the data to denote spacing of the x
    values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
    would result in [1.0, 3.0, 4.0]
    """
    # figure out where the nans are
    mask = cp.isnan(column)

    # trivial cases, all nan or no nans
    num_nan = mask.sum()
    if num_nan == 0 or num_nan == len(column):
        return column

    to_interp = Frame(data={None: column}, index=index)
    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

    known_x = known_x_and_y._index._column.values
    known_y = known_x_and_y._data.columns[0].values

    result = cp.interp(to_interp._index.values, known_x, known_y)

    # find the first nan
    first_nan_idx = (mask == 0).argmax().item()
    result[:first_nan_idx] = np.nan
    return result
Exemplo n.º 27
0
    def _can_downcast_to_series(self, df, arg):
        """
        This method encapsulates the logic used
        to determine whether or not the result of a loc/iloc
        operation should be "downcasted" from a DataFrame to a
        Series
        """
        from cudf.core.column import as_column

        if isinstance(df, cudf.Series):
            return False
        nrows, ncols = df.shape
        if nrows == 1:
            if type(arg[0]) is slice:
                if not is_scalar(arg[1]):
                    return False
            else:
                # row selection using boolean indexing - never downcasts
                if pd.api.types.is_bool_dtype(as_column(arg[0]).dtype):
                    return False
            dtypes = df.dtypes.values.tolist()
            all_numeric = all(
                [pd.api.types.is_numeric_dtype(t) for t in dtypes])
            if all_numeric:
                return True
        if ncols == 1:
            if type(arg[1]) is slice:
                if not is_scalar(arg[0]):
                    return False
            if isinstance(arg[1], tuple):
                # Multiindex indexing with a slice
                if any(isinstance(v, slice) for v in arg):
                    return False
            return True
        return False
Exemplo n.º 28
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        from cudf.core.column import as_column

        gather_map = cupy.zeros(size, dtype="int32")
        scalar_str_col = as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Exemplo n.º 29
0
    def __init__(
        self,
        data: Union[MutableMapping, ColumnAccessor] = None,
        multiindex: bool = False,
        level_names=None,
    ):
        if data is None:
            data = {}
        # TODO: we should validate the keys of `data`
        if isinstance(data, ColumnAccessor):
            multiindex = multiindex or data.multiindex
            level_names = level_names or data.level_names
            self._data = data._data
            self.multiindex = multiindex
            self._level_names = level_names
        else:
            # This code path is performance-critical for copies and should be
            # modified with care.
            self._data = {}
            if data:
                data = dict(data)
                # Faster than next(iter(data.values()))
                column_length = len(data[next(iter(data))])
                for k, v in data.items():
                    # Much faster to avoid the function call if possible; the
                    # extra isinstance is negligible if we do have to make a
                    # column from something else.
                    if not isinstance(v, column.ColumnBase):
                        v = column.as_column(v)
                    if len(v) != column_length:
                        raise ValueError("All columns must be of equal length")
                    self._data[k] = v

            self.multiindex = multiindex
            self._level_names = level_names
Exemplo n.º 30
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.cudautils import fill_value
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.core.column import as_column
        from cudf.utils.cudautils import zeros

        gather_map = zeros(size, dtype="int32")
        scalar_str_col = as_column(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array((size, ), dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da