def digitize( column: ColumnBase, bins: np.ndarray, right: bool = False ) -> ColumnBase: """Return the indices of the bins to which each value in column belongs. Parameters ---------- column : Column Input column. bins : Column-like 1-D column-like object of bins with same type as `column`, should be monotonically increasing. right : bool Indicates whether interval contains the right or left bin edge. Returns ------- A column containing the indices """ if not column.dtype == bins.dtype: raise ValueError( "Digitize() expects bins and input column have the same dtype." ) bin_col = as_column(bins, dtype=bins.dtype) if bin_col.nullable: raise ValueError("`bins` cannot contain null entries.") return as_column( libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right) )
def build_categorical_column( categories, codes, mask=None, size=None, offset=0, ordered=None ): """ Build a CategoricalColumn Parameters ---------- categories : Column Column of categories codes : Column Column of codes, the size of the resulting Column will be the size of `codes` mask : Buffer Null mask size : int, optional offset : int, optional ordered : bool Indicates whether the categories are ordered """ dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered) return build_column( data=None, dtype=dtype, mask=mask, size=size, offset=offset, children=(as_column(codes),), )
def haversine_distance(p1_lon, p1_lat, p2_lon, p2_lat): """ Compute the haversine distances between an arbitrary list of lon/lat pairs Parameters ---------- p1_lon longitude of first set of coords p1_lat latitude of first set of coords p2_lon longitude of second set of coords p2_lat latitude of second set of coords Returns ------- result : cudf.Series The distance between all pairs of lon/lat coordinates """ p1_lon, p1_lat, p2_lon, p2_lat = normalize_point_columns( as_column(p1_lon), as_column(p1_lat), as_column(p2_lon), as_column(p2_lat), ) return cpp_haversine_distance(p1_lon, p1_lat, p2_lon, p2_lat)
def from_sequences( cls, arbitrary: Sequence[ColumnLike]) -> "cudf.core.column.ListColumn": """ Create a list column for list of column-like sequences """ data_col = column.column_empty(0) mask_col = [] offset_col = [0] offset = 0 # Build Data, Mask & Offsets for data in arbitrary: if cudf._lib.scalar._is_null_host_scalar(data): mask_col.append(False) offset_col.append(offset) else: mask_col.append(True) data_col = data_col.append(as_column(data)) offset += len(data) offset_col.append(offset) offset_col = column.as_column(offset_col, dtype="int32") # Build ListColumn res = cls( size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), offset=0, null_count=0, children=(offset_col, data_col), ) return res
def polygon_bounding_boxes(poly_offsets, ring_offsets, xs, ys): """Compute the minimum bounding-boxes for a set of polygons. Parameters ---------- poly_offsets Begin indices of the first ring in each polygon (i.e. prefix-sum) ring_offsets Begin indices of the first point in each ring (i.e. prefix-sum) xs Polygon point x-coordinates ys Polygon point y-coordinates Returns ------- result : cudf.DataFrame minimum bounding boxes for each polygon x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box """ poly_offsets = as_column(poly_offsets, dtype="int32") ring_offsets = as_column(ring_offsets, dtype="int32") xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_table( cpp_polygon_bounding_boxes(poly_offsets, ring_offsets, xs, ys) )
def polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius): """Compute the minimum bounding-boxes for a set of polylines. Parameters ---------- poly_offsets Begin indices of the first ring in each polyline (i.e. prefix-sum) xs Polyline point x-coordinates ys Polyline point y-coordinates expansion_radius radius of each polyline point Returns ------- result : cudf.DataFrame minimum bounding boxes for each polyline x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box """ poly_offsets = as_column(poly_offsets, dtype="int32") xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_data( *cpp_polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius) )
def read_partition(fs, piece, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): path = piece row_group = None partition_keys = [] else: (path, row_group, partition_keys) = piece strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( path, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and (index[0] in df.columns): df = df.set_index(index[0]) if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] col = as_column(index2).as_frame().repeat(len(df))._data[None] df[name] = build_categorical_column( categories=categories, codes=as_column(col.base_data, dtype=col.dtype), size=col.size, offset=col.offset, ordered=False, ) return df
def create_multihot_col(offsets, elements): """ offsets = cudf series with offset values for list data data = cudf series with the list data flattened to 1-d """ if isinstance(elements, pd.Series): col = pd.Series() lh, rh = pd.Series(offsets[1:]).reset_index(drop=True), pd.Series( offsets[:-1]).reset_index(drop=True) vals_per_entry = lh - rh vals_used = 0 entries = [] for vals_count in vals_per_entry: vals_count = int(vals_count) entry = elements[vals_used:vals_used + vals_count] if len(entry) == 1: entry = entry[0] vals_used += vals_count entries.append(entry.values) col = col.append(pd.Series(entries)) else: offsets = as_column(offsets, dtype="int32") elements = as_column(elements) col = _build_cudf_list_column(elements, offsets) col = cudf.Series(col) return col
def from_numpy(cls, array): cast_dtype = array.dtype.type == np.int64 if array.dtype.kind == "M": time_unit, _ = np.datetime_data(array.dtype) cast_dtype = time_unit in ("D", "W", "M", "Y") or ( len(array) > 0 and (isinstance(array[0], str) or isinstance(array[0], dt.datetime))) elif not cast_dtype: raise ValueError( ("Cannot infer datetime dtype " + "from np.array dtype `%s`") % (array.dtype)) if cast_dtype: array = array.astype(np.dtype("datetime64[s]")) assert array.dtype.itemsize == 8 mask = None if np.any(np.isnat(array)): null = cudf.core.column.column_empty_like(array, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(array), dtype=array.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=array.dtype)), dtype=array.dtype, ), null, ) mask = col.mask return cls(data=Buffer(array), mask=mask, dtype=array.dtype)
def _build_cudf_list_column(new_elements, new_offsets): if not HAS_GPU: return [] return build_column( None, dtype=cudf.core.dtypes.ListDtype(new_elements.dtype), size=new_offsets.size - 1, children=(as_column(new_offsets), as_column(new_elements)), )
def trajectory_distances_and_speeds( num_trajectories, object_ids, xs, ys, timestamps ): """ Compute the distance traveled and speed of sets of trajectories Parameters ---------- num_trajectories number of trajectories (unique object ids) object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) timestamps column of timestamps in any resolution Returns ------- result : cudf.DataFrame meters : cudf.Series trajectory distance (in kilometers) speed : cudf.Series trajectory speed (in meters/second) Examples -------- Compute the distances and speeds of derived trajectories >>> objects, traj_offsets = cuspatial.derive_trajectories(...) >>> dists_and_speeds = cuspatial.trajectory_distances_and_speeds( len(traj_offsets) objects['object_id'], objects['x'], objects['y'], objects['timestamp'] ) >>> print(dists_and_speeds) distance speed trajectory_id 0 1000.0 100000.000000 1 1000.0 111111.109375 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) timestamps = normalize_timestamp_column(as_column(timestamps)) df = DataFrame._from_table( cpp_trajectory_distances_and_speeds( num_trajectories, object_ids, xs, ys, timestamps ) ) df.index.name = "trajectory_id" return df
def _proc_inf_strings(col): """Convert "inf/infinity" strings into "Inf", the native string representing infinity in libcudf """ col = libstrings.replace_multi( col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) return col
def trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys): """ Compute the bounding boxes of sets of trajectories. Parameters ---------- num_trajectories number of trajectories (unique object ids) object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) Returns ------- result : cudf.DataFrame minimum bounding boxes (in kilometers) for each trajectory x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box Examples -------- Compute the minimum bounding boxes of derived trajectories >>> objects, traj_offsets = trajectory.derive_trajectories( [0, 0, 1, 1], # object_id [0, 1, 2, 3], # x [0, 0, 1, 1], # y [0, 10, 0, 10] # timestamp ) >>> traj_bounding_boxes = cuspatial.trajectory_bounding_boxes( len(traj_offsets), objects['object_id'], objects['x'], objects['y'] ) >>> print(traj_bounding_boxes) x_min y_min x_max y_max 0 0.0 0.0 2.0 2.0 1 1.0 1.0 3.0 3.0 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_table( cpp_trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys) )
def _proc_inf_strings(col): """Convert "inf/infinity" strings into "Inf", the native string representing infinity in libcudf """ # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column col = libstrings.replace_multi( col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) return col
def derive_trajectories(object_ids, xs, ys, timestamps): """ Derive trajectories from object ids, points, and timestamps. Parameters ---------- object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) timestamps column of timestamps in any resolution Returns ------- result : tuple (objects, traj_offsets) objects : cudf.DataFrame object_ids, xs, ys, and timestamps sorted by ``(object_id, timestamp)``, used by ``trajectory_bounding_boxes`` and ``trajectory_distances_and_speeds`` traj_offsets : cudf.Series offsets of discovered trajectories Examples -------- Compute sorted objects and discovered trajectories >>> objects, traj_offsets = cuspatial.derive_trajectories( [0, 1, 2, 3], # object_id [0, 0, 1, 1], # x [0, 0, 1, 1], # y [0, 10, 0, 10] # timestamp ) >>> print(traj_offsets) 0 0 1 2 >>> print(objects) object_id x y timestamp 0 0 1 0 0 1 0 0 0 10 2 1 3 1 0 3 1 2 1 10 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) timestamps = normalize_timestamp_column(as_column(timestamps)) objects, traj_offsets = cpp_derive_trajectories( object_ids, xs, ys, timestamps ) return DataFrame._from_table(objects), Series(data=traj_offsets)
def execute(self, requests: List[InferenceRequest]) -> List[InferenceResponse]: """Transforms the input batches by running through a NVTabular workflow.transform function. """ responses = [] for request in requests: # create a cudf DataFrame from the triton request input_df = cudf.DataFrame({ name: _convert_tensor(get_input_tensor_by_name(request, name)) for name in self.input_dtypes }) for name, dtype in self.input_multihots.items(): values = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__values"))) nnzs = as_column( _convert_tensor( get_input_tensor_by_name(request, name + "__nnzs"))) input_df[name] = build_column(None, dtype=dtype, size=nnzs.size - 1, children=(nnzs, values)) # use our NVTabular workflow to transform the dataframe output_df = nvtabular.workflow._transform_partition( input_df, [self.workflow.column_group]) # convert back to a triton response output_tensors = [] for name in output_df.columns: col = output_df[name] if is_list_dtype(col.dtype): # convert list values to match TF dataloader values = col.list.leaves.values_host.astype( self.output_dtypes[name + "__values"]) values = values.reshape(len(values), 1) output_tensors.append(Tensor(name + "__values", values)) offsets = col._column.offsets.values_host.astype( self.output_dtypes[name + "__nnzs"]) nnzs = offsets[1:] - offsets[:-1] nnzs = nnzs.reshape(len(nnzs), 1) output_tensors.append(Tensor(name + "__nnzs", nnzs)) else: d = col.values_host.astype(self.output_dtypes[name]) d = d.reshape(len(d), 1) output_tensors.append(Tensor(name, d)) responses.append(InferenceResponse(output_tensors)) return responses
def __getitem__(self, arg): from cudf.core.column import column if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): if is_categorical_dtype(self): codes = self.codes[arg] return build_categorical_column( categories=self.categories, codes=as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, ordered=self.ordered, size=codes.size, offset=codes.offset, ) start, stop, stride = arg.indices(len(self)) if start < 0: start = start + len(self) if stop < 0: stop = stop + len(self) if start >= stop: return column_empty(0, self.dtype, masked=True) # compute mask slice if stride == 1 or stride is None: return libcudfxx.copying.column_slice(self, [start, stop])[0] else: # Need to create a gather map for given slice with stride gather_map = as_column( cupy.arange( start=start, stop=stop, step=stride, dtype=np.dtype(np.int32), )) return self.as_frame()._gather(gather_map)._as_column() else: arg = column.as_column(arg) if len(arg) == 0: arg = column.as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg) if pd.api.types.is_bool_dtype(arg.dtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg))
def create_multihot_col(self, offsets, data): """ offsets = cudf series with offset values for list data data = cudf series with the list data flattened to 1-d """ offs = as_column(offsets, dtype="int32") encoded = as_column(data) col = build_column( None, size=offs.size - 1, dtype=cudf.core.dtypes.ListDtype(encoded.dtype), children=(offs, encoded), ) return cudf.Series(col)
def read_metadata(*args, **kwargs): meta, stats, parts, index = ArrowEngine.read_metadata(*args, **kwargs) # If `strings_to_categorical==True`, convert objects to int32 strings_to_cats = kwargs.get("strings_to_categorical", False) new_meta = cudf.DataFrame(index=meta.index) for col in meta.columns: if meta[col].dtype == "O": new_meta[col] = as_column( meta[col], dtype="int32" if strings_to_cats else "object") else: new_meta[col] = as_column(meta[col]) return (new_meta, stats, parts, index)
def find_and_replace( self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ to_replace_col = as_column(to_replace) replacement_col = as_column(replacement) if type(to_replace_col) != type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) if not isinstance(to_replace_col, NumericalColumn) and not isinstance( replacement_col, NumericalColumn ): return self.copy() to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace ) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement ) replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to( replacement[0], (len(to_replace_col),), self.dtype ) ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return replaced to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, replaced ) return libcudf.replace.replace( replaced, to_replace_col, replacement_col )
def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] if scalar is None or (isinstance(scalar, (np.datetime64, np.timedelta64)) and np.isnat(scalar)): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): if dtype is None: return _categorical_scalar_broadcast_to(scalar, size) else: return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def _gather(self, gather_map): if not pd.api.types.is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") result = self.__class__._from_table( libcudfxx.copying.gather(self, as_column(gather_map))) result._copy_categories(self) return result
def find_last(arr, val, compare="eq"): """ Returns the index of the last occurrence of *val* in *arr*. Or the last occurence of *arr* *compare* *val*, if *compare* is not eq Otherwise, returns -1. Parameters ---------- arr : device array val : scalar compare: str ('gt', 'lt', or 'eq' (default)) """ found = rmm.device_array_like(arr) if found.size > 0: if compare == "gt": gpu_mark_gt.forall(found.size)(arr, val, found, -1) elif compare == "lt": gpu_mark_lt.forall(found.size)(arr, val, found, -1) else: if arr.dtype in ("float32", "float64"): gpu_mark_found_float.forall(found.size)(arr, val, found, -1) else: gpu_mark_found_int.forall(found.size)(arr, val, found, -1) from cudf.core.column import as_column found_col = as_column(found) max_index = found_col.max() return max_index
def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. Parameters ---------- key name of the column value : column-like The value to insert into the column. validate : bool If True, the provided value will be coerced to a column and validated before setting (Default value = True). """ key = self._pad_key(key) if validate: value = column.as_column(value) if len(self._data) > 0: if len(value) != self._column_length: raise ValueError("All columns must be of equal length") else: self._column_length = len(value) self._data[key] = value self._clear_cache()
def as_string_column(self, dtype, **kwargs): if len(self) > 0: return string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](self, **kwargs) else: return as_column([], dtype="object")
def _index_or_values_interpolation(column, index=None): """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] would result in [1.0, 3.0, 4.0] """ # figure out where the nans are mask = cp.isnan(column) # trivial cases, all nan or no nans num_nan = mask.sum() if num_nan == 0 or num_nan == len(column): return column to_interp = Frame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values known_y = known_x_and_y._data.columns[0].values result = cp.interp(to_interp._index.values, known_x, known_y) # find the first nan first_nan_idx = (mask == 0).argmax().item() result[:first_nan_idx] = np.nan return result
def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used to determine whether or not the result of a loc/iloc operation should be "downcasted" from a DataFrame to a Series """ from cudf.core.column import as_column if isinstance(df, cudf.Series): return False nrows, ncols = df.shape if nrows == 1: if type(arg[0]) is slice: if not is_scalar(arg[1]): return False else: # row selection using boolean indexing - never downcasts if pd.api.types.is_bool_dtype(as_column(arg[0]).dtype): return False dtypes = df.dtypes.values.tolist() all_numeric = all( [pd.api.types.is_numeric_dtype(t) for t in dtypes]) if all_numeric: return True if ncols == 1: if type(arg[1]) is slice: if not is_scalar(arg[0]): return False if isinstance(arg[1], tuple): # Multiindex indexing with a slice if any(isinstance(v, slice) for v in arg): return False return True return False
def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): from cudf.core.column import as_column gather_map = cupy.zeros(size, dtype="int32") scalar_str_col = as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def __init__( self, data: Union[MutableMapping, ColumnAccessor] = None, multiindex: bool = False, level_names=None, ): if data is None: data = {} # TODO: we should validate the keys of `data` if isinstance(data, ColumnAccessor): multiindex = multiindex or data.multiindex level_names = level_names or data.level_names self._data = data._data self.multiindex = multiindex self._level_names = level_names else: # This code path is performance-critical for copies and should be # modified with care. self._data = {} if data: data = dict(data) # Faster than next(iter(data.values())) column_length = len(data[next(iter(data))]) for k, v in data.items(): # Much faster to avoid the function call if possible; the # extra isinstance is negligible if we do have to make a # column from something else. if not isinstance(v, column.ColumnBase): v = column.as_column(v) if len(v) != column_length: raise ValueError("All columns must be of equal length") self._data[k] = v self.multiindex = multiindex self._level_names = level_names
def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.cudautils import fill_value from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.core.column import as_column from cudf.utils.cudautils import zeros gather_map = zeros(size, dtype="int32") scalar_str_col = as_column(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array((size, ), dtype=dtype) if da.size != 0: fill_value(da, scalar) return da