def run(self, df, **launch_params): # Get input columns if isinstance(self.incols, dict): inputs = { v: df[k]._column.data_array_view for (k, v) in self.incols.items() } else: inputs = {k: df[k]._column.data_array_view for k in self.incols} # Allocate output columns outputs = {} for k, dt in self.outcols.items(): outputs[k] = column.column_empty(len(df), dt, False).data_array_view # Bind argument args = {} for dct in [inputs, outputs, self.kwargs]: args.update(dct) bound = self.sig.bind(**args) # Launch kernel self.launch_kernel(df, bound.args, **launch_params) # Prepare pessimistic nullmask if self.pessimistic_nulls: out_mask = make_aggregate_nullmask(df, columns=self.incols) else: out_mask = None # Prepare output frame outdf = df.copy() for k in sorted(self.outcols): outdf[k] = Series(outputs[k], index=outdf.index, nan_as_null=False) if out_mask is not None: outdf[k] = outdf[k].set_mask(out_mask.data_array_view) return outdf
def from_sequences( cls, arbitrary: Sequence[ColumnLike]) -> "cudf.core.column.ListColumn": """ Create a list column for list of column-like sequences """ data_col = column.column_empty(0) mask_col = [] offset_col = [0] offset = 0 # Build Data, Mask & Offsets for data in arbitrary: if cudf._lib.scalar._is_null_host_scalar(data): mask_col.append(False) offset_col.append(offset) else: mask_col.append(True) data_col = data_col.append(as_column(data)) offset += len(data) offset_col.append(offset) offset_col = column.as_column(offset_col, dtype="int32") # Build ListColumn res = cls( size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), offset=0, null_count=0, children=(offset_col, data_col), ) return res
def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): if reflect: lhs, rhs = rhs, lhs libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange") # Allocate output masked = False if np.isscalar(lhs): masked = rhs.nullable row_count = len(rhs) elif np.isscalar(rhs): masked = lhs.nullable row_count = len(lhs) elif rhs is None: masked = True row_count = len(lhs) elif lhs is None: masked = True row_count = len(rhs) else: masked = lhs.nullable or rhs.nullable row_count = len(lhs) is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] out = column.column_empty(row_count, dtype=out_dtype, masked=masked) _ = libcudf.binops.apply_op(lhs, rhs, out, op) if is_op_comparison: out = out.fillna(op == "ne") libcudf.nvtx.nvtx_range_pop() return out
def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): if reflect: lhs, rhs = rhs, lhs libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange") # Allocate output masked = False name = None if np.isscalar(lhs): masked = rhs.has_null_mask row_count = len(rhs) name = rhs.name elif np.isscalar(rhs): masked = lhs.has_null_mask row_count = len(lhs) name = lhs.name else: masked = lhs.has_null_mask or rhs.has_null_mask row_count = len(lhs) is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] out = column.column_empty(row_count, dtype=out_dtype, masked=masked) # Call and fix null_count null_count = libcudf.binops.apply_op(lhs, rhs, out, op) if is_op_comparison: out.fillna(op == "ne", inplace=True) else: out = out.replace(null_count=null_count) result = out.view(NumericalColumn, dtype=out_dtype, name=name) libcudf.nvtx.nvtx_range_pop() return result
def __init__(self, **kwargs): """ Parameters ---------- data : Buffer The code values mask : Buffer; optional The validity mask null_count : int; optional The number of null values in the mask. categories : iterable The categories ordered : bool whether the categorical has a logical ordering (e.g. less than) """ ordered = bool(kwargs.pop("ordered")) categories = kwargs.pop("categories", []) # Default to String dtype if len(categories) == 0, like pandas does categories = ( column.as_column(categories) if len(categories) > 0 else column.column_empty(0, np.dtype("object"), masked=False) ) dtype = CategoricalDtype( categories=column.as_column(categories), ordered=ordered ) kwargs.update({"dtype": dtype}) super(CategoricalColumn, self).__init__(**kwargs) self._categories = categories self._ordered = ordered
def as_string_column(self, dtype, **kwargs): from cudf.core.column import string if len(self) > 0: return string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](self, **kwargs) else: return column.column_empty(0, dtype="object", masked=False)
def as_string_column(self, dtype, **kwargs): if not kwargs.get("format"): fmt = _dtype_to_format_conversion.get(self.dtype.name, "%Y-%m-%d %H:%M:%S") kwargs["format"] = fmt if len(self) > 0: return string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](self, **kwargs) else: return column.column_empty(0, dtype="object", masked=False)
def _string_safe_hash(df): frame = df.copy(deep=False) for col in frame.columns: if isinstance(frame[col]._column, StringColumn): out_col = column.column_empty( len(frame), dtype="int32", masked=False ) ptr = out_col.data.ptr frame[col]._column.data_array_view.hash(devptr=ptr) frame[col] = out_col return frame.hash_columns()
def as_string_column(self, dtype, format=None): if format is None: format = _dtype_to_format_conversion.get( self.dtype.name, "%Y-%m-%d %H:%M:%S" ) if len(self) > 0: return string._datetime_to_str_typecast_functions[ np.dtype(self.dtype) ](self, format) else: return column.column_empty(0, dtype="object", masked=False)
def as_string_column(self, dtype: Dtype, format=None) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get(self.dtype.name, "%D days %H:%M:%S") if len(self) > 0: return string._timedelta_to_str_typecast_functions[np.dtype( self.dtype)](self, format=format) else: return cast( "cudf.core.column.StringColumn", column.column_empty(0, dtype="object", masked=False), )
def as_string_column(self, dtype, **kwargs): from cudf.core.column import string if len(self) > 0: dev_ptr = self.data_ptr null_ptr = None if self.nullable: null_ptr = self.mask_ptr kwargs.update({ "count": len(self), "nulls": null_ptr, "bdevmem": True, "units": self.time_unit, }) data = string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](dev_ptr, **kwargs) return as_column(data) else: return column.column_empty(0, dtype="object", masked=False)
def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) if "format" not in kwargs: if len(self.nvstrings) > 0: # infer on host from the first not na element fmt = pd.core.tools.datetimes._guess_datetime_format( self[self.notna()][0] ) kwargs.update(format=fmt) else: fmt = None out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = libcudf.cudf.get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.nvstrings, **kwargs) out_col = column.as_column(out_arr) if self.has_nulls: mask_size = utils.calc_chunk_size( len(self.nvstrings), utils.mask_bitsize ) out_mask = column.column_empty( mask_size, dtype="int8", masked=False ).data out_mask_ptr = out_mask.ptr self.nvstrings.set_null_bitmask(out_mask_ptr, bdevmem=True) out_col.mask = out_mask return out_col.astype(out_dtype)
def as_string_column(self, dtype: Dtype, format=None, **kwargs) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get(self.dtype.name, "%Y-%m-%d %H:%M:%S") if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: names = cudf.core.column.column_empty(0, dtype="object", masked=False) if len(self) > 0: return string._datetime_to_str_typecast_functions[cudf.dtype( self.dtype)](self, format, names) else: return cast( "cudf.core.column.StringColumn", column.column_empty(0, dtype="object", masked=False), )
def _values(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) return column.as_column(vals) else: return column.column_empty(0, masked=False, dtype=self.dtype)
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)