def _concat(cls, objs): head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] newsize = sum(map(len, objs)) # Concatenate data mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem) for o in objs: data.extend(o.data.to_gpu_array()) # Concatenate mask if present if any(o.has_null_mask for o in objs): # FIXME: Inefficient mem = rmm.device_array(shape=newsize, dtype=np.bool) mask = Buffer.from_empty(mem) null_count = 0 for o in objs: mask.extend(o._get_mask_as_column().to_gpu_array()) null_count += o._null_count mask = Buffer(cudautils.compact_mask_bytes(mask.to_gpu_array())) else: mask = None null_count = 0 col = head.replace(data=data, mask=mask, null_count=null_count) return col
def _mask_from_cuda_array_interface_desc(obj): from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize from cudf.utils.cudautils import compact_mask_bytes desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) if mask is not None: desc = mask.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": nelem = calc_chunk_size(nelem, mask_bitsize) mask = Buffer( data=ptr, size=nelem * mask_dtype.itemsize, owner=obj ) elif typecode == "b": dtype = np.dtype(typestr) mask = compact_mask_bytes( rmm.device_array_from_ptr( ptr, nelem=nelem, dtype=dtype, finalizer=None ) ) mask = Buffer(mask) else: raise NotImplementedError( f"Cannot infer mask from typestr {typestr}" ) return mask
def __getitem__(self, arg): if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): # compute mask slice start, stop = utils.normalize_slice(arg, len(self)) if self.null_count > 0: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data subdata = self.data[arg] # slicing mask bytemask = cudautils.expand_mask_bits( self.data.size, self.mask.to_gpu_array(), ) submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg])) col = self.replace(data=subdata, mask=submask) return col else: newbuffer = self.data[arg] return self.replace(data=newbuffer) else: raise NotImplementedError(type(arg))
def as_mask(self): """Convert booleans to bitmask Returns ------- device array """ return cudautils.compact_mask_bytes(self.to_gpu_array())
def as_mask(self): """Convert booleans to bitmask Returns ------- device array """ if self.has_nulls: raise ValueError("Column must have no nulls.") return cudautils.compact_mask_bytes(self.data_array_view)
def column_select_by_boolmask(column, boolmask): """Select by a boolean mask to a column. Returns (selected_column, selected_positions) """ from cudf.dataframe.numerical import NumericalColumn assert column.null_count == 0 # We don't properly handle the boolmask yet boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array()) indices = cudautils.arange(len(boolmask)) _, selinds = cudautils.copy_to_dense(indices, mask=boolbits) _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(), mask=boolbits) selected_values = column.replace(data=Buffer(selvals)) selected_index = Buffer(selinds) return selected_values, NumericalColumn(data=selected_index, dtype=selected_index.dtype)
def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ codes = categorical.codes if codes is None else codes codes = column.as_column(codes) valid_codes = codes != -1 mask = None if not np.all(valid_codes): mask = cudautils.compact_mask_bytes(valid_codes) mask = Buffer(mask) return column.build_categorical_column( categories=categorical.categories, codes=codes, mask=mask, ordered=categorical.ordered, )
def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ # TODO fix mutability issue in numba to avoid the .copy() codes = (categorical.codes.copy() if codes is None else codes) # TODO pending pandas to be improved # https://github.com/pandas-dev/pandas/issues/14711 # https://github.com/pandas-dev/pandas/pull/16015 valid_codes = codes != -1 buf = Buffer(codes) params = dict(data=buf, categories=categorical.categories, ordered=categorical.ordered) if not np.all(valid_codes): mask = cudautils.compact_mask_bytes(valid_codes) nnz = np.count_nonzero(valid_codes) null_count = codes.size - nnz params.update(dict(mask=Buffer(mask), null_count=null_count)) return CategoricalColumn(**params)
def __getitem__(self, arg): from cudf.dataframe import columnops if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): # compute mask slice if self.null_count > 0: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data subdata = self.data[arg] # slicing mask if self.dtype == "object": data_size = self.data.size() else: data_size = self.data.size bytemask = cudautils.expand_mask_bits(data_size, self.mask.to_gpu_array()) submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg])) col = self.replace(data=subdata, mask=submask) return col else: newbuffer = self.data[arg] return self.replace(data=newbuffer) else: arg = columnops.as_column(arg) if len(arg) == 0: arg = columnops.as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg.data.mem) if pd.api.types.is_bool_dtype(arg.dtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg))
def __getitem__(self, arg): from cudf.core.column import column if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): if is_categorical_dtype(self): codes = self.codes[arg] return build_column( data=None, dtype=self.dtype, mask=codes.mask, children=(codes,), ) start, stop, stride = arg.indices(len(self)) if start == stop: return column_empty(0, self.dtype, masked=True) # compute mask slice if self.has_nulls: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data slice_data = self.data_array_view[arg] # slicing mask data_size = self.size bytemask = cudautils.expand_mask_bits( data_size, self.mask_array_view ) slice_mask = cudautils.compact_mask_bytes(bytemask[arg]) else: slice_data = self.data_array_view[arg] slice_mask = None if self.dtype == "object": return as_column(slice_data) else: if arg.step is not None and arg.step != 1: slice_data = cudautils.as_contiguous(slice_data) slice_data = Buffer(slice_data) else: # data Buffer lifetime is tied to self: slice_data = Buffer( data=slice_data.device_ctypes_pointer.value, size=slice_data.nbytes, owner=self, ) # mask Buffer lifetime is not: if slice_mask is not None: slice_mask = Buffer(slice_mask) return build_column(slice_data, self.dtype, mask=slice_mask) else: arg = column.as_column(arg) if len(arg) == 0: arg = column.as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg) if pd.api.types.is_bool_dtype(arg.dtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg))