def indices_from_labels(obj, labels): from cudf.core.column import column if not isinstance(labels, cudf.MultiIndex): labels = column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, ) else: labels = labels.astype(obj.index.dtype) # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels) rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index) return lhs.join(rhs).sort_values("__")["_"]
def column_empty(row_count, dtype="object", masked=False): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) children = () if is_categorical_dtype(dtype): data = None children = ( build_column( data=Buffer.empty(row_count * np.dtype("int32").itemsize), dtype="int32", ), ) elif dtype.kind in "OU": data = None children = ( build_column( data=Buffer.empty( (row_count + 1) * np.dtype("int32").itemsize ), dtype="int32", ), build_column( data=Buffer.empty(row_count * np.dtype("int8").itemsize), dtype="int8", ), ) else: data = Buffer.empty(row_count * dtype.itemsize) if masked: mask = Buffer(cudautils.make_empty_mask(row_count)) else: mask = None return build_column(data, dtype, mask=mask, children=children)
def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- >>> import cudf >>> import numpy as np >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, ... 'B': {0: 1, 1: 3, 2: 6}, ... 'C': {0: 1.0, 1: np.nan, 2: 4.0}, ... 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ assert col_level in (None,) # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(is_categorical_dtype(t) for t in dtypes): raise NotImplementedError( "Categorical columns are not yet " "supported for function" ) # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError("all cols in value_vars must have the same dtype") # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError( "'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap)) ) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series([], dtype=A.dtype) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append(Series(cudautils.full(size=N, value=i, dtype=np.int8))) temp = Series._concat(objs=var_cols, index=None) if not var_name: var_name = "variable" mdata[var_name] = Series( build_categorical_column( categories=value_vars, codes=as_column(temp._column.base_data, dtype=temp._column.dtype), mask=temp._column.base_mask, size=temp._column.size, offset=temp._column.offset, ordered=False, ) ) # Step 3: add values mdata[value_name] = Series._concat( objs=[frame[val] for val in value_vars], index=None ) return DataFrame(mdata)
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool,optional If True (default), treat NaN values in arbitrary as null. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR) ) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize ) nbuf = None if arbitrary.null_count() > 0: mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize) nbuf = Buffer.empty(mask_size) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column( data=None, dtype="object", mask=nbuf, children=children ) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype) if ( data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0 ): if nan_as_null: mask = libcudf.unaryops.nans_to_nulls(data) data = data.set_mask(mask) elif data.dtype.kind == "M": null = column_empty_like(data, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(arbitrary), dtype=arbitrary.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)), dtype=arbitrary.dtype, ), null, ) data = datetime.DatetimeColumn( data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask ) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) dtype = np.dtype(desc["typestr"]) col = build_column(data, dtype=dtype, mask=mask) return col elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary ) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn( mask=nbuf, children=children, size=pa_size, offset=pa_offset ) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype() ) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype( categories=categories, ordered=arbitrary.type.ordered ) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes,), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]" ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary ) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: data = data.fillna(np.nan) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) else: try: data = as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)
def __getitem__(self, arg): from cudf.core.column import column if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): if is_categorical_dtype(self): codes = self.codes[arg] return build_column( data=None, dtype=self.dtype, mask=codes.mask, children=(codes,), ) start, stop, stride = arg.indices(len(self)) if start == stop: return column_empty(0, self.dtype, masked=True) # compute mask slice if self.has_nulls: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data slice_data = self.data_array_view[arg] # slicing mask data_size = self.size bytemask = cudautils.expand_mask_bits( data_size, self.mask_array_view ) slice_mask = cudautils.compact_mask_bytes(bytemask[arg]) else: slice_data = self.data_array_view[arg] slice_mask = None if self.dtype == "object": return as_column(slice_data) else: if arg.step is not None and arg.step != 1: slice_data = cudautils.as_contiguous(slice_data) slice_data = Buffer(slice_data) else: # data Buffer lifetime is tied to self: slice_data = Buffer( data=slice_data.device_ctypes_pointer.value, size=slice_data.nbytes, owner=self, ) # mask Buffer lifetime is not: if slice_mask is not None: slice_mask = Buffer(slice_mask) return build_column(slice_data, self.dtype, mask=slice_mask) else: arg = column.as_column(arg) if len(arg) == 0: arg = column.as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg) if pd.api.types.is_bool_dtype(arg.dtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg))
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and ( len( [ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ] ) == 0 ): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = ( Series(ColumnBase._concat([o.categories for o in objs])) .drop_duplicates() ._column ) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError( "Result of concat cannot have " "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR) ) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR ) ) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data_dtype = head.codes.dtype data = None children = (column_empty(newsize, dtype=head.codes.dtype),) else: data_dtype = head.dtype data = Buffer.empty(size=newsize * data_dtype.itemsize) children = () # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column( data=data, dtype=head.dtype, mask=mask, children=children ) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col
def to_numeric(arg, errors="raise", downcast=None): """ Convert argument into numerical types. Parameters ---------- arg : column-convertible The object to convert to numeric types errors : {'raise', 'ignore', 'coerce'}, defaults 'raise' Policy to handle errors during parsing. * 'raise' will notify user all errors encountered. * 'ignore' will skip error and returns ``arg``. * 'coerce' will leave invalid values as nulls. downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None If set, will try to down-convert the datatype of the parsed results to smallest possible type. For each `downcast` type, this method will determine the smallest possible dtype from the following sets: * {'integer', 'signed'}: all integer types greater or equal to `np.int8` * {'unsigned'}: all unsigned types greater or equal to `np.uint8` * {'float'}: all floating types greater or equal to `np.float32` Note that downcast behavior is decoupled from parsing. Errors encountered during downcast is raised regardless of ``errors`` parameter. Returns ------- Series or ndarray Depending on the input, if series is passed in, series is returned, otherwise ndarray Notes ------- An important difference from pandas is that this function does not accept mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. A ``TypeError`` will be raised when such input is received, regardless of ``errors`` parameter. Examples -------- >>> s = cudf.Series(['1', '2.0', '3e3']) >>> cudf.to_numeric(s) 0 1.0 1 2.0 2 3000.0 dtype: float64 >>> cudf.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 3000.0 dtype: float32 >>> cudf.to_numeric(s, downcast='signed') 0 1 1 2 2 3000 dtype: int16 >>> s = cudf.Series(['apple', '1.0', '3e3']) >>> cudf.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 3e3 dtype: object >>> cudf.to_numeric(s, errors='coerce') 0 <NA> 1 1.0 2 3000.0 dtype: float64 """ if errors not in {"raise", "ignore", "coerce"}: raise ValueError("invalid error value specified") if downcast not in {None, "integer", "signed", "unsigned", "float"}: raise ValueError("invalid downcasting method provided") if not can_convert_to_column(arg) or (hasattr(arg, "ndim") and arg.ndim > 1): raise ValueError("arg must be column convertible") col = as_column(arg) dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): col = col.as_numerical_column(np.dtype("int64")) elif is_categorical_dtype(dtype): cat_dtype = col.dtype.type if is_numerical_dtype(cat_dtype): col = col.as_numerical_column(cat_dtype) else: try: col = _convert_str_col(col._get_decategorized_column(), errors, downcast) except ValueError as e: if errors == "ignore": return arg else: raise e elif is_string_dtype(dtype): try: col = _convert_str_col(col, errors, downcast) except ValueError as e: if errors == "ignore": return arg else: raise e elif is_list_dtype(dtype) or is_struct_dtype(dtype): raise ValueError("Input does not support nested datatypes") elif is_numerical_dtype(dtype): pass else: raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision if col.dtype == np.dtype("f"): col = col.as_numerical_column("d") if downcast: downcast_type_map = { "integer": list(np.typecodes["Integer"]), "signed": list(np.typecodes["Integer"]), "unsigned": list(np.typecodes["UnsignedInteger"]), } float_types = list(np.typecodes["Float"]) idx = float_types.index(np.dtype(np.float32).char) downcast_type_map["float"] = float_types[idx:] type_set = downcast_type_map[downcast] for t in type_set: downcast_dtype = np.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = libcudf.unary.cast(col, downcast_dtype) break if isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col) else: col = col.fillna(col.default_na_value()) return col.values
def assert_column_equal( left, right, check_dtype=True, check_column_type="equiv", check_less_precise=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, obj="ColumnBase", ): """ Check that left and right columns are equal This function is intended to compare two columns and output any differences. Additional parameters allow varying the strictness of the equality checks performed. Parameters: ----------- left : Column left Column to compare right : Column right Column to compare check_dtype : bool, default True Whether to check the Column dtype is identical. check_column_type : bool or {‘equiv’}, default ‘equiv’ Whether to check the columns class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. check_less_precise : bool or int, default False Not yet supported check_exact : bool, default False Whether to compare number exactly. check_datetime_like_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals obj : str, default ‘ColumnBase’ Specify object name being compared, internally used to show appropriate assertion message. """ if check_dtype is True: if (is_categorical_dtype(left) and is_categorical_dtype(right) and not check_categorical): pass else: if type(left) != type(right) or left.dtype != right.dtype: msg1 = f"{left.dtype}" msg2 = f"{right.dtype}" raise_assert_detail(obj, "Dtypes are different", msg1, msg2) if check_datetimelike_compat: if np.issubdtype(left.dtype, np.datetime64): right = right.astype(left.dtype) elif np.issubdtype(right.dtype, np.datetime64): left = left.astype(right.dtype) if np.issubdtype(left.dtype, np.datetime64): if not left.equals(right): raise AssertionError( f"[datetimelike_compat=True] {left.values} " f"is not equal to {right.values}.") return if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): left_cat = left.cat().categories right_cat = right.cat().categories if check_category_order: assert_index_equal( left_cat, right_cat, exact=check_dtype, check_exact=True, check_categorical=False, ) assert_column_equal( left.codes, right.codes, check_dtype=check_dtype, check_exact=True, check_categorical=False, check_category_order=False, ) if left.ordered != right.ordered: msg1 = f"{left.ordered}" msg2 = f"{right.ordered}" raise_assert_detail("{obj} category", "Orders are different", msg1, msg2) if (not check_dtype and is_categorical_dtype(left) and is_categorical_dtype(right)): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) columns_equal = False try: columns_equal = left.equals(right) except TypeError as e: if str(e) != "Categoricals can only compare with the same type": raise e if is_categorical_dtype(left) and is_categorical_dtype(right): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: msg1 = f"{left.to_array()}" msg2 = f"{right.to_array()}" try: diff = left.apply_boolean_mask(left.binary_operator("ne", right)).size diff = diff * 100.0 / left.size except BaseException: diff = 100.0 raise_assert_detail( obj, f"values are different ({np.round(diff, 5)} %)", msg1, msg2, )
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col
def as_column(arbitrary, nan_as_null=True, dtype=None, name=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical * Object exposing ``__cuda_array_interface__`` Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input. - StringColumn for string input. - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index from cudf.bindings.cudf_cpp import np_to_pa_dtype if name is None and hasattr(arbitrary, "name"): name = arbitrary.name if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column( arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories, ) elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudf.bindings.utils.mask_from_devary(data) data = data.set_mask(mask) elif hasattr(arbitrary, "__cuda_array_interface__"): from cudf.bindings.cudf_cpp import count_nonzero_mask desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(desc) mask = _mask_from_cuda_array_interface_desc(desc) if mask is not None: nelem = len(data.mem) nnz = count_nonzero_mask(mask.mem, size=nelem) null_count = nelem - nnz else: null_count = 0 return build_column(data, dtype=data.dtype, mask=mask, name=name, null_count=null_count) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype="int8") else: sbuf = np.empty(0, dtype="int8") # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype="int32") # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype="int8") data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype()) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary, ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype="M8[ms]") data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype("M8[ms]"), ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, "dtype"): data_type = np_to_pa_dtype(arbitrary.dtype) # PyArrow can't construct date64 or date32 arrays from np # datetime types if pa.types.is_date64(data_type) or pa.types.is_date32(data_type): arbitrary = arbitrary.astype("int64") data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): data = as_column( pd.Series(arbitrary, dtype="category"), nan_as_null=nan_as_null, ) else: data = as_column( np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null, ) if hasattr(data, "name") and (name is not None): data.name = name return data
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_start < 0: key_start = key_start + len(self) if key_stop < 0: key_stop = key_stop + len(self) if key_start >= key_stop: return self.copy() if (key_stride is None or key_stride == 1) and is_scalar(value): return self._fill(value, key_start, key_stop, inplace=True) if key_stride != 1 or key_stride is not None or is_scalar(value): key = as_column( cupy.arange( start=key_start, stop=key_stop, step=key_stride, dtype=np.dtype(np.int32), )) nelem = len(key) else: nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = column.as_column(cupy.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): value = self._encode(value) else: value = self.dtype.type(value) if value is not None else value else: if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) value = column.as_column(value).astype(self.dtype) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if (isinstance(key, slice) and (key_stride == 1 or key_stride is None) and not is_scalar(value)): out = libcudfxx.copying.copy_range(value, self, 0, nelem, key_start, key_stop, False) if is_categorical_dtype(value.dtype): out = build_categorical_column( categories=value.categories, codes=as_column(out.base_data, dtype=out.dtype), mask=out.base_mask, size=out.size, offset=out.offset, ordered=value.ordered, ) else: try: if is_scalar(value): input = self if is_categorical_dtype(self.dtype): input = self.codes out = input.as_frame()._scatter(key, [value])._as_column() if is_categorical_dtype(self.dtype): out = build_categorical_column( categories=self.categories, codes=as_column(out.base_data, dtype=out.dtype), mask=out.base_mask, size=out.size, offset=out.offset, ordered=self.ordered, ) else: if not isinstance(value, Column): value = as_column(value) out = (self.as_frame()._scatter( key, value.as_frame())._as_column()) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}") raise self._mimic_inplace(out, inplace=True)
def _concat(cls, objs, dtype=None): if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not is_numerical_dtype(o.dtype) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if obj.valid_count > 0: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype if obj.valid_count == 0: objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) else: raise ValueError("All columns must be the same type") cats = None is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) # Combine CategoricalColumn categories if is_categorical: # Combine and de-dupe the categories cats = (cudf.concat([o.cat().categories for o in objs ]).to_series().drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] # Map `objs` into a list of the codes until we port Categorical to # use the libcudf++ Category data type. objs = [o.cat().codes._column for o in objs] head = head.cat().codes._column newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError("Result of concat cannot have " "size > {}".format( libcudfxx.MAX_COLUMN_SIZE_STR)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] # Perform the actual concatenation if newsize > 0: col = libcudfxx.concat.concat_columns(objs) else: col = column_empty(0, head.dtype, masked=True) if is_categorical: col = build_categorical_column( categories=cats, codes=as_column(col.base_data, dtype=col.dtype), mask=col.base_mask, size=col.size, offset=col.offset, ) return col