def _copy_type_metadata(self, other, include_index: bool = True): """ Copy type metadata from each column of `other` to the corresponding column of `self`. See `ColumnBase._with_type_metadata` for more information. """ for name, col, other_col in zip(self._data.keys(), self._data.values(), other._data.values()): # libcudf APIs lose all information about GeoColumns, operating # solely on the underlying base data. Therefore, our only recourse # is to recreate a new GeoColumn with the same underlying data. # Since there's no easy way to create a GeoColumn from a # NumericalColumn, we're forced to do so manually. if isinstance(other_col, GeoColumn): col = GeoColumn(other_col._geo, other_col._meta, cudf.Index(col)) self._data.set_by_label(name, col._with_type_metadata(other_col.dtype), validate=False) if include_index: if self._index is not None and other._index is not None: self._index._copy_type_metadata(other._index) # When other._index is a CategoricalIndex, there is if isinstance( other._index, cudf.core.index.CategoricalIndex) and not isinstance( self._index, cudf.core.index.CategoricalIndex): self._index = cudf.Index(self._index._column) return self
def test_timedelta_datetime_index_ops_misc(datetime_data, timedelta_data, datetime_dtype, timedelta_dtype): gdt = cudf.Index(datetime_data, dtype=datetime_dtype) gtd = cudf.Index(timedelta_data, dtype=timedelta_dtype) pdt = gdt.to_pandas() ptd = gtd.to_pandas() assert_eq(gdt - gtd, pdt - ptd) assert_eq(gdt + gtd, pdt + ptd)
def astype(self, dtype, copy=False): """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a ValueError exception is raised. Parameters ---------- dtype : numpy dtype Use a numpy.dtype to cast entire Index object to. copy : bool, default False By default, astype always returns a newly allocated object. If copy is set to False and internal requirements on dtype are satisfied, the original data is used to create a new Index or the original Index is returned. Returns ------- Index Index with values cast to specified dtype. Examples -------- >>> import cudf >>> index = cudf.Index([1, 2, 3]) >>> index Int64Index([1, 2, 3], dtype='int64') >>> index.astype('float64') Float64Index([1.0, 2.0, 3.0], dtype='float64') """ if is_dtype_equal(dtype, self.dtype): return self.copy(deep=copy) return cudf.Index(self.copy(deep=copy)._values.astype(dtype), name=self.name)
def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = cudf.Index(cat) pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) # Test attributes assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal( pdsr.cat.codes.values, sr.cat.codes.to_array() ) string = str(sr) expect_str = """ p a q a r b s c t a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) assert_eq(cat.codes, cudf_cat.codes.to_array())
def _union_categoricals( to_union: List[Union[cudf.Series, cudf.Index]], sort_categories: bool = False, ignore_order: bool = False, ): """Combine categorical data. This API is currently internal but should be exposed once full support for cudf.Categorical is ready. """ # TODO(s) in the order specified : # 1. The return type needs to be changed # to cudf.Categorical once it is implemented. # 2. Make this API public (i.e., to resemble # pd.api.types.union_categoricals) if ignore_order: raise TypeError("ignore_order is not yet implemented") result_col = cudf.core.column.CategoricalColumn._concat( [obj._column for obj in to_union]) if sort_categories: sorted_categories = result_col.categories.sort_by_values( ascending=True)[0] result_col = result_col.reorder_categories( new_categories=sorted_categories) return cudf.Index(result_col)
def test_index_sample_basic(n, frac, replace): psr = pd.Series([1, 2, 3, 4, 5]) gindex = cudf.Index(psr) random_state = 0 kind = None try: pout = psr.sample(n=n, frac=frac, replace=replace, random_state=random_state) except BaseException as e: kind = type(e) msg = str(e) if kind is not None: with pytest.raises(kind, match=msg): gout = gindex.sample(n=n, frac=frac, replace=replace, random_state=random_state) else: gout = gindex.sample(n=n, frac=frac, replace=replace, random_state=random_state) if kind is not None: return assert pout.shape == gout.shape
def __setitem__(self, key, value): if cudf.utils.dtypes.is_scalar( value) and cudf._lib.scalar._is_null_host_scalar(value): to_add_categories = 0 else: to_add_categories = len( cudf.Index(value).difference(self.categories)) if to_add_categories > 0: raise ValueError("Cannot setitem on a Categorical with a new " "category, set the categories first") if cudf.utils.dtypes.is_scalar(value): value = self._encode(value) if value is not None else value else: value = cudf.core.column.as_column(value).astype(self.dtype) value = value.codes codes = self.codes codes[key] = value out = cudf.core.column.build_categorical_column( categories=self.categories, codes=codes, mask=codes.base_mask, size=codes.size, offset=self.offset, ordered=self.ordered, ) self._mimic_inplace(out, inplace=True)
def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() if op == "add": expected = ptdi + other_scalars actual = gtdi + other_scalars elif op == "sub": expected = ptdi - other_scalars actual = gtdi - other_scalars elif op == "truediv": expected = ptdi / other_scalars actual = gtdi / other_scalars elif op == "floordiv": expected = ptdi // other_scalars actual = gtdi // other_scalars assert_eq(expected, actual) if op == "add": expected = other_scalars + ptdi actual = other_scalars + gtdi elif op == "sub": expected = other_scalars - ptdi actual = other_scalars - gtdi elif op == "truediv": expected = other_scalars / ptdi actual = other_scalars / gtdi elif op == "floordiv": expected = other_scalars // ptdi actual = other_scalars // gtdi assert_eq(expected, actual)
def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() gpu_scalar = cudf.Scalar(cpu_scalar) if op == "add": expected = ptdi + cpu_scalar actual = gtdi + gpu_scalar elif op == "sub": expected = ptdi - cpu_scalar actual = gtdi - gpu_scalar elif op == "truediv": expected = ptdi / cpu_scalar actual = gtdi / gpu_scalar elif op == "floordiv": expected = ptdi // cpu_scalar actual = gtdi // gpu_scalar assert_eq(expected, actual) if op == "add": expected = cpu_scalar + ptdi actual = gpu_scalar + gtdi elif op == "sub": expected = cpu_scalar - ptdi actual = gpu_scalar - gtdi elif op == "truediv": expected = cpu_scalar / ptdi actual = gpu_scalar / gtdi elif op == "floordiv": expected = cpu_scalar // ptdi actual = gpu_scalar // gtdi assert_eq(expected, actual)
def _setitem_tuple_arg(self, key, value): if isinstance(self._df.index, cudf.MultiIndex) or isinstance( self._df.columns, pd.MultiIndex): raise NotImplementedError( "Setting values using df.loc[] not supported on " "DataFrames with a MultiIndex") try: columns = self._get_column_selection(key[1]) except KeyError: if not self._df.empty and isinstance(key[0], slice): pos_range = get_label_range_or_mask(self._df.index, key[0].start, key[0].stop, key[0].step) idx = self._df.index[pos_range] elif self._df.empty and isinstance(key[0], slice): idx = None else: idx = cudf.Index(key[0]) if is_scalar(value): length = len(idx) if idx is not None else 1 value = as_column(value, length=length) new_col = cudf.Series(value, index=idx) if not self._df.empty: new_col = new_col._align_to_index(self._df.index, how="right") if self._df.empty: self._df.index = (idx if idx is not None else cudf.RangeIndex( len(new_col))) self._df._data.insert(key[1], new_col) else: for col in columns: self._df[col].loc[key[0]] = value
def test_index_to_arrow(data): pdi = pd.Index(data) gdi = cudf.Index(data) expected_arrow_array = pa.Array.from_pandas(pdi) got_arrow_array = gdi.to_arrow() assert_eq(expected_arrow_array, got_arrow_array)
def test_is_categorical_dispatch(): assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3])) assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3])) assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category")) assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category")) assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category")) assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
def test_multiindex_sample_basic(n, frac, replace, axis): # as we currently don't support column with same name if axis == 1 and replace: return pdf = pd.DataFrame( { "a": [1, 2, 3, 4, 5], "float": [0.05, 0.2, 0.3, 0.2, 0.25], "int": [1, 3, 5, 4, 2], }, ) mul_index = cudf.Index(DataFrame.from_pandas(pdf)) random_state = 0 try: pout = pdf.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) except BaseException: assert_exceptions_equal( lfunc=pdf.sample, rfunc=mul_index.sample, lfunc_args_and_kwargs=( [], { "n": n, "frac": frac, "replace": replace, "random_state": random_state, "axis": axis, }, ), rfunc_args_and_kwargs=( [], { "n": n, "frac": frac, "replace": replace, "random_state": random_state, "axis": axis, }, ), ) else: gout = mul_index.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) assert pout.shape == gout.shape
def __getitem__(self, arg): if isinstance(arg, tuple): arg = list(arg) data = self._sr._column[arg] if (isinstance(data, (dict, list)) or _is_scalar_or_zero_d_array(data) or _is_null_host_scalar(data)): return data return self._sr._from_data({self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg)))
def test_index_difference_sort_error(): pdi = pd.Index([1, 2, 3]) gdi = cudf.Index([1, 2, 3]) assert_exceptions_equal( pdi.difference, gdi.difference, ([pdi], {"sort": True}), ([gdi], {"sort": True}), )
def run( self, train_df: cudf.DataFrame, test_df: Optional[cudf.DataFrame] = None, log: bool = False, ): with timer(self.name, log=log): self.create_features(train_df, test_df=test_df) prefix = self.prefix + "_" if self.prefix else "" suffix = self.suffix + "_" if self.suffix else "" self.train.columns = cudf.Index( [str(c) for c in self.train.columns]).to_array() self.valid.columns = cudf.Index( [str(c) for c in self.valid.columns]).to_array() self.test.columns = cudf.Index([str(c) for c in self.test.columns ]).to_array() self.train.columns = prefix + self.train.columns + suffix self.valid.columns = prefix + self.valid.columns + suffix self.test.columns = prefix + self.test.columns + suffix return self
def test_index_tolist(data, dtype): gdi = cudf.Index(data, dtype=dtype) with pytest.raises( TypeError, match=re.escape( r"cuDF does not support conversion to host memory " r"via `tolist()` method. Consider using " r"`.to_arrow().to_pylist()` to construct a Python list."), ): gdi.tolist()
def test_index_iter_error(data, dtype): gdi = cudf.Index(data, dtype=dtype) with pytest.raises( TypeError, match=re.escape( f"{gdi.__class__.__name__} object is not iterable. " f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " f"if you wish to iterate over the values."), ): iter(gdi)
def test_index_difference_sort_error(): pdi = pd.Index([1, 2, 3]) gdi = cudf.Index([1, 2, 3]) try: pdi.difference(pdi, sort=True) except Exception as e: with pytest.raises(type(e), match=e.__str__()): gdi.difference(gdi, sort=True) else: raise AssertionError("Expected pdi.difference to fail when sort=True")
def difference(self, other, sort=None): """ Return a new Index with elements from the index that are not in `other`. This is the set difference of two Index objects. Parameters ---------- other : Index or array-like sort : False or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from incomparable elements is caught by cudf. * None : Attempt to sort the result, but catch any TypeErrors from comparing incomparable elements. * False : Do not sort the result. Returns ------- difference : Index Examples -------- >>> import cudf >>> idx1 = cudf.Index([2, 1, 3, 4]) >>> idx1 Int64Index([2, 1, 3, 4], dtype='int64') >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx2 Int64Index([3, 4, 5, 6], dtype='int64') >>> idx1.difference(idx2) Int64Index([1, 2], dtype='int64') >>> idx1.difference(idx2, sort=False) Int64Index([2, 1], dtype='int64') """ if sort not in {None, False}: raise ValueError(f"The 'sort' keyword only takes the values " f"of None or False; {sort} was passed.") other = cudf.Index(other) if is_mixed_with_object_dtype(self, other): difference = self.copy() else: difference = self.join(other, how="leftanti") if self.dtype != other.dtype: difference = difference.astype(self.dtype) if sort is None and len(other): return difference.sort_values() return difference
def test_categorical_allow_nan(): gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) gs = gs.astype("category") expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") assert_eq(expected_codes, gs.cat.codes) expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") assert_eq(expected_categories, gs.cat.categories) actual_ps = gs.to_pandas() expected_ps = pd.Series([1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category") assert_eq(actual_ps, expected_ps)
def test_cudf_factorize_index(): data = [1, 2, 3, 4, 5] pi = pd.Index(data) gi = cudf.Index(data) expect = pd.factorize(pi) got = cudf.factorize(gi) assert len(expect) == len(got) np.testing.assert_array_equal(expect[0], got[0].get()) np.testing.assert_array_equal(expect[1], got[1].values.get())
def test_multiindex_sample_basic(n, frac, replace, axis): # as we currently don't support column with same name if axis == 1 and replace: return pdf = pd.DataFrame( { "a": [1, 2, 3, 4, 5], "float": [0.05, 0.2, 0.3, 0.2, 0.25], "int": [1, 3, 5, 4, 2], }, ) mul_index = cudf.Index(DataFrame.from_pandas(pdf)) random_state = 0 kind = None try: pout = pdf.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) except BaseException as e: kind = type(e) msg = str(e) if kind is not None: with pytest.raises(kind, match=msg): gout = mul_index.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) else: gout = mul_index.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) if kind is not None: return assert pout.shape == gout.shape
def _pivot(df, index, columns): """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- df : DataFrame index : cudf.Index Index labels of the result columns : cudf.Index Column labels of the result """ columns_labels, columns_idx = columns._encode() index_labels, index_idx = index._encode() column_labels = columns_labels.to_pandas().to_flat_index() # the result of pivot always has a multicolumn result = cudf.core.column_accessor.ColumnAccessor(multiindex=True, level_names=(None, ) + columns._data.names) def as_tuple(x): return x if isinstance(x, tuple) else (x, ) for v in df: names = [as_tuple(v) + as_tuple(name) for name in column_labels] nrows = len(index_labels) ncols = len(names) num_elements = nrows * ncols if num_elements > 0: col = df._data[v] scatter_map = (columns_idx * np.int32(nrows)) + index_idx target = cudf.core.frame.Frame({ None: cudf.core.column.column_empty_like(col, masked=True, newsize=nrows * ncols) }) target._data[None][scatter_map] = col result_frames = target._split(range(nrows, nrows * ncols, nrows)) result.update({ name: next(iter(f._columns)) for name, f in zip(names, result_frames) }) return cudf.DataFrame._from_data(result, index=cudf.Index(index_labels, name=index.name))
def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object to `<NA>` as a preprocessing step to `__repr__` methods. This will involve changing type of Index object to StringIndex but it is the responsibility of the `__repr__` methods using this method to replace or handle representation of the actual types correctly. """ if self._values.has_nulls(): return cudf.Index(self._values.astype("str").fillna(cudf._NA_REP), name=self.name) else: return self
def test_multiindex_to_arrow(): pdf = pd.DataFrame({ "a": [1, 2, 1, 2, 3], "b": [1.0, 2.0, 3.0, 4.0, 5.0], "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), "d": ["a", "b", "c", "d", "e"], }) pdf["a"] = pdf["a"].astype("category") df = cudf.from_pandas(pdf) gdi = cudf.Index(df) expected = pa.Table.from_pandas(pdf) got = gdi.to_arrow() assert_eq(expected, got)
def test_categorical_index_with_nan_repr(): cat_index = cudf.Index( cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype("category")) expected_repr = ( "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, <NA>], " "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')") assert cat_index.__repr__() == expected_repr sliced_expected_repr = ( "CategoricalIndex([NaN, 10.0, NaN, <NA>], " "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')") assert cat_index[2:].__repr__() == sliced_expected_repr
def test_factorize_result_classes(): data = [1, 2, 3] labels, cats = cudf.factorize(cudf.Series(data)) assert isinstance(labels, cp.ndarray) assert isinstance(cats, cudf.BaseIndex) labels, cats = cudf.factorize(cudf.Index(data)) assert isinstance(labels, cp.ndarray) assert isinstance(cats, cudf.BaseIndex) labels, cats = cudf.factorize(cp.array(data)) assert isinstance(labels, cp.ndarray) assert isinstance(cats, cp.ndarray)
def _pivot(df, index, columns): """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- df : DataFrame index : cudf.core.index.Index Index labels of the result columns : cudf.core.index.Index Column labels of the result """ columns_labels, columns_idx = columns._encode() index_labels, index_idx = index._encode() column_labels = columns_labels.to_pandas().to_flat_index() # the result of pivot always has a multicolumn result = cudf.core.column_accessor.ColumnAccessor( multiindex=True, level_names=(None,) + columns._data.names ) def as_tuple(x): return x if isinstance(x, tuple) else (x,) for v in df: names = [as_tuple(v) + as_tuple(name) for name in column_labels] col = df._data[v] result.update( cudf.DataFrame._from_table( col.scatter_to_table( index_idx, columns_idx, names, nrows=len(index_labels), ncols=len(names), ) )._data ) return cudf.DataFrame( result, index=cudf.Index(index_labels, name=index.name) )
def test_timedelta_index_properties(data, dtype, name): gdi = cudf.Index(data, dtype=dtype, name=name) pdi = gdi.to_pandas() def local_assert(expected, actual): if actual._values.null_count: assert_eq(expected, actual.astype("float64")) else: assert_eq(expected, actual) expected_days = pdi.days actual_days = gdi.days local_assert(expected_days, actual_days) expected_seconds = pdi.seconds actual_seconds = gdi.seconds local_assert(expected_seconds, actual_seconds) expected_microseconds = pdi.microseconds actual_microseconds = gdi.microseconds local_assert(expected_microseconds, actual_microseconds) expected_nanoseconds = pdi.nanoseconds actual_nanoseconds = gdi.nanoseconds local_assert(expected_nanoseconds, actual_nanoseconds) expected_components = pdi.components actual_components = gdi.components if actual_components.isnull().any().any(): assert_eq(expected_components, actual_components.astype("float")) else: assert_eq( expected_components, actual_components, check_index_type=not actual_components.empty, )