def test_iloc_decimal(): sr = cudf.Series(["1.00", "2.00", "3.00", "4.00"]).astype(cudf.Decimal64Dtype(scale=2, precision=3)) got = sr.iloc[[3, 2, 1, 0]] expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"], ).astype( cudf.Decimal64Dtype(scale=2, precision=3)) assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
def _decimal_normalize_types(*args): s = max([a.dtype.scale for a in args]) lhs = max([a.dtype.precision - a.dtype.scale for a in args]) p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) dtype = cudf.Decimal64Dtype(p, s) return [a.astype(dtype) for a in args]
def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated s = max([dtype.scale for dtype in dtypes]) lhs = max([dtype.precision - dtype.scale for dtype in dtypes]) # Combine to get the necessary precision and clip at the maximum # precision p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) return cudf.Decimal64Dtype(p, s)
def test_empty_columns(): buffer = BytesIO() # string and decimal columns have additional steps that need to be skipped expected = cudf.DataFrame({ "string": cudf.Series([], dtype="str"), "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)), }) expected.to_orc(buffer, compression="snappy") got_df = cudf.read_orc(buffer) assert_eq(expected, got_df)
def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated s = max(dtype.scale for dtype in dtypes) lhs = max(dtype.precision - dtype.scale for dtype in dtypes) # Combine to get the necessary precision and clip at the maximum # precision p = s + lhs if p > cudf.Decimal64Dtype.MAX_PRECISION: return cudf.Decimal128Dtype(min(cudf.Decimal128Dtype.MAX_PRECISION, p), s) elif p > cudf.Decimal32Dtype.MAX_PRECISION: return cudf.Decimal64Dtype(min(cudf.Decimal64Dtype.MAX_PRECISION, p), s) else: return cudf.Decimal32Dtype(min(cudf.Decimal32Dtype.MAX_PRECISION, p), s)
import pandas as pd import pyarrow as pa import pytest import cudf from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, ) TEST_DECIMAL_TYPES = [ cudf.Decimal64Dtype(1, 1), cudf.Decimal64Dtype(4, 2), cudf.Decimal64Dtype(4, -2), ] SCALAR_VALUES = [ 0, -1, 42, 0.0, 1.0, np.int8(0), np.int8(1), np.int8(-1), np.iinfo(np.int8).min, np.iinfo(np.int8).max,
if op == "ne": expect_all = True else: expect_all = False assert (result == expect_all).all() elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES: assert result._column.null_count == len(data) @pytest.mark.parametrize( "args", [ ( operator.add, ["1.5", "2.0"], cudf.Decimal64Dtype(scale=2, precision=2), ["1.5", "2.0"], cudf.Decimal64Dtype(scale=2, precision=2), ["3.0", "4.0"], cudf.Decimal64Dtype(scale=2, precision=3), ), ( operator.add, ["1.5", "2.0"], cudf.Decimal64Dtype(scale=2, precision=2), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["3.75", "3.005"], cudf.Decimal64Dtype(scale=3, precision=5), ), (
def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True): """ Generates a random table. Parameters ---------- dtypes_meta : List of dict Specifies list of dtype meta data. dtype meta data should be a dictionary of the form example: {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10} `"str"` dtype can contain an extra key `max_string_length` to control the maximum size of the strings being generated in each row. If not specified, it will default to 1000. rows : int Specifies the number of rows to be generated. seed : int Specifies the `seed` value to be utilized by all downstream random data generation APIs. use_threads : bool Indicates whether to use threads pools to build the columns Returns ------- PyArrow Table A Table with columns of corresponding dtypes mentioned in `dtypes_meta` """ # Apply seed random.seed(seed) np.random.seed(seed) mimesis.random.random.seed(seed) column_params = [] for meta in dtypes_meta: dtype = copy.deepcopy(meta["dtype"]) null_frequency = copy.deepcopy(meta["null_frequency"]) cardinality = copy.deepcopy(meta["cardinality"]) if dtype == "list": lists_max_length = meta["lists_max_length"] nesting_max_depth = meta["nesting_max_depth"] value_type = meta["value_type"] nesting_depth = np.random.randint(1, nesting_max_depth) dtype = cudf.core.dtypes.ListDtype(value_type) # Determining the `dtype` from the `value_type` # and the nesting_depth i = 1 while i < nesting_depth: dtype = cudf.core.dtypes.ListDtype(dtype) i += 1 column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=list_generator( dtype=value_type, size=cardinality, nesting_depth=nesting_depth, lists_max_length=lists_max_length, ), is_sorted=False, dtype=dtype, )) elif dtype == "decimal64": max_precision = meta.get("max_precision", cudf.Decimal64Dtype.MAX_PRECISION) precision = np.random.randint(1, max_precision) scale = np.random.randint(0, precision) dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=decimal_generator(dtype=dtype, size=cardinality), is_sorted=False, dtype=dtype, )) elif dtype == "category": column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=lambda cardinality=cardinality: [ mimesis.random.random.randstr(unique=True, length=2000) for _ in range(cardinality) ], is_sorted=False, dtype="category", )) else: dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=int_generator(dtype=dtype, size=cardinality), is_sorted=False, dtype=dtype, )) elif dtype.kind == "f": column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=float_generator(dtype=dtype, size=cardinality), is_sorted=False, dtype=dtype, )) elif dtype.kind in ("U", "O"): column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=lambda cardinality=cardinality: [ mimesis.random.random.schoice( string.printable, meta.get("max_string_length", 1000), ) for _ in range(cardinality) ], is_sorted=False, dtype=dtype, )) elif dtype.kind == "M": column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=datetime_generator(dtype=dtype, size=cardinality), is_sorted=False, dtype=cudf.dtype(dtype), )) elif dtype.kind == "m": column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=timedelta_generator(dtype=dtype, size=cardinality), is_sorted=False, dtype=cudf.dtype(dtype), )) elif dtype.kind == "b": column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, generator=boolean_generator(cardinality), is_sorted=False, dtype=cudf.dtype(dtype), )) else: raise TypeError(f"Unsupported dtype: {dtype}") # TODO: Add List column support once # https://github.com/rapidsai/cudf/pull/6075 # is merged. df = get_dataframe( Parameters( num_rows=rows, column_parameters=column_params, seed=seed, ), use_threads=use_threads, ) return df
(pd.Series(dtype="category"), True), (pd.Series(dtype="object"), False), # cuDF dtypes. (cudf.CategoricalDtype, True), (cudf.ListDtype, False), (cudf.StructDtype, False), (cudf.Decimal128Dtype, False), (cudf.Decimal64Dtype, False), (cudf.Decimal32Dtype, False), (cudf.IntervalDtype, False), # cuDF dtype instances. (cudf.CategoricalDtype("a"), True), (cudf.ListDtype(int), False), (cudf.StructDtype({"a": int}), False), (cudf.Decimal128Dtype(5, 2), False), (cudf.Decimal64Dtype(5, 2), False), (cudf.Decimal32Dtype(5, 2), False), (cudf.IntervalDtype(int), False), # cuDF objects (cudf.Series(dtype="bool"), False), (cudf.Series(dtype="int"), False), (cudf.Series(dtype="float"), False), (cudf.Series(dtype="str"), False), (cudf.Series(dtype="datetime64[s]"), False), (cudf.Series(dtype="timedelta64[s]"), False), (cudf.Series(dtype="category"), True), (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False), (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False), (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False), # TODO: Currently creating an empty Series of list type ignores the # provided type and instead makes a float64 Series.
"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]] }, ], ) def test_serialize_list_columns(data): df = cudf.DataFrame(data) recreated = df.__class__.deserialize(*df.serialize()) assert_eq(recreated, df) @pytest.mark.parametrize( "data", [ { "a": _decimal_series(["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)) }, { "a": _decimal_series(["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)), "b": _decimal_series(["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)), "c": _decimal_series(["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)), }, { "a": _decimal_series(["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)), "b":
def test_decimal_invalid_precision(): with pytest.raises(pa.ArrowInvalid): _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) with pytest.raises(pa.ArrowInvalid): _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))
@pytest.mark.parametrize( "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]] ) def test_series_construction_with_nulls(input_obj): expect = pa.array(input_obj, from_pandas=True) got = cudf.Series(input_obj).to_arrow() assert expect == got @pytest.mark.parametrize( "data", [ { "a": _decimal_series( ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) ) }, { "a": _decimal_series( ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) ), "b": _decimal_series( ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) ), "c": _decimal_series( ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) ), }, { "a": _decimal_series(
}, ], ) def test_masked_udf_subset_selection(data): def func(row): return row["a"] + row["b"] data = cudf.DataFrame(data) run_masked_udf_test(func, data) @pytest.mark.parametrize( "unsupported_col", [ ["a", "b", "c"], _decimal_series(["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)), cudf.Series([1, 2, 3], dtype="category"), cudf.interval_range(start=0, end=3, closed=True), [[1, 2], [3, 4], [5, 6]], [{ "a": 1 }, { "a": 2 }, { "a": 3 }], ], ) def test_masked_udf_unsupported_dtype(unsupported_col): data = cudf.DataFrame() data["unsupported_col"] = unsupported_col
def test_decimal_column_slicing(offset, size, precision, scale): col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) col = col.astype(cudf.Decimal64Dtype(precision, scale)) column_slicing_test(col, offset, size, True)
import numpy as np import pandas as pd import pytest import cudf from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element from cudf.tests.utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, ) TEST_DECIMAL_TYPES = [ cudf.Decimal64Dtype(1, 1), cudf.Decimal64Dtype(4, 2), cudf.Decimal64Dtype(4, -2), ] SCALAR_VALUES = [ 0, -1, 42, 0.0, 1.0, np.int8(0), np.int8(1), np.int8(-1), np.iinfo(np.int8).min, np.iinfo(np.int8).max,