def test_categorical_binary_add(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) with pytest.raises(TypeError) as raises: pdsr + pdsr raises.match("unsupported operand") with pytest.raises(TypeError) as raises: sr + sr raises.match( "Series of dtype `category` cannot perform the operation: add")
def test_str_to_datetime_error(): psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) gsr = Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) assert_exceptions_equal( lfunc=psr.astype, rfunc=gsr.astype, lfunc_args_and_kwargs=(["datetime64[s]"], ), rfunc_args_and_kwargs=(["datetime64[s]"], ), check_exception_type=False, expected_error_message=re.escape( "Could not convert `None` value to datetime"), )
def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] exp_join_col = Series(exp_join_data, dtype="float32") expect = DataFrame( {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} ) got = gdf_l.merge(gdf_r, on="join_col", how="left") assert_eq(expect, got)
def test_series_not(dtype): import pandas as pd dtype = np.dtype(dtype).type arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype) if dtype is not np.bool_: arr = arr * (np.random.random(1000) * 100).astype(dtype) sr = Series(arr) result = cudf.logical_not(sr).to_array() expect = np.logical_not(arr) np.testing.assert_equal(result, expect) np.testing.assert_equal((~sr).to_array(), ~arr)
def test_categorical_unary_ceil(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) assert_exceptions_equal( lfunc=getattr, rfunc=sr.ceil, lfunc_args_and_kwargs=([pdsr, "ceil"], ), check_exception_type=False, expected_error_message="Series of dtype `category` cannot " "perform the operation: ceil", )
def test_categorical_empty(): cat = pd.Categorical([]) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array()) # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)
def test_categorical_binary_add(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([pdsr, pdsr], ), rfunc_args_and_kwargs=([sr, sr], ), expected_error_message="Series of dtype `category` cannot perform " "the operation: add", )
def test_fillna(): _, schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array( data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count, ) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert dense.null_count == 0
def test_series_reductions(method, dtype): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 else: mask = arr > 0.5 arr = arr.astype(dtype) arr2 = arr[mask] sr = Series.from_masked_array(arr, Series(mask).as_mask()) def call_test(sr): fn = getattr(sr, method) if method in ["std", "var"]: return fn(ddof=1) else: return fn() expect, got = call_test(arr2), call_test(sr) print(expect, got) np.testing.assert_approx_equal(expect, got)
def test_typecast_on_join_indexes_matching_categorical(): join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category") join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str") other_data = [1, 2, 3, 4, 5] gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") exp_join_data = ["a", "b", "c", "d", "e"] exp_other_data = [1, 2, 3, 4, 5] expect = DataFrame({ "join_col": exp_join_data, "B_x": exp_other_data, "B_y": exp_other_data, }) expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got)
def test_categorical_integer(): cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array(fillna="pandas")) assert sr.null_count == 2 np.testing.assert_array_equal( pdsr.cat.codes.values, sr.cat.codes.fillna(-1).to_array() ) np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 null 2 null 3 c 4 a dtype: category Categories (3, object): [a, b, c] """ assert string.split() == expect_str.split()
def test_categorical_set_categories(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) psr = pd.Series(cat) sr = Series.from_categorical(cat) # adding category expect = psr.cat.set_categories(["a", "b", "c", "d"]) got = sr.cat.set_categories(["a", "b", "c", "d"]) assert_eq(expect, got) # removing category expect = psr.cat.set_categories(["a", "b"]) got = sr.cat.set_categories(["a", "b"]) assert_eq(expect, got)
def test_reflected_ops_scalar(func, dtype, obj_class): # create random series np.random.seed(12) random_series = utils.gen_rand(dtype, 100, low=10) # gpu series gs = Series(random_series) # class typing if obj_class == "Index": gs = as_index(gs) gs_result = func(gs) # class typing if obj_class == "Index": gs = Series(gs) # pandas ps_result = func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result.to_array())
def test_to_from_pandas_nulls(data, nulls): pd_data = pd.Series(data.copy()) if nulls == "some": # Fill half the values with NaT pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") elif nulls == "all": # Fill all the values with NaT pd_data[:] = np.datetime64("nat", "ns") gdf_data = Series.from_pandas(pd_data) expect = pd_data got = gdf_data.to_pandas() assert_eq(expect, got)
def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] join_data_l = Series([1, 2, 3], dtype=dtype_l) join_data_r = Series([1, 2, 4], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame({ "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, }) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_series_where(data_dtype, fill_value): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = Series.from_pandas(psr) expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) assert_eq(expect, got) expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) assert_eq(expect, got) expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) assert_eq(expect, got)
def test_series_nsmallest(data, n): """Indirectly tests Series.sort_values() """ sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) assert_eq( sr.nsmallest(n, keep="last").sort_index(), psr.nsmallest(n, keep="last").sort_index(), ) assert_exceptions_equal( lfunc=psr.nsmallest, rfunc=sr.nsmallest, lfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), rfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), expected_error_message='keep must be either "first", "last"', )
def test_string_wrap(data, width): gs = Series(data) ps = pd.Series(data) assert_eq( gs.str.wrap(width=width), ps.str.wrap( width=width, break_long_words=False, expand_tabs=False, replace_whitespace=True, drop_whitespace=True, break_on_hyphens=False, ), )
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_series_median(dtype, num_na): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = np.arange(100) >= num_na arr = arr.astype(dtype) sr = Series.from_masked_array(arr, Series(mask).as_mask()) arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) actual = sr.median(skipna=True) desired = ps.median(skipna=True) print(actual, desired) np.testing.assert_approx_equal(actual, desired) # only for float until integer null supported convert to pandas in cudf # eg. pd.Int64Dtype if np.issubdtype(dtype, np.floating): ps = sr.to_pandas() actual = sr.median(skipna=False) desired = ps.median(skipna=False) np.testing.assert_approx_equal(actual, desired)
def test_series_with_nulls_where(fill_value): psr = pd.Series([None] * 3 + list(range(5))) sr = Series.from_pandas(psr) expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) assert_eq(expect, got) expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) assert_eq(expect, got) expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) assert_eq(expect, got)
def test_string_groupby_key_index(): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") gdf["a"] = Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data expect = pdf.groupby("a").count() got = gdf.groupby("a").count() assert_eq(expect, got, check_dtype=False)
def test_series_fillna_numerical(data_dtype, fill_dtype, fill_type, null_value, inplace): # TODO: These tests should use Pandas' nullable int type # when we support a recent enough version of Pandas # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html if fill_type == "scalar": fill_value = np.random.randint(0, 5) expect = np.array([0, 1, fill_value, 2, fill_value], dtype=data_dtype) elif fill_type == "series": data = np.random.randint(0, 5, (5, )) fill_value = pd.Series(data, dtype=data_dtype) expect = np.array([0, 1, fill_value[2], 2, fill_value[4]], dtype=data_dtype) sr = Series([0, 1, null_value, 2, null_value], dtype=data_dtype) result = sr.fillna(fill_value, inplace=inplace) if inplace: result = sr got = result.to_array() np.testing.assert_equal(expect, got)
def test_string_split(data, pat, n, expand, expand_raise): if data in (["a b", " c ", " d", "e ", "f"], ) and pat is None: pytest.xfail("None pattern split algorithm not implemented yet") ps = pd.Series(data, dtype="str") gs = Series(data, dtype="str") expectation = raise_builder([expand_raise], NotImplementedError) with expectation: expect = ps.str.split(pat=pat, n=n, expand=expand) got = gs.str.split(pat=pat, n=n, expand=expand) assert_eq(expect, got)
def test_datetime_scalar_timeunit_cast(timeunit): testscalar = np.datetime64("2016-11-20", timeunit) gs = Series(testscalar) ps = pd.Series(testscalar) assert_eq(ps, gs) gdf = DataFrame() gdf["a"] = np.arange(5) gdf["b"] = testscalar pdf = pd.DataFrame() pdf["a"] = np.arange(5) pdf["b"] = testscalar assert_eq(pdf, gdf)
def test_string_replace_with_backrefs(find, replace): s = [ "A543", "Z756", "", None, "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again", ] ps = pd.Series(s) gs = Series(s) got = gs.str.replace_with_backrefs(find, replace) expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected)
def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) got = sr.sum() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = data[res_mask].sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_series_nsmallest(data, n): """Indirectly tests Series.sort_values() """ sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last")) with pytest.raises(ValueError) as raises: sr.nsmallest(3, keep="what") assert raises.match('keep must be either "first", "last"')
def test_fillna_categorical(psr, fill_value, inplace): gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): fill_value_cudf = cudf.from_pandas(fill_value) else: fill_value_cudf = fill_value expected = psr.fillna(fill_value, inplace=inplace) got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: expected = psr got = gsr assert_eq(expected, got)
def test_strings_rsplit(data, n, expand): gs = Series(data) ps = pd.Series(data) pd.testing.assert_frame_equal( ps.str.rsplit(n=n, expand=expand).reset_index(), gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(), check_index_type=False, ) assert_eq( ps.str.rsplit(",", n=n, expand=expand), gs.str.rsplit(",", n=n, expand=expand), ) assert_eq( ps.str.rsplit("-", n=n, expand=expand), gs.str.rsplit("-", n=n, expand=expand), )