def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value): nelem = 1000 arr = utils.gen_rand(dtype, nelem) * 10000 scalar = 59.0 if has_nulls == 'some': nulls = utils.random_bitmask(nelem) sr = Series.from_masked_array(arr, nulls) else: sr = Series(arr) psr = sr.to_pandas() expect = getattr(psr, func)(scalar, fill_value=fill_value) got = getattr(sr, func)(scalar, fill_value=fill_value) # This is being done because of the various gymnastics required to support # equality for null values. cudf.Series().to_pandas() replaces nulls with # None and so a bool Series becomes object Series. Which does not match the # output of equality op in pandas which remains a bool. Furthermore, NaN # values are treated as not comparable and always return False in a bool op # except in not-equal op where bool(Nan != Nan) gives True. if got.dtype == np.bool: got = got.fillna(True) if func == 'ne' else got.fillna(False) utils.assert_eq(expect, got)
def test_fillna_dataframe(fill_type, inplace): pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}) gdf = DataFrame.from_pandas(pdf) if fill_type == "scalar": fill_value_pd = 5 fill_value_cudf = fill_value_pd elif fill_type == "series": fill_value_pd = pd.Series([3, 4, 5]) fill_value_cudf = Series.from_pandas(fill_value_pd) else: fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])} fill_value_cudf = { "a": fill_value_pd["a"], "b": Series.from_pandas(fill_value_pd["b"]), } # https://github.com/pandas-dev/pandas/issues/27197 # pandas df.fill_value with series is not working if isinstance(fill_value_pd, pd.Series): expect = pd.DataFrame() for col in pdf.columns: expect[col] = pdf[col].fillna(fill_value_pd) else: expect = pdf.fillna(fill_value_pd) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf['vals'] = pd.Series(str_data, dtype='str') gdf['vals'] = Series(str_data, dtype='str') pdf['key'] = other_data gdf['key'] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2['vals'] = pd.Series(str_data_nulls, dtype='str') gdf2['vals'] = Series(str_data_nulls, dtype='str') pdf2['key'] = pd.Series(other_data_nulls, dtype='int64') gdf2['key'] = Series(other_data_nulls, dtype='int64') expect = pdf.merge(pdf2, on='key', how='left') got = gdf.merge(gdf2, on='key', how='left') if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_categorical_compare_ordered(): cat1 = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=True) pdsr1 = pd.Series(cat1) sr1 = Series(cat1) cat2 = pd.Categorical(['a', 'b', 'a', 'c', 'b'], categories=['a', 'b', 'c'], ordered=True) pdsr2 = pd.Series(cat2) sr2 = Series(cat2) # test equal out = sr1 == sr1 assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ assert np.all(out) assert np.all(pdsr1 == pdsr1) # test inequal out = sr1 != sr1 assert not np.any(out) assert not np.any(pdsr1 != pdsr1) assert pdsr1.cat.ordered assert sr1.cat.ordered # test using ordered operators np.testing.assert_array_equal(pdsr1 < pdsr2, sr1 < sr2) np.testing.assert_array_equal(pdsr1 > pdsr2, sr1 > sr2)
def test_series_fillna_invalid_dtype(data_dtype): gdf = Series([1, 2, None, 3], dtype=data_dtype) fill_value = 2.5 with pytest.raises(TypeError) as raises: gdf.fillna(fill_value) raises.match("Cannot safely cast non-equivalent {} to {}".format( np.dtype(type(fill_value)).type.__name__, gdf.dtype.type.__name__))
def test_validity_add(nelem): np.random.seed(0) # LHS lhs_data = np.random.random(nelem) lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] lhs_null_count = utils.count_zero(lhs_bitmask) assert lhs_null_count >= 0 lhs = Series.from_masked_array(lhs_data, lhs_mask) assert lhs.null_count == lhs_null_count # RHS rhs_data = np.random.random(nelem) rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] rhs_null_count = utils.count_zero(rhs_bitmask) assert rhs_null_count >= 0 rhs = Series.from_masked_array(rhs_data, rhs_mask) assert rhs.null_count == rhs_null_count # Result res = lhs + rhs res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool)[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_array() expect = lhs_data + rhs_data expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array()) assert len(set(arr[mask])) == sr.nunique()
def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = as_index(cat) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array()) assert sr.dtype == pdsr.dtype # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) assert_eq(cat.codes, cudf_cat.codes.to_array())
def test_string_astype(dtype): if dtype.startswith("int"): data = ["1", "2", "3", "4", "5"] elif dtype.startswith("float"): data = ["1.0", "2.0", "3.0", "4.0", "5.0"] elif dtype.startswith("bool"): data = ["True", "False", "True", "False", "False"] elif dtype.startswith("datetime64"): data = [ "2019-06-04T00:00:00Z", "2019-06-04T12:12:12Z", "2019-06-03T00:00:00Z", "2019-05-04T00:00:00Z", "2018-06-04T00:00:00Z", ] elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] ps = pd.Series(data) gs = Series(data) # Pandas str --> bool typecasting always returns True if there's a string if dtype.startswith("bool"): expect = ps == "True" else: expect = ps.astype(dtype) got = gs.astype(dtype) assert_eq(expect, got)
def test_reflected_ops_scalar(func, dtype, obj_class): import pandas as pd # create random series np.random.seed(12) random_series = pd.Series(np.random.sample(100) + 10, dtype=dtype) # gpu series gs = Series(random_series) # class typing if obj_class == 'Index': gs = as_index(gs) gs_result = func(gs) # class typing if obj_class == 'Index': gs = Series(gs) # pandas ps_result = func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result)
def test_generic_ptx(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_col = Series(lhs_arr)._column rhs_arr = np.random.random(size).astype(dtype) rhs_col = Series(rhs_arr)._column @numba.cuda.jit(device=True) def generic_function(a, b): return a**3 + b nb_type = numba.numpy_support.from_dtype(np.dtype(dtype)) type_signature = (nb_type, nb_type) result = generic_function.compile(type_signature) ptx = generic_function.inspect_ptx(type_signature) ptx_code = ptx.decode("utf-8") output_type = numba.numpy_support.as_dtype(result.signature.return_type) out_col = binops.apply_op_udf(lhs_col, rhs_col, ptx_code, output_type.type) result = lhs_arr**3 + rhs_arr np.testing.assert_almost_equal(result, out_col)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") gdf["vals"] = Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") gdf2["vals"] = Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") gdf2["key"] = Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_equality(): data1 = ["b", "c", "d", "a", "c"] data2 = ["a", None, "c", "a", "c"] ps1 = pd.Series(data1) ps2 = pd.Series(data2) gs1 = Series(data1) gs2 = Series(data2) expect = ps1 == ps2 got = gs1 == gs2 assert_eq(expect, got.fillna(False)) expect = ps1 == "m" got = gs1 == "m" assert_eq(expect, got.fillna(False)) ps1 = pd.Series(["a"]) gs1 = Series(["a"]) expect = ps1 == "m" got = gs1 == "m" assert_eq(expect, got)
def test_series_groupby_agg(agg): s = pd.Series([1, 2, 3]) g = Series([1, 2, 3]) sg = s.groupby(s // 2).agg(agg) gg = g.groupby(g // 2).agg(agg) check_dtype = False if agg == 'count' else True assert_eq(sg, gg, check_dtype=check_dtype)
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = rmm.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_categorical_compare_ordered(data): cat1 = data[0] cat2 = data[1] pdsr1 = pd.Series(cat1) pdsr2 = pd.Series(cat2) sr1 = Series(cat1) sr2 = Series(cat2) dsr1 = dgd.from_cudf(sr1, npartitions=2) dsr2 = dgd.from_cudf(sr2, npartitions=2) # Test equality out = dsr1 == dsr1 assert out.dtype == np.bool_ assert np.all(out.compute().to_array()) assert np.all(pdsr1 == pdsr1) # Test inequality out = dsr1 != dsr1 assert not np.any(out.compute().to_array()) assert not np.any(pdsr1 != pdsr1) assert dsr1.cat.ordered assert pdsr1.cat.ordered # Test ordered operators np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute()) np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute())
def test_string_numeric_astype(dtype): if dtype.startswith("bool"): data = [1, 0, 1, 0, 1] elif dtype.startswith("int"): data = [1, 2, 3, 4, 5] elif dtype.startswith("float"): data = [1.0, 2.0, 3.0, 4.0, 5.0] elif dtype.startswith("datetime64"): data = [1000000000, 2000000000, 3000000000, 4000000000, 5000000000] if dtype.startswith("datetime64"): ps = pd.Series(data, dtype="datetime64[ns]") gs = Series.from_pandas(ps) else: ps = pd.Series(data, dtype=dtype) gs = Series(data, dtype=dtype) # Pandas datetime64 --> str typecasting returns arbitrary format depending # on the data, so making it consistent unless we choose to match the # behavior if dtype.startswith("datetime64"): expect = ps.dt.strftime("%Y-%m-%dT%H:%M:%SZ") else: expect = ps.astype("str") got = gs.astype("str") assert_eq(expect, got)
def test_series_std(ddof): np.random.seed(0) arr = np.random.random(100) - 0.5 sr = Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) expect = pd.std(ddof=ddof) np.testing.assert_approx_equal(expect, got)
def test_max(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.max() expect = dtype(data.max()) assert expect == got
def test_typecast_to_from_datetime(data, from_dtype, to_dtype): np_data = data.astype(from_dtype) gdf_data = Series(np_data) np_casted = np_data.astype(to_dtype).astype(from_dtype) gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype) np.testing.assert_equal(np_casted, np.array(gdf_casted))
def test_series_compare(cmpop): arr1 = np.random.random(100) arr2 = np.random.random(100) sr1 = Series(arr1) sr2 = Series(arr2) np.testing.assert_equal(cmpop(sr1, sr1).to_array(), cmpop(arr1, arr1)) np.testing.assert_equal(cmpop(sr2, sr2).to_array(), cmpop(arr2, arr2)) np.testing.assert_equal(cmpop(sr1, sr2).to_array(), cmpop(arr1, arr2))
def test_series_groupby(agg): s = pd.Series([1, 2, 3]) g = Series([1, 2, 3]) sg = s.groupby(s // 2) gg = g.groupby(g // 2) sa = getattr(sg, agg)() ga = getattr(gg, agg)() assert_eq(sa, ga)
def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): np_data = data.astype(from_dtype) gdf_col = Series(np_data)._column np_casted = np_data.astype(to_dtype) gdf_casted = gdf_col.astype(to_dtype) np.testing.assert_equal(np_casted, gdf_casted.to_array())
def func(index): arr = np.random.random(100) * 10 sr = Series(arr) result = binop(sr.astype('int32'), sr) expect = binop(arr.astype('int32'), arr) np.testing.assert_almost_equal(result.to_array(), expect, decimal=5)
def test_typecast_to_datetime(data, dtype): np_data = data.astype(dtype) gdf_data = Series(np_data) np_casted = np_data.astype('datetime64[ms]') gdf_casted = gdf_data.astype('datetime64[ms]') np.testing.assert_equal(np_casted, np.array(gdf_casted))
def test_sum(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum() expect = dtype(data.sum()) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_misc_quantiles(data, q): from cudf.tests import utils pdf_series = pd.Series(data) gdf_series = Series(data) expected = pdf_series.quantile(q) actual = gdf_series.quantile(q) utils.assert_eq(expected, actual)
def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series((100 * np.random.random(nelem))) orig = sr.to_array() got = sr.sort_values().sort_index(ascending=asc).to_array() if not asc: # Reverse the array for descending sort got = got[::-1] np.testing.assert_array_equal(orig, got)
def test_string_unique(item): ps = pd.Series(item) gs = Series(item) # Pandas `unique` returns a numpy array pres = pd.Series(ps.unique()) # Nvstrings returns sorted unique with `None` placed before other strings pres = pres.sort_values(na_position="first").reset_index(drop=True) gres = gs.unique() assert_eq(pres, gres)
def test_string_empty_astype(dtype): data = [] ps = pd.Series(data, dtype="str") gs = Series(data, dtype="str") expect = ps.astype(dtype) got = gs.astype(dtype) assert_eq(expect, got)