def test_operator_func_between_series_logical( dtype, func, scalar_a, scalar_b, fill_value ): gdf_series_a = Series([scalar_a]).astype(dtype) gdf_series_b = Series([scalar_b]).astype(dtype) pdf_series_a = gdf_series_a.to_pandas() pdf_series_b = gdf_series_b.to_pandas() gdf_series_result = getattr(gdf_series_a, func)( gdf_series_b, fill_value=fill_value ) pdf_series_result = getattr(pdf_series_a, func)( pdf_series_b, fill_value=fill_value ) if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]: # cudf binary operations will return `None` when both left- and right- # side values are `None`. It will return `np.nan` when either side is # `np.nan`. As a consequence, when we convert our gdf => pdf during # assert_eq, we get a pdf with dtype='object' (all inputs are none). # to account for this, we use fillna. gdf_series_result.fillna(func == "ne", inplace=True) utils.assert_eq(pdf_series_result, gdf_series_result)
def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] join_data_l = Series([1, 2, 3], dtype=dtype_l) join_data_r = Series([1, 2, 4], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame({ "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, }) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_string_groupby_key_index(): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") gdf["a"] = Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data expect = pdf.groupby("a").count() got = gdf.groupby("a").count() assert_eq(expect, got, check_dtype=False)
def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] exp_join_col = Series(exp_join_data, dtype="float32") expect = DataFrame({ "join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By }) got = gdf_l.merge(gdf_r, on="join_col", how="left") assert_eq(expect, got)
def test_typecast_on_join_indexes(): join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") join_data_r = Series([1, 2, 3, 4, 6], dtype="int32") other_data = ["a", "b", "c", "d", "e"] gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] expect = DataFrame({ "join_col": exp_join_data, "B_x": exp_other_data, "B_y": exp_other_data, }) expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got)
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_applymap(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_col = Series(lhs_arr)._column def generic_function(a): return a ** 3 out_col = lhs_col.applymap(generic_function) result = lhs_arr ** 3 np.testing.assert_almost_equal(result, out_col.to_array())
def test_string_split(data, pat, n, expand, expand_raise): if data in (["a b", " c ", " d", "e ", "f"],) and pat is None: pytest.xfail("None pattern split algorithm not implemented yet") ps = pd.Series(data, dtype="str") gs = Series(data, dtype="str") expectation = raise_builder([expand_raise], NotImplementedError) with expectation: expect = ps.str.split(pat=pat, n=n, expand=expand) got = gs.str.split(pat=pat, n=n, expand=expand) assert_eq(expect, got)
def array_to_series(array): if isinstance(array, pa.ChunkedArray): return Series._concat( [array_to_series(chunk) for chunk in array.chunks]) array_len = len(array) null_count = array.null_count buffers = make_device_arrays(array) mask, data = buffers[0], buffers[1] dtype = arrow_to_pandas_dtype(array.type) if pa.types.is_dictionary(array.type): from cudf.core.column import CategoricalColumn codes = array_to_series(array.indices) categories = array_to_series(array.dictionary) data = CategoricalColumn( data=codes.data, mask=mask, null_count=null_count, categories=categories, ordered=array.type.ordered, ) elif pa.types.is_string(array.type): import nvstrings offs, data = buffers[1], buffers[2] offs = offs[array.offset:array.offset + array_len + 1] data = None if data is None else data.device_ctypes_pointer.value mask = None if mask is None else mask.device_ctypes_pointer.value data = nvstrings.from_offsets( data, offs.device_ctypes_pointer.value, array_len, mask, null_count, True, ) elif data is not None: data = data[array.offset:array.offset + len(array)] series = Series(data, dtype=dtype) if null_count > 0 and mask is not None and not series.has_null_mask: return series.set_mask(mask, null_count) return series
def test_product(dtype, nelem): if np.dtype(dtype).kind == "i": data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[random.randrange(nelem)] = random.random() * 2 else: data = gen_rand(dtype, nelem) sr = Series(data) got = sr.product() expect = np.product(data) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_exact_quantiles_int(int_method): arr = np.asarray([7, 0, 3, 4, 2, 1, -1, 1, 6]) quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) gdf_series = Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True ) q2 = df.quantile(quant_values, interpolation=int_method) np.testing.assert_allclose( q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 )
def test_exact_quantiles(int_method): arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] df = pd.DataFrame(arr) gdf_series = Series(arr) q1 = gdf_series.quantile( quant_values, interpolation=int_method, exact=True ) q2 = df.quantile(quant_values, interpolation=int_method) np.testing.assert_allclose( q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 )
def test_applymap_python_lambda(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_ser = Series(lhs_arr) # Note that the lambda has to be written this way. # In other words, the following code does NOT compile with numba: # test_list = [1, 2, 3, 4] # out_ser = lhs_ser.applymap(lambda x: x in test_list) out_ser = lhs_ser.applymap(lambda x: x in [1, 2, 3, 4]) result = np.isin(lhs_arr, [1, 2, 3, 4]) np.testing.assert_almost_equal(result, out_ser.to_array())
def test_datetime_scalar_timeunit_cast(timeunit): testscalar = np.datetime64("2016-11-20", timeunit) gs = Series(testscalar) ps = pd.Series(testscalar) assert_eq(ps, gs) gdf = DataFrame() gdf["a"] = np.arange(5) gdf["b"] = testscalar pdf = pd.DataFrame() pdf["a"] = np.arange(5) pdf["b"] = testscalar assert_eq(pdf, gdf)
def test_string_replace_with_backrefs(find, replace): s = [ "A543", "Z756", "", None, "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again", ] ps = pd.Series(s) gs = Series(s) got = gs.str.replace_with_backrefs(find, replace) expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected)
def test_sum_of_squares(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum_of_squares() expect = (data**2).sum() if np.dtype(dtype).kind == "i": if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) else: print("overflow, passing") else: np.testing.assert_approx_equal(expect, got, significant=accuracy_for_dtype[dtype])
def test_strings_rsplit(data, n, expand): gs = Series(data) ps = pd.Series(data) pd.testing.assert_frame_equal( ps.str.rsplit(n=n, expand=expand).reset_index(), gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(), check_index_type=False, ) assert_eq( ps.str.rsplit(",", n=n, expand=expand), gs.str.rsplit(",", n=n, expand=expand), ) assert_eq( ps.str.rsplit("-", n=n, expand=expand), gs.str.rsplit("-", n=n, expand=expand), )
def test_string_slice_str(string, number, diff): pds = pd.Series(string) gds = Series(string) assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) assert_eq(pds.str.slice(), gds.str.slice()) assert_eq( pds.str.slice(start=number, stop=number + diff), gds.str.slice(start=number, stop=number + diff), ) if diff != 0: assert_eq(pds.str.slice(step=diff), gds.str.slice(step=diff)) assert_eq( pds.str.slice(start=number, stop=number + diff, step=diff), gds.str.slice(start=number, stop=number + diff, step=diff), )
def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = pdf.groupby(list(range(num_keys)), as_index=False).count() got = gdf.groupby(list(range(num_keys)), as_index=False).count() expect = expect.sort_values([0]).reset_index(drop=True) got = got.sort_values([0]).reset_index(drop=True) assert_eq(expect, got, check_dtype=False)
def test_string_char_case(case_op, data): gs = Series(data) ps = pd.Series(data) s = gs.str a = getattr(s, case_op) assert_eq(a(), getattr(ps.str, case_op)()) assert_eq(gs.str.capitalize(), ps.str.capitalize()) assert_eq(gs.str.isdecimal(), ps.str.isdecimal()) assert_eq(gs.str.isalnum(), ps.str.isalnum()) assert_eq(gs.str.isalpha(), ps.str.isalpha()) assert_eq(gs.str.isdigit(), ps.str.isdigit()) assert_eq(gs.str.isnumeric(), ps.str.isnumeric()) assert_eq(gs.str.isspace(), ps.str.isspace()) assert_eq(gs.str.isempty(), ps == "")
def test_series_reductions_concurrency(method): from concurrent.futures import ThreadPoolExecutor e = ThreadPoolExecutor(10) np.random.seed(0) srs = [Series(np.random.random(10000)) for _ in range(1)] def call_test(sr): fn = getattr(sr, method) if method in ["std", "var"]: return fn(ddof=1) else: return fn() def f(sr): return call_test(sr + 1) list(e.map(f, srs * 50))
def test_product(dtype, nelem): np.random.seed(0) dtype = np.dtype(dtype).type if np.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[np.random.randint(low=0, high=nelem, size=1)] = (np.random.uniform() * 2) else: data = gen_rand(dtype, nelem) sr = Series(data) got = sr.product() expect = np.product(data) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_strings_filling_tests(data, width, fillchar): gs = Series(data) ps = pd.Series(data) # TODO: uncomment .str.center tests once this # is fixed: https://github.com/rapidsai/cudf/issues/4354 # as .str.center is nothing but .str.pad(side="both") # assert_eq( # ps.str.center(width=width, fillchar=fillchar), # gs.str.center(width=width, fillchar=fillchar), # ) assert_eq( ps.str.ljust(width=width, fillchar=fillchar), gs.str.ljust(width=width, fillchar=fillchar), ) assert_eq( ps.str.rjust(width=width, fillchar=fillchar), gs.str.rjust(width=width, fillchar=fillchar), )
def test_string_str_rindex(data, sub, er): ps = pd.Series(data) gs = Series(data) if er is None: assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) try: ps.str.index(sub) except er: pass else: assert not er try: gs.str.index(sub) except er: pass else: assert not er
def test_series_nlargest(data, n): """Indirectly tests Series.sort_values() """ sr = Series(data) psr = pd.Series(data) assert_eq(sr.nlargest(n), psr.nlargest(n)) assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) assert_exceptions_equal( lfunc=psr.nlargest, rfunc=sr.nlargest, lfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), rfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), expected_error_message='keep must be either "first", "last"', )
def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = Series(["hello", "goodbye"]) expect = ps.str.replace("e", "E").str.replace("o", "O") got = gs.str.replace(["e", "o"], ["E", "O"]) assert_eq(expect, got) ps = pd.Series(["foo", "fuz", np.nan]) gs = Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=True) got = gs.str.replace(["f."], ["ba"], regex=True) assert_eq(expect, got) ps = pd.Series(["f.o", "fuz", np.nan]) gs = Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=False) got = gs.str.replace(["f."], ["ba"], regex=False) assert_eq(expect, got)
def test_string_groupby_non_key(str_data, num_cols, agg): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = getattr(pdf.groupby("a", as_index=False), agg)() got = getattr(gdf.groupby("a", as_index=False), agg)() expect = expect.sort_values(["a"]).reset_index(drop=True) got = got.sort_values(["a"]).reset_index(drop=True) if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0: for i in range(num_cols): expect[i] = expect[i].astype("str") assert_eq(expect, got, check_dtype=False)
def test_categorical_masking(): """ Test common operation for getting a all rows that matches a certain category. """ cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) # check scalar comparison expect_matches = pdsr == "a" got_matches = sr == "a" np.testing.assert_array_equal(expect_matches.values, got_matches.to_array()) # mask series expect_masked = pdsr[expect_matches] got_masked = sr[got_matches] assert len(expect_masked) == len(got_masked) assert len(expect_masked) == got_masked.valid_count assert_eq(got_masked, expect_masked)
def test_categorical_integer(): cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array(fillna="pandas")) assert sr.null_count == 2 np.testing.assert_array_equal( pdsr.cat.codes.values, sr.cat.codes.fillna(-1).to_array() ) np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 null 2 null 3 c 4 a dtype: category Categories (3, object): [a, b, c] """ assert string.split() == expect_str.split()
def test_string_slice_replace(string, number, diff, repr): pds = pd.Series(string) gds = Series(string) assert_eq( pds.str.slice_replace(start=number, repl=repr), gds.str.slice_replace(start=number, repl=repr), check_dtype=False, ) assert_eq( pds.str.slice_replace(stop=number, repl=repr), gds.str.slice_replace(stop=number, repl=repr), ) assert_eq(pds.str.slice_replace(), gds.str.slice_replace()) assert_eq( pds.str.slice_replace(start=number, stop=number + diff), gds.str.slice_replace(start=number, stop=number + diff), ) assert_eq( pds.str.slice_replace(start=number, stop=number + diff, repl=repr), gds.str.slice_replace(start=number, stop=number + diff, repl=repr), check_dtype=False, )