def test_series_rank_combinations(elem, dtype): np.random.seed(0) gdf = DataFrame() gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype( dtype ) ranked_gs = gdf["a"].rank(method="first") df = pd.DataFrame() df["a"] = aa ranked_ps = df["a"].rank(method="first") # Check assert_eq(ranked_ps, ranked_gs.to_pandas())
def test_df_set_index_from_series(): df = DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) # Check set_index(Series) df2 = df.set_index(df["b"]) assert list(df2.columns) == ["a", "b"] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_dataframe_nsmallest(nelem, n): np.random.seed(0) df = DataFrame() df["a"] = aa = np.random.random(nelem) df["b"] = bb = np.random.random(nelem) res = df.nsmallest(n, "a") # Check inds = np.argsort(-aa) assert_eq(res["a"].to_array(), aa[inds][-n:][::-1]) assert_eq(res["b"].to_array(), bb[inds][-n:][::-1]) assert_eq(res.index.values, inds[-n:][::-1])
def test_query_env_changing(): df = DataFrame() df["a"] = aa = np.arange(100) expr = "a < @c" # first attempt c = 10 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got["a"].to_array()) # change env c = 50 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] exp_join_col = Series(exp_join_data, dtype="float32") expect = DataFrame( {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} ) got = gdf_l.merge(gdf_r, on="join_col", how="left") assert_eq(expect, got)
def test_index_join_exception_cases(): l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) # Join between two MultiIndex lhs = ["a", "b"] rhs = ["a", "c"] level = "a" how = "outer" g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index with pytest.raises(TypeError): g_lhs.join(g_rhs, level=level, how=how) # Improper level value, level should be an int or scalar value level = ["a"] rhs = ["a"] g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index with pytest.raises(ValueError): g_lhs.join(g_rhs, level=level, how=how)
def _check_input_fit(self, X, is_categories=False): """Helper function to check input of fit within the multi-gpu model""" if isinstance(X, (dask.array.core.Array, cp.ndarray)): self._set_input_type('array') if is_categories: X = X.transpose() if isinstance(X, cp.ndarray): return DataFrame(X) else: return to_dask_cudf(X, client=self.client) else: self._set_input_type('df') return X
def test_df_cat_set_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a") pddf = df.to_pandas() expect = pddf.set_index("a") assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # The number of digits after the decimal to use. decimal_digits = 2 # The number of digits before the decimal to use. whole_digits = 2 scale = 10 ** whole_digits nelem = num_groups * nelem_per_group # The unique is necessary because otherwise if there are duplicates idxmin # and idxmax may return different results than pandas (see # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to # the current version of the test, because idxmin and idxmax simply don't # work with pandas Series composed of Decimal objects (see # https://github.com/pandas-dev/pandas/issues/40685). However, if that is # ever enabled, then this issue will crop up again so we may as well have # it fixed now. x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) if x.size < y.size: total_elements = x.size y = y[: x.size] else: total_elements = y.size x = x[: y.size] # Note that this filtering can lead to one group with fewer elements, but # that shouldn't be a problem and is probably useful to test. idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] decimal_x = pd.Series([Decimal(str(d)) for d in x]) decimal_y = pd.Series([Decimal(str(d)) for d in y]) pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) gdf = DataFrame( { "idx": idx_col, "x": cudf.Series(decimal_x), "y": cudf.Series(decimal_y), } ) expect_df = pdf.groupby("idx", sort=True).agg(func) if rmm._cuda.gpu.runtimeGetVersion() < 11000: with pytest.raises(RuntimeError): got_df = gdf.groupby("idx", sort=True).agg(func) else: got_df = gdf.groupby("idx", sort=True).agg(func) assert_eq(expect_df["x"], got_df["x"], check_dtype=False) assert_eq(expect_df["y"], got_df["y"], check_dtype=False)
def test_df_set_index_from_name(): df = DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) # Check set_index(column_name) df2 = df.set_index("b") print(df2) # 1 less column because 'b' is used as index assert list(df2.columns) == ["a"] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_dataframe_merge_order(): gdf1 = DataFrame() gdf2 = DataFrame() gdf1["id"] = [10, 11] gdf1["timestamp"] = [1, 2] gdf1["a"] = [3, 4] gdf2["id"] = [4, 5] gdf2["a"] = [7, 8] gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash") df1 = pd.DataFrame() df2 = pd.DataFrame() df1["id"] = [10, 11] df1["timestamp"] = [1, 2] df1["a"] = [3, 4] df2["id"] = [4, 5] df2["a"] = [7, 8] df = df1.merge(df2, how="left", on=["id", "a"]) assert_eq(gdf, df)
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame({ "a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1] }) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def test_sizeof_dataframe(): np.random.seed(0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) df["vals"] = hvals = np.random.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = sys.getsizeof(df) assert sizeof >= nbytes serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) # Serialized size should be close to what __sizeof__ is giving np.testing.assert_approx_equal(sizeof, serialized_nbytes, significant=2)
def test_typecast_on_join_indexes_matching_categorical(): join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category") join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str") other_data = [1, 2, 3, 4, 5] gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") exp_join_data = ["a", "b", "c", "d", "e"] exp_other_data = [1, 2, 3, 4, 5] expect = DataFrame({ "join_col": exp_join_data, "B_x": exp_other_data, "B_y": exp_other_data, }) expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got)
def test_factorize_series_obj(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) uvals, labels = df["cats"].factorize() np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) assert isinstance(uvals, cp.core.core.ndarray) assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.get(), handcoded)
def test_factorize(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) uvals, labels = df["cats"].factorize() np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) assert isinstance(uvals, Series) assert isinstance(labels, Series) encoder = dict((v, i) for i, v in enumerate(labels)) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.to_array(), handcoded)
def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] join_data_l = Series([1, 2, 3], dtype=dtype_l) join_data_r = Series([1, 2, 4], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame({ "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, }) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_sizeof_dataframe(): np.random.seed(0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) df["vals"] = hvals = np.random.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = sys.getsizeof(df) assert sizeof >= nbytes serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) # assert at least sizeof bytes were serialized assert serialized_nbytes >= sizeof
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_string_groupby_key_index(): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") gdf["a"] = Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data expect = pdf.groupby("a").count() got = gdf.groupby("a").count() assert_eq(expect, got, check_dtype=False)
def test_groupby_iterate_groups(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) for name, grp in df.groupby(["key1", "key2"]): pddf = grp.to_pandas() for k in "key1,key2".split(","): assert_values_equal(pddf[k].values)
def test_onehot_random(): df = DataFrame() low = 10 high = 17 size = 10 df["src"] = src = np.random.randint(low=low, high=high, size=size) df2 = df.one_hot_encoding(column="src", prefix="out_", cats=tuple(range(10, 17))) mat = df2.as_matrix(columns=df2.columns[1:]) for val in range(low, high): colidx = val - low arr = mat[:, colidx] mask = src == val np.testing.assert_equal(arr, mask)
def test_datetime_scalar_timeunit_cast(timeunit): testscalar = np.datetime64("2016-11-20", timeunit) gs = Series(testscalar) ps = pd.Series(testscalar) assert_eq(ps, gs) gdf = DataFrame() gdf["a"] = np.arange(5) gdf["b"] = testscalar pdf = pd.DataFrame() pdf["a"] = np.arange(5) pdf["b"] = testscalar assert_eq(pdf, gdf)
def test_cat_series_binop_error(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) dfa = df["a"] dfb = df["b"] # lhs is a categorical with pytest.raises(TypeError) as raises: dfa + dfb raises.match( "Series of dtype `category` cannot perform the operation: add") # if lhs is a numerical with pytest.raises(TypeError) as raises: dfb + dfa raises.match("'add' operator not supported")
def test_groupby_cats(): df = DataFrame() df["cats"] = pd.Categorical(list("aabaacaab")) df["vals"] = np.random.random(len(df)) cats = df["cats"].values_host vals = df["vals"].to_array() grouped = df.groupby(["cats"], as_index=False).mean() got_vals = grouped["vals"] got_cats = grouped["cats"] for i in range(len(got_vals)): expect = vals[cats == got_cats[i]].mean() np.testing.assert_almost_equal(got_vals[i], expect)
def test_query_ref_env(data, fn): # prepare nelem, seed = data expect_fn, query_expr = fn np.random.seed(seed) df = DataFrame() df["a"] = aa = np.arange(nelem) df["b"] = bb = np.random.random(nelem) * nelem c = 2.3 d = 1.2 # udt expect_mask = expect_fn(aa, bb, c, d) print(expect_mask) df2 = df.query(query_expr) # check assert len(df2) == np.count_nonzero(expect_mask) np.testing.assert_array_almost_equal(df2["a"].to_array(), aa[expect_mask]) np.testing.assert_array_almost_equal(df2["b"].to_array(), bb[expect_mask])
def test_groupby_cats(method): df = DataFrame() df["cats"] = pd.Categorical(list("aabaacaab")) df["vals"] = np.random.random(len(df)) cats = np.asarray(list(df["cats"])) vals = df["vals"].to_array() grouped = df.groupby(["cats"], method=method, as_index=False).mean() got_vals = grouped["vals"] got_cats = grouped["cats"] for c, v in zip(got_cats, got_vals): print(c, v) expect = vals[cats == c].mean() np.testing.assert_almost_equal(v, expect)
def test_groupby_as_df(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) df, segs = df.groupby(["key1", "key2"], method="cudf").as_df() for s, e in zip(segs, list(segs[1:]) + [None]): grp = df[s:e] pddf = grp.to_pandas() for k in "key1,key2".split(","): assert_values_equal(pddf[k].values)
def test_get_dummies_prefix_sep(prefix, prefix_sep): data = { "first": ["1", "2", "3"], "second": ["abc", "def", "ghi"], "third": ["ji", "ji", "ji"], } gdf = DataFrame(data) pdf = pd.DataFrame(data) encoded_expected = pd.get_dummies(pdf, prefix=prefix, prefix_sep=prefix_sep) encoded_actual = cudf.get_dummies(gdf, prefix=prefix, prefix_sep=prefix_sep) utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = pdf.groupby(list(range(num_keys)), as_index=False).count() got = gdf.groupby(list(range(num_keys)), as_index=False).count() expect = expect.sort_values([0]).reset_index(drop=True) got = got.sort_values([0]).reset_index(drop=True) assert_eq(expect, got, check_dtype=False)