def test_cudf_training_with_sklearn(): from cudf import DataFrame as df from cudf import Series as ss import pandas as pd np.random.seed(1) X = pd.DataFrame(np.random.randn(50, 10)) y = pd.DataFrame((np.random.randn(50) > 0).astype(np.int8)) weights = np.random.random(50) + 1.0 cudf_weights = df.from_pandas(pd.DataFrame(weights)) base_margin = np.random.random(50) cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin)) X_cudf = df.from_pandas(X) y_cudf = df.from_pandas(y) y_cudf_series = ss(data=y.iloc[:, 0]) for y_obj in [y_cudf, y_cudf_series]: clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist') clf.fit(X_cudf, y_obj, sample_weight=cudf_weights, base_margin=cudf_base_margin, eval_set=[(X_cudf, y_obj)]) pred = clf.predict(X_cudf) assert np.array_equal(np.unique(pred), np.array([0, 1]))
def _test_cudf_training(DMatrixT): from cudf import DataFrame as df import pandas as pd np.random.seed(1) X = pd.DataFrame(np.random.randn(50, 10)) y = pd.DataFrame(np.random.randn(50)) weights = np.random.random(50) + 1.0 cudf_weights = df.from_pandas(pd.DataFrame(weights)) base_margin = np.random.random(50) cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin)) evals_result_cudf = {} dtrain_cudf = DMatrixT(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights, base_margin=cudf_base_margin) params = {'gpu_id': 0, 'tree_method': 'gpu_hist'} xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")], evals_result=evals_result_cudf) evals_result_np = {} dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin) xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")], evals_result=evals_result_np) assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
def test_dataframe_column_name_indexing(): df = DataFrame() data = np.asarray(range(10), dtype=np.int32) df["a"] = data df[1] = data np.testing.assert_equal(df["a"].to_array(), np.asarray(range(10), dtype=np.int32)) np.testing.assert_equal(df[1].to_array(), np.asarray(range(10), dtype=np.int32)) pdf = pd.DataFrame() nelem = 10 pdf["key1"] = np.random.randint(0, 5, nelem) pdf["key2"] = np.random.randint(0, 3, nelem) pdf[1] = np.arange(1, 1 + nelem) pdf[2] = np.random.random(nelem) df = DataFrame.from_pandas(pdf) assert_eq(df[df.columns], df) assert_eq(df[df.columns[:1]], df[["key1"]]) for i in range(1, len(pdf.columns) + 1): for idx in combinations(pdf.columns, i): assert pdf[list(idx)].equals(df[list(idx)].to_pandas()) # test for only numeric columns df = pd.DataFrame() for i in range(0, 10): df[i] = range(nelem) gdf = DataFrame.from_pandas(df) assert_eq(gdf, df) assert_eq(gdf[gdf.columns], gdf) assert_eq(gdf[gdf.columns[:3]], gdf[[0, 1, 2]])
def test_cudf_training(self): from cudf import DataFrame as df import pandas as pd X = pd.DataFrame(np.random.randn(50, 10)) y = pd.DataFrame(np.random.randn(50)) weights = np.random.random(50) cudf_weights = df.from_pandas(pd.DataFrame(weights)) base_margin = np.random.random(50) cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin)) evals_result_cudf = {} dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights, base_margin=cudf_base_margin) xgb.train({'gpu_id': 0}, dtrain_cudf, evals=[(dtrain_cudf, "train")], evals_result=evals_result_cudf) evals_result_np = {} dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin) xgb.train({}, dtrain_np, evals=[(dtrain_np, "train")], evals_result=evals_result_np) assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): np.random.seed(0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: idx = np.random.choice( num_rows, size=int(num_rows / 4), replace=False ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) assert_eq( got[by].reset_index(drop=True), expect[by].reset_index(drop=True) )
def test_dataframe_loc_mask(mask, arg): pdf = pd.DataFrame( {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} ) gdf = DataFrame.from_pandas(pdf) assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg])
def generate_inputs_from_categories(categories=None, n_samples=10, seed=5060, as_array=False): if categories is None: if as_array: categories = { 'strings': list(range(1000, 4000, 3)), 'integers': list(range(1000)) } else: categories = { 'strings': ['Foo', 'Bar', 'Baz'], 'integers': list(range(1000)) } rd = np.random.RandomState(seed) pandas_df = pd.DataFrame( {name: rd.choice(cat, n_samples) for name, cat in categories.items()}) ary = from_df_to_array(pandas_df) if as_array: inp_ary = cp.array(ary) return inp_ary, ary else: df = DataFrame.from_pandas(pandas_df) return df, ary
def test_cudf_metainfo(self): from cudf import DataFrame as df import pandas as pd n = 100 X = np.random.random((n, 2)) dmat_cudf = xgb.DMatrix(X) dmat = xgb.DMatrix(X) floats = np.random.random(n) uints = np.array([4, 2, 8]).astype("uint32") cudf_floats = df.from_pandas(pd.DataFrame(floats)) cudf_uints = df.from_pandas(pd.DataFrame(uints)) dmat.set_float_info('weight', floats) dmat.set_float_info('label', floats) dmat.set_float_info('base_margin', floats) dmat.set_uint_info('group', uints) dmat_cudf.set_interface_info('weight', cudf_floats) dmat_cudf.set_interface_info('label', cudf_floats) dmat_cudf.set_interface_info('base_margin', cudf_floats) dmat_cudf.set_interface_info('group', cudf_uints) # Test setting info with cudf DataFrame assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight')) assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label')) assert np.array_equal(dmat.get_float_info('base_margin'), dmat_cudf.get_float_info('base_margin')) assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr')) # Test setting info with cudf Series dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]]) dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]]) dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]]) dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]]) assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight')) assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label')) assert np.array_equal(dmat.get_float_info('base_margin'), dmat_cudf.get_float_info('base_margin')) assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
def test_dataframe_apply_boolean_mask(): pdf = pd.DataFrame({ "a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3], "c": ["a", None, "b", "c"], }) gdf = DataFrame.from_pandas(pdf) assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]])
def test_dataframe_boolean_mask_with_None(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) gdf = DataFrame.from_pandas(pdf) pdf_masked = pdf[[True, False, True, False]] gdf_masked = gdf[[True, False, True, False]] assert_eq(pdf_masked, gdf_masked) with pytest.raises(ValueError): gdf[Series([True, False, None, False])]
def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() df["a"] = np.random.random(nelem) expect = df[sliceobj]["a"].sort_values() gdf = DataFrame.from_pandas(df) got = gdf[sliceobj]["a"].sort_values() assert (got.to_pandas() == expect).all()
def test_query_empty_frames(): empty_pdf = pd.DataFrame({"a": [], "b": []}) empty_gdf = DataFrame.from_pandas(empty_pdf) # Do the query expr = "a > 2" got = empty_gdf.query(expr).to_pandas() expect = empty_pdf.query(expr) # assert equal results assert_eq(got, expect)
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): pdf = pd.DataFrame({"a": [0, 1, np.nan]}) df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"]) got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) if dummy_na and nan_as_null: got = got.rename(columns={"a_null": "a_nan"})[expected.columns] utils.assert_eq(expected, got)
def test_sliced_indexing(): a = list(range(4, 4 + 150)) b = list(range(0, 0 + 150)) pdf = pd.DataFrame({"a": a, "b": b}) gdf = DataFrame.from_pandas(pdf) pdf = pdf.set_index("a") gdf = gdf.set_index("a") pidx = pdf.index[:75] gidx = gdf.index[:75] assert_eq(pdf.loc[pidx], gdf.loc[gidx])
def test_from_pandas_ex1(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) assert np.all(df["a"].to_numpy() == pdf["a"]) matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2]) assert np.isnan(pdf["b"][2])
def test_dataframe_nsmallest_sliced(counts, sliceobj): nelem, n = counts np.random.seed(0) df = pd.DataFrame() df["a"] = np.random.random(nelem) df["b"] = np.random.random(nelem) expect = df[sliceobj].nsmallest(n, "a") gdf = DataFrame.from_pandas(df) got = gdf[sliceobj].nsmallest(n, "a") assert (got.to_pandas() == expect).all().all()
def test_from_pandas_with_index(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) df = DataFrame.from_pandas(pdf) # Check columns assert_eq(df.a, pdf.a) assert_eq(df.b, pdf.b) # Check index assert_eq(df.index.values, pdf.index.values) # Check again using pandas testing tool on frames assert_eq(df, pdf)
def test_dataframe_loc(scalar, step): size = 123 pdf = pd.DataFrame( { "a": np.random.randint(low=0, high=100, size=size), "b": np.random.random(size).astype(np.float32), "c": np.random.random(size).astype(np.float64), "d": np.random.random(size).astype(np.float64), } ) df = DataFrame.from_pandas(pdf) # Scalar label assert_eq(df.loc[scalar], pdf.loc[scalar]) # Full slice assert_eq(df.loc[:, "c"], pdf.loc[:, "c"]) begin = 110 end = 122 assert_eq( df.loc[begin:end:step, ["c", "d", "a"]], pdf.loc[begin:end:step, ["c", "d", "a"]], ) assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) # Slicing on columns: assert_eq( df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] ) # Slicing of size 1: assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) # TODO: Pandas changes the dtype here when it shouldn't assert_eq( df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False ) # Make int64 index offset = 50 df2 = df[offset:] pdf2 = pdf[offset:] begin = 117 end = 122 assert_eq( df2.loc[begin:end, ["c", "d", "a"]], pdf2.loc[begin:end, ["c", "d", "a"]], )
def test_query_with_index_keyword(query, a_val, b_val, c_val): pdf = pd.DataFrame({ "a": [1, None, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [12, 15, 17, 19, 27], }) pdf.set_index("a") gdf = DataFrame.from_pandas(pdf) out = gdf.query(query) expect = pdf.query(query) assert_eq(out, expect)
def test_from_pandas(): pdf = pd.DataFrame() pdf["a"] = np.arange(10, dtype=np.int32) pdf["b"] = np.arange(10, 20, dtype=np.float64) df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) assert df["a"].dtype == pdf["a"].dtype assert df["b"].dtype == pdf["b"].dtype assert len(df["a"]) == len(pdf["a"]) assert len(df["b"]) == len(pdf["b"])
def test_dataframe_multi_column_nulls_multiple_ascending( ascending, na_position): pdf = pd.DataFrame({ "a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7] }) gdf = DataFrame.from_pandas(pdf) expect = pdf.sort_values(by=["a", "b"], ascending=ascending, na_position=na_position) actual = gdf.sort_values(by=["a", "b"], ascending=ascending, na_position=na_position) assert_eq(actual, expect)
def test_rank_error_arguments(pdf): gdf = DataFrame.from_pandas(pdf) assert_exceptions_equal( lfunc=pdf["col1"].rank, rfunc=gdf["col1"].rank, lfunc_args_and_kwargs=( [], { "method": "randomname", "na_option": "keep", "ascending": True, "pct": True, }, ), rfunc_args_and_kwargs=( [], { "method": "randomname", "na_option": "keep", "ascending": True, "pct": True, }, ), ) assert_exceptions_equal( lfunc=pdf["col1"].rank, rfunc=gdf["col1"].rank, lfunc_args_and_kwargs=( [], { "method": "first", "na_option": "randomname", "ascending": True, "pct": True, }, ), rfunc_args_and_kwargs=( [], { "method": "first", "na_option": "randomname", "ascending": True, "pct": True, }, ), )
def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending, na_position): np.random.seed(0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True))
def test_query_splitted_combine(): np.random.seed(0) df = pd.DataFrame({ "x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10) }) gdf = DataFrame.from_pandas(df) # Split the GDF s1 = gdf[:5] s2 = gdf[5:] # Do the query expr = "x > 2" q1 = s1.query(expr) q2 = s2.query(expr) # Combine got = cudf.concat([q1, q2]).to_pandas() # Should equal to just querying the original GDF expect = gdf.query(expr).to_pandas() assert_eq(got, expect, check_index_type=True)
def test_issue_165(): df_pandas = pd.DataFrame() start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d") data = [(start_date + dt.timedelta(days=x)) for x in range(6)] df_pandas["dates"] = data df_pandas["num"] = [1, 2, 3, 4, 5, 6] df_cudf = DataFrame.from_pandas(df_pandas) base = df_pandas.query("dates==@start_date") test = df_cudf.query("dates==@start_date") assert_eq(base, test) assert len(test) > 0 mask = df_cudf.dates == start_date base_mask = df_pandas.dates == start_date assert_eq(mask, base_mask, check_names=False) assert mask.to_pandas().sum() > 0 start_date_ts = pd.Timestamp(start_date) test = df_cudf.query("dates==@start_date_ts") base = df_pandas.query("dates==@start_date_ts") assert_eq(base, test) assert len(test) > 0 mask = df_cudf.dates == start_date_ts base_mask = df_pandas.dates == start_date_ts assert_eq(mask, base_mask, check_names=False) assert mask.to_pandas().sum() > 0 start_date_np = np.datetime64(start_date_ts, "ns") test = df_cudf.query("dates==@start_date_np") base = df_pandas.query("dates==@start_date_np") assert_eq(base, test) assert len(test) > 0 mask = df_cudf.dates == start_date_np base_mask = df_pandas.dates == start_date_np assert_eq(mask, base_mask, check_names=False) assert mask.to_pandas().sum() > 0
def test_rank_all_arguments(pdf, dtype, ascending, method, na_option, pct, numeric_only): if method == "first" and dtype == "O": # not supported by pandas return pdf = pdf.copy(deep=True) # for parallel pytest if numeric_only: pdf["str"] = np.array( ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]) gdf = DataFrame.from_pandas(pdf) kwargs = { "method": method, "na_option": na_option, "ascending": ascending, "pct": pct, "numeric_only": numeric_only, } # Series assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs)) assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs)) if numeric_only: expect = pdf["str"].rank(**kwargs) got = gdf["str"].rank(**kwargs) assert expect.empty == got.empty expected = pdf.select_dtypes(include=np.number) else: expected = pdf.copy(deep=True) # TODO: Remove per column iteration once the # following issue is fixed : # https://github.com/pandas-dev/pandas/issues/43310 for col in expected.columns: expected[col] = pdf[col].rank(**kwargs) actual = gdf.rank(**kwargs) assert_eq(expected, actual)
def test_rank_all_arguments(pdf, dtype, ascending, method, na_option, pct, numeric_only): if method == "first" and dtype == "O": # not supported by pandas return pdf = pdf.copy(deep=True) # for parallel pytest if numeric_only: pdf["str"] = np.array( ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]) gdf = DataFrame.from_pandas(pdf) kwargs = { "method": method, "na_option": na_option, "ascending": ascending, "pct": pct, "numeric_only": numeric_only, } # Series assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs)) assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs)) if numeric_only: expect = pdf["str"].rank(**kwargs) got = gdf["str"].rank(**kwargs) assert expect.empty == got.empty # TODO: https://github.com/pandas-dev/pandas/issues/32593 # Dataframe (bug in pandas) if (na_option == "top" and method == "first" and not dtype == "O" and ascending): assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs)) else: with pytest.raises(AssertionError, match="values are different"): assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): from cudf.core.series import Series from cudf import DataFrame super().__init__() self._name = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: source_data = kwargs["source_data"].copy(deep=False) source_data.reset_index(drop=True, inplace=True) if isinstance(source_data, pd.DataFrame): nan_as_null = kwargs.get("nan_as_null", None) source_data = DataFrame.from_pandas(source_data, nan_as_null=nan_as_null) names = names if names is not None else source_data._data.names # if names are unique # try using those as the source_data column names: if len(dict.fromkeys(names)) == len(names): source_data.columns = names self._data = source_data._data self.names = names self._codes = codes self._levels = levels return # name setup if isinstance( names, ( Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList, ), ): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, DataFrame) and not isinstance( codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, DataFrame): self._codes = codes elif len(levels) == len(codes): self._codes = DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) self._codes[name] = codes.astype(np.int64) else: raise ValueError("MultiIndex has unequal number of levels and " "codes and is inconsistent!") self._levels = [Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) source_data = DataFrame() for i, name in enumerate(self._codes.columns): codes = as_index(self._codes[name]._column) if -1 in self._codes[name].values: # Must account for null(s) in _source_data column level = DataFrame( {name: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), ) else: level = DataFrame({name: self._levels[i]}) import cudf._lib as libcudf source_data[name] = libcudf.copying.gather( level, codes._data.columns[0])._data[name] self._data = source_data._data self.names = names
def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) gdf = DataFrame.from_pandas(pdf) assert_eq(pdf.loc[2], gdf.loc[2])
def test_dataframe_boolean_mask_with_None(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) gdf = DataFrame.from_pandas(pdf) pdf_masked = pdf[[True, False, True, False]] gdf_masked = gdf[[True, False, True, False]] assert_eq(pdf_masked, gdf_masked)