def test_dataframe_scatter_by_map(map_size, nelem, keep): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] np.random.seed(0) df = DataFrame() df["a"] = np.random.choice(strlist[:map_size], nelem) df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) df["c"] = np.random.randint(map_size, size=nelem) df["d"] = df["a"].astype("category") def _check_scatter_by_map(dfs, col): assert len(dfs) == map_size nrows = 0 # print(col._column) name = col.name for i, df in enumerate(dfs): nrows += len(df) if len(df) > 0: # Make sure the column types were preserved assert isinstance(df[name]._column, type(col._column)) try: sr = df[name].astype(np.int32) except ValueError: sr = df[name] assert sr.nunique() <= 1 if sr.nunique() == 1: if isinstance(df[name]._column, NumericalColumn): assert sr.iloc[0] == i assert nrows == nelem _check_scatter_by_map(df.scatter_by_map("a", map_size, keep_index=keep), df["a"]) _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep), df["b"]) _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep), df["c"]) _check_scatter_by_map(df.scatter_by_map("d", map_size, keep_index=keep), df["d"]) if map_size == 2 and nelem == 100: df.scatter_by_map("a") # Auto-detect map_size with pytest.raises(ValueError): df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size # Test GenericIndex df2 = df.set_index("c") generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(generic_result, df2["b"]) if keep: for frame in generic_result: isinstance(frame.index, type(df2.index)) # Test MultiIndex df2 = df.set_index(["a", "c"]) multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(multiindex_result, df2["b"]) if keep: for frame in multiindex_result: isinstance(frame.index, type(df2.index))
def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ 2992443.0, 2992447.0, 2992466.0, 2992440.0, 2992441.0, 2992442.0, 2992444.0, 2992445.0, 2992446.0, 2992448.0, ] assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) df = df.set_index("col2") assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, )
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame( {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} ) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def test_factorize_index_obj(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.get(), handcoded)