def test_multiindex_construction(): levels = [['a', 'b'], ['c', 'd']] codes = [[0, 1], [1, 0]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels=levels, codes=codes) assert_eq(pmi, mi)
def test_multiindex_construction(): levels = [["a", "b"], ["c", "d"]] codes = [[0, 1], [1, 0]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels=levels, codes=codes) assert_eq(pmi, mi)
def test_multiindex_equals(): # mi made from groupby # mi made manually to be identical # are they equal? gdf = cudf.DataFrame({ "x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0] }) mi1 = gdf.groupby(["x", "y"]).mean().index mi2 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) assert_eq(mi1.equals(mi2), True) # mi made from two groupbys, are they equal? mi2 = gdf.groupby(["x", "y"]).max().index assert_eq(mi1.equals(mi2), True) # mi made manually twice are they equal? mi1 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) mi2 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) assert_eq(mi1.equals(mi2), True) # mi made from different groupbys are they not equal? mi1 = gdf.groupby(["x", "y"]).mean().index mi2 = gdf.groupby(["x", "z"]).mean().index assert_eq(mi1.equals(mi2), False) # mi made from different manuals are they not equal? mi1 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) mi2 = cudf.MultiIndex( levels=[[0, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) assert_eq(mi1.equals(mi2), False)
def test_multiindex_types(): codes = [[0, 1], [1, 0]] levels = [[0, 1], [2, 3]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) levels = [[1.2, 2.1], [1.3, 3.1]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) levels = [["a", "b"], ["c", "d"]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi)
def test_multiindex_types(): codes = [[0, 1], [1, 0]] levels = [[0, 1], [2, 3]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) levels = [[1.2, 2.1], [1.3, 3.1]] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi) levels = [['a', 'b'], ['c', 'd']] pmi = pd.MultiIndex(levels, codes) mi = cudf.MultiIndex(levels, codes) assert_eq(pmi, mi)
def test_multiindex_index_and_columns(): gdf = cudf.DataFrame() gdf['x'] = np.random.randint(0, 5, 5) gdf['y'] = np.random.randint(0, 5, 5) pdf = gdf.to_pandas() mi = cudf.MultiIndex(levels=[[0, 1, 2], [3, 4]], codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 1]], names=['x', 'y']) gdf.index = mi mc = cudf.MultiIndex(levels=[['val'], ['mean', 'min']], codes=[[0, 0], [0, 1]]) gdf.columns = mc pdf.index = mi pdf.index.names = ['x', 'y'] pdf.columns = mc assert_eq(pdf, gdf)
def test_difference(): midx = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) midx2 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], names=["x", "y"], ) expected = midx2.to_pandas().difference(midx.to_pandas()) actual = midx2.difference(midx) assert_eq(expected, actual)
def test_MI(): gdf = cudf.DataFrame( { "a": np.random.randint(0, 4, 10), "b": np.random.randint(0, 4, 10), "c": np.random.randint(0, 4, 10), } ) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] codes = cudf.DataFrame( { "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], } ) pd.options.display.max_rows = 999 pd.options.display.max_columns = 0 gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) pdf = gdf.to_pandas() gdfT = gdf.T pdfT = pdf.T assert gdf.__repr__() == pdf.__repr__() assert gdf.index.__repr__() == pdf.index.__repr__() assert gdfT.__repr__() == pdfT.__repr__() pd.reset_option("display.max_rows") pd.reset_option("display.max_columns")
def test_multiindex_df_assignment(): pdf = pd.DataFrame({'x': [1, 2, 3]}) gdf = cudf.from_pandas(pdf) pdf.index = pd.MultiIndex([['a', 'b'], ['c', 'd']], [[0, 1, 0], [1, 0, 1]]) gdf.index = cudf.MultiIndex(levels=[['a', 'b'], ['c', 'd']], codes=[[0, 1, 0], [1, 0, 1]]) assert_eq(pdf, gdf)
def test_multiindex_series_assignment(): ps = pd.Series([1, 2, 3]) gs = cudf.from_pandas(ps) ps.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]]) gs.index = cudf.MultiIndex(levels=[["a", "b"], ["c", "d"]], codes=[[0, 1, 0], [1, 0, 1]]) assert_eq(ps, gs)
def test_multiindex_copy(): # mi made from groupby # make a copy with .copy gdf = cudf.DataFrame({ "x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0] }) mi1 = gdf.groupby(["x", "y"]).mean().index mi2 = mi1.copy(deep=False) assert_eq(mi1, mi2) mi2 = mi1.copy(deep=True) assert_eq(mi1, mi2) # mi made manually # make a copy with .copy # is it equal? mi1 = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) mi2 = mi1.copy(deep=False) assert_eq(mi1, mi2) mi2 = mi1.copy(deep=True) assert_eq(mi1, mi2)
def test_multiindex_df_assignment(): pdf = pd.DataFrame({"x": [1, 2, 3]}) gdf = cudf.from_pandas(pdf) pdf.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]]) gdf.index = cudf.MultiIndex(levels=[["a", "b"], ["c", "d"]], codes=[[0, 1, 0], [1, 0, 1]]) assert_eq(pdf, gdf)
def test_array_func_missing_cudf_multi_index(func): levels = [["a", "b"], ["c", "d"]] codes = [[0, 1], [1, 0]] cudf_multi_index = cudf.MultiIndex(levels, codes) with pytest.raises(TypeError): func(cudf_multi_index)
def test_multiindex_index_and_columns(): gdf = cudf.DataFrame() gdf["x"] = np.random.randint(0, 5, 5) gdf["y"] = np.random.randint(0, 5, 5) pdf = gdf.to_pandas() mi = cudf.MultiIndex( levels=[[0, 1, 2], [3, 4]], codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 1]], names=["x", "y"], ) gdf.index = mi mc = cudf.MultiIndex(levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]]) gdf.columns = mc pdf.index = mi.to_pandas() pdf.columns = mc.to_pandas() assert_eq(pdf, gdf)
def test_multiindex_values_host(): midx = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) pmidx = midx.to_pandas() assert_eq(midx.values_host, pmidx.values)
def keys(self): nkeys = len(self._key_columns) if nkeys > 1: return cudf.MultiIndex( source_data=cudf.DataFrame( dict(zip(range(nkeys), self._key_columns))), names=self.names, ) else: return cudf.core.index.as_index(self._key_columns[0], name=self.names[0])
def test_multiindex_equality(): # mi made from groupby # mi made manually to be identical # are they equal? gdf = cudf.DataFrame({ 'x': [1, 5, 3, 4, 1], 'y': [1, 1, 2, 2, 5], 'z': [0, 1, 0, 1, 0] }) mi1 = gdf.groupby(['x', 'y']).mean().index mi2 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=['x', 'y']) assert_eq(mi1, mi2) # mi made from two groupbys, are they equal? mi2 = gdf.groupby(['x', 'y']).max().index assert_eq(mi1, mi2) # mi made manually twice are they equal? mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=['x', 'y']) mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=['x', 'y']) assert_eq(mi1, mi2) # mi made from different groupbys are they not equal? mi1 = gdf.groupby(['x', 'y']).mean().index mi1 = gdf.groupby(['x', 'z']).mean().index assert_neq(mi1, mi2) # mi made from different manuals are they not equal? mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=['x', 'y']) mi1 = cudf.MultiIndex(levels=[[0, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=['x', 'y']) assert_neq(mi1, mi2)
def test_multiindex_values(): midx = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) result = midx.values assert isinstance(result, cp.ndarray) np.testing.assert_array_equal( result.get(), np.array([[1, 1], [1, 5], [3, 2], [4, 2], [5, 1]]))
def test_multiindex_levels_codes_validation(): levels = [["a", "b"], ["c", "d"]] # Codes not a sequence of sequences with pytest.raises(TypeError): pd.MultiIndex(levels, [0, 1]) with pytest.raises(TypeError): cudf.MultiIndex(levels, [0, 1]) # Codes don't match levels with pytest.raises(ValueError): pd.MultiIndex(levels, [[0], [1], [1]]) with pytest.raises(ValueError): cudf.MultiIndex(levels, [[0], [1], [1]]) # Largest code greater than number of levels with pytest.raises(ValueError): pd.MultiIndex(levels, [[0, 1], [0, 2]]) with pytest.raises(ValueError): cudf.MultiIndex(levels, [[0, 1], [0, 2]]) # Unequal code lengths with pytest.raises(ValueError): pd.MultiIndex(levels, [[0, 1], [0]]) with pytest.raises(ValueError): cudf.MultiIndex(levels, [[0, 1], [0]]) # Didn't pass levels and codes with pytest.raises(TypeError): pd.MultiIndex() with pytest.raises(TypeError): cudf.MultiIndex() # Didn't pass non zero levels and codes with pytest.raises(ValueError): pd.MultiIndex([], []) with pytest.raises(ValueError): cudf.MultiIndex([], [])
def test_multiIndex_duplicate_names(): gi = cudf.MultiIndex( levels=[["a", "b"], ["b", "a"]], codes=[[0, 0], [0, 1]], names=["a", "a"], ) pi = pd.MultiIndex( levels=[["a", "b"], ["b", "a"]], codes=[[0, 0], [0, 1]], names=["a", "a"], ) assert_eq(gi, pi)
def test_multiindex_iter_error(): midx = cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) with pytest.raises( TypeError, match=re.escape( f"{midx.__class__.__name__} object is not iterable. " f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " f"if you wish to iterate over the values."), ): iter(midx)
def test_frequency_features(): actual = clx.features.frequency(df, "user", "computer") expected = cudf.DataFrame({ "user": ["u1", "u2", "u3"], "c1": [0.75, 0.00, 0.00], "c2": [0.25, 0.50, 0.0], "c3": [0.0, 0.5, 1.0], }) expected = expected.set_index("user") expected.columns = cudf.MultiIndex( names=[None, "computer"], codes=[[0, 0, 0], [0, 1, 2]], levels=[["time"], ["c1", "c2", "c3"]], ) assert expected.equals(actual)
def _concat(cls, objs): source_data = [o._source_data for o in objs] if len(source_data) > 1: for index, obj in enumerate(source_data[1:]): obj.columns = source_data[0].columns source_data[index + 1] = obj source_data = cudf.DataFrame._concat(source_data) names = [None for x in source_data.columns] objs = list(filter(lambda o: o.names is not None, objs)) for o in range(len(objs)): for i, name in enumerate(objs[o].names): names[i] = names[i] or name return cudf.MultiIndex(names=names, source_data=source_data)
def test_dataframe_take_with_multiIndex(ntake): np.random.seed(0) df = DataFrame(index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], )) nelem = 9 df["ii"] = np.random.randint(0, 20, nelem) df["ff"] = np.random.random(nelem) take_indices = np.random.randint(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) assert_eq(actual, expected)
def test_binary_features(): actual = clx.features.binary(df, "user", "computer") expected = cudf.DataFrame({ "user": ["u1", "u2", "u3"], "c1": [1, 0, 0], "c2": [1, 1, 0], "c3": [0, 1, 1] }) expected = expected.set_index("user") expected["c1"] = expected["c1"].astype("int32") expected["c2"] = expected["c2"].astype("int32") expected["c3"] = expected["c3"].astype("int32") expected.columns = cudf.MultiIndex( names=[None, "computer"], codes=[[0, 0, 0], [0, 1, 2]], levels=[["time"], ["c1", "c2", "c3"]], ) assert expected.equals(actual)
def test_MI(): gdf = cudf.DataFrame({ "a": np.random.randint(0, 4, 10), "b": np.random.randint(0, 4, 10), "c": np.random.randint(0, 4, 10), }) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] codes = cudf.DataFrame({ "a": np.random.randint(0, 4, 10), "b": np.random.randint(0, 4, 10), "c": np.random.randint(0, 2, 10), }) pd.options.display.max_rows = 999 gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) pdf = gdf.to_pandas() gdfT = gdf.T pdfT = pdf.T assert gdf.__repr__() == pdf.__repr__() assert gdfT.__repr__() == pdfT.__repr__()
], "Symbol": [ "AMZN", "AMZN", "AMZN", "MSFT", "MSFT", "MSFT", "NVDA", "NVDA", "NVDA", ], }, cudf.MultiIndex( levels=[[1001, 1002], [2001, 2002]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], names=["col1", "col2"], ), ], ) @pytest.mark.parametrize("deep", [True, False]) def test_multiindex_copy_deep(data, deep): """Test memory idendity for deep copy Case1: Constructed from GroupBy, StringColumns Case2: Constrcuted from MultiIndex, NumericColumns """ same_ref = not deep if isinstance(data, dict): import operator from functools import reduce
], "Symbol": [ "AMZN", "AMZN", "AMZN", "MSFT", "MSFT", "MSFT", "NVDA", "NVDA", "NVDA", ], }, cudf.MultiIndex( levels=[[1001, 1002], [2001, 2002]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], names=["col1", "col2"], ), ], ) @pytest.mark.parametrize("deep", [True, False]) def test_multiindex_copy_deep(data, deep): """Test memory identity for deep copy Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ same_ref = not deep if isinstance(data, dict): import operator from functools import reduce