Пример #1
0
def test_multiindex_construction():
    levels = [['a', 'b'], ['c', 'd']]
    codes = [[0, 1], [1, 0]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels=levels, codes=codes)
    assert_eq(pmi, mi)
Пример #2
0
def test_multiindex_construction():
    levels = [["a", "b"], ["c", "d"]]
    codes = [[0, 1], [1, 0]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels=levels, codes=codes)
    assert_eq(pmi, mi)
Пример #3
0
def test_multiindex_equals():
    # mi made from groupby
    # mi made manually to be identical
    # are they equal?
    gdf = cudf.DataFrame({
        "x": [1, 5, 3, 4, 1],
        "y": [1, 1, 2, 2, 5],
        "z": [0, 1, 0, 1, 0]
    })
    mi1 = gdf.groupby(["x", "y"]).mean().index
    mi2 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    assert_eq(mi1.equals(mi2), True)

    # mi made from two groupbys, are they equal?
    mi2 = gdf.groupby(["x", "y"]).max().index
    assert_eq(mi1.equals(mi2), True)

    # mi made manually twice are they equal?
    mi1 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    mi2 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    assert_eq(mi1.equals(mi2), True)

    # mi made from different groupbys are they not equal?
    mi1 = gdf.groupby(["x", "y"]).mean().index
    mi2 = gdf.groupby(["x", "z"]).mean().index
    assert_eq(mi1.equals(mi2), False)

    # mi made from different manuals are they not equal?
    mi1 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    mi2 = cudf.MultiIndex(
        levels=[[0, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    assert_eq(mi1.equals(mi2), False)
Пример #4
0
def test_multiindex_types():
    codes = [[0, 1], [1, 0]]
    levels = [[0, 1], [2, 3]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    levels = [[1.2, 2.1], [1.3, 3.1]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    levels = [["a", "b"], ["c", "d"]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
Пример #5
0
def test_multiindex_types():
    codes = [[0, 1], [1, 0]]
    levels = [[0, 1], [2, 3]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    levels = [[1.2, 2.1], [1.3, 3.1]]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
    levels = [['a', 'b'], ['c', 'd']]
    pmi = pd.MultiIndex(levels, codes)
    mi = cudf.MultiIndex(levels, codes)
    assert_eq(pmi, mi)
Пример #6
0
def test_multiindex_index_and_columns():
    gdf = cudf.DataFrame()
    gdf['x'] = np.random.randint(0, 5, 5)
    gdf['y'] = np.random.randint(0, 5, 5)
    pdf = gdf.to_pandas()
    mi = cudf.MultiIndex(levels=[[0, 1, 2], [3, 4]], codes=[[0, 0, 1, 1, 2],
                         [0, 1, 0, 1, 1]], names=['x', 'y'])
    gdf.index = mi
    mc = cudf.MultiIndex(levels=[['val'], ['mean', 'min']],
                         codes=[[0, 0], [0, 1]])
    gdf.columns = mc
    pdf.index = mi
    pdf.index.names = ['x', 'y']
    pdf.columns = mc
    assert_eq(pdf, gdf)
Пример #7
0
def test_difference():
    midx = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    midx2 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]],
        names=["x", "y"],
    )

    expected = midx2.to_pandas().difference(midx.to_pandas())
    actual = midx2.difference(midx)
    assert_eq(expected, actual)
Пример #8
0
def test_MI():
    gdf = cudf.DataFrame(
        {
            "a": np.random.randint(0, 4, 10),
            "b": np.random.randint(0, 4, 10),
            "c": np.random.randint(0, 4, 10),
        }
    )
    levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
    codes = cudf.DataFrame(
        {
            "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
            "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
            "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
        }
    )
    pd.options.display.max_rows = 999
    pd.options.display.max_columns = 0
    gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))
    pdf = gdf.to_pandas()
    gdfT = gdf.T
    pdfT = pdf.T
    assert gdf.__repr__() == pdf.__repr__()
    assert gdf.index.__repr__() == pdf.index.__repr__()
    assert gdfT.__repr__() == pdfT.__repr__()
    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
Пример #9
0
def test_multiindex_df_assignment():
    pdf = pd.DataFrame({'x': [1, 2, 3]})
    gdf = cudf.from_pandas(pdf)
    pdf.index = pd.MultiIndex([['a', 'b'], ['c', 'd']], [[0, 1, 0], [1, 0, 1]])
    gdf.index = cudf.MultiIndex(levels=[['a', 'b'], ['c', 'd']],
                                codes=[[0, 1, 0], [1, 0, 1]])
    assert_eq(pdf, gdf)
Пример #10
0
def test_multiindex_series_assignment():
    ps = pd.Series([1, 2, 3])
    gs = cudf.from_pandas(ps)
    ps.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]])
    gs.index = cudf.MultiIndex(levels=[["a", "b"], ["c", "d"]],
                               codes=[[0, 1, 0], [1, 0, 1]])
    assert_eq(ps, gs)
Пример #11
0
def test_multiindex_copy():
    # mi made from groupby
    # make a copy with .copy
    gdf = cudf.DataFrame({
        "x": [1, 5, 3, 4, 1],
        "y": [1, 1, 2, 2, 5],
        "z": [0, 1, 0, 1, 0]
    })
    mi1 = gdf.groupby(["x", "y"]).mean().index
    mi2 = mi1.copy(deep=False)
    assert_eq(mi1, mi2)

    mi2 = mi1.copy(deep=True)
    assert_eq(mi1, mi2)

    # mi made manually
    # make a copy with .copy
    # is it equal?
    mi1 = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    mi2 = mi1.copy(deep=False)
    assert_eq(mi1, mi2)

    mi2 = mi1.copy(deep=True)
    assert_eq(mi1, mi2)
Пример #12
0
def test_multiindex_df_assignment():
    pdf = pd.DataFrame({"x": [1, 2, 3]})
    gdf = cudf.from_pandas(pdf)
    pdf.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]])
    gdf.index = cudf.MultiIndex(levels=[["a", "b"], ["c", "d"]],
                                codes=[[0, 1, 0], [1, 0, 1]])
    assert_eq(pdf, gdf)
Пример #13
0
def test_array_func_missing_cudf_multi_index(func):
    levels = [["a", "b"], ["c", "d"]]
    codes = [[0, 1], [1, 0]]

    cudf_multi_index = cudf.MultiIndex(levels, codes)
    with pytest.raises(TypeError):
        func(cudf_multi_index)
Пример #14
0
def test_multiindex_index_and_columns():
    gdf = cudf.DataFrame()
    gdf["x"] = np.random.randint(0, 5, 5)
    gdf["y"] = np.random.randint(0, 5, 5)
    pdf = gdf.to_pandas()
    mi = cudf.MultiIndex(
        levels=[[0, 1, 2], [3, 4]],
        codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 1]],
        names=["x", "y"],
    )
    gdf.index = mi
    mc = cudf.MultiIndex(levels=[["val"], ["mean", "min"]],
                         codes=[[0, 0], [0, 1]])
    gdf.columns = mc
    pdf.index = mi.to_pandas()
    pdf.columns = mc.to_pandas()
    assert_eq(pdf, gdf)
Пример #15
0
def test_multiindex_values_host():
    midx = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    pmidx = midx.to_pandas()

    assert_eq(midx.values_host, pmidx.values)
Пример #16
0
 def keys(self):
     nkeys = len(self._key_columns)
     if nkeys > 1:
         return cudf.MultiIndex(
             source_data=cudf.DataFrame(
                 dict(zip(range(nkeys), self._key_columns))),
             names=self.names,
         )
     else:
         return cudf.core.index.as_index(self._key_columns[0],
                                         name=self.names[0])
Пример #17
0
def test_multiindex_equality():
    # mi made from groupby
    # mi made manually to be identical
    # are they equal?
    gdf = cudf.DataFrame({
        'x': [1, 5, 3, 4, 1],
        'y': [1, 1, 2, 2, 5],
        'z': [0, 1, 0, 1, 0]
    })
    mi1 = gdf.groupby(['x', 'y']).mean().index
    mi2 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]],
                          codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                          names=['x', 'y'])
    assert_eq(mi1, mi2)

    # mi made from two groupbys, are they equal?
    mi2 = gdf.groupby(['x', 'y']).max().index
    assert_eq(mi1, mi2)

    # mi made manually twice are they equal?
    mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]],
                          codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                          names=['x', 'y'])
    mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]],
                          codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                          names=['x', 'y'])
    assert_eq(mi1, mi2)

    # mi made from different groupbys are they not equal?
    mi1 = gdf.groupby(['x', 'y']).mean().index
    mi1 = gdf.groupby(['x', 'z']).mean().index
    assert_neq(mi1, mi2)

    # mi made from different manuals are they not equal?
    mi1 = cudf.MultiIndex(levels=[[1, 3, 4, 5], [1, 2, 5]],
                          codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                          names=['x', 'y'])
    mi1 = cudf.MultiIndex(levels=[[0, 3, 4, 5], [1, 2, 5]],
                          codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                          names=['x', 'y'])
    assert_neq(mi1, mi2)
Пример #18
0
def test_multiindex_values():
    midx = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )

    result = midx.values

    assert isinstance(result, cp.ndarray)
    np.testing.assert_array_equal(
        result.get(), np.array([[1, 1], [1, 5], [3, 2], [4, 2], [5, 1]]))
Пример #19
0
def test_multiindex_levels_codes_validation():
    levels = [["a", "b"], ["c", "d"]]
    # Codes not a sequence of sequences
    with pytest.raises(TypeError):
        pd.MultiIndex(levels, [0, 1])
    with pytest.raises(TypeError):
        cudf.MultiIndex(levels, [0, 1])
    # Codes don't match levels
    with pytest.raises(ValueError):
        pd.MultiIndex(levels, [[0], [1], [1]])
    with pytest.raises(ValueError):
        cudf.MultiIndex(levels, [[0], [1], [1]])
    # Largest code greater than number of levels
    with pytest.raises(ValueError):
        pd.MultiIndex(levels, [[0, 1], [0, 2]])
    with pytest.raises(ValueError):
        cudf.MultiIndex(levels, [[0, 1], [0, 2]])
    # Unequal code lengths
    with pytest.raises(ValueError):
        pd.MultiIndex(levels, [[0, 1], [0]])
    with pytest.raises(ValueError):
        cudf.MultiIndex(levels, [[0, 1], [0]])
    # Didn't pass levels and codes
    with pytest.raises(TypeError):
        pd.MultiIndex()
    with pytest.raises(TypeError):
        cudf.MultiIndex()
    # Didn't pass non zero levels and codes
    with pytest.raises(ValueError):
        pd.MultiIndex([], [])
    with pytest.raises(ValueError):
        cudf.MultiIndex([], [])
Пример #20
0
def test_multiIndex_duplicate_names():
    gi = cudf.MultiIndex(
        levels=[["a", "b"], ["b", "a"]],
        codes=[[0, 0], [0, 1]],
        names=["a", "a"],
    )
    pi = pd.MultiIndex(
        levels=[["a", "b"], ["b", "a"]],
        codes=[[0, 0], [0, 1]],
        names=["a", "a"],
    )

    assert_eq(gi, pi)
Пример #21
0
def test_multiindex_iter_error():
    midx = cudf.MultiIndex(
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )

    with pytest.raises(
            TypeError,
            match=re.escape(
                f"{midx.__class__.__name__} object is not iterable. "
                f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` "
                f"if you wish to iterate over the values."),
    ):
        iter(midx)
Пример #22
0
def test_frequency_features():
    actual = clx.features.frequency(df, "user", "computer")
    expected = cudf.DataFrame({
        "user": ["u1", "u2", "u3"],
        "c1": [0.75, 0.00, 0.00],
        "c2": [0.25, 0.50, 0.0],
        "c3": [0.0, 0.5, 1.0],
    })
    expected = expected.set_index("user")
    expected.columns = cudf.MultiIndex(
        names=[None, "computer"],
        codes=[[0, 0, 0], [0, 1, 2]],
        levels=[["time"], ["c1", "c2", "c3"]],
    )
    assert expected.equals(actual)
Пример #23
0
    def _concat(cls, objs):

        source_data = [o._source_data for o in objs]

        if len(source_data) > 1:
            for index, obj in enumerate(source_data[1:]):
                obj.columns = source_data[0].columns
                source_data[index + 1] = obj

        source_data = cudf.DataFrame._concat(source_data)
        names = [None for x in source_data.columns]
        objs = list(filter(lambda o: o.names is not None, objs))
        for o in range(len(objs)):
            for i, name in enumerate(objs[o].names):
                names[i] = names[i] or name
        return cudf.MultiIndex(names=names, source_data=source_data)
Пример #24
0
def test_dataframe_take_with_multiIndex(ntake):
    np.random.seed(0)
    df = DataFrame(index=cudf.MultiIndex(
        levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
        codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
    ))

    nelem = 9
    df["ii"] = np.random.randint(0, 20, nelem)
    df["ff"] = np.random.random(nelem)

    take_indices = np.random.randint(0, len(df), ntake)

    actual = df.take(take_indices)
    expected = df.to_pandas().take(take_indices)

    assert_eq(actual, expected)
Пример #25
0
def test_binary_features():
    actual = clx.features.binary(df, "user", "computer")
    expected = cudf.DataFrame({
        "user": ["u1", "u2", "u3"],
        "c1": [1, 0, 0],
        "c2": [1, 1, 0],
        "c3": [0, 1, 1]
    })
    expected = expected.set_index("user")
    expected["c1"] = expected["c1"].astype("int32")
    expected["c2"] = expected["c2"].astype("int32")
    expected["c3"] = expected["c3"].astype("int32")
    expected.columns = cudf.MultiIndex(
        names=[None, "computer"],
        codes=[[0, 0, 0], [0, 1, 2]],
        levels=[["time"], ["c1", "c2", "c3"]],
    )
    assert expected.equals(actual)
Пример #26
0
def test_MI():
    gdf = cudf.DataFrame({
        "a": np.random.randint(0, 4, 10),
        "b": np.random.randint(0, 4, 10),
        "c": np.random.randint(0, 4, 10),
    })
    levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
    codes = cudf.DataFrame({
        "a": np.random.randint(0, 4, 10),
        "b": np.random.randint(0, 4, 10),
        "c": np.random.randint(0, 2, 10),
    })
    pd.options.display.max_rows = 999
    gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))
    pdf = gdf.to_pandas()
    gdfT = gdf.T
    pdfT = pdf.T
    assert gdf.__repr__() == pdf.__repr__()
    assert gdfT.__repr__() == pdfT.__repr__()
Пример #27
0
            ],
            "Symbol": [
                "AMZN",
                "AMZN",
                "AMZN",
                "MSFT",
                "MSFT",
                "MSFT",
                "NVDA",
                "NVDA",
                "NVDA",
            ],
        },
        cudf.MultiIndex(
            levels=[[1001, 1002], [2001, 2002]],
            codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
            names=["col1", "col2"],
        ),
    ],
)
@pytest.mark.parametrize("deep", [True, False])
def test_multiindex_copy_deep(data, deep):
    """Test memory idendity for deep copy
        Case1: Constructed from GroupBy, StringColumns
        Case2: Constrcuted from MultiIndex, NumericColumns
    """
    same_ref = not deep

    if isinstance(data, dict):
        import operator
        from functools import reduce
Пример #28
0
            ],
            "Symbol": [
                "AMZN",
                "AMZN",
                "AMZN",
                "MSFT",
                "MSFT",
                "MSFT",
                "NVDA",
                "NVDA",
                "NVDA",
            ],
        },
        cudf.MultiIndex(
            levels=[[1001, 1002], [2001, 2002]],
            codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
            names=["col1", "col2"],
        ),
    ],
)
@pytest.mark.parametrize("deep", [True, False])
def test_multiindex_copy_deep(data, deep):
    """Test memory identity for deep copy
    Case1: Constructed from GroupBy, StringColumns
    Case2: Constructed from MultiIndex, NumericColumns
    """
    same_ref = not deep

    if isinstance(data, dict):
        import operator
        from functools import reduce