Exemplo n.º 1
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in "abc":
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index("a")
    right = df.set_index("c")
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match(
        "there are overlapping columns but lsuffix"
        " and rsuffix are not defined"
    )

    got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index("a").join(
        pddf.set_index("c"), lsuffix="_left", rsuffix="_right"
    )
    # Check
    assert list(expect.columns) == list(got.columns)
    assert_eq(expect.index.values, got.index.values)
    for k in expect.columns:
        _check_series(expect[k].fillna(-1), got[k].fillna(-1))
Exemplo n.º 2
0
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    lhs["b"] = bb = np.arange(len(lhs))
    lhs = lhs.set_index("a")

    rhs = DataFrame()
    rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc"))
    rhs["c"] = cc = np.arange(len(rhs))
    rhs = rhs.set_index("a")

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by="b")
        .to_pandas()
        .sort_index()
        .reset_index(drop=True),
        expect.reset_index(drop=True),
    )

    # Just do some rough checking here.
    assert list(got.columns) == ["b", "c"]
    assert len(got) > 0
    assert set(got.index.to_pandas()) & set("abc")
    assert set(got["b"]) & set(bb)
    assert set(got["c"]) & set(cc)
Exemplo n.º 3
0
def test_typecast_on_join_indexes_matching_categorical():
    join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category")
    join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str")
    other_data = [1, 2, 3, 4, 5]

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    gdf_l = gdf_l.set_index("join_col")
    gdf_r = gdf_r.set_index("join_col")

    exp_join_data = ["a", "b", "c", "d", "e"]
    exp_other_data = [1, 2, 3, 4, 5]

    expect = DataFrame(
        {
            "join_col": exp_join_data,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )
    expect = expect.set_index("join_col")
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)
Exemplo n.º 4
0
def test_dataframe_scatter_by_map(map_size, nelem, keep):

    strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
    np.random.seed(0)
    df = DataFrame()
    df["a"] = np.random.choice(strlist[:map_size], nelem)
    df["b"] = np.random.uniform(low=0, high=map_size, size=nelem)
    df["c"] = np.random.randint(map_size, size=nelem)
    df["d"] = df["a"].astype("category")

    def _check_scatter_by_map(dfs, col):
        assert len(dfs) == map_size
        nrows = 0
        # print(col._column)
        name = col.name
        for i, df in enumerate(dfs):
            nrows += len(df)
            if len(df) > 0:
                # Make sure the column types were preserved
                assert isinstance(df[name]._column, type(col._column))
            try:
                sr = df[name].astype(np.int32)
            except ValueError:
                sr = df[name]
            assert sr.nunique() <= 1
            if sr.nunique() == 1:
                if isinstance(df[name]._column, NumericalColumn):
                    assert sr.iloc[0] == i
        assert nrows == nelem

    _check_scatter_by_map(df.scatter_by_map("a", map_size, keep_index=keep),
                          df["a"])
    _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep),
                          df["b"])
    _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep),
                          df["c"])
    _check_scatter_by_map(df.scatter_by_map("d", map_size, keep_index=keep),
                          df["d"])

    if map_size == 2 and nelem == 100:
        df.scatter_by_map("a")  # Auto-detect map_size
        with pytest.raises(ValueError):
            df.scatter_by_map("a", map_size=1, debug=True)  # Bad map_size

    # Test GenericIndex
    df2 = df.set_index("c")
    generic_result = df2.scatter_by_map("b", map_size, keep_index=keep)
    _check_scatter_by_map(generic_result, df2["b"])
    if keep:
        for frame in generic_result:
            isinstance(frame.index, type(df2.index))

    # Test MultiIndex
    df2 = df.set_index(["a", "c"])
    multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep)
    _check_scatter_by_map(multiindex_result, df2["b"])
    if keep:
        for frame in multiindex_result:
            isinstance(frame.index, type(df2.index))
Exemplo n.º 5
0
def test_merge_multi(kwargs):

    left = DataFrame(
        {
            "a": [1, 2, 3, 4, 3, 5, 6],
            "b": [1, 3, 5, 7, 5, 9, 0],
            "c": ["o", "p", "q", "r", "s", "t", "u"],
            "d": ["v", "w", "x", "y", "z", "1", "2"],
        }
    )
    right = DataFrame(
        {
            "a": [0, 9, 3, 4, 3, 7, 8],
            "b": [2, 4, 5, 7, 5, 6, 8],
            "c": ["a", "b", "c", "d", "e", "f", "g"],
            "d": ["j", "i", "j", "k", "l", "m", "n"],
        }
    )

    if (
        kwargs["left_on"] is not None
        and kwargs["right_on"] is not None
        and kwargs["left_index"] is False
        and kwargs["right_index"] is False
    ):
        left = left.set_index(["c", "d"])
        right = right.set_index(["c", "d"])
    elif (
        kwargs["left_on"] is None
        and kwargs["right_on"] is None
        and kwargs["left_index"] is True
        and kwargs["right_index"] is True
    ):
        left = left.set_index(["a", "b"])
        right = right.set_index(["a", "b"])
    elif kwargs["left_on"] is not None and kwargs["right_index"] is True:
        left = left.set_index(["c", "d"])
        right = right.set_index(["a", "b"])
    elif kwargs["right_on"] is not None and kwargs["left_index"] is True:
        left = left.set_index(["a", "b"])
        right = right.set_index(["c", "d"])

    gleft = left.to_pandas()
    gright = right.to_pandas()

    kwargs["sort"] = True
    expect = gleft.merge(gright, **kwargs)
    got = left.merge(right, **kwargs)

    assert_eq(expect.sort_index().index, got.sort_index().index)

    expect.index = range(len(expect))
    got.index = range(len(got))
    expect = expect.sort_values(list(expect.columns))
    got = got.sort_values(list(got.columns))
    expect.index = range(len(expect))
    got.index = range(len(got))

    assert_eq(expect, got)
Exemplo n.º 6
0
def test_typecast_on_join_multiindices():
    join_data_l_0 = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_l_1 = Series([2, 3, 4.1, 5.9, 6], dtype="float32")
    join_data_l_2 = Series([7, 8, 9, 0, 1], dtype="float32")

    join_data_r_0 = Series([1, 2, 3, 4, 5], dtype="int32")
    join_data_r_1 = Series([2, 3, 4, 5, 6], dtype="int32")
    join_data_r_2 = Series([7, 8, 9, 0, 0], dtype="float64")

    other_data = ["a", "b", "c", "d", "e"]

    gdf_l = DataFrame(
        {
            "join_col_0": join_data_l_0,
            "join_col_1": join_data_l_1,
            "join_col_2": join_data_l_2,
            "B": other_data,
        }
    )
    gdf_r = DataFrame(
        {
            "join_col_0": join_data_r_0,
            "join_col_1": join_data_r_1,
            "join_col_2": join_data_r_2,
            "B": other_data,
        }
    )

    gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"])
    gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"])

    exp_join_data_0 = Series([1, 2], dtype="int32")
    exp_join_data_1 = Series([2, 3], dtype="float64")
    exp_join_data_2 = Series([7, 8], dtype="float64")
    exp_other_data = Series(["a", "b"])

    expect = DataFrame(
        {
            "join_col_0": exp_join_data_0,
            "join_col_1": exp_join_data_1,
            "join_col_2": exp_join_data_2,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )
    expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"])
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)
Exemplo n.º 7
0
def test_categorical_index():
    pdf = pd.DataFrame()
    pdf["a"] = [1, 2, 3]
    pdf["index"] = pd.Categorical(["a", "b", "c"])
    initial_df = DataFrame.from_pandas(pdf)
    pdf = pdf.set_index("index")
    gdf1 = DataFrame.from_pandas(pdf)
    gdf2 = DataFrame()
    gdf2["a"] = [1, 2, 3]
    gdf2["index"] = pd.Categorical(["a", "b", "c"])
    assert_eq(initial_df.index, gdf2.index)
    gdf2 = gdf2.set_index("index")

    assert isinstance(gdf1.index, CategoricalIndex)
    assert_eq(pdf, gdf1)
    assert_eq(pdf.index, gdf1.index)
    assert_eq(
        pdf.index.codes,
        gdf1.index.codes.astype(pdf.index.codes.dtype).to_array(),
    )

    assert isinstance(gdf2.index, CategoricalIndex)
    assert_eq(pdf, gdf2)
    assert_eq(pdf.index, gdf2.index)
    assert_eq(
        pdf.index.codes,
        gdf2.index.codes.astype(pdf.index.codes.dtype).to_array(),
    )
Exemplo n.º 8
0
def test_factorize_series_index():
    df = DataFrame()
    df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
    df["col2"] = [
        2992443.0,
        2992447.0,
        2992466.0,
        2992440.0,
        2992441.0,
        2992442.0,
        2992444.0,
        2992445.0,
        2992446.0,
        2992448.0,
    ]
    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )

    df = df.set_index("col2")

    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )
Exemplo n.º 9
0
def test_df_cat_sort_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    got = df.set_index("a").sort_index()
    expect = df.to_pandas().set_index("a").sort_index()

    assert_eq(got, expect)
Exemplo n.º 10
0
def test_df_cat_set_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))
    got = df.set_index("a")

    pddf = df.to_pandas(nullable_pd_dtype=False)
    expect = pddf.set_index("a")

    assert_eq(got, expect)
Exemplo n.º 11
0
def test_df_cat_sort_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    got = df.set_index("a").sort_index()
    expect = df.to_pandas().set_index("a").sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
Exemplo n.º 12
0
def test_df_set_index_from_series():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df["b"])
    assert list(df2.columns) == ["a", "b"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Exemplo n.º 13
0
def test_index_join_exception_cases():
    l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]})
    r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]})

    # Join between two MultiIndex
    lhs = ["a", "b"]
    rhs = ["a", "c"]
    level = "a"
    how = "outer"
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index

    with pytest.raises(TypeError):
        g_lhs.join(g_rhs, level=level, how=how)

    # Improper level value, level should be an int or scalar value
    level = ["a"]
    rhs = ["a"]
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index
    with pytest.raises(ValueError):
        g_lhs.join(g_rhs, level=level, how=how)
Exemplo n.º 14
0
def test_df_set_index_from_name():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(column_name)
    df2 = df.set_index("b")
    print(df2)
    # 1 less column because 'b' is used as index
    assert list(df2.columns) == ["a"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Exemplo n.º 15
0
def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame({
        "a": [1, 3, 5, 2, 4],
        "b": [1, 1, 2, 2, 3],
        "c": [9, 7, 7, 7, 1]
    })
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)
Exemplo n.º 16
0
def test_factorize_index_obj(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
    df = df.set_index("cats")

    uvals, labels = df.index.factorize()
    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
    assert isinstance(uvals, cp.core.core.ndarray)
    assert isinstance(labels, Index)

    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.get(), handcoded)