Exemplo n.º 1
0
def test_dataframe_scatter_by_map(map_size, nelem, keep):

    strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
    np.random.seed(0)
    df = DataFrame()
    df["a"] = np.random.choice(strlist[:map_size], nelem)
    df["b"] = np.random.uniform(low=0, high=map_size, size=nelem)
    df["c"] = np.random.randint(map_size, size=nelem)
    df["d"] = df["a"].astype("category")

    def _check_scatter_by_map(dfs, col):
        assert len(dfs) == map_size
        nrows = 0
        # print(col._column)
        name = col.name
        for i, df in enumerate(dfs):
            nrows += len(df)
            if len(df) > 0:
                # Make sure the column types were preserved
                assert isinstance(df[name]._column, type(col._column))
            try:
                sr = df[name].astype(np.int32)
            except ValueError:
                sr = df[name]
            assert sr.nunique() <= 1
            if sr.nunique() == 1:
                if isinstance(df[name]._column, NumericalColumn):
                    assert sr.iloc[0] == i
        assert nrows == nelem

    _check_scatter_by_map(df.scatter_by_map("a", map_size, keep_index=keep),
                          df["a"])
    _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep),
                          df["b"])
    _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep),
                          df["c"])
    _check_scatter_by_map(df.scatter_by_map("d", map_size, keep_index=keep),
                          df["d"])

    if map_size == 2 and nelem == 100:
        df.scatter_by_map("a")  # Auto-detect map_size
        with pytest.raises(ValueError):
            df.scatter_by_map("a", map_size=1, debug=True)  # Bad map_size

    # Test GenericIndex
    df2 = df.set_index("c")
    generic_result = df2.scatter_by_map("b", map_size, keep_index=keep)
    _check_scatter_by_map(generic_result, df2["b"])
    if keep:
        for frame in generic_result:
            isinstance(frame.index, type(df2.index))

    # Test MultiIndex
    df2 = df.set_index(["a", "c"])
    multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep)
    _check_scatter_by_map(multiindex_result, df2["b"])
    if keep:
        for frame in multiindex_result:
            isinstance(frame.index, type(df2.index))
Exemplo n.º 2
0
def test_dataframe_scatter_by_map(map_size, nelem):

    strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
    np.random.seed(0)
    df = DataFrame()
    df["a"] = np.random.choice(strlist[:map_size], nelem)
    df["b"] = np.random.uniform(low=0, high=map_size, size=nelem)
    df["c"] = np.random.randint(map_size, size=nelem)
    df["d"] = df["a"]._column.as_categorical_column(np.int32)

    def _check_scatter_by_map(dfs, col):
        assert len(dfs) == map_size
        nrows = 0
        for df in dfs:
            nrows += len(df)
            assert df[col].astype(np.int32).nunique() <= 1
        assert nrows == nelem

    _check_scatter_by_map(df.scatter_by_map("a", map_size), "a")
    _check_scatter_by_map(df.scatter_by_map("b", map_size), "b")
    _check_scatter_by_map(df.scatter_by_map("c", map_size), "c")
    _check_scatter_by_map(df.scatter_by_map("d", map_size), "d")

    if map_size == 2 and nelem == 100:
        df.scatter_by_map("a")  # Auto-detect map_size
        with pytest.raises(ValueError):
            df.scatter_by_map("a", 1)  # Bad map_size