Exemplo n.º 1
0
def test_select_by_label_multiindex():
    """
    Test getting column(s) by label with MultiIndex
    """
    ca = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("b", "x", ""): [4, 5, 6],
            ("a", "d", "e"): [3, 4, 5],
        },
        multiindex=True,
    )

    expect = ColumnAccessor(
        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
        multiindex=True,
    )
    got = ca.select_by_label("a")
    check_ca_equal(expect, got)

    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
    got = ca.select_by_label(("a", "b"))
    check_ca_equal(expect, got)

    expect = ColumnAccessor(
        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
        multiindex=True,
    )
    got = ca.select_by_label("a")
    check_ca_equal(expect, got)

    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
    got = ca.select_by_label(("a", "b"))
    check_ca_equal(expect, got)
Exemplo n.º 2
0
def test_select_by_label_simple():
    """
    Test getting a column by label
    """
    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
    check_ca_equal(ca.select_by_label("a"), ColumnAccessor({"a": [1, 2, 3]}))
    check_ca_equal(ca.select_by_label("b"), ColumnAccessor({"b": [2, 3, 4]}))
Exemplo n.º 3
0
def test_all_columns(simple_data):
    """
    Test that all values of the CA are
    columns.
    """
    ca = ColumnAccessor(simple_data)
    for col in ca.values():
        assert isinstance(col, cudf.core.column.ColumnBase)
Exemplo n.º 4
0
def test_to_pandas_multiindex_names():
    ca = ColumnAccessor(
        {("a", "b"): [1, 2, 3], ("c", "d"): [3, 4, 5]},
        multiindex=True,
        level_names=("foo", "bar"),
    )
    assert_eq(
        ca.to_pandas_index(),
        pd.MultiIndex.from_tuples(
            (("a", "b"), ("c", "d")), names=("foo", "bar")
        ),
    )
Exemplo n.º 5
0
def test_column_size_mismatch():
    """
    Test that constructing a CA from columns of
    differing sizes throws an error.
    """
    with pytest.raises(ValueError):
        _ = ColumnAccessor({"a": [1], "b": [1, 2]})
Exemplo n.º 6
0
def test_iter(simple_data):
    """
    Test that iterating over the CA
    yields column names.
    """
    ca = ColumnAccessor(simple_data)
    for expect_key, got_key in zip(simple_data, ca):
        assert expect_key == got_key
Exemplo n.º 7
0
def test_select_by_index_empty():
    ca = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("b", "x", ""): [4, 5, 6],
            ("a", "d", "e"): [3, 4, 5],
        },
        multiindex=True,
    )
    expect = ColumnAccessor({},
                            multiindex=True,
                            level_names=((None, None, None)))
    got = ca.select_by_index(slice(None, 0))
    check_ca_equal(expect, got)

    got = ca.select_by_index([])
    check_ca_equal(expect, got)
Exemplo n.º 8
0
def test_select_by_index_simple():
    """
    Test getting a column by label
    """
    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
    check_ca_equal(ca.select_by_index(0), ColumnAccessor({"a": [1, 2, 3]}))
    check_ca_equal(ca.select_by_index(1), ColumnAccessor({"b": [2, 3, 4]}))
    check_ca_equal(ca.select_by_index([0, 1]), ca)
    check_ca_equal(ca.select_by_index(slice(0, None)), ca)
Exemplo n.º 9
0
def test_replace_level_values_MultiColumn():
    ca = ColumnAccessor(
        {
            ("a", 1): [1, 2, 3],
            ("a", 2): [2, 3, 4],
            ("b", 1): [3, 4, 5]
        },
        multiindex=True,
    )

    expect = ColumnAccessor(
        {
            ("f", 1): [1, 2, 3],
            ("f", 2): [2, 3, 4],
            ("b", 1): [3, 4, 5]
        },
        multiindex=True,
    )

    got = ca.rename_levels(mapper={"a": "f"}, level=0)
    check_ca_equal(expect, got)
Exemplo n.º 10
0
def test_replace_level_values_RangeIndex():
    ca = ColumnAccessor(
        {
            ("a"): [1, 2, 3],
            ("b"): [2, 3, 4],
            ("c"): [3, 4, 5]
        },
        multiindex=False,
    )

    expect = ColumnAccessor(
        {
            ("f"): [1, 2, 3],
            ("b"): [2, 3, 4],
            ("c"): [3, 4, 5]
        },
        multiindex=False,
    )

    got = ca.rename_levels(mapper={"a": "f"}, level=0)
    check_ca_equal(expect, got)
Exemplo n.º 11
0
def test_select_by_index_multiindex():
    """
    Test getting column(s) by label with MultiIndex
    """
    ca = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("b", "x", ""): [4, 5, 6],
            ("a", "d", "e"): [3, 4, 5],
        },
        multiindex=True,
    )

    expect = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("b", "x", ""): [4, 5, 6],
        },
        multiindex=True,
    )
    got = ca.select_by_index(slice(0, 3))
    check_ca_equal(expect, got)

    expect = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("a", "d", "e"): [3, 4, 5],
        },
        multiindex=True,
    )
    got = ca.select_by_index([0, 1, 3])
    check_ca_equal(expect, got)
Exemplo n.º 12
0
def test_select_by_label_multiindex_slice():
    ca = ColumnAccessor(
        {
            ("a", "b", "c"): [1, 2, 3],
            ("a", "b", "e"): [2, 3, 4],
            ("a", "d", "e"): [3, 4, 5],
            ("b", "x", ""): [4, 5, 6],
        },
        multiindex=True,
    )  # pandas needs columns to be sorted to do slicing with multiindex
    expect = ca
    got = ca.select_by_label(slice(None, None))
    check_ca_equal(expect, got)

    expect = ColumnAccessor(
        {
            ("a", "b", "e"): [2, 3, 4],
            ("a", "d", "e"): [3, 4, 5],
            ("b", "x", ""): [4, 5, 6],
        },
        multiindex=True,
    )
    got = ca.select_by_label(slice(("a", "b", "e"), ("b", "x", "")))
    check_ca_equal(expect, got)
Exemplo n.º 13
0
def test_to_pandas_simple(simple_data):
    """
    Test that a ColumnAccessor converts to a correct pd.Index
    """
    ca = ColumnAccessor(simple_data)
    assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns)
Exemplo n.º 14
0
def test_by_label_list():
    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
    got = ca.select_by_label(["b", "c"])
    check_ca_equal(expect, got)
Exemplo n.º 15
0
def timeseries(
    start="2000-01-01",
    end="2000-01-31",
    freq="1s",
    dtypes=None,
    nulls_frequency=0,
    seed=None,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start : datetime (or datetime-like string)
        Start of time series
    end : datetime (or datetime-like string)
        End of time series
    dtypes : dict
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}.
        If none is provided, this defaults to
        ``{"name": "category", "id": int, "x": float, "y": float}``
    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    nulls_frequency : float
        Fill the series with the specified proportion of nulls. Default is 0.
    seed : int (optional)
        Randomstate seed

    Examples
    --------
    >>> import cudf as gd
    >>> gdf = gd.datasets.timeseries()
    >>> gdf.head()  # doctest: +SKIP
              timestamp    id     name         x         y
    2000-01-01 00:00:00   967    Jerry -0.031348 -0.040633
    2000-01-01 00:00:01  1066  Michael -0.262136  0.307107
    2000-01-01 00:00:02   988    Wendy -0.526331  0.128641
    2000-01-01 00:00:03  1016   Yvonne  0.620456  0.767270
    2000-01-01 00:00:04   998   Ursula  0.684902 -0.463278
    """
    if dtypes is None:
        dtypes = {"name": "category", "id": int, "x": float, "y": float}

    index = pd.DatetimeIndex(
        pd.date_range(start, end, freq=freq, name="timestamp"))
    state = np.random.RandomState(seed)
    columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()}
    df = pd.DataFrame(columns, index=index, columns=sorted(columns))
    if df.index[-1] == end:
        df = df.iloc[:-1]

    gdf = cudf.from_pandas(df)
    for col in gdf:
        mask = state.choice(
            [True, False],
            size=len(index),
            p=[1 - nulls_frequency, nulls_frequency],
        )
        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
        masked_col = gdf[col]._column.set_mask(mask_buf)
        gdf[col] = cudf.Series._from_data(ColumnAccessor({None: masked_col}),
                                          index=gdf.index)

    return gdf
Exemplo n.º 16
0
def test_to_pandas_multiindex(mi_data):
    ca = ColumnAccessor(mi_data, multiindex=True)
    assert_eq(ca.to_pandas_index(), pd.DataFrame(mi_data).columns)
Exemplo n.º 17
0
def test_select_by_label_simple_slice():
    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
    got = ca.select_by_label(slice("b", "c"))
    check_ca_equal(expect, got)
Exemplo n.º 18
0
    def _data(self):
        from cudf.core.column_accessor import ColumnAccessor

        return ColumnAccessor({self.name: self._values})