Пример #1
0
def test_cross_join() -> None:
    # triggers > 100 rows implementation
    # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34
    df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]})
    df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)})
    out = df2.join(df1, how="cross")
    df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)})
    assert df2.join(df1, how="cross").slice(0, 100).frame_equal(out)
Пример #2
0
def test_arange_expr():
    df = pl.DataFrame({"a": ["foobar", "barfoo"]})
    out = df[[pl.arange(0, pl.col("a").count() * 10)]]
    assert out.shape == (20, 1)
    assert out.select_at_idx(0)[-1] == 19

    # eager arange
    out = pl.arange(0, 10, 2, eager=True)
    assert out == [0, 2, 4, 8, 8]
Пример #3
0
def test_arange_no_rows() -> None:
    df = pl.DataFrame(dict(x=[5, 5, 4, 4, 2, 2]))
    out = df.with_column(pl.arange(0, pl.count()).over("x"))  # type: ignore[union-attr]
    assert out.frame_equal(
        pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "literal": [0, 1, 0, 1, 0, 1]})
    )

    df = pl.DataFrame(dict(x=[]))
    out = df.with_column(pl.arange(0, pl.count()).over("x"))  # type: ignore[union-attr]
    assert out.frame_equal(pl.DataFrame({"x": [], "literal": []}))
Пример #4
0
def test_arange_expr() -> None:
    df = pl.DataFrame({"a": ["foobar", "barfoo"]})
    out = df[[pl.arange(0, pl.col("a").count() * 10)]]
    assert out.shape == (20, 1)
    assert out.select_at_idx(0)[-1] == 19

    # eager arange
    out2 = pl.arange(0, 10, 2, eager=True)
    assert out2 == [0, 2, 4, 8, 8]

    out3 = pl.arange(pl.Series([0, 19]), pl.Series([3, 39]), step=2, eager=True)
    assert out3.dtype == pl.List
    assert out3[0].to_list() == [0, 2]
Пример #5
0
def test_rolling_kernels_and_groupby_rolling() -> None:
    df = pl.DataFrame({
        "dt": [
            datetime(2021, 1, 1),
            datetime(2021, 1, 2),
            datetime(2021, 1, 4),
            datetime(2021, 1, 5),
            datetime(2021, 1, 7),
        ],
        "values":
        pl.arange(0, 5, eager=True),
    })
    for period in ["1d", "2d", "3d"]:
        for closed in ["left", "right", "none", "both"]:

            out1 = df.select([
                pl.col("dt"),
                pl.col("values").rolling_sum(period, by="dt",
                                             closed=closed).alias("sum"),
                pl.col("values").rolling_var(period, by="dt",
                                             closed=closed).alias("var"),
                pl.col("values").rolling_mean(period, by="dt",
                                              closed=closed).alias("mean"),
                pl.col("values").rolling_std(period, by="dt",
                                             closed=closed).alias("std"),
            ])

            out2 = df.groupby_rolling("dt", period=period, closed=closed).agg([
                pl.col("values").sum().alias("sum"),
                pl.col("values").var().alias("var"),
                pl.col("values").mean().alias("mean"),
                pl.col("values").std().alias("std"),
            ])
            pl.testing.assert_frame_equal(out1, out2)
Пример #6
0
def test_agg_after_head() -> None:
    a = [1, 1, 1, 2, 2, 3, 3, 3, 3]

    df = pl.DataFrame({"a": a, "b": pl.arange(1, len(a) + 1, eager=True)})

    expected = pl.DataFrame({"a": [1, 2, 3], "b": [6, 9, 21]})

    for maintain_order in [True, False]:
        out = df.groupby("a", maintain_order=True).agg([pl.col("b").head(3).sum()])

        if not maintain_order:
            out = out.sort("a")

        assert out.frame_equal(expected)
Пример #7
0
def explode_correct_for_slice() -> None:
    df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]})
    assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4]

    df = ((pl.DataFrame({
        "group": pl.arange(0, 5, eager=True)
    }).join(
        pl.DataFrame({
            "b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]],
        }),
        how="cross",
    )).sort("group").with_row_count())
    expected = pl.DataFrame({
        "row_nr": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9],
        "group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        "b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0],
    })
    assert df.slice(0, 10).explode(["b"]).frame_equal(expected)
Пример #8
0
def test_groupby_rolling_by_() -> None:
    df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join(
        pl.DataFrame(
            {
                "datetime": pl.date_range(
                    datetime(2020, 1, 1), datetime(2020, 1, 5), "1d"
                ),
            }
        ),
        how="cross",
    )
    out = (
        df.sort("datetime")
        .groupby_rolling(index_column="datetime", by="group", period="3d")
        .agg([pl.count().alias("count")])
    )

    expected = (
        df.sort(["group", "datetime"])
        .groupby_rolling(index_column="datetime", by="group", period="3d")
        .agg([pl.count().alias("count")])
    )
    assert out.sort(["group", "datetime"]).frame_equal(expected)
    assert out.to_dict(False) == {
        "group": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
        "datetime": [
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
        ],
        "count": [1, 2, 3, 3, 3, 1, 2, 3, 3, 3, 1, 2, 3, 3, 3],
    }
Пример #9
0
def arange(
    low: Union[int, "pl.Expr", "pl.Series"],
    high: Union[int, "pl.Expr", "pl.Series"],
    step: int = 1,
    dtype: Optional[Type[DataType]] = None,
    eager: bool = False,
) -> Union["pl.Expr", "pl.Series"]:
    """
    Create a range expression. This can be used in a `select`, `with_column` etc.
    Be sure that the range size is equal to the DataFrame you are collecting.

     Examples
     --------

    >>> (df.lazy()
        .filter(pl.col("foo") < pl.arange(0, 100))
        .collect())

    Parameters
    ----------
    low
        Lower bound of range.
    high
        Upper bound of range.
    step
        Step size of the range
    dtype
        deprecated, cast later
    eager
        If eager evaluation is `True`, a Series is returned instead of an Expr
    """
    low = pl.lazy.expr_to_lit_or_expr(low, str_to_lit=False)
    high = pl.lazy.expr_to_lit_or_expr(high, str_to_lit=False)

    if eager:
        df = pl.DataFrame({"a": [1]})
        return df.select(pl.arange(
            low, high, step).alias("arange"))["arange"]  # type: ignore

    return pl.wrap_expr(pyarange(low._pyexpr, high._pyexpr, step))
Пример #10
0
def test_arange_expr():
    df = pl.DataFrame({"a": ["foobar", "barfoo"]})
    out = df[[pl.arange(0, pl.col("a").count() * 10)]]
    assert out.shape == (20, 1)
    assert out[0][-1] == 19
Пример #11
0
def test_arange() -> None:
    df = pl.DataFrame({"a": [1, 1, 1]}).lazy()
    result = df.filter(pl.col("a") >= pl.arange(0, 3)).collect()
    expected = pl.DataFrame({"a": [1, 1]})
    assert result.frame_equal(expected)