def test_cross_join() -> None: # triggers > 100 rows implementation # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34 df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]}) df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)}) out = df2.join(df1, how="cross") df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)}) assert df2.join(df1, how="cross").slice(0, 100).frame_equal(out)
def test_arange_expr(): df = pl.DataFrame({"a": ["foobar", "barfoo"]}) out = df[[pl.arange(0, pl.col("a").count() * 10)]] assert out.shape == (20, 1) assert out.select_at_idx(0)[-1] == 19 # eager arange out = pl.arange(0, 10, 2, eager=True) assert out == [0, 2, 4, 8, 8]
def test_arange_no_rows() -> None: df = pl.DataFrame(dict(x=[5, 5, 4, 4, 2, 2])) out = df.with_column(pl.arange(0, pl.count()).over("x")) # type: ignore[union-attr] assert out.frame_equal( pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "literal": [0, 1, 0, 1, 0, 1]}) ) df = pl.DataFrame(dict(x=[])) out = df.with_column(pl.arange(0, pl.count()).over("x")) # type: ignore[union-attr] assert out.frame_equal(pl.DataFrame({"x": [], "literal": []}))
def test_arange_expr() -> None: df = pl.DataFrame({"a": ["foobar", "barfoo"]}) out = df[[pl.arange(0, pl.col("a").count() * 10)]] assert out.shape == (20, 1) assert out.select_at_idx(0)[-1] == 19 # eager arange out2 = pl.arange(0, 10, 2, eager=True) assert out2 == [0, 2, 4, 8, 8] out3 = pl.arange(pl.Series([0, 19]), pl.Series([3, 39]), step=2, eager=True) assert out3.dtype == pl.List assert out3[0].to_list() == [0, 2]
def test_rolling_kernels_and_groupby_rolling() -> None: df = pl.DataFrame({ "dt": [ datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 7), ], "values": pl.arange(0, 5, eager=True), }) for period in ["1d", "2d", "3d"]: for closed in ["left", "right", "none", "both"]: out1 = df.select([ pl.col("dt"), pl.col("values").rolling_sum(period, by="dt", closed=closed).alias("sum"), pl.col("values").rolling_var(period, by="dt", closed=closed).alias("var"), pl.col("values").rolling_mean(period, by="dt", closed=closed).alias("mean"), pl.col("values").rolling_std(period, by="dt", closed=closed).alias("std"), ]) out2 = df.groupby_rolling("dt", period=period, closed=closed).agg([ pl.col("values").sum().alias("sum"), pl.col("values").var().alias("var"), pl.col("values").mean().alias("mean"), pl.col("values").std().alias("std"), ]) pl.testing.assert_frame_equal(out1, out2)
def test_agg_after_head() -> None: a = [1, 1, 1, 2, 2, 3, 3, 3, 3] df = pl.DataFrame({"a": a, "b": pl.arange(1, len(a) + 1, eager=True)}) expected = pl.DataFrame({"a": [1, 2, 3], "b": [6, 9, 21]}) for maintain_order in [True, False]: out = df.groupby("a", maintain_order=True).agg([pl.col("b").head(3).sum()]) if not maintain_order: out = out.sort("a") assert out.frame_equal(expected)
def explode_correct_for_slice() -> None: df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]}) assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4] df = ((pl.DataFrame({ "group": pl.arange(0, 5, eager=True) }).join( pl.DataFrame({ "b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]], }), how="cross", )).sort("group").with_row_count()) expected = pl.DataFrame({ "row_nr": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9], "group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0], }) assert df.slice(0, 10).explode(["b"]).frame_equal(expected)
def test_groupby_rolling_by_() -> None: df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join( pl.DataFrame( { "datetime": pl.date_range( datetime(2020, 1, 1), datetime(2020, 1, 5), "1d" ), } ), how="cross", ) out = ( df.sort("datetime") .groupby_rolling(index_column="datetime", by="group", period="3d") .agg([pl.count().alias("count")]) ) expected = ( df.sort(["group", "datetime"]) .groupby_rolling(index_column="datetime", by="group", period="3d") .agg([pl.count().alias("count")]) ) assert out.sort(["group", "datetime"]).frame_equal(expected) assert out.to_dict(False) == { "group": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], "datetime": [ datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), ], "count": [1, 2, 3, 3, 3, 1, 2, 3, 3, 3, 1, 2, 3, 3, 3], }
def arange( low: Union[int, "pl.Expr", "pl.Series"], high: Union[int, "pl.Expr", "pl.Series"], step: int = 1, dtype: Optional[Type[DataType]] = None, eager: bool = False, ) -> Union["pl.Expr", "pl.Series"]: """ Create a range expression. This can be used in a `select`, `with_column` etc. Be sure that the range size is equal to the DataFrame you are collecting. Examples -------- >>> (df.lazy() .filter(pl.col("foo") < pl.arange(0, 100)) .collect()) Parameters ---------- low Lower bound of range. high Upper bound of range. step Step size of the range dtype deprecated, cast later eager If eager evaluation is `True`, a Series is returned instead of an Expr """ low = pl.lazy.expr_to_lit_or_expr(low, str_to_lit=False) high = pl.lazy.expr_to_lit_or_expr(high, str_to_lit=False) if eager: df = pl.DataFrame({"a": [1]}) return df.select(pl.arange( low, high, step).alias("arange"))["arange"] # type: ignore return pl.wrap_expr(pyarange(low._pyexpr, high._pyexpr, step))
def test_arange_expr(): df = pl.DataFrame({"a": ["foobar", "barfoo"]}) out = df[[pl.arange(0, pl.col("a").count() * 10)]] assert out.shape == (20, 1) assert out[0][-1] == 19
def test_arange() -> None: df = pl.DataFrame({"a": [1, 1, 1]}).lazy() result = df.filter(pl.col("a") >= pl.arange(0, 3)).collect() expected = pl.DataFrame({"a": [1, 1]}) assert result.frame_equal(expected)