def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out["a"] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out.select_at_idx(0), expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out.select_at_idx(1), expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(2), expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(3), expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out.select_at_idx(4), expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(5), expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(6), expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(7), expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(8), expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected)
def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out[0] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out[0], expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out[1], expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out[2], expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out[3], expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out[4], expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out[5], expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out[6], expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out[7], expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out[8], expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected)
def test_horizontal_agg(fruits_cars: pl.DataFrame) -> None: df = fruits_cars out = df.select(pl.max([pl.col("A"), pl.col("B")])) assert out[:, 0].to_list() == [5, 4, 3, 4, 5] out = df.select(pl.min([pl.col("A"), pl.col("B")])) assert out[:, 0].to_list() == [1, 2, 3, 2, 1]
def test_rolling() -> None: dates = [ "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43", ] df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_column( pl.col("dt").str.strptime(pl.Datetime) ) out = df.groupby_rolling(index_column="dt", period="2d").agg( [ pl.sum("a").alias("sum_a"), pl.min("a").alias("min_a"), pl.max("a").alias("max_a"), ] ) assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
def test_min_nulls_consistency() -> None: df = pl.DataFrame({"a": [None, 2, 3], "b": [4, None, 6], "c": [7, 5, 0]}) out = df.select([pl.min(["a", "b", "c"])]).to_series() expected = pl.Series("min", [4, 2, 0]) testing.assert_series_equal(out, expected) out = df.select([pl.max(["a", "b", "c"])]).to_series() expected = pl.Series("max", [7, 5, 6]) testing.assert_series_equal(out, expected)
def test_fold() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.select([ pl.sum(["a", "b"]), pl.max(["a", pl.col("b")**2]), pl.min(["a", pl.col("b")**2]), ]) assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0])) assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0])) assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")) assert out["foo"] == [2, 4, 6]
easy_time = time.time() - t0easy t0advanced = time.time() t0 = time.time() print("q6") out = (x.groupby(["id4", "id5"]).agg( [pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")]).collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["v3_median"].sum()', out["v3_median"].sum()) print('out["v3_std"].sum()', out["v3_std"].sum()) t0 = time.time() print("q7") out = (x.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2") ]).collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["range_v1_v2"].sum()', out["range_v1_v2"].sum()) t0 = time.time() print("q8") out = (x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg( pl.col("v3").head(2).alias("largest2_v3")).explode( "largest2_v3").collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["largest2_v3"].sum()', out["largest2_v3"].sum()) t0 = time.time()
def test_nested_min_max() -> None: df = pl.DataFrame({"a": [1], "b": [2], "c": [3], "d": [4]}) out = df.with_column( pl.max([pl.min(["a", "b"]), pl.min(["c", "d"])]).alias("t")) assert out.shape == (1, 5) assert out["t"][0] == 3
def test_max_min_multiple_columns(fruits_cars: pl.DataFrame) -> None: res = fruits_cars.select(pl.max(["A", "B"]).alias("max")) assert res.to_series(0).series_equal(pl.Series("max", [5, 4, 3, 4, 5])) res = fruits_cars.select(pl.min(["A", "B"]).alias("min")) assert res.to_series(0).series_equal(pl.Series("min", [1, 2, 3, 2, 1]))
import polars as pl from .dataset import parsed_sorted as dataset # creates a new polars.Series with differences per row def mkdiff(cumcases: pl.Series) -> pl.Series: return cumcases - cumcases.shift(1) q = dataset.with_columns( [ pl.col("cumcases") .apply(mkdiff) .over(pl.col("country")) .take(pl.col("country").arg_unique()) .explode() .alias("diffcases"), pl.sum("cumcases").over("country").alias("cases/country"), pl.sum("cumcases").over("date").alias("sum_cases/day"), pl.min("cumcases").over("date").alias("min_cases/day"), pl.max("cumcases").over("date").alias("max_cases/day"), pl.sum("cumcases").over(pl.col("date").year()).alias("cases/year"), ] ) df = q.collect()
import polars as pl from polars.lazy import * reddit = pl.scan_csv("data/reddit.csv").select( [pl.sum("comment_karma"), pl.min("link_karma")]) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f: f.write(str(df))
from .dataset import df import polars as pl from polars import col df = df[[ pl.sum("random").alias("sum"), pl.min("random").alias("min"), pl.max("random").alias("max"), col("random").max().alias("other_max"), pl.std("random").alias("std dev"), pl.var("random").alias("variance"), ]]
fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = "max v1 - min v2 by id3" # q7 gc.collect() t_start = timeit.default_timer() ans = x.groupby("id3").agg([pl.max("v1"), pl.min("v2")]).select( ["id3", (col("v1_max") - col("v2_min")).alias("range_v1_v2")]).collect() print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans["range_v1_v2"].cast(pl.Int64).sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver,