Пример #1
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out["a"] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out.select_at_idx(0), expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out.select_at_idx(1), expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(2), expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(3), expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out.select_at_idx(4), expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(5), expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(6), expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(7), expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(8), expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
Пример #2
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out[0] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out[0], expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out[1], expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out[2], expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out[3], expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out[4], expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out[5], expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out[6], expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out[7], expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out[8], expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
Пример #3
0
def test_horizontal_agg(fruits_cars: pl.DataFrame) -> None:
    df = fruits_cars
    out = df.select(pl.max([pl.col("A"), pl.col("B")]))
    assert out[:, 0].to_list() == [5, 4, 3, 4, 5]

    out = df.select(pl.min([pl.col("A"), pl.col("B")]))
    assert out[:, 0].to_list() == [1, 2, 3, 2, 1]
Пример #4
0
def test_rolling() -> None:
    dates = [
        "2020-01-01 13:45:48",
        "2020-01-01 16:42:13",
        "2020-01-01 16:45:09",
        "2020-01-02 18:12:48",
        "2020-01-03 19:45:32",
        "2020-01-08 23:16:43",
    ]

    df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_column(
        pl.col("dt").str.strptime(pl.Datetime)
    )

    out = df.groupby_rolling(index_column="dt", period="2d").agg(
        [
            pl.sum("a").alias("sum_a"),
            pl.min("a").alias("min_a"),
            pl.max("a").alias("max_a"),
        ]
    )

    assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1]
    assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1]
    assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
Пример #5
0
def test_min_nulls_consistency() -> None:
    df = pl.DataFrame({"a": [None, 2, 3], "b": [4, None, 6], "c": [7, 5, 0]})
    out = df.select([pl.min(["a", "b", "c"])]).to_series()
    expected = pl.Series("min", [4, 2, 0])
    testing.assert_series_equal(out, expected)

    out = df.select([pl.max(["a", "b", "c"])]).to_series()
    expected = pl.Series("max", [7, 5, 6])
    testing.assert_series_equal(out, expected)
Пример #6
0
def test_fold() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.select([
        pl.sum(["a", "b"]),
        pl.max(["a", pl.col("b")**2]),
        pl.min(["a", pl.col("b")**2]),
    ])
    assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0]))
    assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0]))
    assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x,
                exprs=pl.col("*")).alias("foo"))
    assert out["foo"] == [2, 4, 6]
Пример #7
0
def test_window_function():
    df = pl.DataFrame({
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    })

    q = df.lazy().with_columns([
        pl.sum("A").over("fruits").alias("fruit_sum_A"),
        pl.first("B").over("fruits").alias("fruit_first_B"),
        pl.max("B").over("cars").alias("cars_max_B"),
    ])
    out = q.collect()
    assert out["cars_max_B"] == [5, 4, 5, 5, 5]

    out = df[[pl.first("B").over(["fruits", "cars"]).alias("B_first")]]
    assert out["B_first"] == [5, 4, 3, 3, 5]
Пример #8
0
import polars as pl

q = (pl.scan_csv("data/reddit.csv").groupby("comment_karma").agg(
    [pl.col("name").n_unique().alias("unique_names"),
     pl.max("link_karma")]).sort(by_columns="unique_names", reverse=True))

df = q.fetch()
Пример #9
0
easy_time = time.time() - t0easy
t0advanced = time.time()

t0 = time.time()
print("q6")
out = (x.groupby(["id4", "id5"]).agg(
    [pl.median("v3").alias("v3_median"),
     pl.std("v3").alias("v3_std")]).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["v3_median"].sum()', out["v3_median"].sum())
print('out["v3_std"].sum()', out["v3_std"].sum())

t0 = time.time()
print("q7")
out = (x.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")
                             ]).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["range_v1_v2"].sum()', out["range_v1_v2"].sum())

t0 = time.time()
print("q8")
out = (x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(
    pl.col("v3").head(2).alias("largest2_v3")).explode(
        "largest2_v3").collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["largest2_v3"].sum()', out["largest2_v3"].sum())

t0 = time.time()
Пример #10
0
def test_nested_min_max() -> None:
    df = pl.DataFrame({"a": [1], "b": [2], "c": [3], "d": [4]})
    out = df.with_column(
        pl.max([pl.min(["a", "b"]), pl.min(["c", "d"])]).alias("t"))
    assert out.shape == (1, 5)
    assert out["t"][0] == 3
Пример #11
0
def test_max_min_multiple_columns(fruits_cars: pl.DataFrame) -> None:
    res = fruits_cars.select(pl.max(["A", "B"]).alias("max"))
    assert res.to_series(0).series_equal(pl.Series("max", [5, 4, 3, 4, 5]))

    res = fruits_cars.select(pl.min(["A", "B"]).alias("min"))
    assert res.to_series(0).series_equal(pl.Series("min", [1, 2, 3, 2, 1]))
Пример #12
0
import polars as pl

from .dataset import parsed_sorted as dataset


# creates a new polars.Series with differences per row
def mkdiff(cumcases: pl.Series) -> pl.Series:
    return cumcases - cumcases.shift(1)


q = dataset.with_columns(
    [
        pl.col("cumcases")
        .apply(mkdiff)
        .over(pl.col("country"))
        .take(pl.col("country").arg_unique())
        .explode()
        .alias("diffcases"),
        pl.sum("cumcases").over("country").alias("cases/country"),
        pl.sum("cumcases").over("date").alias("sum_cases/day"),
        pl.min("cumcases").over("date").alias("min_cases/day"),
        pl.max("cumcases").over("date").alias("max_cases/day"),
        pl.sum("cumcases").over(pl.col("date").year()).alias("cases/year"),
    ]
)

df = q.collect()
Пример #13
0
import polars as pl

dataset = pl.DataFrame({
    "A": [1, 2, 3, 4, 5],
    "fruits": ["banana", "banana", "apple", "apple", "banana"],
    "B": [5, 4, 3, 2, 1],
    "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
})

q = dataset.lazy().with_columns([
    pl.sum("A").over("fruits").alias("fruit_sum_A"),
    pl.first("B").over("fruits").alias("fruit_first_B"),
    pl.max("B").over("cars").alias("cars_max_B"),
])

df = q.collect()
from .dataset import df
import polars as pl
from polars import col

df = df[[
    pl.sum("random").alias("sum"),
    pl.min("random").alias("min"),
    pl.max("random").alias("max"),
    col("random").max().alias("other_max"),
    pl.std("random").alias("std dev"),
    pl.var("random").alias("variance"),
]]
Пример #15
0
          fun=fun,
          run=2,
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(chk),
          chk_time_sec=chkt,
          on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = "max v1 - min v2 by id3"  # q7
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id3").agg([pl.max("v1"), pl.min("v2")]).select(
    ["id3", (col("v1_max") - col("v2_min")).alias("range_v1_v2")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans["range_v1_v2"].cast(pl.Int64).sum()]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=in_rows,
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,