示例#1
0
import pypolars as pl
from pypolars.lazy import *

# A scan is a lazy read. This means nothing happens.
reddit = pl.scan_csv("data/reddit.csv")

reddit = (
    reddit.filter(col("comment_karma") > 0)  # only positive comment karma
    .filter(col("link_karma") > 0)  # only positive link karma
    .filter(
        col("name").str_contains(r"^a"))  # filter name that start with an "a"
)

if __name__ == "__main__":
    df = reddit.fetch(int(1e7))
    with open("book/src/outputs/predicate_pushdown_0.txt", "w") as f:
        f.write(str(df))

    reddit.show_graph(optimized=False,
                      show=False,
                      output_path="book/src/img/predicate_pushdown_0.png")
    reddit.show_graph(
        optimized=True,
        show=False,
        output_path="book/src/img/predicate_pushdown_0_optimized.png",
    )
示例#2
0
import pypolars as pl
from pypolars.lazy import *

reddit = pl.scan_csv("data/reddit.csv").select(
    [pl.sum("comment_karma"), pl.min("link_karma")]
)

if __name__ == "__main__":
    df = reddit.fetch()
    with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f:
        f.write(str(df))
示例#3
0
import pypolars as pl
from pypolars.lazy import *

reddit = (
    pl.scan_csv("data/reddit.csv")
    .groupby("comment_karma")
    .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()])
    .sort(by_column="unique_names", reverse=True)
)

if __name__ == "__main__":
    df = reddit.fetch()
    with open("book/src/outputs/how_can_i_groupby.txt", "w") as f:
        f.write(str(df))
import pypolars as pl
from pypolars.lazy import *

reddit = pl.scan_csv("data/reddit.csv")
runescape = pl.scan_csv("data/runescape.csv", has_headers=False).select(
    col("column_1").alias("name"))

reddit = (reddit.filter(col("comment_karma") > 0).filter(
    col("link_karma") > 0).filter(col("name").str_contains(r"^a")))

joined = reddit.join(runescape, on="name", how="inner").select(
    ["name", "comment_karma", "link_karma"])

if __name__ == "__main__":
    joined.show_graph(
        optimized=False,
        show=False,
        output_path="book/src/img/projection_pushdown_0.png",
    )
    joined.show_graph(
        optimized=True,
        show=False,
        output_path="book/src/img/projection_pushdown_0_optimized.png",
    )
    df = joined.fetch(int(1e7))
    with open("book/src/outputs/projection_pushdown_0.txt", "w") as f:
        f.write(str(df))
示例#5
0
import pypolars as pl
from pypolars.lazy import *
import time

reddit = pl.scan_csv("data/reddit.csv")
runestar = pl.scan_csv("data/runescape.csv", has_headers=False).with_column(
    col("column_1").alias("name")
)

reddit = (
    reddit.filter(col("comment_karma") > 0)
    .filter(col("link_karma") > 0)
    .filter(col("name").str_contains(r"^a"))  # filter name that start with an "a"
)

joined = reddit.join(runestar, on="name", how="inner").select(
    ["name", "comment_karma", "link_karma"]
)

t0 = time.time()

joined.show_graph(True)

df = joined.fetch(int(1e7))

print(time.time() - t0)
print(df)
示例#6
0
import pypolars as pl
from pypolars.lazy import *
import time

t0 = time.time()

left = pl.scan_csv("data/join_left_80000.csv")
right = pl.scan_csv("data/join_right_80000.csv")
other = pl.scan_csv("data/10000000.csv")

q = (left.join(right, on="key",
               how="inner").filter(col("value") > 0.5).with_column(
                   (col("value") * 10).cast(int)).join(
                       other.groupby("groups").agg(pl.sum("values")),
                       left_on="value",
                       right_on="groups",
                       how="inner",
                   ).select(["key", "values_sum"]))
print(q._la)
df = q.collect()

t = time.time() - t0
# with open("data/macro_bench_polars.txt", "w") as f:
#     f.write(str(t))
print(df)
print(q.describe_optimized_plan())