def test_pearson_corr() -> None: df = pl.DataFrame({ "era": [1, 1, 1, 2, 2, 2], "prediction": [2, 4, 5, 190, 1, 4], "target": [1, 3, 2, 1, 43, 3], }) out = (df.groupby("era", maintain_order=True).agg( pl.pearson_corr(pl.col("prediction"), pl.col("target")).alias("c"), ))["c"] assert out.to_list() == pytest.approx( [0.6546536707079772, -5.477514993831792e-1]) # we can also pass in column names directly out = (df.groupby("era", maintain_order=True).agg( pl.pearson_corr("prediction", "target").alias("c"), ))["c"] assert out.to_list() == pytest.approx( [0.6546536707079772, -5.477514993831792e-1])
print("out.shape", out.shape) print('out["range_v1_v2"].sum()', out["range_v1_v2"].sum()) t0 = time.time() print("q8") out = (x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg( pl.col("v3").head(2).alias("largest2_v3")).explode( "largest2_v3").collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["largest2_v3"].sum()', out["largest2_v3"].sum()) t0 = time.time() print("q9") out = (x.groupby(["id2", "id4"]).agg( (pl.pearson_corr("v1", "v2")**2).alias("r2")).collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["r2"].sum()', out["r2"].sum()) t0 = time.time() print("q10") out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg( [pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]).collect()) print(time.time() - t0) print("out.shape", out.shape) print("easy took:", easy_time, "s") print("advanced took:", time.time() - t0advanced, "s") print("total took:", time.time() - t00, "s")