def test_arange_no_rows() -> None: df = pl.DataFrame(dict(x=[5, 5, 4, 4, 2, 2])) out = df.with_column(pl.arange(0, pl.count()).over("x")) # type: ignore[union-attr] assert out.frame_equal( pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "literal": [0, 1, 0, 1, 0, 1]}) ) df = pl.DataFrame(dict(x=[])) out = df.with_column(pl.arange(0, pl.count()).over("x")) # type: ignore[union-attr] assert out.frame_equal(pl.DataFrame({"x": [], "literal": []}))
def test_count_expr() -> None: df = pl.DataFrame({"a": [1, 2, 3, 3, 3], "b": ["a", "a", "b", "a", "a"]}) out = df.select(pl.count()) assert out.shape == (1, 1) assert out[0, 0] == 5 out = df.groupby("b", maintain_order=True).agg(pl.count()) assert out["b"].to_list() == ["a", "b"] assert out["count"].to_list() == [4, 1]
def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out["a"] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out.select_at_idx(0), expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out.select_at_idx(1), expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(2), expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(3), expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out.select_at_idx(4), expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(5), expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(6), expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(7), expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(8), expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected)
def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out[0] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out[0], expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out[1], expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out[2], expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out[3], expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out[4], expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out[5], expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out[6], expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out[7], expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out[8], expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected)
def test_sorted_groupby_optimization() -> None: df = pl.DataFrame({"a": np.random.randint(0, 5, 20)}) # the sorted optimization should not randomize the # groups, so this is tests that we hit the sorted optimization for reverse in [True, False]: sorted_implicit = (df.with_column( pl.col("a").sort(reverse=reverse)).groupby("a").agg(pl.count())) sorted_explicit = df.groupby("a").agg(pl.count()).sort("a", reverse=reverse) sorted_explicit.frame_equal(sorted_implicit)
def test_apply_custom_function(): df = pl.DataFrame( { "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], } ) # two ways to determine the length groups. a = ( df.lazy() .groupby("fruits") .agg( [ pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"), pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"), pl.count("cars"), ] ) .sort("custom_1", reverse=True) ).collect() expected = pl.DataFrame( { "fruits": ["banana", "apple"], "custom_1": [3, 2], "custom_2": [3, 2], "cars_count": [3, 2], } ) expected["cars_count"] = expected["cars_count"].cast(pl.UInt32) assert a.frame_equal(expected)
def test_repeat_expansion_in_groupby() -> None: out = ( pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]}) .groupby("g", maintain_order=True) .agg(pl.repeat(1, pl.count()).cumsum()) .to_dict() ) assert out == {"g": [1, 2, 3], "literal": [[1], [1, 2], [1, 2, 3]]}
def test_groupby_rolling_by_() -> None: df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join( pl.DataFrame( { "datetime": pl.date_range( datetime(2020, 1, 1), datetime(2020, 1, 5), "1d" ), } ), how="cross", ) out = ( df.sort("datetime") .groupby_rolling(index_column="datetime", by="group", period="3d") .agg([pl.count().alias("count")]) ) expected = ( df.sort(["group", "datetime"]) .groupby_rolling(index_column="datetime", by="group", period="3d") .agg([pl.count().alias("count")]) ) assert out.sort(["group", "datetime"]).frame_equal(expected) assert out.to_dict(False) == { "group": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2], "datetime": [ datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), datetime(2020, 1, 1, 0, 0), datetime(2020, 1, 2, 0, 0), datetime(2020, 1, 3, 0, 0), datetime(2020, 1, 4, 0, 0), datetime(2020, 1, 5, 0, 0), ], "count": [1, 2, 3, 3, 3, 1, 2, 3, 3, 3, 1, 2, 3, 3, 3], }
def test_count_window() -> None: assert ( pl.DataFrame( { "a": [1, 1, 2], } ) .with_column(pl.count().over("a"))["count"] .to_list() ) == [2, 2, 1]
def test_predicate_count_vstack() -> None: l1 = pl.DataFrame({ "k": ["x", "y"], "v": [3, 2], }).lazy() l2 = pl.DataFrame({ "k": ["x", "y"], "v": [5, 7], }).lazy() assert pl.concat([l1, l2]).filter( pl.count().over("k") == 2).collect()["v"].to_list() == [3, 2, 5, 7]
def test_apply_custom_function(): df = pl.DataFrame({ "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], }) # two ways to determine the length groups. (df.lazy().groupby("fruits").agg([ pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"), pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"), pl.count("cars"), ])).collect()
def test_groupby_rolling_negative_offset_3914() -> None: df = pl.DataFrame( { "datetime": pl.date_range(datetime(2020, 1, 1), datetime(2020, 1, 5), "1d"), } ) assert df.groupby_rolling(index_column="datetime", period="2d", offset="-4d").agg( pl.count().alias("count") )["count"].to_list() == [0, 0, 1, 2, 2] df = pl.DataFrame( { "ints": range(0, 20), } ) assert df.groupby_rolling(index_column="ints", period="2i", offset="-5i",).agg( [pl.col("ints").alias("matches")] )["matches"].to_list() == [ [], [], [], [0], [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13], [13, 14], [14, 15], [15, 16], ]
import polars as pl from .dataset import dataset q = (dataset.lazy().groupby("first_name").agg( [pl.count("party"), pl.col("gender").list(), pl.first("last_name")]).sort("party_count", reverse=True).limit(5)) df = q.collect()
import polars as pl dataset = pl.DataFrame( { "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], } ) # three ways to determine the length groups q = ( dataset.lazy() .groupby("fruits") .agg( [ pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"), pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"), pl.count("cars"), ] ) ) df = q.collect()
def test_error_on_empty_groupby() -> None: with pytest.raises( pl.ComputeError, match="expected keys in groupby operation, got nothing"): pl.DataFrame(dict(x=[0, 0, 1, 1])).groupby([]).agg(pl.count())
print("out.shape", out.shape) print('out["largest2_v3"].sum()', out["largest2_v3"].sum()) t0 = time.time() print("q9") out = (x.groupby(["id2", "id4"]).agg( (pl.pearson_corr("v1", "v2")**2).alias("r2")).collect()) print(time.time() - t0) print("out.shape", out.shape) print('out["r2"].sum()', out["r2"].sum()) t0 = time.time() print("q10") out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg( [pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]).collect()) print(time.time() - t0) print("out.shape", out.shape) print("easy took:", easy_time, "s") print("advanced took:", time.time() - t0advanced, "s") print("total took:", time.time() - t00, "s") t00 = time.time() t0 = time.time() print("q1") out = x.groupby("id1").agg(pl.sum("v1")).collect() print(time.time() - t0) assert out.shape == (96, 2) assert out["v1_sum"].sum() == 28501451 t0easy = time.time()
def sub_sample_fragments( fragments_df, min_uniq_frag=200, sampling_fractions=sampling_fractions_default, stats_tsv_filename="sampling_stats.tsv", whitelist=None, ): sampling_fractions_length = len(sampling_fractions) # Initialize dataframe for storing all statistics results. stats_df = pd.DataFrame( { "mean_frag_per_bc": np.zeros(sampling_fractions_length, np.float64), "median_uniq_frag_per_bc": np.zeros(sampling_fractions_length, np.float64), "total_frag_count": np.zeros(sampling_fractions_length, np.uint32), "cell_barcode_count": np.zeros(sampling_fractions_length, np.uint32), }, index=pd.Index(data=np.array(sampling_fractions), name="sampling_fraction"), ) # Get all cell barcodes which have more than min_uniq_frag fragments. good_cell_barcodes = fragments_df.groupby("CellBarcode").agg( pl.col("FragmentCount").count().alias('nbr_frags_per_CBs')).filter( pl.col("nbr_frags_per_CBs") > min_uniq_frag) # Count all good cell barcodes. nbr_good_cell_barcodes = good_cell_barcodes.height if 1.0 in sampling_fractions: # As there is no need to sample when sampling fraction is 100%, # the median number of unique fragments per barcode can be # calculated much more efficiently on the original fragments # file dataframe with counts than the expanded one, which is # needed when sampling is required. logger.info("Calculate statistics for sampling fraction 100.0%.") logger.info(f"Keep fragments with good barcodes.") fragments_for_good_bc_df = good_cell_barcodes.join( fragments_df, left_on="CellBarcode", right_on="CellBarcode", how="left") logger.info("Calculate total number of fragments.") stats_df.loc[1.0, "total_frag_count"] = fragments_for_good_bc_df.select([ pl.col("FragmentCount").sum().alias("TotalFragCount") ])["TotalFragCount"][0] logger.info("Calculate mean number of fragments per barcode.") stats_df.loc[ 1.0, "mean_frag_per_bc"] = fragments_for_good_bc_df.groupby( "CellBarcode").agg([ pl.col("FragmentCount").sum().alias("MeanFragmentsPerCB") ]).select([pl.col("MeanFragmentsPerCB").mean() ])["MeanFragmentsPerCB"][0] logger.info("Calculate median number of unique fragments per barcode.") stats_df.loc[ 1.0, "median_uniq_frag_per_bc"] = fragments_for_good_bc_df.groupby( "CellBarcode").agg( pl.col("FragmentCount").count().alias( "UniqueFragmentsPerCB")).select( pl.col("UniqueFragmentsPerCB").median() )["UniqueFragmentsPerCB"][0] stats_df.loc[1.0, "cell_barcode_count"] = nbr_good_cell_barcodes # Delete dataframe to free memory. del fragments_for_good_bc_df # Create dataframe where each row contains one fragment: # - Original dataframe has a count per fragment with the same cell barcode. # - Create a row for each count, so we can sample fairly afterwards. logger.info("Create dataframe with all fragments (for sampling).") fragments_all_df = fragments_df.with_column( pl.col("FragmentCount").repeat_by( pl.col("FragmentCount"))).explode("FragmentCount") # Delete input dataframe to free memory. del fragments_df for sampling_fraction in sampling_fractions: if sampling_fraction == 0.0: # All statistics are zero and already set when the stats_df dataframe is created. continue elif sampling_fraction == 1.0: # Statistics for 100% sampling are already calculated as there is no need # to have the fragments_all_df dataframe as no sampling is needed. # This avoids the need to use the expensive groupby operations for the # calculations of the median number of unique fragments per barcode. continue logger.info( f"Calculate statistics for sampling fraction {round(sampling_fraction * 100, 1)}%." ) # Sample x% from all fragments (with duplicates) and keep fragments which have good barcodes. logger.info( f"Sample {round(sampling_fraction * 100, 1)}% from all fragments and keep fragments with good barcodes." ) fragments_sampled_for_good_bc_df = good_cell_barcodes.join( fragments_all_df.sample(frac=sampling_fraction), left_on="CellBarcode", right_on="CellBarcode", how="left") # Get number of sampled fragments (with possible duplicate fragments) which have good barcodes. stats_df.loc[ sampling_fraction, "total_frag_count"] = fragments_sampled_for_good_bc_df.height logger.info("Calculate mean number of fragments per barcode.") stats_df.loc[ sampling_fraction, "mean_frag_per_bc"] = fragments_sampled_for_good_bc_df.select([ pl.col('CellBarcode'), pl.col('FragmentCount') ]).groupby("CellBarcode").agg( [pl.count("FragmentCount").alias("FragmentsPerCB")]).select([ pl.col("FragmentsPerCB").mean().alias("MeanFragmentsPerCB") ])["MeanFragmentsPerCB"][0] logger.info("Calculate median number of unique fragments per barcode.") stats_df.loc[ sampling_fraction, "median_uniq_frag_per_bc"] = fragments_sampled_for_good_bc_df.groupby( ["CellBarcode", "Chromosome", "Start", "End"]).agg([ pl.col("FragmentCount").first().alias("FragmentCount") ]).select([pl.col("CellBarcode"), pl.col("FragmentCount") ]).groupby("CellBarcode").agg( pl.col("FragmentCount").count().alias( "UniqueFragmentsPerCB")).select( pl.col("UniqueFragmentsPerCB").median() )["UniqueFragmentsPerCB"][0] stats_df.loc[sampling_fraction, "cell_barcode_count"] = nbr_good_cell_barcodes # Delete dataframe to free memory. del fragments_sampled_for_good_bc_df logger.info(f'Saving statistics in "{stats_tsv_filename}".') stats_df.to_csv(stats_tsv_filename, sep="\t") return stats_df
causal_STR_candidates = pl.read_csv( f'{ukb}/post_finemapping/intermediate_results/concordant_causal_STR_candidates.tab', sep='\t').select([ 'phenotype', 'chrom', 'pos', pl.lit(True).alias('is_causal_STR_candidate') ]) all_STRs = pl.DataFrame(all_STRs).join( causal_STR_candidates.groupby(['chrom', 'pos']).agg( pl.col('is_causal_STR_candidate').any()), how='left', left_on=['chrom', 'SNPSTR_start_pos'], right_on=['chrom', 'pos']) assert all_STRs.groupby(['chrom', 'pos']).agg(pl.count()).select( pl.col('count').max().alias('out'))['out'].to_numpy()[0] == 1 ''' finemapping_results = pl.read_csv( 'post_finemapping/intermediate_results/finemapping_putatively_causal_concordance.tab', sep='\t' ).filter( ~pl.col('finemap_pip').is_null() & ~pl.col('susie_alpha').is_null() & pl.col('is_STR') & (pl.col('p_val') <= 1e-10) ).with_columns([ pl.when(pl.col('susie_cs') > 0).then(pl.col('susie_alpha')).otherwise(0).alias('susie_alpha'), pl.when(pl.col('susie_cs_ratio') > 0).then(pl.col('susie_alpha_ratio')).otherwise(0).alias('susie_alpha_ratio'), pl.when(pl.col('susie_cs_hardcall') > 0).then(pl.col('susie_alpha_hardcall')).otherwise(0).alias('susie_alpha_hardcall'), ])
def map_expr(name: str) -> pl.Expr: return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then( pl.struct([ pl.sum(name).alias("sum"), (pl.count() - pl.col(name).null_count()).alias("count"), ]), ).otherwise(None)).alias("out")