Exemplo n.º 1
0
def test_arange_no_rows() -> None:
    df = pl.DataFrame(dict(x=[5, 5, 4, 4, 2, 2]))
    out = df.with_column(pl.arange(0, pl.count()).over("x"))  # type: ignore[union-attr]
    assert out.frame_equal(
        pl.DataFrame({"x": [5, 5, 4, 4, 2, 2], "literal": [0, 1, 0, 1, 0, 1]})
    )

    df = pl.DataFrame(dict(x=[]))
    out = df.with_column(pl.arange(0, pl.count()).over("x"))  # type: ignore[union-attr]
    assert out.frame_equal(pl.DataFrame({"x": [], "literal": []}))
Exemplo n.º 2
0
def test_count_expr() -> None:
    df = pl.DataFrame({"a": [1, 2, 3, 3, 3], "b": ["a", "a", "b", "a", "a"]})

    out = df.select(pl.count())
    assert out.shape == (1, 1)
    assert out[0, 0] == 5

    out = df.groupby("b", maintain_order=True).agg(pl.count())
    assert out["b"].to_list() == ["a", "b"]
    assert out["count"].to_list() == [4, 1]
Exemplo n.º 3
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out["a"] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out.select_at_idx(0), expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out.select_at_idx(1), expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(2), expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(3), expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out.select_at_idx(4), expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(5), expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(6), expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(7), expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(8), expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
Exemplo n.º 4
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out[0] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out[0], expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out[1], expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out[2], expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out[3], expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out[4], expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out[5], expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out[6], expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out[7], expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out[8], expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
Exemplo n.º 5
0
def test_sorted_groupby_optimization() -> None:
    df = pl.DataFrame({"a": np.random.randint(0, 5, 20)})

    # the sorted optimization should not randomize the
    # groups, so this is tests that we hit the sorted optimization
    for reverse in [True, False]:
        sorted_implicit = (df.with_column(
            pl.col("a").sort(reverse=reverse)).groupby("a").agg(pl.count()))

        sorted_explicit = df.groupby("a").agg(pl.count()).sort("a",
                                                               reverse=reverse)
        sorted_explicit.frame_equal(sorted_implicit)
Exemplo n.º 6
0
def test_apply_custom_function():
    df = pl.DataFrame(
        {
            "A": [1, 2, 3, 4, 5],
            "fruits": ["banana", "banana", "apple", "apple", "banana"],
            "B": [5, 4, 3, 2, 1],
            "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
        }
    )

    # two ways to determine the length groups.
    a = (
        df.lazy()
        .groupby("fruits")
        .agg(
            [
                pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"),
                pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"),
                pl.count("cars"),
            ]
        )
        .sort("custom_1", reverse=True)
    ).collect()
    expected = pl.DataFrame(
        {
            "fruits": ["banana", "apple"],
            "custom_1": [3, 2],
            "custom_2": [3, 2],
            "cars_count": [3, 2],
        }
    )
    expected["cars_count"] = expected["cars_count"].cast(pl.UInt32)
    assert a.frame_equal(expected)
Exemplo n.º 7
0
def test_repeat_expansion_in_groupby() -> None:
    out = (
        pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]})
        .groupby("g", maintain_order=True)
        .agg(pl.repeat(1, pl.count()).cumsum())
        .to_dict()
    )
    assert out == {"g": [1, 2, 3], "literal": [[1], [1, 2], [1, 2, 3]]}
Exemplo n.º 8
0
def test_groupby_rolling_by_() -> None:
    df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join(
        pl.DataFrame(
            {
                "datetime": pl.date_range(
                    datetime(2020, 1, 1), datetime(2020, 1, 5), "1d"
                ),
            }
        ),
        how="cross",
    )
    out = (
        df.sort("datetime")
        .groupby_rolling(index_column="datetime", by="group", period="3d")
        .agg([pl.count().alias("count")])
    )

    expected = (
        df.sort(["group", "datetime"])
        .groupby_rolling(index_column="datetime", by="group", period="3d")
        .agg([pl.count().alias("count")])
    )
    assert out.sort(["group", "datetime"]).frame_equal(expected)
    assert out.to_dict(False) == {
        "group": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
        "datetime": [
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
            datetime(2020, 1, 1, 0, 0),
            datetime(2020, 1, 2, 0, 0),
            datetime(2020, 1, 3, 0, 0),
            datetime(2020, 1, 4, 0, 0),
            datetime(2020, 1, 5, 0, 0),
        ],
        "count": [1, 2, 3, 3, 3, 1, 2, 3, 3, 3, 1, 2, 3, 3, 3],
    }
Exemplo n.º 9
0
def test_count_window() -> None:
    assert (
        pl.DataFrame(
            {
                "a": [1, 1, 2],
            }
        )
        .with_column(pl.count().over("a"))["count"]
        .to_list()
    ) == [2, 2, 1]
Exemplo n.º 10
0
def test_predicate_count_vstack() -> None:
    l1 = pl.DataFrame({
        "k": ["x", "y"],
        "v": [3, 2],
    }).lazy()
    l2 = pl.DataFrame({
        "k": ["x", "y"],
        "v": [5, 7],
    }).lazy()
    assert pl.concat([l1, l2]).filter(
        pl.count().over("k") == 2).collect()["v"].to_list() == [3, 2, 5, 7]
Exemplo n.º 11
0
def test_apply_custom_function():
    df = pl.DataFrame({
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    })

    # two ways to determine the length groups.
    (df.lazy().groupby("fruits").agg([
        pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"),
        pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"),
        pl.count("cars"),
    ])).collect()
Exemplo n.º 12
0
def test_groupby_rolling_negative_offset_3914() -> None:
    df = pl.DataFrame(
        {
            "datetime": pl.date_range(datetime(2020, 1, 1), datetime(2020, 1, 5), "1d"),
        }
    )
    assert df.groupby_rolling(index_column="datetime", period="2d", offset="-4d").agg(
        pl.count().alias("count")
    )["count"].to_list() == [0, 0, 1, 2, 2]

    df = pl.DataFrame(
        {
            "ints": range(0, 20),
        }
    )

    assert df.groupby_rolling(index_column="ints", period="2i", offset="-5i",).agg(
        [pl.col("ints").alias("matches")]
    )["matches"].to_list() == [
        [],
        [],
        [],
        [0],
        [0, 1],
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [6, 7],
        [7, 8],
        [8, 9],
        [9, 10],
        [10, 11],
        [11, 12],
        [12, 13],
        [13, 14],
        [14, 15],
        [15, 16],
    ]
Exemplo n.º 13
0
import polars as pl

from .dataset import dataset

q = (dataset.lazy().groupby("first_name").agg(
    [pl.count("party"),
     pl.col("gender").list(),
     pl.first("last_name")]).sort("party_count", reverse=True).limit(5))

df = q.collect()
Exemplo n.º 14
0
import polars as pl

dataset = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    }
)

# three ways to determine the length groups
q = (
    dataset.lazy()
    .groupby("fruits")
    .agg(
        [
            pl.col("cars").apply(lambda groups: groups.len()).alias("custom_1"),
            pl.col("cars").apply(lambda groups: groups.len()).alias("custom_2"),
            pl.count("cars"),
        ]
    )
)

df = q.collect()
Exemplo n.º 15
0
def test_error_on_empty_groupby() -> None:
    with pytest.raises(
            pl.ComputeError,
            match="expected keys in groupby operation, got nothing"):
        pl.DataFrame(dict(x=[0, 0, 1, 1])).groupby([]).agg(pl.count())
Exemplo n.º 16
0
Arquivo: main.py Projeto: ghuls/polars
print("out.shape", out.shape)
print('out["largest2_v3"].sum()', out["largest2_v3"].sum())

t0 = time.time()
print("q9")
out = (x.groupby(["id2", "id4"]).agg(
    (pl.pearson_corr("v1", "v2")**2).alias("r2")).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["r2"].sum()', out["r2"].sum())

t0 = time.time()
print("q10")
out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg(
    [pl.sum("v3").alias("v3"),
     pl.count("v1").alias("count")]).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print("easy took:", easy_time, "s")
print("advanced took:", time.time() - t0advanced, "s")
print("total took:", time.time() - t00, "s")

t00 = time.time()
t0 = time.time()
print("q1")
out = x.groupby("id1").agg(pl.sum("v1")).collect()
print(time.time() - t0)
assert out.shape == (96, 2)
assert out["v1_sum"].sum() == 28501451

t0easy = time.time()
def sub_sample_fragments(
    fragments_df,
    min_uniq_frag=200,
    sampling_fractions=sampling_fractions_default,
    stats_tsv_filename="sampling_stats.tsv",
    whitelist=None,
):
    sampling_fractions_length = len(sampling_fractions)

    # Initialize dataframe for storing all statistics results.
    stats_df = pd.DataFrame(
        {
            "mean_frag_per_bc":
            np.zeros(sampling_fractions_length, np.float64),
            "median_uniq_frag_per_bc":
            np.zeros(sampling_fractions_length, np.float64),
            "total_frag_count":
            np.zeros(sampling_fractions_length, np.uint32),
            "cell_barcode_count":
            np.zeros(sampling_fractions_length, np.uint32),
        },
        index=pd.Index(data=np.array(sampling_fractions),
                       name="sampling_fraction"),
    )

    # Get all cell barcodes which have more than min_uniq_frag fragments.
    good_cell_barcodes = fragments_df.groupby("CellBarcode").agg(
        pl.col("FragmentCount").count().alias('nbr_frags_per_CBs')).filter(
            pl.col("nbr_frags_per_CBs") > min_uniq_frag)

    # Count all good cell barcodes.
    nbr_good_cell_barcodes = good_cell_barcodes.height

    if 1.0 in sampling_fractions:
        # As there is no need to sample when sampling fraction is 100%,
        # the median number of unique fragments per barcode can be
        # calculated much more efficiently on the original fragments
        # file dataframe with counts than the expanded one, which is
        # needed when sampling is required.

        logger.info("Calculate statistics for sampling fraction 100.0%.")

        logger.info(f"Keep fragments with good barcodes.")
        fragments_for_good_bc_df = good_cell_barcodes.join(
            fragments_df,
            left_on="CellBarcode",
            right_on="CellBarcode",
            how="left")

        logger.info("Calculate total number of fragments.")
        stats_df.loc[1.0,
                     "total_frag_count"] = fragments_for_good_bc_df.select([
                         pl.col("FragmentCount").sum().alias("TotalFragCount")
                     ])["TotalFragCount"][0]

        logger.info("Calculate mean number of fragments per barcode.")
        stats_df.loc[
            1.0, "mean_frag_per_bc"] = fragments_for_good_bc_df.groupby(
                "CellBarcode").agg([
                    pl.col("FragmentCount").sum().alias("MeanFragmentsPerCB")
                ]).select([pl.col("MeanFragmentsPerCB").mean()
                           ])["MeanFragmentsPerCB"][0]

        logger.info("Calculate median number of unique fragments per barcode.")
        stats_df.loc[
            1.0, "median_uniq_frag_per_bc"] = fragments_for_good_bc_df.groupby(
                "CellBarcode").agg(
                    pl.col("FragmentCount").count().alias(
                        "UniqueFragmentsPerCB")).select(
                            pl.col("UniqueFragmentsPerCB").median()
                        )["UniqueFragmentsPerCB"][0]

        stats_df.loc[1.0, "cell_barcode_count"] = nbr_good_cell_barcodes

        # Delete dataframe to free memory.
        del fragments_for_good_bc_df

    # Create dataframe where each row contains one fragment:
    #   - Original dataframe has a count per fragment with the same cell barcode.
    #   - Create a row for each count, so we can sample fairly afterwards.
    logger.info("Create dataframe with all fragments (for sampling).")
    fragments_all_df = fragments_df.with_column(
        pl.col("FragmentCount").repeat_by(
            pl.col("FragmentCount"))).explode("FragmentCount")

    # Delete input dataframe to free memory.
    del fragments_df

    for sampling_fraction in sampling_fractions:
        if sampling_fraction == 0.0:
            # All statistics are zero and already set when the stats_df dataframe is created.
            continue
        elif sampling_fraction == 1.0:
            # Statistics for 100% sampling are already calculated as there is no need
            # to have the fragments_all_df dataframe as no sampling is needed.
            # This avoids the need to use the expensive groupby operations for the
            # calculations of the median number of unique fragments per barcode.
            continue

        logger.info(
            f"Calculate statistics for sampling fraction {round(sampling_fraction * 100, 1)}%."
        )

        # Sample x% from all fragments (with duplicates) and keep fragments which have good barcodes.
        logger.info(
            f"Sample {round(sampling_fraction * 100, 1)}% from all fragments and keep fragments with good barcodes."
        )
        fragments_sampled_for_good_bc_df = good_cell_barcodes.join(
            fragments_all_df.sample(frac=sampling_fraction),
            left_on="CellBarcode",
            right_on="CellBarcode",
            how="left")

        # Get number of sampled fragments (with possible duplicate fragments) which have good barcodes.
        stats_df.loc[
            sampling_fraction,
            "total_frag_count"] = fragments_sampled_for_good_bc_df.height

        logger.info("Calculate mean number of fragments per barcode.")
        stats_df.loc[
            sampling_fraction,
            "mean_frag_per_bc"] = fragments_sampled_for_good_bc_df.select([
                pl.col('CellBarcode'),
                pl.col('FragmentCount')
            ]).groupby("CellBarcode").agg(
                [pl.count("FragmentCount").alias("FragmentsPerCB")]).select([
                    pl.col("FragmentsPerCB").mean().alias("MeanFragmentsPerCB")
                ])["MeanFragmentsPerCB"][0]

        logger.info("Calculate median number of unique fragments per barcode.")
        stats_df.loc[
            sampling_fraction,
            "median_uniq_frag_per_bc"] = fragments_sampled_for_good_bc_df.groupby(
                ["CellBarcode", "Chromosome", "Start", "End"]).agg([
                    pl.col("FragmentCount").first().alias("FragmentCount")
                ]).select([pl.col("CellBarcode"),
                           pl.col("FragmentCount")
                           ]).groupby("CellBarcode").agg(
                               pl.col("FragmentCount").count().alias(
                                   "UniqueFragmentsPerCB")).select(
                                       pl.col("UniqueFragmentsPerCB").median()
                                   )["UniqueFragmentsPerCB"][0]

        stats_df.loc[sampling_fraction,
                     "cell_barcode_count"] = nbr_good_cell_barcodes

        # Delete dataframe to free memory.
        del fragments_sampled_for_good_bc_df

    logger.info(f'Saving statistics in "{stats_tsv_filename}".')
    stats_df.to_csv(stats_tsv_filename, sep="\t")

    return stats_df
    causal_STR_candidates = pl.read_csv(
        f'{ukb}/post_finemapping/intermediate_results/concordant_causal_STR_candidates.tab',
        sep='\t').select([
            'phenotype', 'chrom', 'pos',
            pl.lit(True).alias('is_causal_STR_candidate')
        ])

    all_STRs = pl.DataFrame(all_STRs).join(
        causal_STR_candidates.groupby(['chrom', 'pos']).agg(
            pl.col('is_causal_STR_candidate').any()),
        how='left',
        left_on=['chrom', 'SNPSTR_start_pos'],
        right_on=['chrom', 'pos'])

    assert all_STRs.groupby(['chrom', 'pos']).agg(pl.count()).select(
        pl.col('count').max().alias('out'))['out'].to_numpy()[0] == 1
    '''
    finemapping_results = pl.read_csv(
        'post_finemapping/intermediate_results/finemapping_putatively_causal_concordance.tab',
        sep='\t'
    ).filter(
        ~pl.col('finemap_pip').is_null() &
        ~pl.col('susie_alpha').is_null() &
        pl.col('is_STR') &
        (pl.col('p_val') <= 1e-10)
    ).with_columns([
        pl.when(pl.col('susie_cs') > 0).then(pl.col('susie_alpha')).otherwise(0).alias('susie_alpha'),
        pl.when(pl.col('susie_cs_ratio') > 0).then(pl.col('susie_alpha_ratio')).otherwise(0).alias('susie_alpha_ratio'),
        pl.when(pl.col('susie_cs_hardcall') > 0).then(pl.col('susie_alpha_hardcall')).otherwise(0).alias('susie_alpha_hardcall'),
    ])
Exemplo n.º 19
0
 def map_expr(name: str) -> pl.Expr:
     return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then(
         pl.struct([
             pl.sum(name).alias("sum"),
             (pl.count() - pl.col(name).null_count()).alias("count"),
         ]), ).otherwise(None)).alias("out")