Пример #1
0
def test_n_distinct_handles_in_na_rm():
    d = tibble(x=c([1, 2, 3, 4], NA))
    yes = True
    no = False

    out = d >> summarise(n=n_distinct(f.x, na_rm=True)) >> pull(to="list")
    assert out == [4]
    out = d >> summarise(n=n_distinct(f.x, na_rm=False)) >> pull(to="list")
    assert out == [5]
    out = d >> summarise(n=n_distinct(f.x, na_rm=yes)) >> pull(to="list")
    assert out == [4]
    out = d >> summarise(n=n_distinct(f.x, na_rm=no)) >> pull(to="list")
    assert out == [5]

    out = (d >> summarise(n=n_distinct(f.x, na_rm=True or True)) >>
           pull(to="list"))
    assert out == [4]
Пример #2
0
def test_cur_data_all():
    df = tibble(x=c("b", "a", "b"), y=[1, 2, 3])
    gf = df >> group_by(f.x, _sort=True)

    out = df >> summarise(x=cur_data()) >> pull(f.x, to="list")
    assert out[0].equals(df)

    out = df >> summarise(x=cur_data_all()) >> pull(f.x, to="list")
    assert out[0].equals(df)

    out = gf >> summarise(x=cur_data()) >> pull(f.x)
    assert out.values[0].values.flatten().tolist() == [2]
    assert out.values[1].values.flatten().tolist() == [1, 3]

    out = gf >> summarise(x=cur_data_all()) >> pull(f.x)
    assert out.values[0].values.flatten().tolist() == ["a", 2]
    assert out.values[1].values.flatten().tolist() == ["b", 1, "b", 3]
Пример #3
0
def test_proportion_computed_correctly():
    df = tibble(x=range(1, 11))

    out = df >> slice_head(prop=0.11) >> nrow()
    assert out == 1
    out = df >> slice_tail(prop=0.11) >> nrow()
    assert out == 1
    out = df >> slice_sample(prop=0.11) >> nrow()
    assert out == 1
    out = df >> slice_min(f.x, prop=0.11) >> nrow()
    assert out == 1
    out = df >> slice_max(f.x, prop=0.11) >> nrow()
    assert out == 1
    out = df >> slice_max(f.x, prop=0.11, with_ties=False) >> nrow()
    assert out == 1
    out = df >> slice_min(f.x, prop=0.11, with_ties=False) >> nrow()
    assert out == 1
Пример #4
0
def test_handles_passing_args():
    df = tibble(x=range(1, 5))

    def ff(*args):
        x1 = 4
        f1 = lambda y: y
        return df >> filter(*args, f1(x1) > f.x)

    def g():
        x2 = 2
        return ff(f.x > x2)

    res = g()
    assert res.x.tolist() == [3]

    df >>= group_by(f.x)
    res = g()
    assert res.x.obj.tolist() == [3]
Пример #5
0
def test_across():
    df = tibble(x=[1, 3, 2, 1], y=[4, 3, 2, 1])

    out = df >> arrange(across())
    expect = df >> arrange(f.x, f.y)
    assert out.equals(expect)

    out = df >> arrange(across(None, desc))
    expect = df >> arrange(desc(f.x), desc(f.y))
    assert out.equals(expect)

    out = df >> arrange(across(f.x))
    expect = df >> arrange(f.x)
    assert out.equals(expect)

    out = df >> arrange(across(f.y))
    expect = df >> arrange(f.y)
    assert out.equals(expect)
Пример #6
0
def test_dup_keyword_args():
    df = tibble(a=1)
    out = df >> mutate(_b=f.a + 1, b=f._b * 2)
    assert_tibble_equal(out, tibble(a=1, b=4))
    # order doesn't matter
    out = df >> mutate(b=f.a + 1, _b=f.b * 2)
    assert_tibble_equal(out, tibble(a=1, b=2, _b=4))
    # support >= 2 dups
    out = df >> mutate(__b=f.a + 1, _b=f.__b * 2, b=f._b / 4.0)
    assert_tibble_equal(out, tibble(a=1, b=1.0))
    # has to be consective
    out = df >> mutate(__b=f.a + 1, _b=f.__b * 2, b=f._b / 4.0)
    assert_tibble_equal(out, tibble(a=1, b=1.0))
    out = df >> mutate(__b=f.a + 1, _b=f.__b * 2)
    assert_tibble_equal(out, tibble(a=1, _b=4))
    out = df >> mutate(_b=f.a + 1)
    assert_tibble_equal(out, tibble(a=1, _b=2))
Пример #7
0
def test_head_tail():
    df = tibble(x=range(20))
    z = df >> head()
    assert z.shape[0] == 6
    z = df >> head(3)
    assert z.shape[0] == 3
    z = list(range(10)) >> head()
    assert len(z) == 6
    with pytest.raises(NotImplementedError):
        head(3)

    z = df >> tail()
    assert z.shape[0] == 6
    z = df >> tail(3)
    assert z.shape[0] == 3
    z = list(range(10)) >> tail()
    assert len(z) == 6
    with pytest.raises(NotImplementedError):
        tail(3)
Пример #8
0
 def _get_input_data(ch1, ch2, ch3, ch4):
     metadf = _get_metadf(ch1)
     normal_masks = metadf[options.type_col] == options.type_normal
     return tibble(
         covfiles=(
             [None]
             if sum(normal_masks) == 0
             else [
                 ch2.outfile[normal_masks].tolist()
                 + ch3.outfile[normal_masks].tolist()
             ]
         ),
         target_file=ch4.target_file,
         antitarget_file=ch4.antitarget_file,
         sample_sex=(
             ",".join(metadf.SampleSex[normal_masks])
             if "SampleSex" in metadf.columns
             else [None]
         ),
     )
Пример #9
0
    class MetabolicExprNormalization(Proc):
        """Normalize the expression data using deconvolution

        Requires:
            - name: r-scran
              check: |
                {{proc.lang}} <(echo "library(scran)")
        """

        requires = MetabolicPrepareSCE, MetabolicInputs
        input = "sceobj:file, configfile:file"
        output = "outfile:file:{{in.sceobj | stem0}}.sce.RDS"
        input_data = lambda ch1, ch2: tibble(
            sceobj=ch1.outfile,
            configfile=ch2.configfile,
        )
        envs = {"dropout": 0.75, "refexon": config.ref.refexon}
        lang = config.lang.rscript
        script = (
            "file://../scripts/scrna_metabolic/MetabolicExprNormalization.R")
Пример #10
0
 def _get_input_data(ch1, ch2):
     metadf = _get_metadf(ch1)
     tumor_masks = metadf[options.type_col] == options.type_tumor
     return tibble(
         chrfile=ch2.outfile,
         vcf=(
             metadf.SnpVcf[tumor_masks]
             if "SnpVcf" in metadf.columns
             else [None]
         ),
         sample_id=(
             metadf.VcfSampleID[tumor_masks]
             if "VcfSampleID" in metadf.columns
             else [None]
         ),
         normal_id=(
             metadf.NormalID[tumor_masks]
             if "NormalID" in metadf.columns
             else [None]
         ),
     )
Пример #11
0
def test_get():
    df = tibble(x=2)
    df.index = ["a"]

    out = df >> get()
    assert_frame_equal(out, df)

    out = df >> get(0, 0)
    assert out == 2

    out = df >> get("a", "x")
    assert out == 2

    out = df >> get(["a"], ["x"])
    assert out.equals(df)

    out = df >> get("a")
    assert out.equals(df)

    out = df >> get(cols="x")
    assert out.equals(df)
Пример #12
0
def test_0col_df_in_results_ignored():
    df1 = tibble(x=[1, 2])
    df2 = df1 >> group_by(f.x) >> summarise(tibble())
    assert df2.equals(df1)

    df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65)
    df3 = df1 >> mutate(y=65)
    assert df2.equals(df3)

    df2 = tibble(x=[1, 2], y=[3, 4])
    df3 = df2 >> group_by(f.x) >> summarise(tibble())
    assert df3.equals(df1)

    df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98)
    df4 = df1 >> mutate(z=98)
    assert df3.equals(df4)
Пример #13
0
def test_col_row_verbs():
    df = tribble(f.x, f.y, f.z, 1, NA, 6, 2, 4, 9, 3, 6, 15)
    assert_iterable_equal(row_medians(df), [NA, 4, 6])
    assert_iterable_equal(row_medians(df, na_rm=True), [3.5, 4, 6])
    assert_iterable_equal(col_medians(df), [2, NA, 9])
    assert_iterable_equal(col_medians(df, na_rm=True), [2, 5, 9])

    assert_iterable_equal(row_means(df), [NA, 5, 8])
    assert_iterable_equal(row_means(df, na_rm=True), [3.5, 5, 8])
    assert_iterable_equal(col_means(df), [2, NA, 10])
    assert_iterable_equal(col_means(df, na_rm=True), [2, 5, 10])

    assert_iterable_equal(row_sums(df), [NA, 15, 24])
    assert_iterable_equal(row_sums(df, na_rm=True), [7, 15, 24])
    assert_iterable_equal(col_sums(df), [6, NA, 30])
    assert_iterable_equal(col_sums(df, na_rm=True), [6, 10, 30])

    assert_iterable_equal(
        row_sds(df), [NA, 3.605551275463989, 6.244997998398398], approx=True
    )
    assert_iterable_equal(
        row_sds(df, na_rm=True),
        [3.5355339059327378, 3.605551275463989, 6.244997998398398],
        approx=True,
    )
    assert_iterable_equal(
        col_sds(df), [1.0, NA, 4.58257569495584], approx=True
    )
    assert_iterable_equal(
        col_sds(df, na_rm=True),
        [1.0, 1.4142135623730951, 4.58257569495584],
        approx=True,
    )

    # grouped
    df = tibble(x=[1, 1, 2, 2], y=[3, 4, 3, 4]).group_by('x')
    assert_iterable_equal(col_sums(df).y, [7, 7])
    assert_iterable_equal(col_means(df).y, [3.5, 3.5])
    assert_iterable_equal(col_medians(df).y, [3.5, 3.5])
    assert_iterable_equal(col_sds(df).y, [0.7071, 0.7071], approx=1e-3)
Пример #14
0
    class MetabolicFeaturesIntraSubsets(Proc):
        """Intra-subset metabolic features - Enrichment analysis in details

        Requires:
            - name: r-parallel
              check: |
                {{proc.lang}} <(echo "library(parallel)")
            - name: r-scater
              check: |
                {{proc.lang}} <(echo "library(scater)")
            - name: r-fgsea
              check: |
                {{proc.lang}} <(echo "library(fgsea)")
        """
        if options["intra-subset"]:
            requires = MetabolicExprNormalization, MetabolicInputs
        input = "sceobjs:files, gmtfile:file, configfile:file"
        input_data = lambda ch1, ch2: tibble(
            sceobjs=[list(ch1.outfile)],
            gmtfile=ch2.gmtfile,
            configfile=ch2.configfile,
        )
        output = "outdir:dir:{{in.configfile | stem}}.intras-pathwayfeatures"
        lang = config.lang.rscript
        order = 4
        envs = {
            "ncores": config.misc.ncores,
            "fgsea": True,
            "prerank_method": "signal_to_noise",
            "top": 10,
        }
        script = (
            "file://../scripts/scrna_metabolic/MetabolicFeaturesIntraSubsets.R"
        )
        plugin_opts = {
            "report": ("file://../reports/scrna_metabolic/"
                       "MetabolicFeaturesIntraSubsets.svelte")
        }
Пример #15
0
def test_transform_register():
    @func_factory(kind="transform", data_args="x")
    def double(x):
        return x * 2

    @double.register(DataFrame)
    def _(x):
        return x * 3

    x = Series([2, 3])
    out = double(x)
    assert_iterable_equal(out, [4, 6])

    double.register(Series, lambda x: x * 4)

    out = double(x)
    assert_iterable_equal(out, [8, 12])

    x = tibble(a=[1, 3])
    out = double(x)
    assert_iterable_equal(out.a, [3, 9])

    out = double([1, 4])
    assert_iterable_equal(out, [4, 16])

    # register an available string func for tranform
    double.register(SeriesGroupBy, "sum")
    x = Series([1, -2]).groupby([1, 2])
    out = double(x)
    assert_iterable_equal(out.obj, [1, -2])

    # seriesrowwise
    double.register(SeriesRowwise, lambda x: x + 1)
    x.is_rowwise = True
    out = double(x)
    assert_iterable_equal(out.obj, [2, -1])
    assert out.is_rowwise
Пример #16
0
def test_mixed_rows():
    df = tibble(x=range(5))

    # order kept
    # 0   1   2   3   4
    #        -3      -1
    #             3
    out = slice(df, c(-c(3, 1), 3))
    assert out.x.tolist() == [2, 4, 3]

    # 0   1   2   3   4
    #            -2  -1
    #             3
    out = slice(df, c(-f[1:3], 3))
    assert out.x.tolist() == [4, 3, 3]

    # 0   1   2   3   4
    # 0       2
    #                -1
    out = slice(df, c(~c(0, 2), ~c(-1)))
    assert out.x.tolist() == [1, 3]

    out = df >> slice(c(~f[3:], ~c(1)))
    assert out.x.tolist() == [0, 2]
Пример #17
0
def test_row_number_handles_empty_dfs():
    df = tibble(a=[])
    res = df >> mutate(
        row_number_0=row_number(),
        # row_number_a=row_number(f.a), # row_number doesn't support extra arg
        ntile=ntile(f.a, 2),
        min_rank=min_rank(f.a),
        percent_rank=percent_rank(f.a),
        dense_rank=dense_rank(f.a),
        cume_dist=cume_dist(f.a),
    )
    assert_iterable_equal(
        res.columns,
        [
            "a",
            "row_number_0",
            "ntile",
            "min_rank",
            "percent_rank",
            "dense_rank",
            "cume_dist",
        ],
    )
    assert nrow(res) == 0
Пример #18
0
    class MetabolicPrepareSCE(Proc):
        """Prepare SingleCellExperiment objects

        Requires:
            - name: r-scater
              check: |
                {{proc.lang}} <(echo "library(scater)")
            - name: r-seurat
              check: |
                {{proc.lang}} <(echo "library(Seurat)")
        """

        requires = MetabolicExprImputation, MetabolicInputs
        input = "impfiles:files, gmtfile:file"
        input_data = lambda ch1, ch2: tibble(
            impfiles=_group_imputed_files(ch1.outfile),
            gmtfile=ch2.gmtfile,
        )
        output = (
            "outfile:file:"
            "{{in.impfiles | first | stem | split: '.' | first}}.sce.RDS")
        lang = config.lang.rscript
        envs = {"refexon": config.ref.refexon}
        script = "file://../scripts/scrna_metabolic/MetabolicPrepareSCE.R"
Пример #19
0
def test_arguments_to_select_dont_match_vars_select_arguments():
    df = tibble(a=1)
    out = select(df, var=f.a)
    assert out.equals(tibble(var=1))

    out = select(group_by(df, f.a), var=f.a)
    exp = group_by(tibble(var=1), f.var)
    assert out.equals(exp)
    assert group_vars(out) == group_vars(exp)

    out = select(df, exclude=f.a)
    assert out.equals(tibble(exclude=1))
    out = select(df, include=f.a)
    assert out.equals(tibble(include=1))

    out = select(group_by(df, f.a), exclude=f.a)
    exp = group_by(tibble(exclude=1), f.exclude)
    assert out.equals(exp)
    assert group_vars(out) == group_vars(exp)

    out = select(group_by(df, f.a), include=f.a)
    exp = group_by(tibble(include=1), f.include)
    assert out.equals(exp)
    assert group_vars(out) == group_vars(exp)
Пример #20
0
 def rn(x):
     return tibble(x=[1, 2, 3])
Пример #21
0
def test_sort_empty_df():
    df = tibble()
    out = df >> arrange()
    assert_tibble_equal(out, df)
Пример #22
0
def test_incompatible_size_fill_with_NA():
    df1 = tibble(x=range(1, 4))
    df2 = tibble(y=range(1, 2))
    out = (df1 >> bind_cols(df2)).fillna(100)
    assert out.x.tolist() == [1, 2, 3]
    assert out.y.tolist() == [1, 100, 100]
Пример #23
0
def test_transform_hooks():
    @func_factory(kind="transform", data_args="x")
    def times(x, t):
        return x * t

    with pytest.raises(ValueError):
        times.register(Series, meta=False, pre=1, func=None)

    times.register(
        Series,
        func=None,
        pre=lambda x, t: (x, (-t, ), {}),
        post=lambda out, x, t: out + t,
    )

    x = Series([1, 2])
    out = times(x, -1)
    assert_iterable_equal(out, [2, 3])

    @times.register(Series, meta=False)
    def _(x, t):
        return x + t

    out = times(x, 10)
    assert_iterable_equal(out, [11, 12])

    @times.register(SeriesGroupBy, meta=True)
    def _(x, t):
        return x + 10

    x = Series([1, 2, 1, 2]).groupby([1, 1, 2, 2])
    out = times(x, 1)
    assert_iterable_equal(out.obj, [11, 12, 11, 12])

    times.register(
        SeriesGroupBy,
        func=None,
        pre=lambda x, t: (x, (t + 1, ), {}),
        post=lambda out, x, *args, **kwargs: out,
    )
    out = times(x, 1)
    assert_iterable_equal(out, [2, 4, 2, 4])

    times.register(
        Series,
        func=None,
        pre=lambda *args, **kwargs: None,
        post=lambda out, x, t: out + t,
    )
    x = Series([1, 2])
    out = times(x, 3)
    assert_iterable_equal(out, [4, 5])

    @times.register(DataFrame, meta=True)
    def _(x, t):
        return x**t

    x = tibble(a=[1, 2], b=[2, 3])
    out = times(x, 3)
    assert_iterable_equal(out.a, [1, 8])
    assert_iterable_equal(out.b, [8, 27])

    # TibbleGrouped
    times.register(
        TibbleGrouped,
        func=None,
        pre=lambda x, t: (x, (t - 1, ), {}),
        post=lambda out, x, t: out.reindex([1, 0]),
    )
    x = x.group_by("a")
    out = times(x, 3)
    assert_iterable_equal(out.b, [6, 4])

    @times.register(
        TibbleGrouped,
        meta=False,
    )
    def _(x, t):
        out = x.transform(lambda d, t: d * t, 0, t - 1)
        out.iloc[0, 1] = 10
        return out

    # x = tibble(a=[1, 2], b=[2, 3])  # grouped by a
    out = times(x, 3)
    assert isinstance(out, TibbleGrouped)
    assert_iterable_equal(out.group_vars, ["a"])
    assert_iterable_equal(out.b.obj, [10, 6])
Пример #24
0
def test_complex_cols():
    df = tibble(x=[1, 2, 3], y=[3 + 2j, 2 + 2j, 1 + 2j])
    out = df >> arrange(f.y)
    assert_iterable_equal(out.x, [3, 2, 1])
Пример #25
0
def test_update_grouping():
    df = tibble(g=[2, 2, 1, 1], x=[1, 3, 2, 4])
    res = df >> group_by(f.g) >> arrange(f.x)
    assert isinstance(res, TibbleGrouped)
    assert group_rows(res) == [[0, 2], [1, 3]]
Пример #26
0
def test_df_cols():
    df = tibble(x=[1, 2, 3], y=tibble(z=[3, 2, 1]))
    out = df >> arrange(f.y)
    expect = tibble(x=[3, 2, 1], y=tibble(z=[1, 2, 3]))
    assert out.reset_index(drop=True).equals(expect)
Пример #27
0
def test_na_end():
    df = tibble(x=c(4, 3, NA))  # NA makes it float
    out = df >> arrange(f.x)
    assert_iterable_equal(out.x, [3, 4, None])
    out = df >> arrange(desc(f.x))
    assert_iterable_equal(out.x, [4, 3, None])
Пример #28
0
def test_reorder_cols():
    df = tibble(a=1, b=2, c=3, d=4, e=5, f=6)
    df_scramble = df[sample(df.columns)]
    out = df >> bind_rows(df_scramble)
    assert out.columns.tolist() == list("abcdef")
Пример #29
0
from datar.base import c, factor, letters, NA, identity, sum
from datar.dplyr import (
    n_distinct,
    summarise,
    group_by,
    pull,
)
from datar.tibble import tibble
from datar.datasets import iris
from ..conftest import assert_iterable_equal

df_var = tibble(
    l=c(True, False, False),
    i=c(1, 1, 2),
    # d = Sys.Date() + c(1, 1, 2),
    f=factor(letters[c(1, 1, 2)]),
    n=np.array(c(1, 1, 2)) + 0.5,
    # t = Sys.time() + c(1, 1, 2),
    c=letters[c(1, 1, 2)],
)


def test_n_disinct_gives_the_correct_results_on_iris():
    out = iris.apply(n_distinct)
    exp = iris.apply(lambda col: len(col.unique()))
    assert_iterable_equal(out, exp)


def test_n_distinct_treats_na_correctly():
    # test_that("n_distinct treats NA correctly in the REALSXP case (#384)", {
    assert n_distinct(c(1.0, NA, NA), na_rm=False) == 2
Пример #30
0
def test_n_distinct_respects_data():
    df = tibble(x=42)
    out = df >> summarise(n=n_distinct(df.x))
    exp = tibble(n=1)
    assert out.equals(exp)