def test_auto_splicing(): df1 = iris >> group_by(f.Species) df2 = iris >> group_by(tibble(Species=iris.Species)) assert df1.equals(df2) df1 = iris >> group_by(f.Species) df2 = iris >> group_by(across(f.Species)) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(across(starts_with("Sepal"), round)) assert df1.equals(df2) # across(character()), across(NULL) not supported df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width, f.Species) ) df2 = iris >> group_by(across(starts_with("Sepal"), round), f.Species) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Species, f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(f.Species, across(starts_with("Sepal"), round)) assert df1.equals(df2)
def test_length1_vectors_are_recycled(): df = tibble(x=range(1, 5)) out = mutate(df, y=1) assert out.y.tolist() == [1, 1, 1, 1] with pytest.raises(ValueError, match="does not match length"): mutate(df, y=[1, 2])
def test_row_number_with_groups(): df = tibble(x=[3, 3, 4, 4]).group_by("x") out = df >> mutate(n=row_number()) assert_iterable_equal(out.n.obj, [1, 2, 1, 2]) out = df >> mutate(n=row_number() + 1) assert_iterable_equal(out.n.obj, [2, 3, 2, 3])
def test_preserves_grouping(): gf = group_by(tibble(x=[1, 2], y=2), f.x) out = mutate(gf, x=1) assert group_vars(out) == ["x"] assert nrow(group_data(out)) == 1 out = mutate(gf, z=1) assert group_data(out).equals(group_data(gf))
def test_preserves_names(): df = tibble(a=range(1, 4)) # note it's treated as data frame out1 = df >> mutate(b=tibble(**dict(zip(letters[:3], [0, 1, 2])))) out2 = df >> mutate(b=tibble(**dict(zip(letters[:3], [[0], [1], [2]])))) assert_iterable_equal(out1["b"].columns, list("abc")) assert_iterable_equal(out2["b"].columns, list("abc"))
def test_keep_none_only_keeps_grouping_variables(): df = tibble(x=1, y=2) gf = group_by(df, f.x) out = mutate(df, z=1, _keep="none") assert out.columns.tolist() == ["z"] out = mutate(gf, z=1, _keep="none") assert out.columns.tolist() == ["x", "z"]
def test_unnamed_data_frames_are_automatically_unspliced(): out = tibble(a=1) >> mutate(tibble(b=2)) assert_tibble_equal(out, tibble(a=1, b=2)) out = tibble(a=1) >> mutate(tibble(b=2), tibble(b=3)) assert_tibble_equal(out, tibble(a=1, b=3)) out = tibble(a=1) >> mutate(tibble(b=2), c=f.b) assert_tibble_equal(out, tibble(a=1, b=2, c=2))
def test_return_one_row(): # not actually one row, but returns a corresponding series df = tibble(x=range(1, 43)) out = df >> mutate(across(c(), as_factor)) assert out.equals(df) out = df >> mutate(y=across(c(), as_factor)) # empty column in pandas will be NAs assert out.y.isna().all()
def test_deals_with_0_groups(): df = tibble(x=[]) >> group_by(f.x) out = mutate(df, y=f.x + 1) exp = tibble(x=[], y=[]) >> group_by(f.x) assert_iterable_equal(out, exp) assert group_vars(out) == group_vars(exp) out = mutate(df, y=max(f.x)) assert out.shape == (0, 2) assert group_vars(out) == ["x"]
def test_cache_key(): df = tibble(g=rep([1, 2], each=2), a=range(1, 5)) >> group_by(f.g) out = df >> mutate( tibble( x=across(where(is_numeric), mean).a, y=across(where(is_numeric), max).a, )) expect = df >> mutate(x=mean(f.a), y=max(f.a)) assert_frame_equal(out, expect)
def test_group_by_keeps_the_right_order_of_subdfs(): df = ( tibble( g1=["a", "b", "c", "a", "b", "c", "a", "b", "c"], g2=["a", "b", "c", "a", "b", "c", "a", "b", "b"], ) >> mutate(x=range(9)) ) out = df >> group_by(f.g1, f.g2) >> mutate(x=f.x) assert_iterable_equal(out.x.obj, range(9))
def test_works_on_empty_data_frames(): df = tibble() res = df >> mutate() assert nrow(res) == 0 assert len(res) == 0 res = df >> mutate(x=[]) assert res.columns.tolist() == ["x"] assert nrow(res) == 0 assert ncol(res) == 1
def test_handles_data_frame_columns(): df = tibble(a=c(1, 2, 3), b=c(2, 3, 4), base_col=c(3, 4, 5)) res = mutate(df, new_col=tibble(x=[1, 2, 3])) assert_tibble_equal(res["new_col"], tibble(x=[1, 2, 3])) res = mutate(group_by(df, f.a), new_col=tibble(x=f.a)) assert_iterable_equal(res["new_col"].x.obj, [1, 2, 3]) rf = rowwise(df, f.a) res = mutate(rf, new_col=tibble(x=f.a)) assert_tibble_equal(res["new_col"], tibble(x=[1, 2, 3]) >> rowwise())
def test_lead_lag_inside_mutates_handles_expressions_as_value_for_default(): df = tibble(x=[1, 2, 3]) res = mutate(df, leadn=lead(f.x, default=f.x[0]), lagn=lag(f.x, default=f.x[0])) assert_iterable_equal(res.leadn, lead(df.x, default=df.x[0])) assert_iterable_equal(res.lagn, lag(df.x, default=df.x[0])) res = mutate(df, leadn=lead(f.x, default=[1]), lagn=lag(f.x, default=[1])) assert_iterable_equal(res.leadn, lead(df.x, default=[1])) assert_iterable_equal(res.lagn, lag(df.x, default=[1]))
def test_mutate_cols_inside_func(): df = tibble(x=2, y=4, z=8) @register_func(None, context=None) def data_frame(**kwargs): return tibble(**kwargs) out = df >> mutate(data_frame(x=f.x / f.y, y=f.y / f.y, z=f.z / f.y)) # df.y does not work on grouped data expect = df >> mutate(across(everything(), lambda col: col / df.y)) assert out.equals(expect)
def test_works_sequentially(): df = tibble(a=1) out = df >> mutate(x=ncol(across(where(is_numeric))), y=ncol(across(where(is_numeric)))) expect = tibble(a=1, x=1, y=2) assert out.equals(expect) out = df >> mutate(a="x", y=ncol(across(where(is_numeric)))) expect = tibble(a="x", y=0) assert out.equals(expect)
def test_empty_mutate_returns_input(): df = tibble(x=1) gf = group_by(df, f.x) out = mutate(df) assert out.equals(df) out = mutate(gf) assert_tibble_equal(out, gf) assert isinstance(gf, TibbleGrouped) assert group_vars(out) == ["x"]
def test_can_use_before_and_after_to_control_column_position(): df = tibble(x=1, y=2) out = mutate(df, z=1) assert out.columns.tolist() == ["x", "y", "z"] out = mutate(df, z=1, _before=1) assert out.columns.tolist() == ["x", "z", "y"] out = mutate(df, z=1, _after=0) assert out.columns.tolist() == ["x", "z", "y"] df = tibble(x=1, y=2) out = mutate(df, x=1, _after=f.y) assert out.columns.tolist() == ["x", "y"]
def test_if_any_all_enforce_bool(): d = tibble(x=10, y=10) out = d >> filter(if_all(f[f.x:f.y], identity)) assert_frame_equal(out, d) out = d >> filter(if_any(f[f.x:f.y], identity)) assert_frame_equal(out, d) out = d >> mutate(ok=if_all(f[f.x:f.y], identity)) assert_frame_equal(out, mutate(d, ok=True)) out = d >> mutate(ok=if_any(f[f.x:f.y], identity)) assert_frame_equal(out, mutate(d, ok=True))
def test_cur_data_all_sequentially(): df = tibble(a=1) out = df >> mutate( x=cur_data().transform(ncol), y=cur_data().transform(ncol) ) expect = tibble(a=1, x=1, y=2) assert out.equals(expect) gf = tibble(a=1, b=2) >> group_by(f.a) out = gf >> mutate( x=cur_data_all().transform(ncol), y=cur_data_all().transform(ncol) ) expect = tibble(a=1, b=2, x=2, y=3) assert out.equals(expect)
def test_applied_progressively(): df = tibble(x=1) out = df >> mutate(y=f['x'] + 1, z=f.y + 1) assert_tibble_equal(out, tibble(x=1, y=2, z=3)) out = df >> mutate(y=f.x + 1, x=f.y + 1) assert_tibble_equal(out, tibble(x=3, y=2)) out = df >> mutate(x=2, y=f.x) assert_tibble_equal(out, tibble(x=2, y=2)) df = tibble(x=1, y=2) out1 = df >> mutate(x2=f.x, x3=f.x2 + 1) out2 = df >> mutate(x2=f.x + 0, x3=f.x2 + 1) assert_tibble_equal(out1, out2)
def test_attrgetter(): df = tibble(x=list("abc")) out = df >> mutate(y=attrgetter(f.x, "str").upper()) assert_iterable_equal(out.y, ["A", "B", "C"]) out = df >> mutate(y=pd_str(f.x).upper()) assert_iterable_equal(out.y, ["A", "B", "C"]) gf = df >> group_by(g=1) out = gf >> mutate(y=attrgetter(f.x, "str").upper()) assert_iterable_equal(out.y.obj, ["A", "B", "C"]) out = gf >> mutate(y=pd_str(f.x).upper()) assert_iterable_equal(out.y.obj, ["A", "B", "C"])
def test_rowwise_preserved_by_major_verbs(): rf = rowwise(tibble(x=range(1, 6), y=range(5, 0, -1)), f.x) out = arrange(rf, f.y) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = filter(rf, f.x < 3) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = mutate(rf, x=f.x + 1) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = rename(rf, X=f.x) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["X"] out = select(rf, "x") assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] out = slice(rf, c(0, 0)) assert isinstance(out, TibbleRowwise) assert group_vars(out) == ["x"] # Except for summarise out = summarise(rf, z=mean(f.x, f.y)) assert isinstance(out, TibbleGrouped) assert group_vars(out) == ["x"]
def test_mutate_internally(): df = tibble(g=c(1, 2), x=c(1, 2)) df1 = df >> distinct(aa=f.g * 2) df2 = df >> mutate(aa=f.g * 2) >> distinct(f.aa) assert df1.equals(df2)
def test_order_by(): df = tibble(x=f[1:6]) out = df >> mutate(y=order_by(f[5:], cumsum(f.x))) assert_iterable_equal(out.y, [15, 14, 12, 9, 5]) with pytest.raises(ValueError): order_by(seq(5, 1), cumsum(seq(1, 5)))
def test_0col_df_in_results_ignored(): df1 = tibble(x=[1, 2]) df2 = df1 >> group_by(f.x) >> summarise(tibble()) assert df2.equals(df1) df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65) df3 = df1 >> mutate(y=65) assert df2.equals(df3) df2 = tibble(x=[1, 2], y=[3, 4]) df3 = df2 >> group_by(f.x) >> summarise(tibble()) assert df3.equals(df1) df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98) df4 = df1 >> mutate(z=98) assert df3.equals(df4)
def read_bed(bedfile, bedidx): """Read BED file.""" _log("- Reading BED file:", bedfile) ofile = outfile.parent / f"_{stems[bedidx]}.bed" df = pandas.read_csv(bedfile, sep="\t", header=None) header = [ "chrom", "start", "end", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts", ] df.columns = header[:len(df.columns)] if "score" in df.columns and bedidx not in ignore_scores: ofile = bedfile else: df = df >> mutate(score=f.end - f.start) df.to_csv(ofile, sep="\t", index=False, header=False) return ofile
def test_zero_row_dfs(): df = tibble(a=[], b=[], g=[]) dfg = group_by(df, f.g, _drop=False) assert dfg.shape == (0, 3) assert group_vars(dfg) == ["g"] assert group_size(dfg) == [] x = summarise(dfg, n=n()) assert x.shape == (0, 2) assert group_vars(x) == [] x = mutate(dfg, c=f.b + 1) assert x.shape == (0, 4) assert group_vars(x) == ["g"] assert group_size(x) == [] x = filter(dfg, f.a == 100) assert x.shape == (0, 3) assert group_vars(x) == ["g"] assert group_size(x) == [] x = arrange(dfg, f.a, f.g) assert x.shape == (0, 3) assert group_vars(x) == ["g"] assert group_size(x) == [] x = select(dfg, f.a) assert x.shape == (0, 2) assert group_vars(x) == ["g"] assert group_size(x) == []
def test_nb_fail(): from datar.datasets import iris out = iris >> mutate( across( where(is_double) & ~c(f["Petal_Length"], f["Petal_Width"]), round)) rows = out >> nrow() assert rows == 150
def test_if_any_all_in_mutate(): d = tibble(x=c(1, 5, 10, 10), y=c(0, 0, 0, 10), z=c(10, 5, 1, 10)) res = d >> mutate( any=if_any(f[f.x:], lambda x: x > 8), all=if_all(f[f.x:f.any], lambda x: x > 8), ) assert_iterable_equal(res["any"], [True, False, True, True]) assert_iterable_equal(res["all"], [False, False, False, True])