def test_and_or(): df = tibble(x=1, y=2, z=3, w=4) out = df >> select(c(f.x, f.y) & c(f.y, f.z)) assert out.columns.tolist() == ["y"] out = df >> mutate(a=f.x & f.y) assert out.a.tolist() == [True] out = df >> mutate(a=True & f.y) assert out.a.tolist() == [True] out = df >> select(c(f.x, f.y) | c(f.y, f.z)) assert out.columns.tolist() == ["x", "y", "z"]
def test_inv(): df = tibble(x=1, y=2) out = df >> select(~f.x) assert out.columns.tolist() == ["y"] df = tibble(x=True) out = df >> mutate(y=~f.x) assert out.y.tolist() == [False]
def gene_name_conversion( genes, species, infmt, outfmt, notfound, ): """Convert gene names using MyGeneInfo Args: genes: A sequence of genes species: The species to limit the query Supported: human, mouse, rat, fruitfly, nematode, zebrafish, thale-cress, frog and pig infmt: What's the original gene name format Available fields https://docs.mygene.info/en/latest/doc/query_service.html#available-fields outfmt: What's the target gene name format notfound: What to do if a conversion cannot be done. use-query: Ignore the conversion and use the original name skip: Ignore the conversion and skip the entire row in input file error: Report error Returns: A dataframe with two columns, query and `outfmt`. """ out = (mygene.querymany( genes, scopes=infmt, fields=outfmt, as_dataframe=True, df_index=False, species=species, ) >> group_by(f.query) >> arrange(desc(f._score)) >> slice_head(1) >> select(~c(f._id, f._score, f.notfound))) if isinstance(outfmt, str): outfmt = [of.strip() for of in outfmt.split(",")] out = tibble(query=genes) >> left_join(out, by=f.query) if notfound == "use-query": out = out >> mutate( across( outfmt, lambda col, query: if_else(is_na(col), query, col), query=f.query, )) elif notfound == "error" and any(is_na(out[outfmt[0]])): nagenes = out >> filter(is_na(f[outfmt[0]])) >> pull(f.query) raise QueryGenesNotFound(nagenes) elif notfound == "skip": out = out >> filter(~is_na(f[outfmt[0]])) return out
from pipen import Proc from biopipen.ns.web import Download from biopipen.ns.misc import Str2File from biopipen.ns.gsea import FGSEA from datar.all import flatten, tibble, select from biopipen.core.testing import get_pipeline FGSEA = Proc.from_proc( FGSEA, requires=[Download, Str2File], input_data=lambda ch1, ch2: tibble( *flatten(ch1), ch2, _name_repair="minimal", ) >> select(0, 2, 1), envs={"clscol": "Group", "classes": ["MMP9", "CTRL"]} ) def pipeline(): return get_pipeline(__file__).set_starts(Download, Str2File).set_data( [ "https://www.ncbi.nlm.nih.gov/geo/download/" "?acc=GSE179367" "&format=file" "&file=GSE179367%5Fgene%5Fcount%2Ereal%2Etxt%2Egz", "https://www.genepattern.org/tutorial/linkedFiles/" "export_gnf.GENE_SYMBOL.gmt", ], [(
def test_neg(): df = tibble(x=1, y=2) out = df >> select(-f[:1]) assert out.columns.tolist() == ["y"]
def test_or_(): df = tibble(x=1, y=2, z=3) out = df >> select(c(f.x, f.y) | [f.y, f.z]) assert_frame_equal(out, tibble(x=1, y=2, z=3))
outfmt=outfmt, notfound=notfound, ) converted.columns = [colname] + converted.columns[1:].tolist() if output == "only": out = converted elif output == "keep": out = df >> right_join(converted, by=colname, suffix=["", "_converted"]) elif output == "drop": out = df >> right_join( converted, by=colname, suffix=["", "_converted"] ) >> select(~c(colname)) elif output == "replace": out = df >> right_join( converted, by=colname, suffix=["", "_converted"] ) converted_cols = out.columns[-len(converted.columns)+1:].tolist() pos = df.columns.get_indexer([colname])[0] out = out >> relocate( converted_cols, _after=pos+1 ) >> select(~c(colname)) else: raise ValueError(f"Unknown output mode: {output}.") out.to_csv(outfile, **outopts)