Пример #1
0
def test_and_or():
    df = tibble(x=1, y=2, z=3, w=4)
    out = df >> select(c(f.x, f.y) & c(f.y, f.z))
    assert out.columns.tolist() == ["y"]

    out = df >> mutate(a=f.x & f.y)
    assert out.a.tolist() == [True]

    out = df >> mutate(a=True & f.y)
    assert out.a.tolist() == [True]

    out = df >> select(c(f.x, f.y) | c(f.y, f.z))
    assert out.columns.tolist() == ["x", "y", "z"]
Пример #2
0
def test_inv():
    df = tibble(x=1, y=2)
    out = df >> select(~f.x)
    assert out.columns.tolist() == ["y"]

    df = tibble(x=True)
    out = df >> mutate(y=~f.x)
    assert out.y.tolist() == [False]
Пример #3
0
def gene_name_conversion(
    genes,
    species,
    infmt,
    outfmt,
    notfound,
):
    """Convert gene names using MyGeneInfo

    Args:
        genes: A sequence of genes
        species: The species to limit the query
            Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
            thale-cress, frog and pig

        infmt: What's the original gene name format
            Available fields
            https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
        outfmt: What's the target gene name format
        notfound: What to do if a conversion cannot be done.
            use-query: Ignore the conversion and use the original name
            skip: Ignore the conversion and skip the entire row in input file
            error: Report error

    Returns:
        A dataframe with two columns, query and `outfmt`.
    """
    out = (mygene.querymany(
        genes,
        scopes=infmt,
        fields=outfmt,
        as_dataframe=True,
        df_index=False,
        species=species,
    ) >> group_by(f.query) >> arrange(desc(f._score)) >> slice_head(1) >>
           select(~c(f._id, f._score, f.notfound)))
    if isinstance(outfmt, str):
        outfmt = [of.strip() for of in outfmt.split(",")]
    out = tibble(query=genes) >> left_join(out, by=f.query)
    if notfound == "use-query":
        out = out >> mutate(
            across(
                outfmt,
                lambda col, query: if_else(is_na(col), query, col),
                query=f.query,
            ))
    elif notfound == "error" and any(is_na(out[outfmt[0]])):
        nagenes = out >> filter(is_na(f[outfmt[0]])) >> pull(f.query)
        raise QueryGenesNotFound(nagenes)
    elif notfound == "skip":
        out = out >> filter(~is_na(f[outfmt[0]]))

    return out
Пример #4
0
from pipen import Proc
from biopipen.ns.web import Download
from biopipen.ns.misc import Str2File
from biopipen.ns.gsea import FGSEA
from datar.all import flatten, tibble, select
from biopipen.core.testing import get_pipeline


FGSEA = Proc.from_proc(
    FGSEA,
    requires=[Download, Str2File],
    input_data=lambda ch1, ch2: tibble(
        *flatten(ch1),
        ch2,
        _name_repair="minimal",
    ) >> select(0, 2, 1),
    envs={"clscol": "Group", "classes": ["MMP9", "CTRL"]}
)


def pipeline():
    return get_pipeline(__file__).set_starts(Download, Str2File).set_data(
        [
            "https://www.ncbi.nlm.nih.gov/geo/download/"
            "?acc=GSE179367"
            "&format=file"
            "&file=GSE179367%5Fgene%5Fcount%2Ereal%2Etxt%2Egz",
            "https://www.genepattern.org/tutorial/linkedFiles/"
            "export_gnf.GENE_SYMBOL.gmt",
        ],
        [(
Пример #5
0
def test_neg():
    df = tibble(x=1, y=2)
    out = df >> select(-f[:1])
    assert out.columns.tolist() == ["y"]
Пример #6
0
def test_or_():
    df = tibble(x=1, y=2, z=3)
    out = df >> select(c(f.x, f.y) | [f.y, f.z])
    assert_frame_equal(out, tibble(x=1, y=2, z=3))
Пример #7
0
    outfmt=outfmt,
    notfound=notfound,
)
converted.columns = [colname] + converted.columns[1:].tolist()

if output == "only":
    out = converted

elif output == "keep":
    out = df >> right_join(converted, by=colname, suffix=["", "_converted"])

elif output == "drop":
    out = df >> right_join(
        converted,
        by=colname, suffix=["", "_converted"]
    ) >> select(~c(colname))

elif output == "replace":
    out = df >> right_join(
        converted, by=colname, suffix=["", "_converted"]
    )
    converted_cols = out.columns[-len(converted.columns)+1:].tolist()
    pos = df.columns.get_indexer([colname])[0]
    out = out >> relocate(
        converted_cols, _after=pos+1
    ) >> select(~c(colname))

else:
    raise ValueError(f"Unknown output mode: {output}.")

out.to_csv(outfile, **outopts)