Python workflow示例，micom.workflows.core.workflow Python示例

示例#1

0

显示文件

def minimal_media(
    manifest, model_folder, summarize=True, min_growth=0.1, threads=1
):
    """Calculate the minimal medium for a set of community models."""
    samples = manifest.sample_id.unique()
    paths = [
        (
            s,
            path.join(
                model_folder, manifest[manifest.sample_id == s].file.iloc[0]
            ),
        )
        for s in samples
    ]
    args = [[s, p, min_growth] for s, p in paths]
    results = workflow(_medium, args, threads)
    if any(r is None for r in results):
        raise OptimizationError(
            "Could not find a growth medium that allows the specified "
            "growth rate for all taxa in all samples :("
        )
    results = pd.concat(results, axis=0)
    if summarize:
        medium = results.groupby("reaction").flux.max().reset_index()
    medium["metabolite"] = medium.reaction.str.replace("EX_", "")
    return medium

示例#2

0

显示文件

def tradeoff(
        manifest,
        model_folder,
        medium,
        tradeoffs=np.arange(0.1, 1.0 + 1e-6, 0.1),
        threads=1,
):
    """Run growth rate predictions for varying tradeoff values.

    Parameters
    ----------
    manifest : pandas.DataFrame
        The manifest as returned by the `build` workflow.
    model_folder : str
        The folder in which to find the files mentioned in the manifest.
    medium : pandas.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchnage reactions and their respective maximum flux.
    tradeoffs : array of floats in (0.0, 1.0]
        An array of tradeoff vaues to be tested. One simulation without
        a tradeoff (no cooperative tradeoff) will always be run additionally
        and will have a tradeoff of "NaN".
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    pandas.DataFrame
        The predicted growth rates.
    """
    samples = manifest.sample_id.unique()
    paths = {
        s: path.join(model_folder,
                     manifest[manifest.sample_id == s].file.iloc[0])
        for s in samples
    }
    if any(t < 0.0 or t > 1.0 for t in tradeoffs):
        raise ValueError("tradeoff values must between 0 and 1 :(")
    medium = process_medium(medium, samples)
    args = [[p, tradeoffs, medium.flux[medium.sample_id == s]]
            for s, p in paths.items()]
    results = workflow(_tradeoff, args, threads)
    if all(r is None for r in results):
        raise OptimizationError(
            "All numerical optimizations failed. This indicates a problem "
            "with the solver or numerical instabilities. Check that you have "
            "CPLEX or Gurobi installed. You may also increase the abundance "
            "cutoff in `qiime micom build` to create simpler models or choose "
            "a more permissive solver tolerance.")
    results = pd.concat(results)
    return results

示例#3

0

显示文件

def check_db_medium(model_db, medium, threads=1):
    """Complete a growth medium for all models in a database.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    medium : pd.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchange reactions and their respective maximum flux. Can not be sample
        specific.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    pd.DataFrame
        Returns an annotated manifest file with a column `can_grow` that tells you
        whether the model can grow on the (fixed) medium, and a column `growth_rate`
        that gives the growth rate.
    """
    medium = process_medium(medium, ["dummy"])
    medium.index = medium.global_id
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Checking %d %s-level models on a medium with %d components." %
                (manifest.shape[0], rank, len(medium)))

    args = [(f, medium.flux) for f in manifest.file]
    results = workflow(_grow, args, threads)
    manifest["growth_rate"] = results
    manifest["can_grow"] = manifest.growth_rate.notna() & (manifest.growth_rate
                                                           > 1e-6)

    if compressed:
        tdir.cleanup()

    return manifest

示例#4

0

显示文件

def db_annotations(
    model_db,
    threads=1,
):
    """Get metabolite annotations from a model DB.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    pd.DataFrame
        Annotations for all exchanged metabolites.
    """
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Getting annotations from %d %s-level models ." %
                (manifest.shape[0], rank))

    args = manifest.file.tolist()
    results = workflow(_annotate, args, threads)
    anns = pd.concat(results).drop_duplicates()

    if compressed:
        tdir.cleanup()

    return anns

示例#5

0

显示文件

def grow(
    manifest,
    model_folder,
    medium,
    tradeoff,
    threads=1,
    weights=None,
    atol=None,
    rtol=None
):
    """Simulate growth for a set of community models.

    Parameters
    ----------
    manifest : pandas.DataFrame
        The manifest as returned by the `build` workflow.
    model_folder : str
        The folder in which to find the files mentioned in the manifest.
    medium : pandas.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchange reactions and their respective maximum flux.
    tradeoff : float in (0.0, 1.0]
        A tradeoff value. Can be chosen by running the `tradeoff` workflow or
        by experince. Tradeoff values of 0.5 for metagenomcis data and 0.3 for
        16S data seem to work well.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.
    weights : str
        Used during the calculaton of the minimal import rates.
        Will scale the fluxes by a weight factor. Can either be "mass" which will
        scale by molecular mass, a single element which will scale by
        the elemental content (for instance "C" to scale by carbon content).
        If None every metabolite will receive the same weight.
        Will be ignored if `minimize_components` is True.
    atol : float
        Absolute tolerance for the growth rates. If None will use the solver tolerance.
    rtol : float
        Relative tolerqance for the growth rates. If None will use the solver tolerance.

    Returns
    -------
    GrowthResults
        A named tuple containing the growth rates and exchange fluxes for all
        samples/models.
    """
    samples = manifest.sample_id.unique()
    paths = {
        s: path.join(
            model_folder, manifest[manifest.sample_id == s].file.iloc[0])
        for s in samples
    }
    medium = process_medium(medium, samples)
    args = [
        [p, tradeoff, medium.flux[medium.sample_id == s], weights, atol, rtol]
        for s, p in paths.items()
    ]
    results = workflow(_growth, args, threads)
    if all([r is None for r in results]):
        raise OptimizationError(
            "All numerical optimizations failed. This indicates a problem "
            "with the solver or numerical instabilities. Check that you have "
            "CPLEX or Gurobi installed. You may also increase the abundance "
            "cutoff to create simpler models."
        )
    growth = pd.concat(r["growth"] for r in results if r is not None)
    growth = growth[growth.taxon != "medium"]
    exchanges = pd.concat(r["exchanges"] for r in results)
    exchanges["taxon"] = exchanges.index
    exchanges = exchanges.melt(
        id_vars=["taxon", "sample_id", "tolerance"],
        var_name="reaction", value_name="flux"
    ).dropna(subset=["flux"])
    abundance = growth[["taxon", "sample_id", "abundance"]]
    exchanges = pd.merge(exchanges, abundance,
                         on=["taxon", "sample_id"], how="outer")
    anns = pd.concat(r["annotations"] for r in results).drop_duplicates()
    anns.index = anns.reaction
    exchanges["metabolite"] = anns.loc[exchanges.reaction, "metabolite"].values
    exchanges["direction"] = DIRECTION[
        (exchanges.flux > 0.0).astype(int)
    ].values
    exchanges = exchanges[exchanges.flux.abs() > exchanges.tolerance]

    return GrowthResults(growth, exchanges, anns)

示例#6

0

显示文件

def fix_medium(
    manifest,
    model_folder,
    medium,
    min_growth=0.1,
    max_import=1,
    minimize_components=False,
    summarize=True,
    weights=None,
    threads=1,
):
    """Augment a growth medium so all community members can grow in it.

    Arguments
    ---------
    manifest : pandas.DataFrame
        The manifest as returned by the `build` workflow.
    model_folder : str
        The folder in which to find the files mentioned in the manifest.
    medium : pandas.Series or pandas.DataFrame
        A growth medium with exchange reaction IDs as index and positive
        import fluxes as values. If a DataFrame needs columns `flux` and
        `reaction`.
    min_growth : positive float
        The minimum biomass production required for growth.
    max_import : positive float
        The maximum import rate for added imports.
    minimize_components : boolean
        Whether to minimize the number of media components rather than the
        total flux.
    summarize: boolean
        Whether to summarize the medium across all samples. If False will
        return a medium for each sample.
    weights : str
        Will scale the fluxes by a weight factor. Can either be "mass" which will
        scale by molecular mass, a single element which will scale by
        the elemental content (for instance "C" to scale by carbon content).
        If None every metabolite will receive the same weight.
        Will be ignored if `minimize_components` is True.
    threads: int
        The number of processes to use.

    Returns
    -------
    pandas.DataFrame
        A new growth medium with the smallest amount of augmentations such
        that all members of the community can grow in it.

    """
    if not isinstance(medium, pd.DataFrame):
        raise ValueError("`medium` must be a DataFrame.")

    samples = manifest.sample_id.unique()
    paths = {
        s: path.join(model_folder,
                     manifest[manifest.sample_id == s].file.iloc[0])
        for s in samples
    }
    medium = process_medium(medium, samples)
    if medium.flux[medium.flux < 1e-6].any():
        medium.loc[medium < 1e-6, "flux"] = 1e-6
        logger.info(
            "Some import rates were to small and were adjusted to 1e-6.")
    args = [[
        s, p, min_growth, max_import, minimize_components,
        medium.flux[medium.sample_id == s], weights
    ] for s, p in paths.items()]
    res = workflow(_fix_medium, args, n_jobs=threads, unit="model(s)")
    if all(r is None for r in res):
        raise OptimizationError(
            "All optimizations failed. You may need to increase `max_import` "
            "or lower the target growth rate.")
    final = pd.concat(res)
    if summarize:
        final = (final.groupby(["reaction", "metabolite",
                                "description"]).flux.max().reset_index())
    return final

示例#7

0

显示文件

def complete_db_medium(
    model_db,
    medium,
    growth=0.001,
    max_added_import=1,
    minimize_components=False,
    weights=None,
    threads=1,
):
    """Complete a growth medium for all models in a database.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    medium : pd.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchange reactions and their respective maximum flux. Can not be sample
        specific.
    growth : positive float or pandas.Series
        The minimum growth rate the model has to achieve with the (fixed) medium. If
        a Series will have a minimum growth rate for each id/taxon in the model db.
    max_added_import : positive float
        Maximum import flux for each added additional import not included in the growth
        medium. If positive will expand the medium with additional imports in order to
        fulfill the growth objective.
    minimize_components : boolean
        Whether to minimize the number of components instead of the total
        import flux. Might be more intuitive if set to True but may also be
        slow to calculate.
    weights : str
        Will scale the fluxes by a weight factor. Can either be "mass" which will
        scale by molecular mass, a single element which will scale by
        the elemental content (for instance "C" to scale by carbon content).
        If None every metabolite will receive the same weight.
        Will be ignored if `minimize_components` is True.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    tuple of (manifest, import fluxes)
        Returns an annotated manifest file with a column `can_grow` that tells you
        whether the model can grow on the (fixed) medium, and a column `added` that
        gives the number of added imports apart from the ones in the medium.
    """
    medium = process_medium(medium, ["dummy"])
    medium.index = medium.global_id
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Checking %d %s-level models on a medium with %d components." %
                (manifest.shape[0], rank, len(medium)))
    if not isinstance(growth, pd.Series):
        growth = pd.Series(growth, index=manifest.id)

    manifest.index = manifest.id
    args = [(
        manifest.loc[i, "file"],
        medium.flux,
        growth[i],
        max_added_import,
        minimize_components,
        weights,
    ) for i in manifest.index]
    results = workflow(_try_complete, args, threads)
    manifest["can_grow"] = [r[0] for r in results]
    manifest["added"] = [r[1] for r in results]
    imports = pd.DataFrame.from_records([r[2] for r in results]).fillna(0.0)
    imports.index = manifest.id

    if compressed:
        tdir.cleanup()

    return (manifest, imports)

示例#8

0

显示文件

def build(
    taxonomy, model_db, out_folder, cutoff=0.0001, threads=1, solver=None,
):
    """Builds a series of community models.

    This is a best-practice implementation of building community models
    for several samples in parallel.

    Parameters
    ----------
    taxonomy : pandas.DataFrame
        The taxonomy used for building the model. Must have at least the
        columns "id" and "sample_id". This must also
        contain at least a column with the same name as the rank used in
        the model database. Thus, for a genus-level database you will need
        a column `genus`. Additional taxa ranks can also be specified and
        will be used to be more stringent in taxa matching.
        Finally, the taxonomy should contain a column `abundance`. It will
        be used to quantify each individual in the community. If absent,
        MICOM will assume all individuals are present in the same amount.
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    out_folder : str
        The built models and a manifest file will be written to this
        folder.
    cutoff : float in [0.0, 1.0]
        Abundance cutoff. Taxa with a relative abundance smaller than this
        will not be included in the model.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.
    solver : str
        Name of the solver used for the linear and quadratic problems.

    Returns
    -------
    pandas.DataFrame
        The manifest for the built models. Contains taxa abundances,
        build metrics and file basenames.

    """
    os.makedirs(out_folder, exist_ok=True)
    samples = taxonomy.sample_id.unique()
    out_path = pd.Series(
        {s: os.path.join(out_folder, s + ".pickle") for s in samples}
    )
    args = [
        [s, taxonomy[taxonomy.sample_id == s], model_db,
         out_path[s], cutoff, solver]
        for s in samples
    ]
    res = workflow(build_and_save, args, threads)
    metrics = pd.concat(res)
    taxonomy = (
        taxonomy.groupby("sample_id").apply(_reduce_group)
        .dropna(axis=1).reset_index(drop=True)
    )
    taxonomy = taxonomy.loc[:, ~taxonomy.columns.isin(_ranks)]
    taxonomy["file"] = taxonomy.sample_id + ".pickle"
    taxonomy = pd.merge(taxonomy, metrics, on="sample_id")
    taxonomy.to_csv(os.path.join(out_folder, "manifest.csv"), index=False)
    return taxonomy

示例#9

0

显示文件

def build_database(
    manifest, out_path, rank="genus", threads=1, compress=None, progress=True
):
    """Create a model database from a set of SBML files.

    Note
    ----
    A manifest for the joined models will also be written to the output folder
    as "manifest.csv". This may contain NA entries for additional columns
    that had different values within the summarized models.

    Parameters
    ----------
    manifest : pandas.DataFrame
        A manifest of SBML files containing their filepath as well as taxonomy.
        Must contain the columns "file", "kingdom", "phylum", "class",
        "order", "family", "genus", and "species". May contain additional
        columns.
    out_path : str
        The directory where the joined models will be written.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.
    compress : bool
        Whether to compress the output. Default is True if out_path ends with
        ".zip" otherwise no.
    progress : bool
        Whether to show a progress bar.

    Returns
    -------
    pd.DataFrame
        The manifest of the joined models. Will still contain information
        from the original metadata.
    """
    meta = manifest.copy()
    meta.columns = meta.columns.str.lower()
    compress = out_path.endswith(".zip")

    if not REQ_FIELDS.isin(meta.columns).all():
        raise ValueError(
            "Metadata File needs to have the following "
            "columns %s." % ", ".join(REQ_FIELDS)
        )
    bad = meta.file.apply(lambda x: not os.path.exists(x))
    if any(bad):
        raise ValueError(
            "The following models are in the manifest but do "
            "not exist at the specified path: %s" % meta.file[bad]
        )

    meta = meta.groupby(rank).apply(_reduce_group).reset_index(drop=True)
    logger.info("Building %d models on rank `%s`." % (meta.shape[0], rank))
    meta.index = meta[rank].str.replace("[^\\w\\_]", "_")
    meta["id"] = meta.index
    meta["summary_rank"] = rank

    if compress:
        with TemporaryDirectory(prefix="micom_") as tdir:
            args = [
                (tid, row, os.path.join(tdir, "%s.json" % tid))
                for tid, row in meta.iterrows()
            ]
            workflow(_summarize_models, args, threads)
            meta.file = meta.index + ".json"
            meta.to_csv(os.path.join(tdir, "manifest.csv"), index=False)
            with ZipFile(out_path, "w") as zf:
                [zf.write(a[2], os.path.basename(a[2])) for a in args]
                zf.write(os.path.join(tdir, "manifest.csv"), "manifest.csv")
    else:
        os.makedirs(out_path, exist_ok=True)
        args = [
            (tid, row, os.path.join(out_path, "%s.json" % tid))
            for tid, row in meta.iterrows()
        ]
        workflow(_summarize_models, args, threads)
        meta.file = meta.index + ".json"
        meta.to_csv(os.path.join(out_path, "manifest.csv"), index=False)

    return meta