def test_qiime_db(tmp_path): meta = qf.metadata(db) assert "uuid" in meta assert meta["type"] == "MetabolicModels[JSON]" manifest = qf.load_qiime_model_db(db, str(tmp_path)) assert manifest.shape[0] == 3 assert all(path.exists(f) for f in manifest.file)
def check_db_medium(model_db, medium, threads=1): """Complete a growth medium for all models in a database. Arguments --------- model_db : str A pre-built model database. If ending in `.qza` must be a Qiime 2 artifact of type `MetabolicModels[JSON]`. Can also be a folder, zip (must end in `.zip`) file or None if the taxonomy contains a column `file`. medium : pd.DataFrame A growth medium. Must have columns "reaction" and "flux" denoting exchange reactions and their respective maximum flux. Can not be sample specific. threads : int >=1 The number of parallel workers to use when building models. As a rule of thumb you will need around 1GB of RAM for each thread. Returns ------- pd.DataFrame Returns an annotated manifest file with a column `can_grow` that tells you whether the model can grow on the (fixed) medium, and a column `growth_rate` that gives the growth rate. """ medium = process_medium(medium, ["dummy"]) medium.index = medium.global_id compressed = model_db.endswith(".qza") or model_db.endswith(".zip") if compressed: tdir = TemporaryDirectory(prefix="micom_") if model_db.endswith(".qza"): manifest = load_qiime_model_db(model_db, tdir.name) elif model_db.endswith(".zip"): manifest = load_zip_model_db(model_db, tdir.name) else: manifest = load_manifest(model_db) rank = manifest["summary_rank"][0] logger.info("Checking %d %s-level models on a medium with %d components." % (manifest.shape[0], rank, len(medium))) args = [(f, medium.flux) for f in manifest.file] results = workflow(_grow, args, threads) manifest["growth_rate"] = results manifest["can_grow"] = manifest.growth_rate.notna() & (manifest.growth_rate > 1e-6) if compressed: tdir.cleanup() return manifest
def db_annotations( model_db, threads=1, ): """Get metabolite annotations from a model DB. Arguments --------- model_db : str A pre-built model database. If ending in `.qza` must be a Qiime 2 artifact of type `MetabolicModels[JSON]`. Can also be a folder, zip (must end in `.zip`) file or None if the taxonomy contains a column `file`. threads : int >=1 The number of parallel workers to use when building models. As a rule of thumb you will need around 1GB of RAM for each thread. Returns ------- pd.DataFrame Annotations for all exchanged metabolites. """ compressed = model_db.endswith(".qza") or model_db.endswith(".zip") if compressed: tdir = TemporaryDirectory(prefix="micom_") if model_db.endswith(".qza"): manifest = load_qiime_model_db(model_db, tdir.name) elif model_db.endswith(".zip"): manifest = load_zip_model_db(model_db, tdir.name) else: manifest = load_manifest(model_db) rank = manifest["summary_rank"][0] logger.info("Getting annotations from %d %s-level models ." % (manifest.shape[0], rank)) args = manifest.file.tolist() results = workflow(_annotate, args, threads) anns = pd.concat(results).drop_duplicates() if compressed: tdir.cleanup() return anns
def __init__( self, taxonomy, model_db=None, id=None, name=None, rel_threshold=1e-6, solver=None, progress=True, max_exchange=100, mass=1, ): """Create a new community object. `micom` builds a community from a taxonomy which may simply be a list of model files in its simplest form. Usually, the taxonomy will contain additional information such as annotations for the individuals (for instance phylum, organims or species) and abundances. The recommended way to build a micom model is to supply a quantification of taxa (called "taxonomy" here) which specifies the taxonomic ranks for a taxon and its abundance, and a model database for a specific rank (for instance "genus"). MICOM will match the ranks from your taxonomy to the model database and assemble the community models from that. You will also get information about the construction process by calling `Community.build_metrics`. The most customizable way only takes a single table where summarization and matching to the reference database has already occured. In this case you will also provide paths to model files for each taxon. This is the "old" way but may still be applicable if you want to use a custom database or want full control of matching your data to reference models. Notes ----- `micom` will automatically add exchange fluxes and and a community objective maximizing the overall growth rate of the community. Parameters ---------- taxonomy : pandas.DataFrame The taxonomy used for building the model. Must have at least the column "id". If no model database is specified in the next argument it furthermore requires a column "file" which specifies a filepath for each model. Valid file extensions are ".pickle", ".xml", ".xml.gz" and ".json". If a model database is specified this must contain at least a column with the same name as the rank used in the model database. Thus, for a genus-level database you will need a column `genus`. Additional taxa ranks can also be specified and will be used to be more stringent in taxa matching. Finally, the taxonomy should contain a column `abundance`. It will be used to quantify each individual in the community. If absent, MICOM will assume all individuals are present in the same amount. model_db : str A pre-built model database. If ending in `.qza` must be a Qiime 2 artifact of type `MetabolicModels[JSON]`. Can also be a folder, zip (must end in `.zip`) file or None if the taxonomy contains a column `file`. id : str, optional The ID for the community. Should only contain letters and numbers, otherwise it will be formatted as such. name : str, optional The name for the community. rel_threshold : float < 1, optional The relative abundance threshold that will be used. Describes the smallest relative amount of an individual that will be considered non-zero. All individuals with a smaller relative amount will be omitted. solver : str, optional Which solver to use. Will default to cplex if available which is better suited for large problems. progress : bool, optional Show a progress bar. max_exchange : positive float, optional During model constructions exchange reactions are duplicated into internal and external exchange reactions. This specifies the new import flux bound for the *internal* exchange reaction. Import rates for the exchanges between the medium and outside are still mantained. mass : positive float, optional The total mass of the community in gDW. Used to adjust import fluxes which are assumed to be given as mmol/gDW*h for the entire community. As a consequence all import fluxes will be divided by that number. Attributes ---------- taxa : list A list of taxa IDs in the community. """ super(Community, self).__init__(id, name) logger.info("building new micom model {}.".format(id)) if not solver: solver = [ s for s in ["cplex", "osqp", "gurobi", "glpk"] if s in cobra.util.solver.solvers ][0] logger.info("using the %s solver." % solver) if solver == "glpk": logger.warning( "No QP solver found, will use GLPK. A lot of functionality " "in MICOM will require a QP solver :/") self.solver.configuration.lp_method = "auto" self.solver.configuration.qp_method = "auto" self.solver.configuration.presolve = False self.solver = solver self._rtol = rel_threshold self._modification = None self.mass = mass self.__db_metrics = None adjust_solver_config(self.solver) taxonomy = taxonomy.copy() if "abundance" not in taxonomy.columns: taxonomy["abundance"] = 1 taxonomy.abundance /= taxonomy.abundance.sum() logger.info("{} individuals with abundances below threshold".format( (taxonomy.abundance <= self._rtol).sum())) taxonomy = taxonomy[taxonomy.abundance > self._rtol] if not (isinstance(taxonomy, pd.DataFrame) and "id" in taxonomy.columns): raise ValueError("`taxonomy` must be a pandas DataFrame with at" "least a column `id` :(") if model_db is None and "file" not in taxonomy.columns: raise ValueError( "If no model database is specified you need to pass " "file names for models in a `file` column as well.") compressed = False if model_db is not None: compressed = model_db.endswith(".qza") or model_db.endswith(".zip") if compressed: tdir = TemporaryDirectory(prefix="micom_") if "file" in taxonomy.columns: del taxonomy["file"] if model_db.endswith(".qza"): manifest = load_qiime_model_db(model_db, tdir.name) elif model_db.endswith(".zip"): manifest = load_zip_model_db(model_db, tdir.name) else: manifest = load_manifest(model_db) rank = manifest["summary_rank"][0] if rank not in taxonomy.columns: raise ValueError("Missing the column `%s` from the taxonomy." % rank) keep_cols = [ r for r in _ranks[0:(_ranks.index(rank) + 1)] if r in taxonomy.columns and r in manifest.columns ] manifest = manifest[keep_cols + ["file"]] merged = pd.merge(taxonomy, manifest, on=keep_cols) self.__db_metrics = pd.Series({ "found_taxa": merged.shape[0], "total_taxa": taxonomy.shape[0], "found_fraction": merged.shape[0] / taxonomy.shape[0], "found_abundance_fraction": merged.abundance.sum(), }) logger.info("Matched %g%% of total abundance in model DB." % (100.0 * self.__db_metrics[3])) if self.__db_metrics["found_abundance_fraction"] < 0.5: logger.warning( "Less than 50%% of the abundance could be matched to the " "model database. Model `%s` may not be representative " "of the sample" % self.id) taxonomy = merged taxonomy["abundance"] /= taxonomy["abundance"].sum() if taxonomy.id.str.contains(r"[^A-Za-z0-9_]", regex=True).any(): logger.warning("Taxa IDs contain prohibited characters and" " will be reformatted.") taxonomy.id = taxonomy.id.replace(r"[^A-Za-z0-9_\s]+", "_", regex=True) self.__taxonomy = taxonomy self.__taxonomy.index = self.__taxonomy.id obj = Zero self.taxa = [] index = self.__taxonomy.index index = tqdm(index, unit="models") if progress else index for idx in index: row = self.__taxonomy.loc[idx] if isinstance(row.file, list): if len(row.file) > 1: model = join_models(row.file) logger.info("joined {} models".format(len(row.file))) else: model = load_model(row.file[0]) else: model = load_model(row.file) suffix = "__" + idx.replace(" ", "_").strip() logger.info("converting IDs for {}".format(idx)) external = cobra.medium.find_external_compartment(model) logger.info("Identified %s as the external compartment for %s. " "If that is wrong you may be in trouble..." % (external, idx)) for r in model.reactions: r.global_id = clean_ids(r.id) r.id = r.global_id + suffix r.community_id = idx # avoids https://github.com/opencobra/cobrapy/issues/926 r._compartments = None # SBO terms may not be maintained if "sbo" in r.annotation: del r.annotation["sbo"] for m in model.metabolites: m.global_id = clean_ids(m.id) m.id = m.global_id + suffix m.compartment += suffix m.community_id = idx logger.info("adding reactions for {} to community".format(idx)) self.add_reactions(model.reactions) o = self.solver.interface.Objective.clone(model.objective, model=self.solver) obj += o.expression * row.abundance self.taxa.append(idx) taxa_obj = self.problem.Constraint(o.expression, name="objective_" + idx, lb=0.0) self.add_cons_vars([taxa_obj]) self.__add_exchanges( model.reactions, row, external_compartment=external, internal_exchange=max_exchange, ) self.solver.update() # to avoid dangling refs due to lazy add if compressed: tdir.cleanup() com_obj = add_var_from_expression(self, "community_objective", obj, lb=0) self.objective = self.problem.Objective(com_obj, direction="max")
def complete_db_medium( model_db, medium, growth=0.001, max_added_import=1, minimize_components=False, weights=None, threads=1, ): """Complete a growth medium for all models in a database. Arguments --------- model_db : str A pre-built model database. If ending in `.qza` must be a Qiime 2 artifact of type `MetabolicModels[JSON]`. Can also be a folder, zip (must end in `.zip`) file or None if the taxonomy contains a column `file`. medium : pd.DataFrame A growth medium. Must have columns "reaction" and "flux" denoting exchange reactions and their respective maximum flux. Can not be sample specific. growth : positive float or pandas.Series The minimum growth rate the model has to achieve with the (fixed) medium. If a Series will have a minimum growth rate for each id/taxon in the model db. max_added_import : positive float Maximum import flux for each added additional import not included in the growth medium. If positive will expand the medium with additional imports in order to fulfill the growth objective. minimize_components : boolean Whether to minimize the number of components instead of the total import flux. Might be more intuitive if set to True but may also be slow to calculate. weights : str Will scale the fluxes by a weight factor. Can either be "mass" which will scale by molecular mass, a single element which will scale by the elemental content (for instance "C" to scale by carbon content). If None every metabolite will receive the same weight. Will be ignored if `minimize_components` is True. threads : int >=1 The number of parallel workers to use when building models. As a rule of thumb you will need around 1GB of RAM for each thread. Returns ------- tuple of (manifest, import fluxes) Returns an annotated manifest file with a column `can_grow` that tells you whether the model can grow on the (fixed) medium, and a column `added` that gives the number of added imports apart from the ones in the medium. """ medium = process_medium(medium, ["dummy"]) medium.index = medium.global_id compressed = model_db.endswith(".qza") or model_db.endswith(".zip") if compressed: tdir = TemporaryDirectory(prefix="micom_") if model_db.endswith(".qza"): manifest = load_qiime_model_db(model_db, tdir.name) elif model_db.endswith(".zip"): manifest = load_zip_model_db(model_db, tdir.name) else: manifest = load_manifest(model_db) rank = manifest["summary_rank"][0] logger.info("Checking %d %s-level models on a medium with %d components." % (manifest.shape[0], rank, len(medium))) if not isinstance(growth, pd.Series): growth = pd.Series(growth, index=manifest.id) manifest.index = manifest.id args = [( manifest.loc[i, "file"], medium.flux, growth[i], max_added_import, minimize_components, weights, ) for i in manifest.index] results = workflow(_try_complete, args, threads) manifest["can_grow"] = [r[0] for r in results] manifest["added"] = [r[1] for r in results] imports = pd.DataFrame.from_records([r[2] for r in results]).fillna(0.0) imports.index = manifest.id if compressed: tdir.cleanup() return (manifest, imports)