def maincall(inputfiles, flavor=None, init=None, mediadb=None, outputfile=None): if not flavor: flavor = config.get('sbml', 'default_flavor') if outputfile: model_id = os.path.splitext(os.path.basename(outputfile))[0] else: model_id = 'community' outputfile = 'community.xml' models = [ load_cbmodel(inputfile, flavor=flavor) for inputfile in inputfiles ] community = Community(model_id, models) model = community.merged_model if init: if not mediadb: mediadb = project_dir + config.get('input', 'media_library') try: media_db = load_media_db(mediadb) except IOError: raise IOError('Failed to load media library:' + mediadb) init_env = Environment.from_compounds(media_db[init]) init_env.apply(model, inplace=True) save_cbmodel(model, outputfile, flavor=flavor)
def build_bigg_universe_model(outputfile=None): """ Download the whole BiGG universe database as a CBModel and (optionally) store in SBML. Args: outputfile (str): SBML output file (optional) Returns: CBModel: universe model """ print('Downloading universal data from BiGG...') model = CBModel('bigg_universe') bigg_rxns = get_request(reactions_url) n = len(bigg_rxns['results']) for i, entry in enumerate(bigg_rxns['results']): build_reaction(model, entry['bigg_id']) progress(i, n) print('\n') if outputfile: save_cbmodel(model, outputfile) return model
def download_universal_model(outputfile, cpd_annotation): print("Downloading BiGG universe...") cpds = pd.read_csv(cpd_annotation, sep="\t", index_col=0) json_model = get_request(UNIVERSE_URL) model = CBModel("bigg_universe") load_compartments(model) load_metabolites(json_model, model, cpds) load_reactions(json_model, model) save_cbmodel(model, outputfile)
def curate_universe(model, outputfile, model_specific_data, bigg_models, taxa, biomass_eq, manually_curated=None, unbalanced_metabolites=None): """ Curate universal reaction database from initial database dump. Args: model (CBModel): universal model outputfile (str): output SBML file (optional) model_specific_data (pandas.DataFrame): model specific data downloaded from BiGG bigg_models (pandas.DataFrame): Additional information on BiGG models taxa (str): filter by taxa (optional) biomass_eq (str): default biomass equation """ print(f'Curating {taxa} universe...') print(f'Initial model size: {len(model.metabolites)} x {len(model.reactions)}') remove_compartments(model, taxa) model_specific_data['reaction'] = model_specific_data['reaction'].apply(lambda x: 'R_' + x) filter_reactions_by_kingdom(model, taxa, model_specific_data, bigg_models) clean_up_atp_synthases(model) compute_missing_formulae(model) curate_transport_by_weight(model) remove_unbalanced_reactions(model, proton_relax=True) fix_protons_and_charge(model) fix_hydrogen_stoichiometry(model) constrain_reversibility(model, model_specific_data, manually_curated) reversibility_heuristics(model) if unbalanced_metabolites is not None: create_sink_reactions(model, unbalanced_metabolites) create_exchange_reactions(model, default_lb=-1000, default_ub=1000) add_biomass_equation(model, biomass_eq) add_maintenance_atp(model) print('Removing blocked reactions and dead-end metabolites...') simplify(model) print(f'Final model size: {len(model.metabolites)} x {len(model.reactions)}') save_cbmodel(model, outputfile)
def maincall(inputfile, media, mediadb=None, universe=None, universe_file=None, outputfile=None, flavor=None, spent=None, verbose=False): if verbose: print('Loading model...') try: model = load_cbmodel(inputfile, flavor=flavor) except IOError: raise IOError('Failed to load model:' + inputfile) if spent: if verbose: print('Loading model for spent medium species...') try: spent_model = load_cbmodel(spent, flavor=flavor) except IOError: raise IOError('Failed to load model:' + spent) else: spent_model = None if verbose: print('Loading reaction universe...') if not universe_file: if universe: universe_file = "{}{}universe_{}.xml".format( project_dir, config.get('generated', 'folder'), universe) else: universe_file = project_dir + config.get('generated', 'default_universe') try: universe_model = load_cbmodel(universe_file, flavor='cobra') except IOError: if universe: raise IOError( 'Failed to load universe "{0}". Please run build_universe.py --{0}.' .format(universe)) else: raise IOError('Failed to load universe model:' + universe_file) if verbose: print('Loading media...') if not mediadb: mediadb = project_dir + config.get('input', 'media_library') try: media_db = load_media_db(mediadb) except IOError: raise IOError('Failed to load media database:' + mediadb) if verbose: m1, n1 = len(model.metabolites), len(model.reactions) print('Gap filling for {}...'.format(', '.join(media))) max_uptake = config.getint('gapfill', 'max_uptake') multiGapFill(model, universe_model, media, media_db, max_uptake=max_uptake, inplace=True, spent_model=spent_model) if verbose: m2, n2 = len(model.metabolites), len(model.reactions) print('Added {} reactions and {} metabolites'.format((n2 - n1), (m2 - m1))) if verbose: print('Saving SBML file...') if not outputfile: outputfile = os.path.splitext(inputfile)[0] + '_gapfill.xml' if not flavor: flavor = config.get('sbml', 'default_flavor') save_cbmodel(model, outputfile, flavor=flavor) if verbose: print('Done.')
def curate_universe(model, model_specific_data, bigg_models, biomass_eq, taxa=None, thermodynamics_data=None, metabolomics_data=None, thermodynamics_method=None, manually_curated=None, unbalanced_metabolites=None, use_heuristics=True, remove_unbalanced=True, remove_blocked=True, outputfile=None): """ Curate universal reaction database from initial database dump. Args: model (CBModel): universal model model_specific_data (pandas.DataFrame): model specific data downloaded from BiGG bigg_models (pandas.DataFrame): Additional information on BiGG models biomass_eq (str): default biomass equation taxa (str): filter by taxa (optional) thermodynamics_data (pandas.DataFrame): used for reversibility estimation (optional) metabolomics_data (pandas.DataFrame): used for reversibility estimation (optional) thermodynamics_method (str): thermodynamics method to use (optional) manually_curated (pandas.DataFrame): manually curated reaction bounds (optional) unbalanced_metabolites (list): unbalanced metabolites that require sink reactions (optional) use_heuristics (bool): apply heuristic rules (no proton pumps, no reversible ATP consumers) (default: True) remove_unbalanced (bool): remove unbalanced reactions from model (default: True) remove_blocked (bool): remove blocked reactions and dead-end metabolites (default: True) outputfile (str): output SBML file (optional) Returns: CBModel: curated universal model Notes: Combines thermodynamics and heuristic rules to determine reaction reversibility. Adds exchange reactions for all extracellular metabolites. Adds sinks reactions for a list of known unbalanced compounds. Adds biomass equations from local biomass database (avoids discarding biomass precursors and other essencial reactions/metabolites that would otherwise be structurally blocked). """ print('Starting universe curation...') print('(initial size: {} x {})\n'.format(len(model.metabolites), len(model.reactions))) trusted_models = bigg_models.query('trusted == True').index.tolist() add_bounds_from_extracted_data(model, model_specific_data, trusted_models) if taxa: print('Filtering by taxa:', taxa) kingdom_map = bigg_models['domain'].to_dict() if taxa in {'cyanobacteria', 'bacteria'}: kingdoms = {'Bacteria'} elif taxa == 'archaea': kingdoms = {'Archaea', 'Bacteria'} else: raise ValueError('Unsupported taxa:' + taxa) filter_reactions_by_kingdoms(model, kingdoms, kingdom_map, inplace=True) if taxa in {'bacteria', 'archaea'}: valid_compartments = {'C_c', 'C_p', 'C_e'} elif taxa == 'cyanobacteria': valid_compartments = {'C_c', 'C_p', 'C_e', 'C_u'} other_compartments = set( model.compartments.keys()) - valid_compartments model.remove_compartments(other_compartments, delete_metabolites=True, delete_reactions=True) print('(size: {} x {})\n'.format(len(model.metabolites), len(model.reactions))) if thermodynamics_data is not None: print('Computing thermodynamics...', end=' ') dG0 = thermodynamics_data['dG0'].to_dict() sdG0 = thermodynamics_data['sdG0'].to_dict() if metabolomics_data is not None: x0 = metabolomics_data.median(axis=1).to_dict() else: x0 = None compute_flux_bounds(model, dG0, sdG0, x0, method=thermodynamics_method, inplace=True, override_trusted=False) print('done\n') print('Applying manual curation rules...', end=' ') if use_heuristics: reversibility_heuristics(model, no_reverse_atp=True, no_proton_pumps=False, override_trusted=False) # manually curated reactions if manually_curated is not None: for r_id, (lb, ub) in manually_curated.iterrows(): if r_id in model.reactions: model.set_flux_bounds(r_id, lb, ub) print('done\n') if remove_unbalanced: # remove arbitrary 'Z' formula from photons if taxa == 'cyanobacteria': for m_id in ['M_photon_e', 'M_photon_p', 'M_photon_c']: model.metabolites[m_id].metadata['FORMULA'] = '' print('Removing unbalanced reactions...') remove_unbalanced_reactions(model) print('(size: {} x {})\n'.format(len(model.metabolites), len(model.reactions))) print('Creating pseudo-reactions...') create_exchange_reactions(model, default_lb=-1000, default_ub=1000) if unbalanced_metabolites: create_sink_reactions(model, unbalanced_metabolites) add_biomass_equation(model, biomass_eq) add_maintenance_atp(model) print('(size: {} x {})\n'.format(len(model.metabolites), len(model.reactions))) if remove_blocked: print('Removing blocked reactions and dead-end metabolites...') simplify(model) print('(size: {} x {})\n'.format(len(model.metabolites), len(model.reactions))) if outputfile: save_cbmodel(model, outputfile) print('Done.')
def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None, universe=None, universe_file=None, ensemble_size=None, verbose=False, debug=False, flavor=None, gapfill=None, blind_gapfill=False, init=None, mediadb=None, default_score=None, uptake_score=None, soft_score=None, soft=None, hard=None, reference=None, ref_score=None, recursive_mode=False): if recursive_mode: model_id = os.path.splitext(os.path.basename(inputfile))[0] if outputfile: outputfile = f'{outputfile}/{model_id}.xml' else: outputfile = os.path.splitext(inputfile)[0] + '.xml' else: if outputfile: model_id = os.path.splitext(os.path.basename(outputfile))[0] else: model_id = os.path.splitext(os.path.basename(inputfile))[0] outputfile = os.path.splitext(inputfile)[0] + '.xml' model_id = build_model_id(model_id) outputfolder = os.path.abspath(os.path.dirname(outputfile)) if not os.path.exists(outputfolder): try: os.makedirs(outputfolder) except: print('Unable to create output folder:', outputfolder) return if soft: try: soft_constraints = load_soft_constraints(soft) except IOError: raise IOError('Failed to load soft-constraints file:' + soft) else: soft_constraints = None if hard: try: hard_constraints = load_hard_constraints(hard) except IOError: raise IOError('Failed to load hard-constraints file:' + hard) else: hard_constraints = None if input_type == 'refseq': if verbose: print(f'Downloading genome {inputfile} from NCBI...') ncbi_table = load_ncbi_table(project_dir + config.get('input', 'refseq')) inputfile = download_ncbi_genome(inputfile, ncbi_table) if not inputfile: print('Failed to download genome from NCBI.') return input_type = 'protein' if inputfile.endswith('.faa.gz') else 'dna' if input_type == 'protein' or input_type == 'dna': if verbose: print('Running diamond...') diamond_db = project_dir + config.get('generated', 'diamond_db') blast_output = os.path.splitext(inputfile)[0] + '.tsv' exit_code = run_blast(inputfile, input_type, blast_output, diamond_db, diamond_args, verbose) if exit_code is None: print('Unable to run diamond (make sure diamond is available in your PATH).') return if exit_code != 0: print('Failed to run diamond.') if diamond_args is not None: print('Incorrect diamond args? Please check documentation or use default args.') return annotations = load_diamond_results(blast_output) elif input_type == 'eggnog': annotations = load_eggnog_data(inputfile) elif input_type == 'diamond': annotations = load_diamond_results(inputfile) else: raise ValueError('Invalid input type: ' + input_type) if verbose: print('Loading universe model...') if not universe_file: if universe: universe_file = f"{project_dir}{config.get('generated', 'folder')}universe_{universe}.xml.gz" else: universe_file = project_dir + config.get('generated', 'default_universe') try: universe_model = load_cbmodel(universe_file, flavor='bigg') universe_model.id = model_id except IOError: available = '\n'.join(glob(f"{project_dir}{config.get('generated', 'folder')}universe_*.xml.gz")) raise IOError(f'Failed to load universe model: {universe_file}\nAvailable universe files:\n{available}') if reference: if verbose: print('Loading reference model...') try: ref_model = load_cbmodel(reference) except: raise IOError('Failed to load reference model.') else: ref_model = None if gapfill or init: if verbose: print('Loading media library...') if not mediadb: mediadb = project_dir + config.get('input', 'media_library') try: media_db = load_media_db(mediadb) except IOError: raise IOError('Failed to load media library:' + mediadb) if verbose: print('Scoring reactions...') gene_annotations = pd.read_csv(project_dir + config.get('generated', 'gene_annotations'), sep='\t') bigg_gprs = project_dir + config.get('generated', 'bigg_gprs') gprs = pd.read_csv(bigg_gprs) gprs = gprs[gprs.reaction.isin(universe_model.reactions)] debug_output = model_id if debug else None scores, gene2gene = reaction_scoring(annotations, gprs, debug_output=debug_output) if scores is None: print('The input genome did not match sufficient genes/reactions in the database.') return if not flavor: flavor = config.get('sbml', 'default_flavor') init_env = None if init: if init in media_db: init_env = Environment.from_compounds(media_db[init]) else: print(f'Error: medium {init} not in media database.') universe_model.metadata['Description'] = 'This model was built with CarveMe version ' + version if ensemble_size is None or ensemble_size <= 1: if verbose: print('Reconstructing a single model') model = carve_model(universe_model, scores, inplace=(not gapfill), default_score=default_score, uptake_score=uptake_score, soft_score=soft_score, soft_constraints=soft_constraints, hard_constraints=hard_constraints, ref_model=ref_model, ref_score=ref_score, init_env=init_env, debug_output=debug_output) annotate_genes(model, gene2gene, gene_annotations) else: if verbose: print('Building an ensemble of', ensemble_size, 'models') ensemble = build_ensemble(universe_model, scores, ensemble_size, init_env=init_env) annotate_genes(ensemble, gene2gene, gene_annotations) save_ensemble(ensemble, outputfile, flavor=flavor) if model is None: print("Failed to build model.") return if not gapfill: save_cbmodel(model, outputfile, flavor=flavor) else: media = gapfill.split(',') if verbose: m1, n1 = len(model.metabolites), len(model.reactions) print(f"Gap filling for {', '.join(media)}...") max_uptake = config.getint('gapfill', 'max_uptake') if blind_gapfill: scores = None else: scores = dict(scores[['reaction', 'normalized_score']].values) multiGapFill(model, universe_model, media, media_db, scores=scores, max_uptake=max_uptake, inplace=True) if verbose: m2, n2 = len(model.metabolites), len(model.reactions) print(f'Added {(n2 - n1)} reactions and {(m2 - m1)} metabolites') if init_env: # Initializes environment again as new exchange reactions can be acquired during gap-filling init_env.apply(model, inplace=True, warning=False) save_cbmodel(model, outputfile, flavor=flavor) if verbose: print('Done.')