def fix_stoichiometry(model, term_id2clu, species_id2term_id, ub_chebi_ids, onto, r_ids_to_ignore=None): clu2term_ids = invert_map(term_id2clu) thrds = [] conflicts = [] for r in model.getListOfReactions(): if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue t_ids = {species_id2term_id[s_id] if s_id in species_id2term_id else s_id for s_id in chain((species_ref.getSpecies() for species_ref in r.getListOfReactants()), (species_ref.getSpecies() for species_ref in r.getListOfProducts()))} if len(t_ids) > 1: conflicts.append(t_ids) for clu, term_ids in clu2term_ids.items(): if len(term_ids) <= 1: continue clu_conflicts = [set(it) for it in {tuple(t_ids & term_ids) for t_ids in conflicts} if len(it) > 1] real_term_ids = {t_id for t_id in term_ids if onto.get_term(t_id)} unmapped_s_ids = {s_id for s_id in term_ids if not onto.get_term(s_id)} if clu_conflicts: thread = StoichiometryFixingThread(model, species_id2term_id, ub_chebi_ids, unmapped_s_ids, real_term_ids, clu_conflicts, onto, clu, term_id2clu, r_ids_to_ignore=r_ids_to_ignore) thrds.append(thread) thread.start() # This actually causes the thread to run for th in thrds: th.join() # This waits until the thread has completed
def select_representative_terms(term_id2clu, onto): """ Replaces each cluster with a tuple containing one ChEBI term id: (term_id, ) :param term_id2clu: dict {term_id: cluster} :param onto: mod_sbml.onto.obo_ontology.Ontology ChEBI ontology :return: updated (inplace) dict term_id2clu {term_id: (cluster_representative_term_id, )} """ clu2term_ids = invert_map(term_id2clu) used = set() i = 0 for clu, term_ids in clu2term_ids.items(): terms = {onto.get_term(t) for t in term_ids if onto.get_term(t)} common_ancestors = \ {t for t in onto.common_points(terms, relationships=EQUIVALENT_RELATIONSHIPS)} if terms else set() options = common_ancestors - used if options: common_ancestor_term = options.pop() else: name = common_ancestors.pop().get_name() + " (another)" if common_ancestors else 'fake term' common_ancestor_term = Term(onto=onto, t_id="chebi:unknown_{0}".format(i), name=name) onto.add_term(common_ancestor_term) i += 1 used.add(common_ancestor_term) for t in term_ids: term_id2clu[t] = (common_ancestor_term.get_id(), ) return term_id2clu
def __init__(self, st_matrix=None, pws=None, N=None, V=None, m_id2i=None, r_id2i=None, efm_id2i=None, boundary_m_ids=None, r_ids=None, m_ids=None, efm_ids=None, r_id2gr_id=None, gr_id2r_id2c=None, efm_id2gr_id=None, m_id2gr_id=None,): if not r_id2i: r_id2i = st_matrix.r_id2i if st_matrix else (pws.r_id2i if pws else {}) self.st_matrix = st_matrix if N is not None and m_id2i is not None and r_id2i is not None: self.st_matrix = StoichiometricMatrix(N, m_id2i, r_id2i, boundary_m_ids) self.pws = pws if V is not None and r_id2i is not None: self.pws = PathwaySet(V, r_id2i, efm_id2i) self.r_ids = set(r_ids) if r_ids else set(self.r_id2i.keys()) self.m_ids = set(m_ids) if m_ids else set(self.m_id2i.keys()) self.efm_ids = set(efm_ids) if efm_ids else set(self.efm_id2i.keys()) self.r_id2gr_id = r_id2gr_id if r_id2gr_id else {} self.gr_id2r_id2c = gr_id2r_id2c if gr_id2r_id2c else {} self.efm_id2gr_id = efm_id2gr_id if efm_id2gr_id else {} self.gr_id2efm_ids = invert_map(self.efm_id2gr_id, list) self.m_id2gr_id = m_id2gr_id if m_id2gr_id else {} self.coupled_rs = set() self.r_types = set() self.folded_efms = set() self.pathways = set()
def generalize_species(model, s_id2chebi_id, ub_s_ids, chebi, ub_chebi_ids, threshold=UBIQUITOUS_THRESHOLD, r_ids_to_ignore=None): """ Groups metabolites of the model into clusters. :param model: libsbml.Model model of interest :param s_id2chebi_id: dict {metabolite_id: ChEBI_term_id} :param ub_s_ids: collection of ubiquitous metabolite ids :param chebi: mod_sbml.onto.obo_ontology.Ontology ChEBI ontology :param ub_chebi_ids: collection of ubiquitous ChEBI term ids :param threshold: threshold for a metabolite to be considered as frequently participating in reactions and therefore ubiquitous :param r_ids_to_ignore: (optional) ids of reactions whose stoichiometry preserving constraint can be ignores :return: """ unmapped_s_ids = {s.getId() for s in model.getListOfSpecies() if s.getId() not in s_id2chebi_id} term_id2clu = find_term_clustering(model, chebi, s_id2chebi_id, unmapped_s_ids, ub_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) if term_id2clu: term_id2clu = select_representative_terms(term_id2clu, chebi) s_id2clu = compute_s_id2clu(unmapped_s_ids, model, s_id2chebi_id, term_id2clu) clu2s_ids = invert_map(s_id2clu) for s_ids in clu2s_ids.values(): if len(s_ids) == 1: del s_id2clu[s_ids.pop()] else: s_id2clu = {} if not ub_s_ids: frequent_ch_ids = get_frequent_term_ids(model, threshold) ub_s_ids = select_metabolite_ids_by_term_ids(model, frequent_ch_ids) - set(s_id2clu.keys()) # unmapped_s_ids = {s_id for s_id in unmapped_s_ids if s_id not in s_id2clu} # infer_clusters(model, unmapped_s_ids, s_id2clu, species_id2chebi_id, ub_chebi_ids) return s_id2clu, ub_s_ids
def write_detailed_r_id2c(model, r_id2c, f): c2r_ids = invert_map(r_id2c) for c, r_ids in sorted(c2r_ids.items(), key=lambda it: (-abs(it[0]), -it[0])): for r_id in sorted(r_ids): f.write('%g\t%s:\t%s\n' % (c, r_id, get_sbml_r_formula(model, model.getReaction(r_id), show_compartments=False, show_metabolite_ids=True))) f.write('\n')
def in_species_conflict(term, candidate_sps, proposal_s_id2clu): proposal_clu2s_ids = invert_map(proposal_s_id2clu) for s in candidate_sps: s_clu = s.getCompartment(), term rs = {r_id for r_id in s_id2r_ids[s.getId()]} clu_s_ids = clu2s_ids[s_clu] | proposal_clu2s_ids[s_clu] for clu_s_id in clu_s_ids: if {r_id for r_id in s_id2r_ids[clu_s_id]} & rs: return True return False
def serialize_generalization(r_id2clu, s_id2clu, sbml, chebi, path): doc = libsbml.SBMLReader().readSBML(sbml) model = doc.getModel() groups_plugin = model.getPlugin("groups") clu2r_ids, clu2s_ids = invert_map(r_id2clu), invert_map(s_id2clu) wb = openpyxl.Workbook() ws = wb.create_sheet(0, "Metabolite Groups") row = 1 add_values(ws, row, 1, ["Group Id", "Group Name", "Group CHEBI", "Id", "Name", "Compartment", "CHEBI"], HEADER_STYLE) row += 1 processed_s_ids = set() for (g_id, ch_term), s_ids in sorted(clu2s_ids.items(), key=lambda ((g_id, _), s_ids): g_id): group = groups_plugin.getGroup(g_id) add_values(ws, row, 1, [g_id, group.getName(), ch_term.get_id() if ch_term else '']) for s_id in sorted(s_ids, key=lambda s_id: s_id[s_id.find('__'):]): species = model.getSpecies(s_id) ch_term = get_chebi_term_by_annotation(species, chebi) add_values(ws, row, 4, [s_id, species.getName(), model.getCompartment(species.getCompartment()).getName(), ch_term.get_id() if ch_term else '']) row += 1 processed_s_ids |= s_ids
def _log_clusters(term_id2clu, onto, model): clu2term = invert_map(term_id2clu) blueprint = [] logging.info("-------------------\nquotient species sets:\n-------------------") for clu in sorted(clu2term.keys(), key=lambda k: -len(clu2term[k])): term_ids = clu2term[clu] if len(term_ids) == 1: continue blueprint.append(len(term_ids)) logging.info("(%d)\t%s\n" % (len(term_ids), [ onto.get_term(it).get_name() if onto.get_term(it) else model.getSpecies( it).getName() if model.getSpecies(it) else it for it in term_ids])) logging.info("Cluster sizes: %s\n-------------------\n\n" % sorted(blueprint, key=lambda s: -s))
def cover_with_onto_terms(model, onto, species_id2chebi_id, term_id2clu, ubiquitous_chebi_ids, r_ids_to_ignore=None): onto_updated = update_onto(onto, term_id2clu) if onto_updated: for clu, t_ids in invert_map(term_id2clu).items(): if len(t_ids) == 1: del term_id2clu[t_ids.pop()] else: new_t_id2clu = cover_t_ids(model, species_id2chebi_id, ubiquitous_chebi_ids, t_ids, onto, clu, r_ids_to_ignore=r_ids_to_ignore) for t_id in t_ids: if t_id in new_t_id2clu: term_id2clu[t_id] = new_t_id2clu[t_id] else: del term_id2clu[t_id] return onto_updated
def generalize_model(in_sbml, chebi, groups_sbml, out_sbml, ub_s_ids=None, ub_chebi_ids=None, ignore_biomass=True): """ Generalizes a model. :param in_sbml: str, path to the input SBML file :param chebi: mod_sbml.onto.obo_ontology.Ontology ChEBI ontology :param groups_sbml: str, path to the output SBML file (with groups extension) :param out_sbml: str, path to the output SBML file (generalized) :param ub_s_ids: optional, ids of ubiquitous species (will be inferred if set to None) :param ub_chebi_ids: optional, ids of ubiquitous ChEBI terms (will be inferred if set to None) :param ignore_biomass: boolean, whether to ignore the biomass reaction (and its stoichiometry preserving constraint) :return: tuple (r_id2g_eq, s_id2gr_id, s_id2chebi_id, ub_s_ids): dict {reaction_id: reaction_group_id}, dict {species_id: species_group_id}, dict {species_id: ChEBI_term_id}, collection of ubiquitous species_ids. """ # input_model input_doc = libsbml.SBMLReader().readSBML(in_sbml) input_model = input_doc.getModel() r_ids_to_ignore = get_biomass_r_ids(input_model) if ignore_biomass else None remove_is_a_reactions(input_model) annotate_metabolites(input_model, chebi) # TODO: fix comp separation # separate_boundary_metabolites(input_model) remove_unused_elements(input_model) logging.info("mapping species to ChEBI") s_id2chebi_id = get_species_id2chebi_id(input_model) ub_chebi_ids, ub_s_ids = get_ub_elements(input_model, chebi, s_id2chebi_id, ub_chebi_ids, ub_s_ids) terms = (t for t in (chebi.get_term(t_id) for t_id in s_id2chebi_id.values()) if t) old_onto_len = len(chebi) filter_ontology(chebi, terms, relationships=EQUIVALENT_RELATIONSHIPS, min_deepness=3) logging.info('Filtered the ontology from %d terms to %d' % (old_onto_len, len(chebi))) threshold = min(max(3, int(0.1 * input_model.getNumReactions())), UBIQUITOUS_THRESHOLD) s_id2clu, ub_s_ids = generalize_species(input_model, s_id2chebi_id, ub_s_ids, chebi, ub_chebi_ids, threshold, r_ids_to_ignore=r_ids_to_ignore) logging.info("generalized species") r_id2clu = generalize_reactions(input_model, s_id2clu, s_id2chebi_id, ub_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) logging.info("generalized reactions") clu2s_ids = {(c_id, term): s_ids for ((c_id, (term, )), s_ids) in invert_map(s_id2clu).items()} r_id2g_eq, s_id2gr_id = save_as_comp_generalized_sbml(input_model, out_sbml, groups_sbml, r_id2clu, clu2s_ids, ub_s_ids, chebi) return r_id2g_eq, s_id2gr_id, s_id2chebi_id, ub_s_ids
def maximize(unmapped_s_ids, model, term_id2clu, species_id2term_id, ub_chebi_ids, r_ids_to_ignore=None): clu2term_ids = invert_map(term_id2clu) s_id2clu = compute_s_id2clu(unmapped_s_ids, model, species_id2term_id, term_id2clu) r_id2clu = generalize_reactions(model, s_id2clu, species_id2term_id, ub_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) thrds = [] for (clu, term_ids) in clu2term_ids.items(): if len(term_ids) <= 1: continue thread = MaximizingThread(model, term_ids, species_id2term_id, clu, term_id2clu, s_id2clu, ub_chebi_ids, r_id2clu, r_ids_to_ignore=r_ids_to_ignore) thrds.append(thread) thread.start() # This actually causes the thread to run for th in thrds: th.join() # This waits until the thread has completed return term_id2clu
def update_onto(onto, term_id2clu): ancestors = [] clu2t_ids = invert_map(term_id2clu) for clu, t_ids in clu2t_ids.items(): if len(t_ids) <= 1: continue terms = {onto.get_term(t_id) for t_id in t_ids if onto.get_term(t_id)} if terms: ancestors.extend(set(onto.common_points(terms, relationships=EQUIVALENT_RELATIONSHIPS))) removed_something = False count = Counter(ancestors) for t in (t for t in count.keys() if count[t] > 1): # if this term has been already removed as an ancestor/equivalent of another term if not onto.get_term(t.get_id()): continue for it in onto.get_generalized_ancestors(t, relationships=EQUIVALENT_RELATIONSHIPS): onto.remove_term(it, True) for it in onto.get_equivalents(t, relationships=EQUIVALENT_RELATIONSHIPS): onto.remove_term(it, True) onto.remove_term(t, True) removed_something = True return removed_something
def infer_clusters(model, unmapped_s_ids, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=None): # TODO: double check it return if not unmapped_s_ids: return term_id2s_ids = invert_map(s_id2term_id) clu2s_ids = invert_map(s_id2clu) vk2r_ids = get_vk2r_ids(model, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) vk2r_ids = { vk: r_ids for (vk, r_ids) in vk2r_ids.items() if len(r_ids) > 1 } simplified_vk2vk_set = defaultdict(set) for vk in vk2r_ids.keys(): simplified_vk2vk_set[vertical_key2simplified_vertical_key(vk)].add(vk) s_id2r_ids = defaultdict(list) for r in (r for r in model.getListOfReactions() if r.getNumReactants() + r.getNumProducts() > 2): if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue r_id = r.getId() for s_id in chain((species_ref.getSpecies() for species_ref in r.getListOfReactants()), (species_ref.getSpecies() for species_ref in r.getListOfProducts())): s_id2r_ids[s_id].append(r_id) def in_species_conflict(term, candidate_sps, proposal_s_id2clu): proposal_clu2s_ids = invert_map(proposal_s_id2clu) for s in candidate_sps: s_clu = s.getCompartment(), term rs = {r_id for r_id in s_id2r_ids[s.getId()]} clu_s_ids = clu2s_ids[s_clu] | proposal_clu2s_ids[s_clu] for clu_s_id in clu_s_ids: if {r_id for r_id in s_id2r_ids[clu_s_id]} & rs: return True return False processed_r_ids = reduce(lambda s1, s2: s1 | s2, vk2r_ids.values(), set()) for r in model.getListOfReactions(): if r.getId( ) in processed_r_ids or not unmapped_s_ids & get_metabolites(r): continue if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue ub_rs, ub_ps, rs, ps = get_vertical_key(model, r, s_id2clu, s_id2term_id, ubiquitous_chebi_ids) vk = ub_rs, ub_ps, rs, ps rs, ps = set(rs), set(ps) partial_rs, partial_ps = {(s_id, c_id) for (s_id, c_id) in rs if s_id not in unmapped_s_ids}, \ {(s_id, c_id) for (s_id, c_id) in ps if s_id not in unmapped_s_ids} if len(ub_rs) + len(ub_ps) + len(partial_rs) + len(partial_ps) < 2: continue simplified_vk = vertical_key2simplified_vertical_key(vk) if simplified_vk in simplified_vk2vk_set: ub_rs, ub_ps = tuple(sorted(ub_rs)), tuple(sorted(ub_ps)) for (vk_ub_rs, vk_ub_ps, vk_rs, vk_ps) in simplified_vk2vk_set[simplified_vk]: vk_rs, vk_ps = {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_rs}, \ {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_ps} proposal = {} if vk_ub_rs == ub_ps and vk_ub_ps == ub_rs and not partial_rs - vk_ps and not partial_ps - vk_rs: vk_ub_rs, vk_ub_ps, partial_rs, partial_ps = vk_ub_ps, vk_ub_rs, partial_ps, partial_rs if vk_ub_rs == ub_rs and vk_ub_ps == ub_ps and not partial_rs - vk_rs and not partial_ps - vk_ps: r_s_ids = rs - vk_rs p_s_ids = ps - vk_ps if 0 < len(r_s_ids) <= 1 and 0 < len( p_s_ids) <= 1 and r_s_ids or p_s_ids: if r_s_ids and vk_rs - rs: s_id, c_id = r_s_ids.pop() # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue clu, c_id = (vk_rs - rs).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue candidate_sps = { model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id}) } comp, term = clu if not in_species_conflict(term, candidate_sps, proposal): for s in candidate_sps: proposal[ s.getId()] = s.getCompartment(), term else: continue if p_s_ids and vk_ps - ps: s_id, c_id = p_s_ids.pop() # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue clu, c_id = (vk_ps - ps).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue candidate_sps = { model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id}) } comp, term = clu if not in_species_conflict(term, candidate_sps, proposal): for s in candidate_sps: proposal[ s.getId()] = s.getCompartment(), term else: continue if proposal: s_id2clu.update(proposal) for s_id, clu in proposal.items(): clu2s_ids[clu].add(s_id) unmapped_s_ids -= set(proposal.keys())
def suggest_clusters(model, unmapped_s_ids, term_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=None): # TODO: double check it return if not unmapped_s_ids: return s_id2clu = compute_s_id2clu(set(), model, s_id2term_id, term_id2clu) term_id2s_ids = invert_map(s_id2term_id) vk2r_ids = get_vk2r_ids(model, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) vk2r_ids = {vk: r_ids for (vk, r_ids) in vk2r_ids.items() if len(r_ids) > 1} processed_r_ids = reduce(lambda s1, s2: s1 | s2, vk2r_ids.values(), set()) s_vk2vk = defaultdict(set) for vk in vk2r_ids.keys(): s_vk2vk[vertical_key2simplified_vertical_key(vk)].add(vk) s_id2r_ids = defaultdict(list) for r in (r for r in model.getListOfReactions() if r.getNumReactants() + r.getNumProducts() > 2): if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue r_id = r.getId() for s_id in chain((species_ref.getSpecies() for species_ref in r.getListOfReactants()), (species_ref.getSpecies() for species_ref in r.getListOfProducts())): s_id2r_ids[s_id].append(r_id) for r in model.getListOfReactions(): if r.getId() in processed_r_ids or not unmapped_s_ids & get_metabolites(r): continue if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue ub_rs, ub_ps, rs, ps = get_vertical_key(model, r, s_id2clu, s_id2term_id, ubiquitous_chebi_ids) vk = ub_rs, ub_ps, rs, ps rs, ps = set(rs), set(ps) partial_rs, partial_ps = {(s_id, c_id) for (s_id, c_id) in rs if s_id not in unmapped_s_ids}, \ {(s_id, c_id) for (s_id, c_id) in ps if s_id not in unmapped_s_ids} if vk in vk2r_ids or len(ub_rs) + len(ub_ps) + len(partial_rs) + len(partial_ps) < 2: continue s_vk = vertical_key2simplified_vertical_key(vk) if s_vk in s_vk2vk: ub_rs, ub_ps = tuple(sorted(ub_rs)), tuple(sorted(ub_ps)) for (vk_ub_rs, vk_ub_ps, vk_rs, vk_ps) in s_vk2vk[s_vk]: vk_rs, vk_ps = {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_rs}, \ {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_ps} proposal = {} if vk_ub_rs == ub_ps and vk_ub_ps == ub_rs and not partial_rs - vk_ps and not partial_ps - vk_rs: vk_ub_rs, vk_ub_ps, partial_rs, partial_ps = vk_ub_ps, vk_ub_rs, partial_ps, partial_rs if vk_ub_rs == ub_rs and vk_ub_ps == ub_ps and not partial_rs - vk_rs and not partial_ps - vk_ps: r_s_ids = rs - vk_rs p_s_ids = ps - vk_ps if 0 < len(r_s_ids) <= 1 and 0 < len(p_s_ids) <= 1 and r_s_ids or p_s_ids: if r_s_ids and vk_rs - rs: s_id, c_id = r_s_ids.pop() clu, c_id = (vk_rs - rs).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue candidate_sps = {model.getSpecies(sp_id) for sp_id in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else [s_id])} comp, term = clu for s in candidate_sps: proposal[s.getId()] = term if p_s_ids and vk_ps - ps: s_id, c_id = p_s_ids.pop() clu, c_id = (vk_ps - ps).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue candidate_sps = {model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id})} comp, term = clu for s in candidate_sps: proposal[s.getId()] = term if proposal: for s_id, clu in proposal.items(): term_id2clu[s_id] = (clu, ) if not (isinstance(clu, tuple)) else clu unmapped_s_ids -= {s_id}
def save_as_comp_generalized_sbml(input_model, out_sbml, groups_sbml, r_id2clu, clu2s_ids, ub_sps, onto): logging.info("serializing generalization") s_id_increment, r_id_increment = 0, 0 if groups_sbml: doc = convert_to_lev3_v1(input_model) groups_model = doc.getModel() groups_plugin = groups_model.getPlugin("groups") if groups_plugin: logging.info(" saving ubiquitous species annotations") s_group = groups_plugin.createGroup() s_group.setId("g_ubiquitous_sps") s_group.setKind(libsbml.GROUP_KIND_COLLECTION) s_group.setSBOTerm(SBO_CHEMICAL_MACROMOLECULE) s_group.setName("ubiquitous species") for s_id in ub_sps: member = s_group.createMember() member.setIdRef(s_id) add_annotation(s_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_UBIQUITOUS) if out_sbml: # generalized model generalized_doc = libsbml.SBMLDocument(input_model.getSBMLNamespaces()) generalized_model = generalized_doc.createModel() copy_elements(input_model, generalized_model) r_id2g_eq, s_id2gr_id = {}, {} if not clu2s_ids: logging.info(" nothing to serialize") else: clu2r_ids = invert_map(r_id2clu) logging.info(" creating species groups") for ((c_id, t), s_ids) in clu2s_ids.items(): comp = input_model.getCompartment(c_id) if len(s_ids) > 1: t = onto.get_term(t) t_name, t_id = (t.get_name(), t.get_id()) if t \ else (' or '.join(input_model.getSpecies(s_id).getName() for s_id in s_ids), None) if not t_id: t = t_name if out_sbml: new_species = create_species(model=generalized_model, compartment_id=comp.getId(), type_id=None, name="{0} ({1}) [{2}]".format(t_name, len(s_ids), comp.getName())) add_annotation(new_species, libsbml.BQB_IS, t_id, CHEBI_PREFIX) new_s_id = new_species.getId() else: s_id_increment += 1 new_s_id = generate_unique_id(input_model, "s_g_", s_id_increment) for s_id in s_ids: s_id2gr_id[s_id] = new_s_id, t if groups_sbml and groups_plugin: # save as a group s_group = groups_plugin.createGroup() s_group.setId(new_s_id) s_group.setKind(libsbml.GROUP_KIND_CLASSIFICATION) s_group.setSBOTerm(SBO_CHEMICAL_MACROMOLECULE) g_name = "{0} [{1}]".format(t_name, comp.getName()) s_group.setName(g_name) # logging.info("%s: %d" % (g_name, len(s_ids))) if t_id: add_annotation(s_group, libsbml.BQB_IS, t_id, CHEBI_PREFIX) for s_id in s_ids: member = s_group.createMember() member.setIdRef(s_id) add_annotation(s_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_EQUIV) generalize_species = lambda species_id: s_id2gr_id[species_id][0] if (species_id in s_id2gr_id) else species_id s_id_to_generalize = set(s_id2gr_id.keys()) logging.info(" creating reaction groups") for clu, r_ids in clu2r_ids.items(): representative = input_model.getReaction(list(r_ids)[0]) r_name = "generalized %s" % representative.getName() if out_sbml: reactants = dict(get_reactants(representative, stoichiometry=True)) products = dict(get_products(representative, stoichiometry=True)) if (len(r_ids) == 1) and \ not ((set(reactants.keys()) | set(products.keys())) & s_id_to_generalize): generalized_model.addReaction(representative) continue r_id2st = {generalize_species(it): st for (it, st) in reactants.items()} p_id2st = {generalize_species(it): st for (it, st) in products.items()} reversible = next((False for r_id in r_ids if not input_model.getReaction(r_id).getReversible()), True) new_r_id = create_reaction(generalized_model, r_id2st, p_id2st, name=r_name, reversible=reversible, id_=representative.getId() if len(r_ids) == 1 else None).getId() elif len(r_ids) > 1: r_id_increment += 1 new_r_id = generate_unique_id(input_model, "r_g_", r_id_increment) if len(r_ids) > 1: for r_id in r_ids: r_id2g_eq[r_id] = new_r_id, r_name if groups_sbml and groups_plugin: # save as a group r_group = groups_plugin.createGroup() r_group.setId(new_r_id) r_group.setKind(libsbml.GROUP_KIND_COLLECTION) r_group.setSBOTerm(SBO_BIOCHEMICAL_REACTION) r_group.setName(r_name) for r_id in r_ids: member = r_group.createMember() member.setIdRef(r_id) add_annotation(r_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_EQUIV) if out_sbml: remove_unused_elements(generalized_model) save_as_sbml(generalized_model, out_sbml) if groups_sbml and groups_model: save_as_sbml(groups_model, groups_sbml) logging.info("serialized to " + groups_sbml) return r_id2g_eq, s_id2gr_id
def infer_clusters(model, unmapped_s_ids, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=None): # TODO: double check it return if not unmapped_s_ids: return term_id2s_ids = invert_map(s_id2term_id) clu2s_ids = invert_map(s_id2clu) vk2r_ids = get_vk2r_ids(model, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) vk2r_ids = {vk: r_ids for (vk, r_ids) in vk2r_ids.items() if len(r_ids) > 1} simplified_vk2vk_set = defaultdict(set) for vk in vk2r_ids.keys(): simplified_vk2vk_set[vertical_key2simplified_vertical_key(vk)].add(vk) s_id2r_ids = defaultdict(list) for r in (r for r in model.getListOfReactions() if r.getNumReactants() + r.getNumProducts() > 2): if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue r_id = r.getId() for s_id in chain((species_ref.getSpecies() for species_ref in r.getListOfReactants()), (species_ref.getSpecies() for species_ref in r.getListOfProducts())): s_id2r_ids[s_id].append(r_id) def in_species_conflict(term, candidate_sps, proposal_s_id2clu): proposal_clu2s_ids = invert_map(proposal_s_id2clu) for s in candidate_sps: s_clu = s.getCompartment(), term rs = {r_id for r_id in s_id2r_ids[s.getId()]} clu_s_ids = clu2s_ids[s_clu] | proposal_clu2s_ids[s_clu] for clu_s_id in clu_s_ids: if {r_id for r_id in s_id2r_ids[clu_s_id]} & rs: return True return False processed_r_ids = reduce(lambda s1, s2: s1 | s2, vk2r_ids.values(), set()) for r in model.getListOfReactions(): if r.getId() in processed_r_ids or not unmapped_s_ids & get_metabolites(r): continue if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue ub_rs, ub_ps, rs, ps = get_vertical_key(model, r, s_id2clu, s_id2term_id, ubiquitous_chebi_ids) vk = ub_rs, ub_ps, rs, ps rs, ps = set(rs), set(ps) partial_rs, partial_ps = {(s_id, c_id) for (s_id, c_id) in rs if s_id not in unmapped_s_ids}, \ {(s_id, c_id) for (s_id, c_id) in ps if s_id not in unmapped_s_ids} if len(ub_rs) + len(ub_ps) + len(partial_rs) + len(partial_ps) < 2: continue simplified_vk = vertical_key2simplified_vertical_key(vk) if simplified_vk in simplified_vk2vk_set: ub_rs, ub_ps = tuple(sorted(ub_rs)), tuple(sorted(ub_ps)) for (vk_ub_rs, vk_ub_ps, vk_rs, vk_ps) in simplified_vk2vk_set[simplified_vk]: vk_rs, vk_ps = {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_rs}, \ {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_ps} proposal = {} if vk_ub_rs == ub_ps and vk_ub_ps == ub_rs and not partial_rs - vk_ps and not partial_ps - vk_rs: vk_ub_rs, vk_ub_ps, partial_rs, partial_ps = vk_ub_ps, vk_ub_rs, partial_ps, partial_rs if vk_ub_rs == ub_rs and vk_ub_ps == ub_ps and not partial_rs - vk_rs and not partial_ps - vk_ps: r_s_ids = rs - vk_rs p_s_ids = ps - vk_ps if 0 < len(r_s_ids) <= 1 and 0 < len(p_s_ids) <= 1 and r_s_ids or p_s_ids: if r_s_ids and vk_rs - rs: s_id, c_id = r_s_ids.pop() # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue clu, c_id = (vk_rs - rs).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue candidate_sps = {model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id})} comp, term = clu if not in_species_conflict(term, candidate_sps, proposal): for s in candidate_sps: proposal[s.getId()] = s.getCompartment(), term else: continue if p_s_ids and vk_ps - ps: s_id, c_id = p_s_ids.pop() # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue clu, c_id = (vk_ps - ps).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue candidate_sps = {model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id})} comp, term = clu if not in_species_conflict(term, candidate_sps, proposal): for s in candidate_sps: proposal[s.getId()] = s.getCompartment(), term else: continue if proposal: s_id2clu.update(proposal) for s_id, clu in proposal.items(): clu2s_ids[clu].add(s_id) unmapped_s_ids -= set(proposal.keys())
def generalize_model(in_sbml, chebi, groups_sbml, out_sbml, ub_s_ids=None, ub_chebi_ids=None, ignore_biomass=True): """ Generalizes a model. :param in_sbml: str, path to the input SBML file :param chebi: mod_sbml.onto.obo_ontology.Ontology ChEBI ontology :param groups_sbml: str, path to the output SBML file (with groups extension) :param out_sbml: str, path to the output SBML file (generalized) :param ub_s_ids: optional, ids of ubiquitous species (will be inferred if set to None) :param ub_chebi_ids: optional, ids of ubiquitous ChEBI terms (will be inferred if set to None) :param ignore_biomass: boolean, whether to ignore the biomass reaction (and its stoichiometry preserving constraint) :return: tuple (r_id2g_eq, s_id2gr_id, s_id2chebi_id, ub_s_ids): dict {reaction_id: reaction_group_id}, dict {species_id: species_group_id}, dict {species_id: ChEBI_term_id}, collection of ubiquitous species_ids. """ # input_model input_doc = libsbml.SBMLReader().readSBML(in_sbml) input_model = input_doc.getModel() r_ids_to_ignore = get_biomass_r_ids( input_model) if ignore_biomass else None remove_is_a_reactions(input_model) annotate_metabolites(input_model, chebi) # TODO: fix comp separation # separate_boundary_metabolites(input_model) remove_unused_elements(input_model) logging.info("mapping species to ChEBI") s_id2chebi_id = get_species_id2chebi_id(input_model) ub_chebi_ids, ub_s_ids = get_ub_elements(input_model, chebi, s_id2chebi_id, ub_chebi_ids, ub_s_ids) terms = (t for t in (chebi.get_term(t_id) for t_id in s_id2chebi_id.values()) if t) old_onto_len = len(chebi) filter_ontology(chebi, terms, relationships=EQUIVALENT_RELATIONSHIPS, min_deepness=3) logging.info('Filtered the ontology from %d terms to %d' % (old_onto_len, len(chebi))) threshold = min(max(3, int(0.1 * input_model.getNumReactions())), UBIQUITOUS_THRESHOLD) s_id2clu, ub_s_ids = generalize_species(input_model, s_id2chebi_id, ub_s_ids, chebi, ub_chebi_ids, threshold, r_ids_to_ignore=r_ids_to_ignore) logging.info("generalized species") r_id2clu = generalize_reactions(input_model, s_id2clu, s_id2chebi_id, ub_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) logging.info("generalized reactions") clu2s_ids = {(c_id, term): s_ids for ((c_id, (term, )), s_ids) in invert_map(s_id2clu).items()} r_id2g_eq, s_id2gr_id = save_as_comp_generalized_sbml( input_model, out_sbml, groups_sbml, r_id2clu, clu2s_ids, ub_s_ids, chebi) return r_id2g_eq, s_id2gr_id, s_id2chebi_id, ub_s_ids
def filter_clu_to_terms(term2clu): clu2term = invert_map(term2clu) for clu, terms in clu2term.items(): if len(terms) == 1: del term2clu[terms.pop()]
def suggest_clusters(model, unmapped_s_ids, term_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=None): # TODO: double check it return if not unmapped_s_ids: return s_id2clu = compute_s_id2clu(set(), model, s_id2term_id, term_id2clu) term_id2s_ids = invert_map(s_id2term_id) vk2r_ids = get_vk2r_ids(model, s_id2clu, s_id2term_id, ubiquitous_chebi_ids, r_ids_to_ignore=r_ids_to_ignore) vk2r_ids = { vk: r_ids for (vk, r_ids) in vk2r_ids.items() if len(r_ids) > 1 } processed_r_ids = reduce(lambda s1, s2: s1 | s2, vk2r_ids.values(), set()) s_vk2vk = defaultdict(set) for vk in vk2r_ids.keys(): s_vk2vk[vertical_key2simplified_vertical_key(vk)].add(vk) s_id2r_ids = defaultdict(list) for r in (r for r in model.getListOfReactions() if r.getNumReactants() + r.getNumProducts() > 2): if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue r_id = r.getId() for s_id in chain((species_ref.getSpecies() for species_ref in r.getListOfReactants()), (species_ref.getSpecies() for species_ref in r.getListOfProducts())): s_id2r_ids[s_id].append(r_id) for r in model.getListOfReactions(): if r.getId( ) in processed_r_ids or not unmapped_s_ids & get_metabolites(r): continue if r_ids_to_ignore and r.getId() in r_ids_to_ignore: continue ub_rs, ub_ps, rs, ps = get_vertical_key(model, r, s_id2clu, s_id2term_id, ubiquitous_chebi_ids) vk = ub_rs, ub_ps, rs, ps rs, ps = set(rs), set(ps) partial_rs, partial_ps = {(s_id, c_id) for (s_id, c_id) in rs if s_id not in unmapped_s_ids}, \ {(s_id, c_id) for (s_id, c_id) in ps if s_id not in unmapped_s_ids} if vk in vk2r_ids or len(ub_rs) + len(ub_ps) + len(partial_rs) + len( partial_ps) < 2: continue s_vk = vertical_key2simplified_vertical_key(vk) if s_vk in s_vk2vk: ub_rs, ub_ps = tuple(sorted(ub_rs)), tuple(sorted(ub_ps)) for (vk_ub_rs, vk_ub_ps, vk_rs, vk_ps) in s_vk2vk[s_vk]: vk_rs, vk_ps = {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_rs}, \ {(s_id if s_id not in s_id2clu else s_id2clu[s_id], c_id) for (s_id, c_id) in vk_ps} proposal = {} if vk_ub_rs == ub_ps and vk_ub_ps == ub_rs and not partial_rs - vk_ps and not partial_ps - vk_rs: vk_ub_rs, vk_ub_ps, partial_rs, partial_ps = vk_ub_ps, vk_ub_rs, partial_ps, partial_rs if vk_ub_rs == ub_rs and vk_ub_ps == ub_ps and not partial_rs - vk_rs and not partial_ps - vk_ps: r_s_ids = rs - vk_rs p_s_ids = ps - vk_ps if 0 < len(r_s_ids) <= 1 and 0 < len( p_s_ids) <= 1 and r_s_ids or p_s_ids: if r_s_ids and vk_rs - rs: s_id, c_id = r_s_ids.pop() clu, c_id = (vk_rs - rs).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue candidate_sps = { model.getSpecies(sp_id) for sp_id in ( term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else [s_id]) } comp, term = clu for s in candidate_sps: proposal[s.getId()] = term if p_s_ids and vk_ps - ps: s_id, c_id = p_s_ids.pop() clu, c_id = (vk_ps - ps).pop() # if it is a species id instead of a cluster, continue if not isinstance(clu, tuple): continue # if it is not a species id but a cluster, continue if not isinstance(s_id, str): continue candidate_sps = { model.getSpecies(it) for it in (term_id2s_ids[s_id2term_id[s_id]] if s_id in s_id2term_id else {s_id}) } comp, term = clu for s in candidate_sps: proposal[s.getId()] = term if proposal: for s_id, clu in proposal.items(): term_id2clu[s_id] = ( clu, ) if not (isinstance(clu, tuple)) else clu unmapped_s_ids -= {s_id}
def save_as_comp_generalized_sbml(input_model, out_sbml, groups_sbml, r_id2clu, clu2s_ids, ub_sps, onto): logging.info("serializing generalization") s_id_increment, r_id_increment = 0, 0 if groups_sbml: doc = convert_to_lev3_v1(input_model) groups_model = doc.getModel() groups_plugin = groups_model.getPlugin("groups") if groups_plugin: logging.info(" saving ubiquitous species annotations") s_group = groups_plugin.createGroup() s_group.setId("g_ubiquitous_sps") s_group.setKind(libsbml.GROUP_KIND_COLLECTION) s_group.setSBOTerm(SBO_CHEMICAL_MACROMOLECULE) s_group.setName("ubiquitous species") for s_id in ub_sps: member = s_group.createMember() member.setIdRef(s_id) add_annotation(s_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_UBIQUITOUS) if out_sbml: # generalized model generalized_doc = convert_to_lev3_v1(input_model) generalized_model = generalized_doc.getModel() for _ in range(0, generalized_model.getNumReactions()): generalized_model.removeReaction(0) r_id2g_eq, s_id2gr_id = {}, {} if not clu2s_ids: logging.info(" nothing to serialize") else: clu2r_ids = invert_map(r_id2clu) logging.info(" creating species groups") for ((c_id, t), s_ids) in clu2s_ids.items(): comp = input_model.getCompartment(c_id) if len(s_ids) > 1: t = onto.get_term(t) t_name, t_id = (t.get_name(), t.get_id()) if t \ else (' or '.join(input_model.getSpecies(s_id).getName() for s_id in s_ids), None) if not t_id: t = t_name if out_sbml: new_species = create_species(model=generalized_model, compartment_id=comp.getId(), type_id=None, name="{0} ({1}) [{2}]".format(t_name, len(s_ids), comp.getName())) add_annotation(new_species, libsbml.BQB_IS, t_id, CHEBI_PREFIX) new_s_id = new_species.getId() else: s_id_increment += 1 new_s_id = generate_unique_id(input_model, "s_g_", s_id_increment) for s_id in s_ids: s_id2gr_id[s_id] = new_s_id, t if groups_sbml and groups_plugin: # save as a group s_group = groups_plugin.createGroup() s_group.setId(new_s_id) s_group.setKind(libsbml.GROUP_KIND_CLASSIFICATION) s_group.setSBOTerm(SBO_CHEMICAL_MACROMOLECULE) g_name = "{0} [{1}]".format(t_name, comp.getName()) s_group.setName(g_name) # logging.info("%s: %d" % (g_name, len(s_ids))) if t_id: add_annotation(s_group, libsbml.BQB_IS, t_id, CHEBI_PREFIX) for s_id in s_ids: member = s_group.createMember() member.setIdRef(s_id) add_annotation(s_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_EQUIV) generalize_species = lambda species_id: s_id2gr_id[species_id][0] if (species_id in s_id2gr_id) else species_id s_id_to_generalize = set(s_id2gr_id.keys()) logging.info(" creating reaction groups") for clu, r_ids in clu2r_ids.items(): representative = input_model.getReaction(list(r_ids)[0]) r_name = "generalized %s" % representative.getName() if out_sbml: reactants = dict(get_reactants(representative, stoichiometry=True)) products = dict(get_products(representative, stoichiometry=True)) if (len(r_ids) == 1) and \ not ((set(reactants.keys()) | set(products.keys())) & s_id_to_generalize): create_reaction(generalized_model, reactants, products, name=representative.getName(), reversible=representative.getReversible(), id_=representative.getId()) continue r_id2st = {generalize_species(it): st for (it, st) in reactants.items()} p_id2st = {generalize_species(it): st for (it, st) in products.items()} reversible = next((False for r_id in r_ids if not input_model.getReaction(r_id).getReversible()), True) new_r_id = create_reaction(generalized_model, r_id2st, p_id2st, name=r_name, reversible=reversible, id_=representative.getId() if len(r_ids) == 1 else None).getId() elif len(r_ids) > 1: r_id_increment += 1 new_r_id = generate_unique_id(input_model, "r_g_", r_id_increment) if len(r_ids) > 1: for r_id in r_ids: r_id2g_eq[r_id] = new_r_id, r_name if groups_sbml and groups_plugin: # save as a group r_group = groups_plugin.createGroup() r_group.setId(new_r_id) r_group.setKind(libsbml.GROUP_KIND_COLLECTION) r_group.setSBOTerm(SBO_BIOCHEMICAL_REACTION) r_group.setName(r_name) for r_id in r_ids: member = r_group.createMember() member.setIdRef(r_id) add_annotation(r_group, libsbml.BQB_IS_DESCRIBED_BY, GROUP_TYPE_EQUIV) if out_sbml: remove_unused_elements(generalized_model) save_as_sbml(generalized_model, out_sbml) if groups_sbml and groups_model: save_as_sbml(groups_model, groups_sbml) logging.info("serialized to " + groups_sbml) return r_id2g_eq, s_id2gr_id