def set_all_depths_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None, comparison_func=max, current_depth: int = 0): """calculate and set max_depth and min_depth (maximum and minimum distances from root terms in the ontology) recursively for all terms in a branch of the ontology Args: ontology (Ontology): the ontology root_id (str): the ID of the root term of the branch to process relations (List[str]): list of relations to consider comparison_func: a comparison function to calculate the depth when multiple paths exist between the node and the root. max calculates the length of the longest path, min the one of the shortest current_depth (int): the current depth in the ontology """ if "depth" not in ontology.node(root_id): ontology.node(root_id)["depth"] = current_depth else: ontology.node(root_id)["depth"] = comparison_func( ontology.node(root_id)["depth"], current_depth) for child_id in ontology.children(node=root_id, relations=relations): set_all_depths_in_subgraph(ontology=ontology, root_id=child_id, relations=relations, comparison_func=comparison_func, current_depth=current_depth + 1)
def set_all_information_content_values(ontology: Ontology, relations: List[str] = None): logger.info("calculating information content for all terms in ontology") roots = ontology.get_roots(relations=relations) for root_id in roots: if "num_subsumers" not in ontology.node(root_id): _set_num_subsumers_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: if "num_leaves" not in ontology.node(root_id): _set_num_leaves_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: if "depth" not in ontology.node(root_id): set_all_depths_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: _set_information_content_in_subgraph( ontology=ontology, root_id=root_id, maxleaves=ontology.node(root_id)["num_leaves"], relations=relations)
def get_all_common_ancestors(node_ids: List[str], ontology: Ontology, min_distance_from_root: int = 0, nodeids_blacklist: List[str] = None): # check if all ids are connected to the same root node common_root = None for node_id in node_ids: onto_node = ontology.node(node_id) if "meta" in onto_node and "basicPropertyValues" in onto_node["meta"]: for basic_prop_val in onto_node["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": if common_root and common_root != basic_prop_val["val"]: raise ValueError( "Cannot get common ancestors of nodes connected to different roots" ) common_root = basic_prop_val["val"] ancestors = defaultdict(list) for node_id in node_ids: for ancestor in ontology.ancestors(node=node_id, reflexive=True): onto_anc = ontology.node(ancestor) onto_anc_root = None if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]: for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": onto_anc_root = basic_prop_val["val"] if onto_anc["depth"] >= min_distance_from_root and (not onto_anc_root or onto_anc_root == common_root) \ and (not nodeids_blacklist or ancestor not in nodeids_blacklist): ancestors[ancestor].append(node_id) return [(ancestor, ontology.label(ancestor), set(covered_nodes)) for ancestor, covered_nodes in ancestors.items() if len(covered_nodes) > 1 or ancestor == covered_nodes[0]]
def get_best_nodes_ic( node_ids: List[str], ontology: Ontology, max_number_of_terms: int = 3, min_distance_from_root: int = 0, slim_terms_ic_bonus_perc: int = 0, slim_set: set = None, nodeids_blacklist: List[str] = None ) -> Tuple[bool, List[Tuple[str, Set[str]]]]: """trim the list of terms by selecting the best combination of terms from the initial list or their common ancestors based on information content Args: node_ids (List[str]): the list of nodes to merge by common ancestor max_number_of_terms (int): minimum number of terms above which the merge operation is performed ontology (Ontology): the ontology min_distance_from_root (int): consider only nodes at a minimum distance from root as potential candidate for trimming slim_terms_ic_bonus_perc (int): boost the IC value for terms that appear in the slim set by the provided percentage slim_set (set): set of terms that belong to the slim for the provided ontology nodeids_blacklist (List[str]): a list of node ids to be excluded from common ancestors list Returns: Set[str]: the set of trimmed terms, together with the set of original terms that each of them covers """ common_ancestors = get_all_common_ancestors( node_ids=node_ids, ontology=ontology, nodeids_blacklist=nodeids_blacklist) if "IC" not in ontology.node(common_ancestors[0][0]): logger.warning( "ontology terms do not have information content values set") set_all_information_content_values(ontology=ontology) values = [ 0 if node[0] not in node_ids and ontology.node(node[0])["depth"] < min_distance_from_root else ontology.node(node[0])["IC"] * (1 + slim_terms_ic_bonus_perc) if slim_set and node[0] in slim_set else ontology.node(node[0])["IC"] for node in common_ancestors ] if slim_set and any([node[0] in slim_set for node in common_ancestors]): logger.debug("some candidates are present in the slim set") # remove ancestors with zero IC common_ancestors = [ common_ancestor for common_ancestor, value in zip(common_ancestors, values) if value > 0 ] values = [value for value in values if value > 0] best_terms = find_set_covering(subsets=common_ancestors, max_num_subsets=max_number_of_terms, value=values, ontology=ontology) covered_terms = set([ e for best_term_label, covered_terms in best_terms for e in covered_terms ]) return covered_terms != set(node_ids), best_terms
def _set_num_leaves_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None): num_leaves = 0 for child_id in ontology.children(node=root_id): if "num_leaves" not in ontology.node(child_id): _set_num_leaves_in_subgraph(ontology=ontology, root_id=child_id, relations=relations) if ontology.node(child_id)["num_leaves"] == 0: num_leaves += 1 else: num_leaves += ontology.node(child_id)["num_leaves"] ontology.node(root_id)["num_leaves"] = num_leaves
def _set_tot_annots_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None): if "tot_annot_genes" not in ontology.node(root_id): children = set(ontology.children(root_id, relations=relations)) children.discard(root_id) children = list(children) ontology.node(root_id)["tot_annot_genes"] = ontology.node( root_id)["rel_annot_genes"] | set([ annot_gene for child_id in children for annot_gene in _set_tot_annots_in_subgraph( ontology, child_id) ]) return ontology.node(root_id)["tot_annot_genes"]
def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None): logger.info( "Setting information content values based on ontology structure") roots = ontology.get_roots(relations=relations) for root_id in roots: if "num_subsumers" not in ontology.node(root_id) and ( "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS"): _set_num_subsumers_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: if "num_leaves" not in ontology.node(root_id) and ( "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS"): _set_num_leaves_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: if "depth" not in ontology.node(root_id) and ( "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS"): set_all_depths_in_subgraph(ontology=ontology, root_id=root_id, relations=relations) for root_id in roots: if "type" not in ontology.node(root_id) or ontology.node_type( root_id) == "CLASS": _set_information_content_in_subgraph( ontology=ontology, root_id=root_id, maxleaves=ontology.node(root_id)["num_leaves"], relations=relations) logger.info("Finished setting information content values")
def _set_num_subsumers_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None): if "num_subsumers" not in ontology.node(root_id): parents = set(ontology.parents(root_id)) parents.discard(root_id) parents = list(parents) if not parents or all( ["set_subsumers" in ontology.node(parent) for parent in parents]): subsumers = {subsumer for parent in parents for subsumer in ontology.node(parent)["set_subsumers"]} | \ {root_id} ontology.node(root_id)["num_subsumers"] = len(subsumers) ontology.node(root_id)["set_subsumers"] = subsumers for child_id in ontology.children(node=root_id): _set_num_subsumers_in_subgraph(ontology, child_id, relations)
def _set_num_subsumers_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None): parents = ontology.parents(root_id) if len(parents) == 1: ontology.node(root_id)["num_subsumers"] = ontology.node( parents[0])["num_subsumers"] + 1 else: ontology.node(root_id)["num_subsumers"] = len( ontology.ancestors(node=root_id, relations=relations, reflexive=True)) for child_id in ontology.children(node=root_id, relations=relations): _set_num_subsumers_in_subgraph(ontology=ontology, root_id=child_id, relations=relations)
def get_all_paths_to_root(node_id: str, ontology: Ontology, min_distance_from_root: int = 0, relations: List[str] = None, nodeids_blacklist: List[str] = None, previous_path: Union[None, List[str]] = None, root_node=None) -> Set[Tuple[str]]: """get all possible paths connecting a go term to its root terms Args: node_id (str): a valid GO id for the starting term ontology (Ontology): the go ontology min_distance_from_root (int): return only terms at a specified minimum distance from root terms relations (List[str]): the list of relations to be used nodeids_blacklist (List[str]): a list of node ids to exclude from the paths previous_path (Union[None, List[str]]): the path to get to the current node Returns: Set[Tuple[str]]: the set of paths connecting the specified term to its root terms, each of which contains a sequence of terms ids """ if previous_path is None: previous_path = [] new_path = previous_path[:] if not nodeids_blacklist or node_id not in nodeids_blacklist: new_path.append(node_id) parents = [ parent for parent in ontology.parents(node=node_id, relations=relations) if ontology.node(parent)["depth"] >= min_distance_from_root ] parents_same_root = [] if root_node: for parent in parents: parent_root = None if "meta" in parent and "basicPropertyValues" in parent["meta"]: for basic_prop_val in parent["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": parent_root = basic_prop_val["val"] if parent_root and parent_root == root_node: parents_same_root.append(parent) parents = parents_same_root if len(parents) > 0: # go up the tree, following a depth first visit paths_to_return = set() for parent in parents: for path in get_all_paths_to_root( node_id=parent, ontology=ontology, previous_path=new_path, min_distance_from_root=min_distance_from_root, relations=relations, nodeids_blacklist=nodeids_blacklist, root_node=root_node): paths_to_return.add(path) return paths_to_return if len(new_path) == 0: return {(node_id, )} else: return {tuple(new_path)}
def rename_ontology_terms( ontology: Ontology, terms_replacement_regex: Dict[str, str] = None) -> None: """rename ontology terms based on regular expression matching Args: ontology (Ontology): the ontology containing the terms to be renamed terms_replacement_regex (Dict[str, str]): a dictionary containing the regular expression to be applied for renaming terms. Each key must be a regular expression to search for terms and the associated value another regular expression that defines the final result """ logger.info("Renaming ontology terms") if terms_replacement_regex: for regex_to_substitute, regex_target in terms_replacement_regex.items( ): for node in ontology.search(regex_to_substitute, is_regex=True): ontology.node(node)["label"] = re.sub( regex_to_substitute, regex_target, ontology.node(node)["label"])
def set_ic_annot_freq(ontology: Ontology, annotations: AssociationSet): logger.info( "Setting information content values based on annotation frequency") for node_id in ontology.nodes(): node_prop = ontology.node(node_id) if "rel_annot_genes" in node_prop: del node_prop["rel_annot_genes"] if "tot_annot_genes" in node_prop: del node_prop["tot_annot_genes"] if "IC" in node_prop: del node_prop["IC"] for root_id in ontology.get_roots(): if "depth" not in ontology.node(root_id) and ( "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS"): set_all_depths_in_subgraph(ontology=ontology, root_id=root_id) node_gene_map = defaultdict(set) for subj, obj in annotations.associations_by_subj_obj.keys(): node_gene_map[obj].add(subj) for node_id in ontology.nodes(): node_pr = ontology.node(node_id) node_pr["rel_annot_genes"] = node_gene_map[node_id] for root_id in ontology.get_roots(): _set_tot_annots_in_subgraph(ontology, root_id) for node_prop in ontology.nodes().values(): if "tot_annot_genes" not in node_prop: node_prop["tot_annot_genes"] = set() tot_annots = len( set([ gene for set_genes in node_gene_map.values() for gene in set_genes ])) min_annots = min([ len(node["tot_annot_genes"]) for node in ontology.nodes().values() if "tot_annot_genes" in node and len(node["tot_annot_genes"]) > 0 ]) if not min_annots: min_annots = 1 for node_prop in ontology.nodes().values(): node_prop["IC"] = -math.log(len(node_prop["tot_annot_genes"]) / tot_annots) if \ len(node_prop["tot_annot_genes"]) > 0 else -math.log(min_annots / (tot_annots + 1)) logger.info("Finished setting information content values")
def _set_num_leaves_in_subgraph(ontology: Ontology, root_id: str, relations: List[str] = None): if "set_leaves" in ontology.node(root_id): return ontology.node(root_id)["set_leaves"] children = set(ontology.children(node=root_id)) children.discard(root_id) children = list(children) if not children: leaves = {root_id} num_leaves = 0 else: leaves = { leaf for child_id in children for leaf in _set_num_leaves_in_subgraph( ontology=ontology, root_id=child_id, relations=relations) } num_leaves = len(leaves) ontology.node(root_id)["num_leaves"] = num_leaves ontology.node(root_id)["set_leaves"] = leaves return leaves
def _set_information_content_in_subgraph(ontology: Ontology, root_id: str, maxleaves: int, relations: List[str] = None): node = ontology.node(root_id) node["IC"] = -math.log( (float(node["num_leaves"]) / node["num_subsumers"] + 1) / (maxleaves + 1)) for child_id in ontology.children(node=root_id, relations=relations): _set_information_content_in_subgraph(ontology=ontology, root_id=child_id, maxleaves=maxleaves, relations=relations)
def set_all_depths(ontology: Ontology, relations: List[str] = None, comparison_func=max): for root_id in ontology.get_roots(): if "type" not in ontology.node(root_id) or ontology.node_type( root_id) == "CLASS": set_all_depths_in_subgraph(ontology=ontology, root_id=root_id, relations=relations, comparison_func=comparison_func) for node_id, node_content in ontology.nodes().items(): if "depth" not in node_content: node_content["depth"] = 0
def get_all_common_ancestors(node_ids: List[str], ontology: Ontology, min_distance_from_root: int = 0, nodeids_blacklist: List[str] = None): """ Retrieve all common ancestors for the provided list of nodes Args: node_ids (List[str]): list of starting nodes ontology (Ontology): the ontology to which the provided nodes belong min_distance_from_root (int): minimum distance from root node nodeids_blacklist (List[str]): node ids to be excluded from the result Returns: List[CommonAncestor]: list of common ancestors """ common_root = nodes_have_same_root(node_ids=node_ids, ontology=ontology) if common_root is False: raise ValueError( "Cannot get common ancestors of nodes connected to different roots" ) ancestors = defaultdict(list) for node_id in node_ids: for ancestor in ontology.ancestors(node=node_id, reflexive=True): onto_anc = ontology.node(ancestor) onto_anc_root = None if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]: for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": onto_anc_root = basic_prop_val["val"] if (ancestor in node_ids or onto_anc["depth"] >= min_distance_from_root) and ( not onto_anc_root or onto_anc_root == common_root) and (not nodeids_blacklist or ancestor not in nodeids_blacklist): ancestors[ancestor].append(node_id) return [ CommonAncestor(node_id=ancestor, node_label=ontology.label(ancestor), covered_starting_nodes=set(covered_nodes)) for ancestor, covered_nodes in ancestors.items() if len(covered_nodes) > 1 or ancestor == covered_nodes[0] ]
def _set_information_content_in_subgraph(ontology: Ontology, root_id: str, maxleaves: int, relations: List[str] = None): node = ontology.node(root_id) if str(root_id) == root_id and "ARTIFICIAL_NODE:" in root_id: node["IC"] = 0 else: if "num_leaves" in node and "num_subsumers" in node: node["IC"] = -math.log( (float(node["num_leaves"]) / node["num_subsumers"] + 1) / (maxleaves + 1)) else: logger.warning("Disconnected node: " + root_id) node["IC"] = 0 children = set(ontology.children(node=root_id, relations=relations)) children.discard(root_id) children = list(children) for child_id in children: _set_information_content_in_subgraph(ontology=ontology, root_id=child_id, maxleaves=maxleaves, relations=relations)
def nodes_have_same_root(node_ids: List[str], ontology: Ontology) -> Union[bool, str]: """ Check whether all provided nodes are connected to the same root only Args: node_ids (List[str]): List of nodes to be checked ontology (Ontology): the ontology to which the provided nodes belong Returns: Union[bool, str]: the ID of the common root if all nodes are connected to the same and only root, False otherwise """ common_root = None for node_id in node_ids: onto_node = ontology.node(node_id) if "meta" in onto_node and "basicPropertyValues" in onto_node["meta"]: for basic_prop_val in onto_node["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": if common_root and common_root != basic_prop_val["val"]: return False common_root = basic_prop_val["val"] return common_root
def get_best_nodes_lca( node_ids: List[str], ontology: Ontology, min_distance_from_root: int = 3, max_num_nodes: int = 3, nodeids_blacklist: List[str] = None ) -> Tuple[bool, List[Tuple[str, Set[str]]]]: candidates = { node_id: (node_label, covered_nodes) for node_id, node_label, covered_nodes in get_all_common_ancestors( node_ids=node_ids, ontology=ontology, min_distance_from_root=min_distance_from_root, nodeids_blacklist=nodeids_blacklist) } cands_ids_to_process = set(candidates.keys()) selected_cands_ids = [] node_to_cands_map = defaultdict(list) for cand in cands_ids_to_process: for node in candidates[cand][1]: node_to_cands_map[node].append(cand) while len(cands_ids_to_process) > 0: cand_id = cands_ids_to_process.pop() comparable_cands = [ (cid, cval[1]) for cid, cval in candidates.items() if cid != cand_id and all( [child_id in cval[1] for child_id in candidates[cand_id][1]]) ] if len(comparable_cands) > 0: max_len = max(map(lambda x: len(x[1]), comparable_cands)) best_cands = [ candidate for candidate in comparable_cands if len(candidate[1]) == max_len ] if len(best_cands) > 1: weighted_best_cands = sorted( [(ontology.node(cand[0])["depth"], cand) for cand in best_cands], key=lambda x: x[0], reverse=True) max_weight = max(map(lambda x: x[0], weighted_best_cands)) best_cands = [ wcand[1] for wcand in weighted_best_cands if wcand[0] == max_weight ] else: max_weight = ontology.node(best_cands[0][0])["depth"] if len(candidates[cand_id][1]) > len(best_cands[0][1]) or \ (len(candidates[cand_id][1]) > len(best_cands[0][1]) and ontology.node(cand_id)["depth"] > max_weight): best_cands = [(cand_id, candidates[cand_id][1])] for best_cand in best_cands: selected_cands_ids.append(best_cand[0]) for node_id in candidates[best_cand[0]][1]: cands_ids_to_process -= set(node_to_cands_map[node_id]) else: selected_cands_ids.append(cand_id) if len(selected_cands_ids) <= max_num_nodes: return False, [(node_id, candidates[node_id][1]) for node_id in selected_cands_ids] else: best_terms = find_set_covering( [(node_id, ontology.label(node_id, id_if_null=True), candidates[node_id][1]) for node_id in selected_cands_ids], max_num_subsets=max_num_nodes) covered_terms = set([ e for best_term_label, covered_terms in best_terms for e in covered_terms ]) return covered_terms != set(node_ids), best_terms
def get_best_nodes_naive( node_ids: List[str], ontology: Ontology, min_distance_from_root: int = 3, max_num_nodes: int = 3, nodeids_blacklist: List[str] = None ) -> Tuple[bool, List[Tuple[str, Set[str]]]]: """remove terms with common ancestor and keep the ancestor term instead Args: node_ids (List[str]): the list of nodes to merge by common ancestor min_distance_from_root (int): set a minimum distance from root terms for ancestors that can group children terms max_num_nodes (int): maximum number of nodes to be returned by the trimming algorithm nodeids_blacklist (List[str]): a list of node ids to be excluded from common ancestors list ontology (Ontology): the ontology Returns: Set[str]: the set of merged terms, together with the set of original terms that each of them covers """ logger.debug("applying trimming through naive algorithm") final_terms_set = {} ancestor_paths = defaultdict(list) term_paths = defaultdict(set) # step 1: get all path for each term and populate data structures for node_id in node_ids: node_root = None node_ont = ontology.node(node_id) if "meta" in node_ont and "basicPropertyValues" in node_ont["meta"]: for basic_prop_val in node_ont["meta"]["basicPropertyValues"]: if basic_prop_val["pred"] == "OIO:hasOBONamespace": node_root = basic_prop_val["val"] paths = get_all_paths_to_root( node_id=node_id, ontology=ontology, min_distance_from_root=min_distance_from_root, relations=None, nodeids_blacklist=nodeids_blacklist, root_node=node_root) for path in paths: term_paths[node_id].add(path) ancestor_paths[path[-1]].append(path) # step 2: merge terms and keep common ancestors for node_id in sorted(node_ids): term_paths_copy = sorted(term_paths[node_id].copy(), key=lambda x: len(x)) while len(term_paths_copy) > 0: curr_path = list(term_paths_copy.pop()) selected_highest_ancestor = curr_path.pop() related_paths = ancestor_paths[selected_highest_ancestor] if not related_paths: break covered_nodes_set = set( [related_path[0] for related_path in related_paths]) del ancestor_paths[selected_highest_ancestor] if curr_path: if all(map(lambda x: x[0] == curr_path[0], related_paths)): selected_highest_ancestor = curr_path[0] else: i = -1 while len(curr_path) > 1: i -= 1 curr_highest_ancestor = curr_path.pop() if not all(map(lambda x: len(x) >= -i, related_paths)): break if all( map(lambda x: x[i] == curr_highest_ancestor, related_paths)): selected_highest_ancestor = curr_highest_ancestor if selected_highest_ancestor in ancestor_paths: del ancestor_paths[selected_highest_ancestor] for path in related_paths: term_paths[path[0]].discard(path) final_terms_set[selected_highest_ancestor] = covered_nodes_set for path in related_paths: term_paths[path[0]].discard(path) if len(term_paths[node_id]) > 0: term_paths_copy = term_paths[node_id].copy() else: break if len(list(final_terms_set.keys())) <= max_num_nodes: return False, [ (term_label, covered_terms) for term_label, covered_terms in final_terms_set.items() ] else: best_terms = find_set_covering( [(k, ontology.label(k, id_if_null=True), v) for k, v in final_terms_set.items()], max_num_subsets=max_num_nodes) covered_terms = set([ e for best_term_label, covered_terms in best_terms for e in covered_terms ]) return covered_terms != set(node_ids), best_terms