Python Ontology.node示例，ontobio.ontol.Ontology.node Python示例

示例#1

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def set_all_depths_in_subgraph(ontology: Ontology,
                               root_id: str,
                               relations: List[str] = None,
                               comparison_func=max,
                               current_depth: int = 0):
    """calculate and set max_depth and min_depth (maximum and minimum distances from root terms in the ontology)
    recursively for all terms in a branch of the ontology

    Args:
        ontology (Ontology): the ontology
        root_id (str): the ID of the root term of the branch to process
        relations (List[str]): list of relations to consider
        comparison_func: a comparison function to calculate the depth when multiple paths exist between the node and
            the root. max calculates the length of the longest path, min the one of the shortest
        current_depth (int): the current depth in the ontology
    """
    if "depth" not in ontology.node(root_id):
        ontology.node(root_id)["depth"] = current_depth
    else:
        ontology.node(root_id)["depth"] = comparison_func(
            ontology.node(root_id)["depth"], current_depth)
    for child_id in ontology.children(node=root_id, relations=relations):
        set_all_depths_in_subgraph(ontology=ontology,
                                   root_id=child_id,
                                   relations=relations,
                                   comparison_func=comparison_func,
                                   current_depth=current_depth + 1)

示例#2

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def set_all_information_content_values(ontology: Ontology,
                                       relations: List[str] = None):
    logger.info("calculating information content for all terms in ontology")
    roots = ontology.get_roots(relations=relations)
    for root_id in roots:
        if "num_subsumers" not in ontology.node(root_id):
            _set_num_subsumers_in_subgraph(ontology=ontology,
                                           root_id=root_id,
                                           relations=relations)
    for root_id in roots:
        if "num_leaves" not in ontology.node(root_id):
            _set_num_leaves_in_subgraph(ontology=ontology,
                                        root_id=root_id,
                                        relations=relations)
    for root_id in roots:
        if "depth" not in ontology.node(root_id):
            set_all_depths_in_subgraph(ontology=ontology,
                                       root_id=root_id,
                                       relations=relations)
    for root_id in roots:
        _set_information_content_in_subgraph(
            ontology=ontology,
            root_id=root_id,
            maxleaves=ontology.node(root_id)["num_leaves"],
            relations=relations)

示例#3

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def get_all_common_ancestors(node_ids: List[str],
                             ontology: Ontology,
                             min_distance_from_root: int = 0,
                             nodeids_blacklist: List[str] = None):
    # check if all ids are connected to the same root node
    common_root = None
    for node_id in node_ids:
        onto_node = ontology.node(node_id)
        if "meta" in onto_node and "basicPropertyValues" in onto_node["meta"]:
            for basic_prop_val in onto_node["meta"]["basicPropertyValues"]:
                if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                    if common_root and common_root != basic_prop_val["val"]:
                        raise ValueError(
                            "Cannot get common ancestors of nodes connected to different roots"
                        )
                    common_root = basic_prop_val["val"]
    ancestors = defaultdict(list)
    for node_id in node_ids:
        for ancestor in ontology.ancestors(node=node_id, reflexive=True):
            onto_anc = ontology.node(ancestor)
            onto_anc_root = None
            if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]:
                for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
                    if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                        onto_anc_root = basic_prop_val["val"]
            if onto_anc["depth"] >= min_distance_from_root and (not onto_anc_root or onto_anc_root == common_root) \
                and (not nodeids_blacklist or ancestor not in nodeids_blacklist):
                ancestors[ancestor].append(node_id)
    return [(ancestor, ontology.label(ancestor), set(covered_nodes))
            for ancestor, covered_nodes in ancestors.items()
            if len(covered_nodes) > 1 or ancestor == covered_nodes[0]]

示例#4

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def get_best_nodes_ic(
    node_ids: List[str],
    ontology: Ontology,
    max_number_of_terms: int = 3,
    min_distance_from_root: int = 0,
    slim_terms_ic_bonus_perc: int = 0,
    slim_set: set = None,
    nodeids_blacklist: List[str] = None
) -> Tuple[bool, List[Tuple[str, Set[str]]]]:
    """trim the list of terms by selecting the best combination of terms from the initial list or their common
    ancestors based on information content

    Args:
        node_ids (List[str]): the list of nodes to merge by common ancestor
        max_number_of_terms (int): minimum number of terms above which the merge operation is performed
        ontology (Ontology): the ontology
        min_distance_from_root (int): consider only nodes at a minimum distance from root as potential candidate for
            trimming
        slim_terms_ic_bonus_perc (int): boost the IC value for terms that appear in the slim set by the provided
            percentage
        slim_set (set): set of terms that belong to the slim for the provided ontology
        nodeids_blacklist (List[str]): a list of node ids to be excluded from common ancestors list
    Returns:
        Set[str]: the set of trimmed terms, together with the set of original terms that each of them covers
    """
    common_ancestors = get_all_common_ancestors(
        node_ids=node_ids,
        ontology=ontology,
        nodeids_blacklist=nodeids_blacklist)
    if "IC" not in ontology.node(common_ancestors[0][0]):
        logger.warning(
            "ontology terms do not have information content values set")
        set_all_information_content_values(ontology=ontology)
    values = [
        0 if node[0] not in node_ids
        and ontology.node(node[0])["depth"] < min_distance_from_root else
        ontology.node(node[0])["IC"] * (1 + slim_terms_ic_bonus_perc)
        if slim_set and node[0] in slim_set else ontology.node(node[0])["IC"]
        for node in common_ancestors
    ]
    if slim_set and any([node[0] in slim_set for node in common_ancestors]):
        logger.debug("some candidates are present in the slim set")
    # remove ancestors with zero IC
    common_ancestors = [
        common_ancestor
        for common_ancestor, value in zip(common_ancestors, values)
        if value > 0
    ]
    values = [value for value in values if value > 0]
    best_terms = find_set_covering(subsets=common_ancestors,
                                   max_num_subsets=max_number_of_terms,
                                   value=values,
                                   ontology=ontology)
    covered_terms = set([
        e for best_term_label, covered_terms in best_terms
        for e in covered_terms
    ])
    return covered_terms != set(node_ids), best_terms

示例#5

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def _set_num_leaves_in_subgraph(ontology: Ontology,
                                root_id: str,
                                relations: List[str] = None):
    num_leaves = 0
    for child_id in ontology.children(node=root_id):
        if "num_leaves" not in ontology.node(child_id):
            _set_num_leaves_in_subgraph(ontology=ontology,
                                        root_id=child_id,
                                        relations=relations)
        if ontology.node(child_id)["num_leaves"] == 0:
            num_leaves += 1
        else:
            num_leaves += ontology.node(child_id)["num_leaves"]
    ontology.node(root_id)["num_leaves"] = num_leaves

示例#6

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def _set_tot_annots_in_subgraph(ontology: Ontology,
                                root_id: str,
                                relations: List[str] = None):
    if "tot_annot_genes" not in ontology.node(root_id):
        children = set(ontology.children(root_id, relations=relations))
        children.discard(root_id)
        children = list(children)
        ontology.node(root_id)["tot_annot_genes"] = ontology.node(
            root_id)["rel_annot_genes"] | set([
                annot_gene for child_id in children
                for annot_gene in _set_tot_annots_in_subgraph(
                    ontology, child_id)
            ])
    return ontology.node(root_id)["tot_annot_genes"]

示例#7

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None):
    logger.info(
        "Setting information content values based on ontology structure")
    roots = ontology.get_roots(relations=relations)
    for root_id in roots:
        if "num_subsumers" not in ontology.node(root_id) and (
                "type" not in ontology.node(root_id)
                or ontology.node_type(root_id) == "CLASS"):
            _set_num_subsumers_in_subgraph(ontology=ontology,
                                           root_id=root_id,
                                           relations=relations)
    for root_id in roots:
        if "num_leaves" not in ontology.node(root_id) and (
                "type" not in ontology.node(root_id)
                or ontology.node_type(root_id) == "CLASS"):
            _set_num_leaves_in_subgraph(ontology=ontology,
                                        root_id=root_id,
                                        relations=relations)
    for root_id in roots:
        if "depth" not in ontology.node(root_id) and (
                "type" not in ontology.node(root_id)
                or ontology.node_type(root_id) == "CLASS"):
            set_all_depths_in_subgraph(ontology=ontology,
                                       root_id=root_id,
                                       relations=relations)
    for root_id in roots:
        if "type" not in ontology.node(root_id) or ontology.node_type(
                root_id) == "CLASS":
            _set_information_content_in_subgraph(
                ontology=ontology,
                root_id=root_id,
                maxleaves=ontology.node(root_id)["num_leaves"],
                relations=relations)
    logger.info("Finished setting information content values")

示例#8

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def _set_num_subsumers_in_subgraph(ontology: Ontology,
                                   root_id: str,
                                   relations: List[str] = None):
    if "num_subsumers" not in ontology.node(root_id):
        parents = set(ontology.parents(root_id))
        parents.discard(root_id)
        parents = list(parents)
        if not parents or all(
            ["set_subsumers" in ontology.node(parent) for parent in parents]):
            subsumers = {subsumer for parent in parents for subsumer in ontology.node(parent)["set_subsumers"]} | \
                        {root_id}
            ontology.node(root_id)["num_subsumers"] = len(subsumers)
            ontology.node(root_id)["set_subsumers"] = subsumers
            for child_id in ontology.children(node=root_id):
                _set_num_subsumers_in_subgraph(ontology, child_id, relations)

示例#9

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def _set_num_subsumers_in_subgraph(ontology: Ontology,
                                   root_id: str,
                                   relations: List[str] = None):
    parents = ontology.parents(root_id)
    if len(parents) == 1:
        ontology.node(root_id)["num_subsumers"] = ontology.node(
            parents[0])["num_subsumers"] + 1
    else:
        ontology.node(root_id)["num_subsumers"] = len(
            ontology.ancestors(node=root_id,
                               relations=relations,
                               reflexive=True))
    for child_id in ontology.children(node=root_id, relations=relations):
        _set_num_subsumers_in_subgraph(ontology=ontology,
                                       root_id=child_id,
                                       relations=relations)

示例#10

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def get_all_paths_to_root(node_id: str,
                          ontology: Ontology,
                          min_distance_from_root: int = 0,
                          relations: List[str] = None,
                          nodeids_blacklist: List[str] = None,
                          previous_path: Union[None, List[str]] = None,
                          root_node=None) -> Set[Tuple[str]]:
    """get all possible paths connecting a go term to its root terms

    Args:
        node_id (str): a valid GO id for the starting term
        ontology (Ontology): the go ontology
        min_distance_from_root (int): return only terms at a specified minimum distance from root terms
        relations (List[str]): the list of relations to be used
        nodeids_blacklist (List[str]): a list of node ids to exclude from the paths
        previous_path (Union[None, List[str]]): the path to get to the current node
    Returns:
        Set[Tuple[str]]: the set of paths connecting the specified term to its root terms, each of which contains a
        sequence of terms ids
    """
    if previous_path is None:
        previous_path = []
    new_path = previous_path[:]
    if not nodeids_blacklist or node_id not in nodeids_blacklist:
        new_path.append(node_id)
    parents = [
        parent
        for parent in ontology.parents(node=node_id, relations=relations)
        if ontology.node(parent)["depth"] >= min_distance_from_root
    ]
    parents_same_root = []
    if root_node:
        for parent in parents:
            parent_root = None
            if "meta" in parent and "basicPropertyValues" in parent["meta"]:
                for basic_prop_val in parent["meta"]["basicPropertyValues"]:
                    if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                        parent_root = basic_prop_val["val"]
            if parent_root and parent_root == root_node:
                parents_same_root.append(parent)
        parents = parents_same_root

    if len(parents) > 0:
        # go up the tree, following a depth first visit
        paths_to_return = set()
        for parent in parents:
            for path in get_all_paths_to_root(
                    node_id=parent,
                    ontology=ontology,
                    previous_path=new_path,
                    min_distance_from_root=min_distance_from_root,
                    relations=relations,
                    nodeids_blacklist=nodeids_blacklist,
                    root_node=root_node):
                paths_to_return.add(path)
        return paths_to_return
    if len(new_path) == 0:
        return {(node_id, )}
    else:
        return {tuple(new_path)}

示例#11

0

显示文件

    def rename_ontology_terms(
            ontology: Ontology,
            terms_replacement_regex: Dict[str, str] = None) -> None:
        """rename ontology terms based on regular expression matching

        Args:
            ontology (Ontology): the ontology containing the terms to be renamed
            terms_replacement_regex (Dict[str, str]): a dictionary containing the regular expression to be applied for
                renaming terms. Each key must be a regular expression to search for terms and the associated value
                another regular expression that defines the final result
        """
        logger.info("Renaming ontology terms")
        if terms_replacement_regex:
            for regex_to_substitute, regex_target in terms_replacement_regex.items(
            ):
                for node in ontology.search(regex_to_substitute,
                                            is_regex=True):
                    ontology.node(node)["label"] = re.sub(
                        regex_to_substitute, regex_target,
                        ontology.node(node)["label"])

示例#12

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def set_ic_annot_freq(ontology: Ontology, annotations: AssociationSet):
    logger.info(
        "Setting information content values based on annotation frequency")
    for node_id in ontology.nodes():
        node_prop = ontology.node(node_id)
        if "rel_annot_genes" in node_prop:
            del node_prop["rel_annot_genes"]
        if "tot_annot_genes" in node_prop:
            del node_prop["tot_annot_genes"]
        if "IC" in node_prop:
            del node_prop["IC"]
    for root_id in ontology.get_roots():
        if "depth" not in ontology.node(root_id) and (
                "type" not in ontology.node(root_id)
                or ontology.node_type(root_id) == "CLASS"):
            set_all_depths_in_subgraph(ontology=ontology, root_id=root_id)
    node_gene_map = defaultdict(set)
    for subj, obj in annotations.associations_by_subj_obj.keys():
        node_gene_map[obj].add(subj)
    for node_id in ontology.nodes():
        node_pr = ontology.node(node_id)
        node_pr["rel_annot_genes"] = node_gene_map[node_id]
    for root_id in ontology.get_roots():
        _set_tot_annots_in_subgraph(ontology, root_id)
    for node_prop in ontology.nodes().values():
        if "tot_annot_genes" not in node_prop:
            node_prop["tot_annot_genes"] = set()
    tot_annots = len(
        set([
            gene for set_genes in node_gene_map.values() for gene in set_genes
        ]))
    min_annots = min([
        len(node["tot_annot_genes"]) for node in ontology.nodes().values()
        if "tot_annot_genes" in node and len(node["tot_annot_genes"]) > 0
    ])
    if not min_annots:
        min_annots = 1
    for node_prop in ontology.nodes().values():
        node_prop["IC"] = -math.log(len(node_prop["tot_annot_genes"]) / tot_annots) if \
            len(node_prop["tot_annot_genes"]) > 0 else -math.log(min_annots / (tot_annots + 1))
    logger.info("Finished setting information content values")

示例#13

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def _set_num_leaves_in_subgraph(ontology: Ontology,
                                root_id: str,
                                relations: List[str] = None):
    if "set_leaves" in ontology.node(root_id):
        return ontology.node(root_id)["set_leaves"]
    children = set(ontology.children(node=root_id))
    children.discard(root_id)
    children = list(children)
    if not children:
        leaves = {root_id}
        num_leaves = 0
    else:
        leaves = {
            leaf
            for child_id in children for leaf in _set_num_leaves_in_subgraph(
                ontology=ontology, root_id=child_id, relations=relations)
        }
        num_leaves = len(leaves)
    ontology.node(root_id)["num_leaves"] = num_leaves
    ontology.node(root_id)["set_leaves"] = leaves
    return leaves

示例#14

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def _set_information_content_in_subgraph(ontology: Ontology,
                                         root_id: str,
                                         maxleaves: int,
                                         relations: List[str] = None):
    node = ontology.node(root_id)
    node["IC"] = -math.log(
        (float(node["num_leaves"]) / node["num_subsumers"] + 1) /
        (maxleaves + 1))
    for child_id in ontology.children(node=root_id, relations=relations):
        _set_information_content_in_subgraph(ontology=ontology,
                                             root_id=child_id,
                                             maxleaves=maxleaves,
                                             relations=relations)

示例#15

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def set_all_depths(ontology: Ontology,
                   relations: List[str] = None,
                   comparison_func=max):
    for root_id in ontology.get_roots():
        if "type" not in ontology.node(root_id) or ontology.node_type(
                root_id) == "CLASS":
            set_all_depths_in_subgraph(ontology=ontology,
                                       root_id=root_id,
                                       relations=relations,
                                       comparison_func=comparison_func)
    for node_id, node_content in ontology.nodes().items():
        if "depth" not in node_content:
            node_content["depth"] = 0

示例#16

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def get_all_common_ancestors(node_ids: List[str],
                             ontology: Ontology,
                             min_distance_from_root: int = 0,
                             nodeids_blacklist: List[str] = None):
    """
    Retrieve all common ancestors for the provided list of nodes

    Args:
        node_ids (List[str]): list of starting nodes
        ontology (Ontology): the ontology to which the provided nodes belong
        min_distance_from_root (int): minimum distance from root node
        nodeids_blacklist (List[str]): node ids to be excluded from the result

    Returns:
        List[CommonAncestor]: list of common ancestors
    """
    common_root = nodes_have_same_root(node_ids=node_ids, ontology=ontology)
    if common_root is False:
        raise ValueError(
            "Cannot get common ancestors of nodes connected to different roots"
        )
    ancestors = defaultdict(list)
    for node_id in node_ids:
        for ancestor in ontology.ancestors(node=node_id, reflexive=True):
            onto_anc = ontology.node(ancestor)
            onto_anc_root = None
            if "meta" in onto_anc and "basicPropertyValues" in onto_anc["meta"]:
                for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
                    if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                        onto_anc_root = basic_prop_val["val"]
            if (ancestor in node_ids
                    or onto_anc["depth"] >= min_distance_from_root) and (
                        not onto_anc_root or onto_anc_root
                        == common_root) and (not nodeids_blacklist or ancestor
                                             not in nodeids_blacklist):
                ancestors[ancestor].append(node_id)
    return [
        CommonAncestor(node_id=ancestor,
                       node_label=ontology.label(ancestor),
                       covered_starting_nodes=set(covered_nodes))
        for ancestor, covered_nodes in ancestors.items()
        if len(covered_nodes) > 1 or ancestor == covered_nodes[0]
    ]

示例#17

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def _set_information_content_in_subgraph(ontology: Ontology,
                                         root_id: str,
                                         maxleaves: int,
                                         relations: List[str] = None):
    node = ontology.node(root_id)
    if str(root_id) == root_id and "ARTIFICIAL_NODE:" in root_id:
        node["IC"] = 0
    else:
        if "num_leaves" in node and "num_subsumers" in node:
            node["IC"] = -math.log(
                (float(node["num_leaves"]) / node["num_subsumers"] + 1) /
                (maxleaves + 1))
        else:
            logger.warning("Disconnected node: " + root_id)
            node["IC"] = 0
    children = set(ontology.children(node=root_id, relations=relations))
    children.discard(root_id)
    children = list(children)
    for child_id in children:
        _set_information_content_in_subgraph(ontology=ontology,
                                             root_id=child_id,
                                             maxleaves=maxleaves,
                                             relations=relations)

示例#18

0

显示文件

文件： ontology_tools.py 项目： alliance-genome/agr_genedescriptions

def nodes_have_same_root(node_ids: List[str],
                         ontology: Ontology) -> Union[bool, str]:
    """
    Check whether all provided nodes are connected to the same root only

    Args:
        node_ids (List[str]): List of nodes to be checked
        ontology (Ontology): the ontology to which the provided nodes belong

    Returns:
        Union[bool, str]: the ID of the common root if all nodes are connected to the same and only root,
                          False otherwise
    """
    common_root = None
    for node_id in node_ids:
        onto_node = ontology.node(node_id)
        if "meta" in onto_node and "basicPropertyValues" in onto_node["meta"]:
            for basic_prop_val in onto_node["meta"]["basicPropertyValues"]:
                if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                    if common_root and common_root != basic_prop_val["val"]:
                        return False
                    common_root = basic_prop_val["val"]
    return common_root

示例#19

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def get_best_nodes_lca(
    node_ids: List[str],
    ontology: Ontology,
    min_distance_from_root: int = 3,
    max_num_nodes: int = 3,
    nodeids_blacklist: List[str] = None
) -> Tuple[bool, List[Tuple[str, Set[str]]]]:
    candidates = {
        node_id: (node_label, covered_nodes)
        for node_id, node_label, covered_nodes in get_all_common_ancestors(
            node_ids=node_ids,
            ontology=ontology,
            min_distance_from_root=min_distance_from_root,
            nodeids_blacklist=nodeids_blacklist)
    }
    cands_ids_to_process = set(candidates.keys())
    selected_cands_ids = []
    node_to_cands_map = defaultdict(list)
    for cand in cands_ids_to_process:
        for node in candidates[cand][1]:
            node_to_cands_map[node].append(cand)
    while len(cands_ids_to_process) > 0:
        cand_id = cands_ids_to_process.pop()
        comparable_cands = [
            (cid, cval[1]) for cid, cval in candidates.items()
            if cid != cand_id and all(
                [child_id in cval[1] for child_id in candidates[cand_id][1]])
        ]
        if len(comparable_cands) > 0:
            max_len = max(map(lambda x: len(x[1]), comparable_cands))
            best_cands = [
                candidate for candidate in comparable_cands
                if len(candidate[1]) == max_len
            ]
            if len(best_cands) > 1:
                weighted_best_cands = sorted(
                    [(ontology.node(cand[0])["depth"], cand)
                     for cand in best_cands],
                    key=lambda x: x[0],
                    reverse=True)
                max_weight = max(map(lambda x: x[0], weighted_best_cands))
                best_cands = [
                    wcand[1] for wcand in weighted_best_cands
                    if wcand[0] == max_weight
                ]
            else:
                max_weight = ontology.node(best_cands[0][0])["depth"]
            if len(candidates[cand_id][1]) > len(best_cands[0][1]) or \
                    (len(candidates[cand_id][1]) > len(best_cands[0][1]) and
                     ontology.node(cand_id)["depth"] > max_weight):
                best_cands = [(cand_id, candidates[cand_id][1])]
            for best_cand in best_cands:
                selected_cands_ids.append(best_cand[0])
                for node_id in candidates[best_cand[0]][1]:
                    cands_ids_to_process -= set(node_to_cands_map[node_id])
        else:
            selected_cands_ids.append(cand_id)
    if len(selected_cands_ids) <= max_num_nodes:
        return False, [(node_id, candidates[node_id][1])
                       for node_id in selected_cands_ids]

    else:
        best_terms = find_set_covering(
            [(node_id, ontology.label(node_id,
                                      id_if_null=True), candidates[node_id][1])
             for node_id in selected_cands_ids],
            max_num_subsets=max_num_nodes)
        covered_terms = set([
            e for best_term_label, covered_terms in best_terms
            for e in covered_terms
        ])
        return covered_terms != set(node_ids), best_terms

示例#20

0

显示文件

文件： ontology_tools.py 项目： WormBase/genedesc_generator

def get_best_nodes_naive(
    node_ids: List[str],
    ontology: Ontology,
    min_distance_from_root: int = 3,
    max_num_nodes: int = 3,
    nodeids_blacklist: List[str] = None
) -> Tuple[bool, List[Tuple[str, Set[str]]]]:
    """remove terms with common ancestor and keep the ancestor term instead

    Args:
        node_ids (List[str]): the list of nodes to merge by common ancestor
        min_distance_from_root (int): set a minimum distance from root terms for ancestors that can group children terms
        max_num_nodes (int): maximum number of nodes to be returned by the trimming algorithm
        nodeids_blacklist (List[str]): a list of node ids to be excluded from common ancestors list
        ontology (Ontology): the ontology
    Returns:
        Set[str]: the set of merged terms, together with the set of original terms that each of them covers
    """
    logger.debug("applying trimming through naive algorithm")
    final_terms_set = {}
    ancestor_paths = defaultdict(list)
    term_paths = defaultdict(set)
    # step 1: get all path for each term and populate data structures
    for node_id in node_ids:
        node_root = None
        node_ont = ontology.node(node_id)
        if "meta" in node_ont and "basicPropertyValues" in node_ont["meta"]:
            for basic_prop_val in node_ont["meta"]["basicPropertyValues"]:
                if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                    node_root = basic_prop_val["val"]
        paths = get_all_paths_to_root(
            node_id=node_id,
            ontology=ontology,
            min_distance_from_root=min_distance_from_root,
            relations=None,
            nodeids_blacklist=nodeids_blacklist,
            root_node=node_root)
        for path in paths:
            term_paths[node_id].add(path)
            ancestor_paths[path[-1]].append(path)
    # step 2: merge terms and keep common ancestors
    for node_id in sorted(node_ids):
        term_paths_copy = sorted(term_paths[node_id].copy(),
                                 key=lambda x: len(x))
        while len(term_paths_copy) > 0:
            curr_path = list(term_paths_copy.pop())
            selected_highest_ancestor = curr_path.pop()
            related_paths = ancestor_paths[selected_highest_ancestor]
            if not related_paths:
                break
            covered_nodes_set = set(
                [related_path[0] for related_path in related_paths])
            del ancestor_paths[selected_highest_ancestor]
            if curr_path:
                if all(map(lambda x: x[0] == curr_path[0], related_paths)):
                    selected_highest_ancestor = curr_path[0]
                else:
                    i = -1
                    while len(curr_path) > 1:
                        i -= 1
                        curr_highest_ancestor = curr_path.pop()
                        if not all(map(lambda x: len(x) >= -i, related_paths)):
                            break
                        if all(
                                map(lambda x: x[i] == curr_highest_ancestor,
                                    related_paths)):
                            selected_highest_ancestor = curr_highest_ancestor
                            if selected_highest_ancestor in ancestor_paths:
                                del ancestor_paths[selected_highest_ancestor]
                            for path in related_paths:
                                term_paths[path[0]].discard(path)
            final_terms_set[selected_highest_ancestor] = covered_nodes_set
            for path in related_paths:
                term_paths[path[0]].discard(path)
            if len(term_paths[node_id]) > 0:
                term_paths_copy = term_paths[node_id].copy()
            else:
                break
    if len(list(final_terms_set.keys())) <= max_num_nodes:
        return False, [
            (term_label, covered_terms)
            for term_label, covered_terms in final_terms_set.items()
        ]

    else:
        best_terms = find_set_covering(
            [(k, ontology.label(k, id_if_null=True), v)
             for k, v in final_terms_set.items()],
            max_num_subsets=max_num_nodes)
        covered_terms = set([
            e for best_term_label, covered_terms in best_terms
            for e in covered_terms
        ])
        return covered_terms != set(node_ids), best_terms