Пример #1
0
def add_all_edges(g1: BaseGraph, g2: BaseGraph, preserve: bool = True) -> int:
    """
    Add all edges from source graph (``g2``) to target graph (``g1``).

    Parameters
    ----------
    g1: kgx.graph.base_graph.BaseGraph
        Target graph
    g2: kgx.graph.base_graph.BaseGraph
        Source graph
    preserve: bool
        Whether or not to preserve conflicting properties

    Returns
    -------
    int
        Number of edges merged during this operation

    """
    log.info(f"Adding {g2.number_of_edges()} edges from {g2} to {g1}")
    merge_count = 0
    for u, v, key, data in g2.edges(keys=True, data=True):
        if g1.has_edge(u, v, key):
            merge_edge(g1, u, v, key, data, preserve)
            merge_count += 1
        else:
            g1.add_edge(u, v, edge_key=key, **data)
    return merge_count
Пример #2
0
def remap_node_property(
    graph: BaseGraph, category: str, old_property: str, new_property: str
) -> None:
    """
    Remap the value in node ``old_property`` attribute with value
    from node ``new_property`` attribute.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    category: string
        Category referring to nodes whose property needs to be remapped
    old_property: string
        old property name whose value needs to be replaced
    new_property: string
        new property name from which the value is pulled from

    """
    mapping = {}
    if old_property in CORE_NODE_PROPERTIES:
        raise AttributeError(
            f"node property {old_property} cannot be modified as it is a core property."
        )

    for nid, data in graph.nodes(data=True):
        node_data = data.copy()
        if category in node_data and category not in node_data["category"]:
            continue
        if new_property in node_data:
            mapping[nid] = {old_property: node_data[new_property]}
    graph.set_node_attributes(graph, attributes=mapping)
Пример #3
0
def merge_node(g: BaseGraph,
               n: str,
               data: dict,
               preserve: bool = True) -> dict:
    """
    Merge node ``n`` into graph ``g``.

    Parameters
    ----------
    g: kgx.graph.base_graph.BaseGraph
        The target graph
    n: str
        Node id
    data: dict
        Node properties
    preserve: bool
        Whether or not to preserve conflicting properties

    Returns
    -------
    dict
        The merged node

    """
    existing_node = g.nodes()[n]
    new_data = prepare_data_dict(copy.deepcopy(existing_node),
                                 copy.deepcopy(data), preserve)
    g.add_node(n, **new_data)
    return existing_node
Пример #4
0
def add_all_nodes(g1: BaseGraph, g2: BaseGraph, preserve: bool = True) -> int:
    """
    Add all nodes from source graph (``g2``) to target graph (``g1``).

    Parameters
    ----------
    g1: kgx.graph.base_graph.BaseGraph
        Target graph
    g2: kgx.graph.base_graph.BaseGraph
        Source graph
    preserve: bool
        Whether or not to preserve conflicting properties

    Returns
    -------
    int
        Number of nodes merged during this operation

    """
    log.info(
        f"Adding {g2.number_of_nodes()} nodes from {g2.name} to {g1.name}")
    merge_count = 0
    for n, data in g2.nodes(data=True):
        if n in g1.nodes():
            merge_node(g1, n, data, preserve)
            merge_count += 1
        else:
            g1.add_node(n, **data)
    return merge_count
Пример #5
0
def merge_edge(g: BaseGraph,
               u: str,
               v: str,
               key: str,
               data: dict,
               preserve: bool = True) -> dict:
    """
    Merge edge ``u`` -> ``v`` into graph ``g``.

    Parameters
    ----------
    g: kgx.graph.base_graph.BaseGraph
        The target graph
    u: str
        Subject node id
    v: str
        Object node id
    key: str
        Edge key
    data: dict
        Node properties
    preserve: bool
        Whether or not to preserve conflicting properties

    Returns
    -------
    dict
        The merged edge

    """
    existing_edge = g.get_edge(u, v, key)
    new_data = prepare_data_dict(copy.deepcopy(existing_edge),
                                 copy.deepcopy(data), preserve)
    g.add_edge(u, v, edge_key=key, **new_data)
    return existing_edge
Пример #6
0
def get_parents(graph: BaseGraph,
                node: str,
                relations: List[str] = None) -> List[str]:
    """
    Return all direct `parents` of a specified node, filtered by ``relations``.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        Graph to traverse
    node: str
        node identifier
    relations: List[str]
       list of relations

    Returns
    -------
    List[str]
        A list of parent node(s)

    """
    parents = []
    if graph.has_node(node):
        out_edges = [x for x in graph.out_edges(node, keys=False, data=True)]
        if relations is None:
            parents = [x[1] for x in out_edges]
        else:
            parents = [
                x[1] for x in out_edges if x[2]['predicate'] in relations
            ]
    return parents
Пример #7
0
def apply_node_filters(graph: BaseGraph,
                       node_filters: Dict[str, Union[str, Set]]) -> None:
    """
    Apply filters to graph and remove nodes that do not pass given filters.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    node_filters: Dict[str, Union[str, Set]]
        Node filters

    """
    nodes_to_remove = []
    for node, node_data in graph.nodes(data=True):
        pass_filter = True
        for k, v in node_filters.items():
            if k == "category":
                if not any(x in node_data[k] for x in v):
                    pass_filter = False
        if not pass_filter:
            nodes_to_remove.append(node)

    for node in nodes_to_remove:
        # removing node that fails category filter
        log.debug(f"Removing node {node}")
        graph.remove_node(node)
Пример #8
0
def apply_edge_filters(graph: BaseGraph,
                       edge_filters: Dict[str, Union[str, Set]]) -> None:
    """
    Apply filters to graph and remove edges that do not pass given filters.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    edge_filters: Dict[str, Union[str, Set]]
        Edge filters

    """
    edges_to_remove = []
    for subject_node, object_node, key, data in graph.edges(keys=True,
                                                            data=True):
        pass_filter = True
        for k, v in edge_filters.items():
            if k == "predicate":
                if data[k] not in v:
                    pass_filter = False
            elif k == "relation":
                if data[k] not in v:
                    pass_filter = False
        if not pass_filter:
            edges_to_remove.append((subject_node, object_node, key))

    for edge in edges_to_remove:
        # removing edge that fails edge filters
        log.debug(f"Removing edge {edge}")
        graph.remove_edge(edge[0], edge[1], edge[2])
Пример #9
0
def fold_predicate(
    graph: BaseGraph, predicate: str, remove_prefix: bool = False
) -> None:
    """
    Fold predicate as node property where every edge with ``predicate``
    will be folded as a node property.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    predicate: str
        The predicate to fold
    remove_prefix: bool
        Whether or not to remove prefix from the predicate (``False``, by default)

    """
    node_cache = []
    edge_cache = []
    start = current_time_in_millis()
    p = predicate.split(":", 1)[1] if remove_prefix else predicate
    for u, v, k, data in graph.edges(keys=True, data=True):
        if data["predicate"] == predicate:
            node_cache.append((u, p, v))
            edge_cache.append((u, v, k))
    while node_cache:
        n = node_cache.pop()
        graph.add_node_attribute(*n)
    while edge_cache:
        e = edge_cache.pop()
        graph.remove_edge(*e)
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
Пример #10
0
def remap_edge_property(
    graph: BaseGraph, edge_predicate: str, old_property: str, new_property: str
) -> None:
    """
    Remap the value in an edge ``old_property`` attribute with value
    from edge ``new_property`` attribute.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    edge_predicate: string
        edge_predicate referring to edges whose property needs to be remapped
    old_property: string
        Old property name whose value needs to be replaced
    new_property: string
        New property name from which the value is pulled from

    """
    mapping = {}
    if old_property in CORE_EDGE_PROPERTIES:
        raise AttributeError(
            f"edge property {old_property} cannot be modified as it is a core property."
        )
    for u, v, k, data in graph.edges(data=True, keys=True):
        edge_data = data.copy()
        if edge_predicate is not edge_data["predicate"]:
            continue
        if new_property in edge_data:
            mapping[(u, v, k)] = {old_property: edge_data[new_property]}
    graph.set_edge_attributes(graph, attributes=mapping)
Пример #11
0
def get_category_via_superclass(graph: BaseGraph,
                                curie: str,
                                load_ontology: bool = True) -> Set[str]:
    """
    Get category for a given CURIE by tracing its superclass, via ``subclass_of`` hierarchy,
    and getting the most appropriate category based on the superclass.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        Graph to traverse
    curie: str
        Input CURIE
    load_ontology: bool
        Determines whether to load ontology, based on CURIE prefix, or to simply
        rely on ``subclass_of`` hierarchy from graph

    Returns
    -------
    Set[str]
        A set containing one (or more) category for the given CURIE

    """
    log.debug("curie: {}".format(curie))
    new_categories = []
    toolkit = get_toolkit()
    if PrefixManager.is_curie(curie):
        ancestors = get_ancestors(graph, curie, relations=['subclass_of'])
        if len(ancestors) == 0 and load_ontology:
            cls = get_curie_lookup_service()
            ontology_graph = cls.ontology_graph
            new_categories += [
                x for x in get_category_via_superclass(ontology_graph, curie,
                                                       False)
            ]
        log.debug("Ancestors for CURIE {} via subClassOf: {}".format(
            curie, ancestors))
        seen = []
        for anc in ancestors:
            mapping = toolkit.get_by_mapping(anc)
            seen.append(anc)
            if mapping:
                # there is direct mapping to BioLink Model
                log.debug("Ancestor {} mapped to {}".format(anc, mapping))
                seen_labels = [
                    graph.nodes()[x]['name'] for x in seen
                    if 'name' in graph.nodes()[x]
                ]
                new_categories += [x for x in seen_labels]
                new_categories += [x for x in toolkit.ancestors(mapping)]
                break
    return set(new_categories)
Пример #12
0
    def validate_edges(self, graph: BaseGraph) -> list:
        """
        Validate all the edges in a graph.

        This method validates for the following,
        - Edge properties
        - Edge property type
        - Edge property value type
        - Edge label

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph to validate

        Returns
        -------
        list
            A list of errors for a given graph

        """
        errors = []
        with click.progressbar(graph.edges(data=True),
                               label='Validate edges in graph') as bar:
            for u, v, data in bar:
                errors += self.analyse_edge(u, v, None, data)
        return errors
Пример #13
0
    def validate_nodes(self, graph: BaseGraph) -> list:
        """
        Validate all the nodes in a graph.

        This method validates for the following,
        - Node properties
        - Node property type
        - Node property value type
        - Node categories

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph to validate

        Returns
        -------
        list
            A list of errors for a given graph

        """
        errors = []
        with click.progressbar(graph.nodes(data=True),
                               label='Validating nodes in graph') as bar:
            for n, data in bar:
                errors += self.analyse_node(n, data)
        return errors
Пример #14
0
def build_cliques(target_graph: BaseGraph) -> nx.MultiDiGraph:
    """
    Builds a clique graph from ``same_as`` edges in ``target_graph``.

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        An instance of BaseGraph that contains nodes and edges

    Returns
    -------
    networkx.MultiDiGraph
        The clique graph with only ``same_as`` edges

    """
    clique_graph = nx.MultiDiGraph()
    for n, data in target_graph.nodes(data=True):
        if "same_as" in data:
            new_data = copy.deepcopy(data)
            del new_data["same_as"]
            clique_graph.add_node(n, **new_data)
            for s in data["same_as"]:
                edge_data1 = {"subject": n, "predicate": SAME_AS, "object": s}
                if "provided_by" in data:
                    edge_data1["provided_by"] = data["provided_by"]
                clique_graph.add_edge(n, s, **edge_data1)
                edge_data2 = {"subject": s, "predicate": SAME_AS, "object": n}
                if "provided_by" in data:
                    edge_data2["provided_by"] = data["provided_by"]
                clique_graph.add_edge(s, n, **edge_data2)
    for u, v, data in target_graph.edges(data=True):
        if "predicate" in data and data["predicate"] == SAME_AS:
            # load all biolink:same_as edges to clique_graph
            clique_graph.add_node(u, **target_graph.nodes()[u])
            clique_graph.add_node(v, **target_graph.nodes()[v])
            clique_graph.add_edge(u, v, **data)
            clique_graph.add_edge(
                v,
                u,
                **{
                    "subject": v,
                    "predicate": data["predicate"],
                    "object": v,
                    "relation": data["relation"],
                },
            )
    return clique_graph
Пример #15
0
def build_cliques(target_graph: BaseGraph) -> nx.MultiDiGraph:
    """
    Builds a clique graph from ``same_as`` edges in ``target_graph``.

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        An instance of BaseGraph that contains nodes and edges

    Returns
    -------
    networkx.MultiDiGraph
        The clique graph with only ``same_as`` edges

    """
    clique_graph = nx.MultiDiGraph()
    for n, data in target_graph.nodes(data=True):
        if 'same_as' in data:
            new_data = copy.deepcopy(data)
            del new_data['same_as']
            clique_graph.add_node(n, **new_data)
            for s in data['same_as']:
                edge_data1 = {'subject': n, 'predicate': SAME_AS, 'object': s}
                if 'provided_by' in data:
                    edge_data1['provided_by'] = data['provided_by']
                clique_graph.add_edge(n, s, **edge_data1)
                edge_data2 = {'subject': s, 'predicate': SAME_AS, 'object': n}
                if 'provided_by' in data:
                    edge_data2['provided_by'] = data['provided_by']
                clique_graph.add_edge(s, n, **edge_data2)
    for u, v, data in target_graph.edges(data=True):
        if 'predicate' in data and data['predicate'] == SAME_AS:
            # load all biolink:same_as edges to clique_graph
            clique_graph.add_node(u, **target_graph.nodes()[u])
            clique_graph.add_node(v, **target_graph.nodes()[v])
            clique_graph.add_edge(u, v, **data)
            clique_graph.add_edge(
                v, u, **{
                    'subject': v,
                    'predicate': data['predicate'],
                    'object': v,
                    'relation': data['relation']
                })
    return clique_graph
Пример #16
0
def unfold_node_property(graph: BaseGraph,
                         node_property: str,
                         prefix: Optional[str] = None) -> None:
    """
    Unfold node property as a predicate where every node with ``node_property``
    will be unfolded as an edge.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    node_property: str
        The node property to unfold
    prefix: Optional[str]
        The prefix to use

    """
    node_cache = []
    edge_cache = []
    start = current_time_in_millis()
    p = f"{prefix}:{node_property}" if prefix else node_property
    for n, data in graph.nodes(data=True):
        sub = n
        if node_property in data:
            obj = data[node_property]
            edge_cache.append((sub, obj, p))
            node_cache.append((n, node_property))
    while edge_cache:
        e = edge_cache.pop()
        graph.add_edge(
            *e, **{
                'subject': e[0],
                'object': e[1],
                'predicate': e[2],
                'relation': e[2]
            })
    while node_cache:
        n = node_cache.pop()
        del graph.nodes()[n[0]][n[1]]
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
Пример #17
0
    def summarize_graph_nodes(self, graph: BaseGraph) -> Dict:
        """
        Summarize the nodes in a graph.

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph

        Returns
        -------
        Dict
            The node stats
        """
        # TODO: count TOTAL_NODES somewhere else?
        self.add_node_stat(TOTAL_NODES, len(graph.nodes()))

        for n, data in graph.nodes(data=True):
            self.analyse_node(n, data)

        return self.get_node_stats()
Пример #18
0
def remove_singleton_nodes(graph: BaseGraph) -> None:
    """
    Remove singleton nodes (nodes that have a degree of 0) from the graph.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph

    """
    start = current_time_in_millis()
    singleton = []
    for n, d in graph.degree():
        if d == 0:
            singleton.append(n)
    while singleton:
        n = singleton.pop()
        log.debug(f"Removing singleton node {n}")
        graph.remove_node(n)
    end = current_time_in_millis()
    log.info(f"Time taken: {end - start} ms")
Пример #19
0
def generate_edge_identifiers(graph: BaseGraph):
    """
    Generate unique identifiers for edges in a graph that do not
    have an ``id`` field.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph

    """
    for u, v, data in graph.edges(data=True):
        if "id" not in data:
            data["id"] = generate_uuid()
Пример #20
0
    def summarize_graph_edges(self, graph: BaseGraph) -> Dict:
        """
        Summarize the edges in a graph.

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph

        Returns
        -------
        Dict
            The edge stats
        """
        # TODO: count TOTAL_EDGES somewhere else?
        self.edge_stats[TOTAL_EDGES] = len(graph.edges())

        for u, v, k, data in graph.edges(keys=True, data=True):
            self.analyse_edge(u, v, k, data)

        self.edges_processed = True

        return self.get_edge_stats()
Пример #21
0
    def summarize_graph_nodes(self, graph: BaseGraph) -> Dict:
        """
        Summarize the nodes in a graph.

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph

        Returns
        -------
        Dict
            The node stats
        """
        for n, data in graph.nodes(data=True):
            self.analyse_node(n, data)
        return self.get_node_stats()
Пример #22
0
    def summarize_graph_edges(self, graph: BaseGraph) -> Dict:
        """
        Summarize the edges in a graph.

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph

        Returns
        -------
        Dict
            The edge stats
        """
        for u, v, k, data in graph.edges(keys=True, data=True):
            self.analyse_edge(u, v, k, data)

        return self.get_edge_stats()
Пример #23
0
    def validate_nodes(self, graph: BaseGraph):
        """
        Validate all the nodes in a graph.

        This method validates for the following,
        - Node properties
        - Node property type
        - Node property value type
        - Node categories

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph to validate

        """
        with click.progressbar(graph.nodes(data=True),
                               label="Validating nodes in graph") as bar:
            for n, data in bar:
                self.analyse_node(n, data)
Пример #24
0
    def validate_edges(self, graph: BaseGraph):
        """
        Validate all the edges in a graph.

        This method validates for the following,
        - Edge properties
        - Edge property type
        - Edge property value type
        - Edge predicate

        Parameters
        ----------
        graph: kgx.graph.base_graph.BaseGraph
            The graph to validate

        """
        with click.progressbar(graph.edges(data=True),
                               label="Validate edges in graph") as bar:
            for u, v, data in bar:
                self.analyse_edge(u, v, None, data)
Пример #25
0
def elect_leader(
    target_graph: BaseGraph,
    clique_graph: nx.MultiDiGraph,
    leader_annotation: str,
    prefix_prioritization_map: Optional[Dict[str, List[str]]],
    category_mapping: Optional[Dict[str, str]],
    strict: bool = True,
) -> BaseGraph:
    """
    Elect leader for each clique in a graph.

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    clique_graph: networkx.Graph
        The clique graph
    leader_annotation: str
        The field on a node that signifies that the node is the leader of a clique
    prefix_prioritization_map: Optional[Dict[str, List[str]]]
        A map that gives a prefix priority for one or more categories
    category_mapping: Optional[Dict[str, str]]
        Mapping for non-Biolink Model categories to Biolink Model categories
    strict: bool
        Whether or not to merge nodes in a clique that have conflicting node categories

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The updated target graph

    """
    cliques = list(nx.strongly_connected_components(clique_graph))
    log.info(f"Total cliques in clique graph: {len(cliques)}")
    count = 0
    update_dict = {}
    for clique in cliques:
        log.info(
            f"Processing clique: {clique} with {[clique_graph.nodes()[x]['category'] if 'category' in clique_graph.nodes()[x] else None for x in clique]}"
        )
        update_node_categories(
            target_graph, clique_graph, clique, category_mapping, strict
        )
        clique_category, clique_category_ancestors = get_clique_category(
            clique_graph, clique
        )
        log.debug(f"Clique category: {clique_category}")
        invalid_nodes = set()
        for n in clique:
            data = clique_graph.nodes()[n]
            if "_excluded_from_clique" in data and data["_excluded_from_clique"]:
                log.info(
                    f"Removing invalid node {n} from clique graph; node marked to be excluded"
                )
                clique_graph.remove_node(n)
                invalid_nodes.add(n)
            if data["category"][0] not in clique_category_ancestors:
                log.info(
                    f"Removing invalid node {n} from the clique graph; node category {data['category'][0]} not in CCA: {clique_category_ancestors}"
                )
                clique_graph.remove_node(n)
                invalid_nodes.add(n)

        filtered_clique = [x for x in clique if x not in invalid_nodes]
        if filtered_clique:
            if clique_category:
                # First check for LEADER_ANNOTATION property
                leader, election_strategy = get_leader_by_annotation(
                    target_graph, clique_graph, filtered_clique, leader_annotation
                )
                if not leader:
                    # Leader is None; use prefix prioritization strategy
                    log.debug(
                        "Could not elect clique leader by looking for LEADER_ANNOTATION property; "
                        "Using prefix prioritization instead"
                    )
                    if (
                        prefix_prioritization_map
                        and clique_category in prefix_prioritization_map.keys()
                    ):
                        leader, election_strategy = get_leader_by_prefix_priority(
                            target_graph,
                            clique_graph,
                            filtered_clique,
                            prefix_prioritization_map[clique_category],
                        )
                    else:
                        log.debug(
                            f"No prefix order found for category '{clique_category}' in PREFIX_PRIORITIZATION_MAP"
                        )

                if not leader:
                    # Leader is None; fall back to alphabetical sort on prefixes
                    log.debug(
                        "Could not elect clique leader by PREFIX_PRIORITIZATION; Using alphabetical sort on prefixes"
                    )
                    leader, election_strategy = get_leader_by_sort(
                        target_graph, clique_graph, filtered_clique
                    )

                log.debug(
                    f"Elected {leader} as leader via {election_strategy} for clique {filtered_clique}"
                )
                update_dict[leader] = {
                    LEADER_ANNOTATION: True,
                    "election_strategy": election_strategy,
                }
                count += 1

    nx.set_node_attributes(clique_graph, update_dict)
    target_graph.set_node_attributes(target_graph, update_dict)
    log.info(f"Total merged cliques: {count}")
    return target_graph
Пример #26
0
def consolidate_edges(
    target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, leader_annotation: str
) -> BaseGraph:
    """
    Move all edges from nodes in a clique to the clique leader.

    Original subject and object of a node are preserved via ``ORIGINAL_SUBJECT_PROPERTY`` and ``ORIGINAL_OBJECT_PROPERTY``

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    clique_graph: networkx.MultiDiGraph
        The clique graph
    leader_annotation: str
        The field on a node that signifies that the node is the leader of a clique

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The target graph where all edges from nodes in a clique are moved to clique leader

    """
    cliques = list(nx.strongly_connected_components(clique_graph))
    log.info(f"Consolidating edges in {len(cliques)} cliques")
    for clique in cliques:
        log.debug(f"Processing clique: {clique}")
        leaders: List = [
            x
            for x in clique
            if leader_annotation in clique_graph.nodes()[x]
            and clique_graph.nodes()[x][leader_annotation]
        ]
        if len(leaders) == 0:
            log.debug("No leader elected for clique {}; skipping".format(clique))
            continue
        leader: str = leaders[0]
        # update nodes in target graph
        target_graph.set_node_attributes(
            target_graph,
            {
                leader: {
                    leader_annotation: clique_graph.nodes()[leader].get(
                        leader_annotation
                    ),
                    "election_strategy": clique_graph.nodes()[leader].get(
                        "election_strategy"
                    ),
                }
            },
        )
        leader_equivalent_identifiers = set([x for x in clique_graph.neighbors(leader)])
        for node in clique:
            if node == leader:
                continue
            log.debug(f"Looking for in_edges for {node}")
            in_edges = target_graph.in_edges(node, keys=False, data=True)
            filtered_in_edges = [x for x in in_edges if x[2]["predicate"] != SAME_AS]
            equiv_in_edges = [x for x in in_edges if x[2]["predicate"] == SAME_AS]
            log.debug(f"Moving {len(in_edges)} in-edges from {node} to {leader}")
            for u, v, edge_data in filtered_in_edges:
                key = generate_edge_key(u, edge_data["predicate"], v)
                target_graph.remove_edge(u, v, edge_key=key)
                edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"]
                edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"]
                edge_data["object"] = leader
                key = generate_edge_key(u, edge_data["predicate"], leader)
                if (
                    edge_data["subject"] == edge_data["object"]
                    and edge_data["predicate"] == SUBCLASS_OF
                ):
                    continue
                target_graph.add_edge(
                    edge_data["subject"], edge_data["object"], key, **edge_data
                )

            log.debug(f"Looking for out_edges for {node}")
            out_edges = target_graph.out_edges(node, keys=False, data=True)
            filtered_out_edges = [x for x in out_edges if x[2]["predicate"] != SAME_AS]
            equiv_out_edges = [x for x in out_edges if x[2]["predicate"] == SAME_AS]
            log.debug(f"Moving {len(out_edges)} out-edges from {node} to {leader}")
            for u, v, edge_data in filtered_out_edges:
                key = generate_edge_key(u, edge_data["predicate"], v)
                target_graph.remove_edge(u, v, edge_key=key)
                edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"]
                edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"]
                edge_data["subject"] = leader
                key = generate_edge_key(leader, edge_data["predicate"], v)
                if (
                    edge_data["subject"] == edge_data["object"]
                    and edge_data["predicate"] == SUBCLASS_OF
                ):
                    continue
                target_graph.add_edge(
                    edge_data["subject"], edge_data["object"], key, **edge_data
                )

            log.debug(f"equiv out edges: {equiv_out_edges}")
            equivalent_identifiers = set()
            for u, v, edge_data in equiv_in_edges:
                if u != leader:
                    equivalent_identifiers.add(u)
                if v != leader:
                    equivalent_identifiers.add(v)
                target_graph.remove_edge(
                    u, v, edge_key=generate_edge_key(u, SAME_AS, v)
                )

            log.debug(f"equiv out edges: {equiv_out_edges}")
            for u, v, edge_data in equiv_out_edges:
                if u != leader:
                    log.debug(f"{u} is an equivalent identifier of leader {leader}")
                    equivalent_identifiers.add(u)
                if v != leader:
                    log.debug(f"{v} is an equivalent identifier of leader {leader}")
                    equivalent_identifiers.add(v)
                target_graph.remove_edge(
                    u, v, edge_key=generate_edge_key(u, SAME_AS, v)
                )

            leader_equivalent_identifiers.update(equivalent_identifiers)

        log.debug(
            f"setting same_as property to leader node with {leader_equivalent_identifiers}"
        )
        target_graph.set_node_attributes(
            target_graph, {leader: {"same_as": list(leader_equivalent_identifiers)}}
        )
        log.debug(
            f"removing equivalent nodes of leader: {leader_equivalent_identifiers}"
        )
        for n in leader_equivalent_identifiers:
            target_graph.remove_node(n)
    return target_graph
Пример #27
0
def update_node_categories(
    target_graph: BaseGraph,
    clique_graph: nx.MultiDiGraph,
    clique: List,
    category_mapping: Optional[Dict[str, str]],
    strict: bool = True,
) -> List:
    """
    For a given clique, get category for each node in clique and validate against Biolink Model,
    mapping to Biolink Model category where needed.

    For example, If a node has ``biolink:Gene`` as its category, then this method adds all of its ancestors.

    Parameters
    ----------
    target_graph: kgx.graph.base_graph.BaseGraph
        The original graph
    clique_graph: networkx.Graph
        The clique graph
    clique: List
        A list of nodes from a clique
    category_mapping: Optional[Dict[str, str]]
        Mapping for non-Biolink Model categories to Biolink Model categories
    strict: bool
        Whether or not to merge nodes in a clique that have conflicting node categories

    Returns
    -------
    List
        The clique

    """
    updated_clique_graph_properties = {}
    updated_target_graph_properties = {}
    for node in clique:
        # For each node in a clique, get its category property
        data = clique_graph.nodes()[node]
        if "category" in data:
            categories = data["category"]
        else:
            categories = get_category_from_equivalence(
                target_graph, clique_graph, node, data
            )

        # differentiate between valid and invalid categories
        (
            valid_biolink_categories,
            invalid_biolink_categories,
            invalid_categories,
        ) = check_all_categories(categories)
        log.debug(
            f"valid biolink categories: {valid_biolink_categories} invalid biolink categories: {invalid_biolink_categories} invalid_categories: {invalid_categories}"
        )
        # extend categories to have the longest list of ancestors
        extended_categories: List = []
        for x in valid_biolink_categories:
            ancestors = get_biolink_ancestors(x)
            if len(ancestors) > len(extended_categories):
                extended_categories.extend(ancestors)
        log.debug(f"Extended categories: {extended_categories}")
        clique_graph_update_dict: Dict = {"category": list(extended_categories)}
        target_graph_update_dict: Dict = {}

        if invalid_biolink_categories:
            if strict:
                clique_graph_update_dict["_excluded_from_clique"] = True
                target_graph_update_dict["_excluded_from_clique"] = True
            clique_graph_update_dict[
                "invalid_biolink_category"
            ] = invalid_biolink_categories
            target_graph_update_dict[
                "invalid_biolink_category"
            ] = invalid_biolink_categories

        if invalid_categories:
            clique_graph_update_dict["_invalid_category"] = invalid_categories
            target_graph_update_dict["_invalid_category"] = invalid_categories

        updated_clique_graph_properties[node] = clique_graph_update_dict
        updated_target_graph_properties[node] = target_graph_update_dict

    nx.set_node_attributes(clique_graph, updated_clique_graph_properties)
    target_graph.set_node_attributes(target_graph, updated_target_graph_properties)
    return clique
Пример #28
0
def remap_node_identifier(
    graph: BaseGraph, category: str, alternative_property: str, prefix=None
) -> BaseGraph:
    """
    Remap a node's 'id' attribute with value from a node's ``alternative_property`` attribute.

    Parameters
    ----------
    graph: kgx.graph.base_graph.BaseGraph
        The graph
    category: string
        category referring to nodes whose 'id' needs to be remapped
    alternative_property: string
        property name from which the new value is pulled from
    prefix: string
        signifies that the value for ``alternative_property`` is a list
        and the ``prefix`` indicates which value to pick from the list

    Returns
    -------
    kgx.graph.base_graph.BaseGraph
        The modified graph

    """
    mapping: Dict = {}
    for nid, data in graph.nodes(data=True):
        node_data = data.copy()
        if "category" in node_data and category not in node_data["category"]:
            continue

        if alternative_property in node_data:
            alternative_values = node_data[alternative_property]
            if isinstance(alternative_values, (list, set, tuple)):
                if prefix:
                    for v in alternative_values:
                        if prefix in v:
                            # take the first occurring value that contains the given prefix
                            mapping[nid] = {"id": v}
                            break
                else:
                    # no prefix defined; pick the 1st one from list
                    mapping[nid] = {"id": next(iter(alternative_values))}
            elif isinstance(alternative_values, str):
                if prefix:
                    if alternative_values.startswith(prefix):
                        mapping[nid] = {"id": alternative_values}
                else:
                    # no prefix defined
                    mapping[nid] = {"id": alternative_values}
            else:
                log.error(
                    f"Cannot use {alternative_values} from alternative_property {alternative_property}"
                )

    graph.set_node_attributes(graph, attributes=mapping)
    graph.relabel_nodes(graph, {k: list(v.values())[0] for k, v in mapping.items()})

    # update 'subject' of all outgoing edges
    update_edge_keys = {}
    updated_subject_values = {}
    updated_object_values = {}
    for u, v, k, edge_data in graph.edges(data=True, keys=True):
        if u is not edge_data["subject"]:
            updated_subject_values[(u, v, k)] = {"subject": u}
            update_edge_keys[(u, v, k)] = {
                "edge_key": generate_edge_key(u, edge_data["predicate"], v)
            }
        if v is not edge_data["object"]:
            updated_object_values[(u, v, k)] = {"object": v}
            update_edge_keys[(u, v, k)] = {
                "edge_key": generate_edge_key(u, edge_data["predicate"], v)
            }

    graph.set_edge_attributes(graph, attributes=updated_subject_values)
    graph.set_edge_attributes(graph, attributes=updated_object_values)
    graph.set_edge_attributes(graph, attributes=update_edge_keys)
    return graph