def add_all_edges(g1: BaseGraph, g2: BaseGraph, preserve: bool = True) -> int: """ Add all edges from source graph (``g2``) to target graph (``g1``). Parameters ---------- g1: kgx.graph.base_graph.BaseGraph Target graph g2: kgx.graph.base_graph.BaseGraph Source graph preserve: bool Whether or not to preserve conflicting properties Returns ------- int Number of edges merged during this operation """ log.info(f"Adding {g2.number_of_edges()} edges from {g2} to {g1}") merge_count = 0 for u, v, key, data in g2.edges(keys=True, data=True): if g1.has_edge(u, v, key): merge_edge(g1, u, v, key, data, preserve) merge_count += 1 else: g1.add_edge(u, v, edge_key=key, **data) return merge_count
def remap_node_property( graph: BaseGraph, category: str, old_property: str, new_property: str ) -> None: """ Remap the value in node ``old_property`` attribute with value from node ``new_property`` attribute. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph category: string Category referring to nodes whose property needs to be remapped old_property: string old property name whose value needs to be replaced new_property: string new property name from which the value is pulled from """ mapping = {} if old_property in CORE_NODE_PROPERTIES: raise AttributeError( f"node property {old_property} cannot be modified as it is a core property." ) for nid, data in graph.nodes(data=True): node_data = data.copy() if category in node_data and category not in node_data["category"]: continue if new_property in node_data: mapping[nid] = {old_property: node_data[new_property]} graph.set_node_attributes(graph, attributes=mapping)
def merge_node(g: BaseGraph, n: str, data: dict, preserve: bool = True) -> dict: """ Merge node ``n`` into graph ``g``. Parameters ---------- g: kgx.graph.base_graph.BaseGraph The target graph n: str Node id data: dict Node properties preserve: bool Whether or not to preserve conflicting properties Returns ------- dict The merged node """ existing_node = g.nodes()[n] new_data = prepare_data_dict(copy.deepcopy(existing_node), copy.deepcopy(data), preserve) g.add_node(n, **new_data) return existing_node
def add_all_nodes(g1: BaseGraph, g2: BaseGraph, preserve: bool = True) -> int: """ Add all nodes from source graph (``g2``) to target graph (``g1``). Parameters ---------- g1: kgx.graph.base_graph.BaseGraph Target graph g2: kgx.graph.base_graph.BaseGraph Source graph preserve: bool Whether or not to preserve conflicting properties Returns ------- int Number of nodes merged during this operation """ log.info( f"Adding {g2.number_of_nodes()} nodes from {g2.name} to {g1.name}") merge_count = 0 for n, data in g2.nodes(data=True): if n in g1.nodes(): merge_node(g1, n, data, preserve) merge_count += 1 else: g1.add_node(n, **data) return merge_count
def merge_edge(g: BaseGraph, u: str, v: str, key: str, data: dict, preserve: bool = True) -> dict: """ Merge edge ``u`` -> ``v`` into graph ``g``. Parameters ---------- g: kgx.graph.base_graph.BaseGraph The target graph u: str Subject node id v: str Object node id key: str Edge key data: dict Node properties preserve: bool Whether or not to preserve conflicting properties Returns ------- dict The merged edge """ existing_edge = g.get_edge(u, v, key) new_data = prepare_data_dict(copy.deepcopy(existing_edge), copy.deepcopy(data), preserve) g.add_edge(u, v, edge_key=key, **new_data) return existing_edge
def get_parents(graph: BaseGraph, node: str, relations: List[str] = None) -> List[str]: """ Return all direct `parents` of a specified node, filtered by ``relations``. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph Graph to traverse node: str node identifier relations: List[str] list of relations Returns ------- List[str] A list of parent node(s) """ parents = [] if graph.has_node(node): out_edges = [x for x in graph.out_edges(node, keys=False, data=True)] if relations is None: parents = [x[1] for x in out_edges] else: parents = [ x[1] for x in out_edges if x[2]['predicate'] in relations ] return parents
def apply_node_filters(graph: BaseGraph, node_filters: Dict[str, Union[str, Set]]) -> None: """ Apply filters to graph and remove nodes that do not pass given filters. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph node_filters: Dict[str, Union[str, Set]] Node filters """ nodes_to_remove = [] for node, node_data in graph.nodes(data=True): pass_filter = True for k, v in node_filters.items(): if k == "category": if not any(x in node_data[k] for x in v): pass_filter = False if not pass_filter: nodes_to_remove.append(node) for node in nodes_to_remove: # removing node that fails category filter log.debug(f"Removing node {node}") graph.remove_node(node)
def apply_edge_filters(graph: BaseGraph, edge_filters: Dict[str, Union[str, Set]]) -> None: """ Apply filters to graph and remove edges that do not pass given filters. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph edge_filters: Dict[str, Union[str, Set]] Edge filters """ edges_to_remove = [] for subject_node, object_node, key, data in graph.edges(keys=True, data=True): pass_filter = True for k, v in edge_filters.items(): if k == "predicate": if data[k] not in v: pass_filter = False elif k == "relation": if data[k] not in v: pass_filter = False if not pass_filter: edges_to_remove.append((subject_node, object_node, key)) for edge in edges_to_remove: # removing edge that fails edge filters log.debug(f"Removing edge {edge}") graph.remove_edge(edge[0], edge[1], edge[2])
def fold_predicate( graph: BaseGraph, predicate: str, remove_prefix: bool = False ) -> None: """ Fold predicate as node property where every edge with ``predicate`` will be folded as a node property. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph predicate: str The predicate to fold remove_prefix: bool Whether or not to remove prefix from the predicate (``False``, by default) """ node_cache = [] edge_cache = [] start = current_time_in_millis() p = predicate.split(":", 1)[1] if remove_prefix else predicate for u, v, k, data in graph.edges(keys=True, data=True): if data["predicate"] == predicate: node_cache.append((u, p, v)) edge_cache.append((u, v, k)) while node_cache: n = node_cache.pop() graph.add_node_attribute(*n) while edge_cache: e = edge_cache.pop() graph.remove_edge(*e) end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def remap_edge_property( graph: BaseGraph, edge_predicate: str, old_property: str, new_property: str ) -> None: """ Remap the value in an edge ``old_property`` attribute with value from edge ``new_property`` attribute. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph edge_predicate: string edge_predicate referring to edges whose property needs to be remapped old_property: string Old property name whose value needs to be replaced new_property: string New property name from which the value is pulled from """ mapping = {} if old_property in CORE_EDGE_PROPERTIES: raise AttributeError( f"edge property {old_property} cannot be modified as it is a core property." ) for u, v, k, data in graph.edges(data=True, keys=True): edge_data = data.copy() if edge_predicate is not edge_data["predicate"]: continue if new_property in edge_data: mapping[(u, v, k)] = {old_property: edge_data[new_property]} graph.set_edge_attributes(graph, attributes=mapping)
def get_category_via_superclass(graph: BaseGraph, curie: str, load_ontology: bool = True) -> Set[str]: """ Get category for a given CURIE by tracing its superclass, via ``subclass_of`` hierarchy, and getting the most appropriate category based on the superclass. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph Graph to traverse curie: str Input CURIE load_ontology: bool Determines whether to load ontology, based on CURIE prefix, or to simply rely on ``subclass_of`` hierarchy from graph Returns ------- Set[str] A set containing one (or more) category for the given CURIE """ log.debug("curie: {}".format(curie)) new_categories = [] toolkit = get_toolkit() if PrefixManager.is_curie(curie): ancestors = get_ancestors(graph, curie, relations=['subclass_of']) if len(ancestors) == 0 and load_ontology: cls = get_curie_lookup_service() ontology_graph = cls.ontology_graph new_categories += [ x for x in get_category_via_superclass(ontology_graph, curie, False) ] log.debug("Ancestors for CURIE {} via subClassOf: {}".format( curie, ancestors)) seen = [] for anc in ancestors: mapping = toolkit.get_by_mapping(anc) seen.append(anc) if mapping: # there is direct mapping to BioLink Model log.debug("Ancestor {} mapped to {}".format(anc, mapping)) seen_labels = [ graph.nodes()[x]['name'] for x in seen if 'name' in graph.nodes()[x] ] new_categories += [x for x in seen_labels] new_categories += [x for x in toolkit.ancestors(mapping)] break return set(new_categories)
def validate_edges(self, graph: BaseGraph) -> list: """ Validate all the edges in a graph. This method validates for the following, - Edge properties - Edge property type - Edge property value type - Edge label Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph to validate Returns ------- list A list of errors for a given graph """ errors = [] with click.progressbar(graph.edges(data=True), label='Validate edges in graph') as bar: for u, v, data in bar: errors += self.analyse_edge(u, v, None, data) return errors
def validate_nodes(self, graph: BaseGraph) -> list: """ Validate all the nodes in a graph. This method validates for the following, - Node properties - Node property type - Node property value type - Node categories Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph to validate Returns ------- list A list of errors for a given graph """ errors = [] with click.progressbar(graph.nodes(data=True), label='Validating nodes in graph') as bar: for n, data in bar: errors += self.analyse_node(n, data) return errors
def build_cliques(target_graph: BaseGraph) -> nx.MultiDiGraph: """ Builds a clique graph from ``same_as`` edges in ``target_graph``. Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph An instance of BaseGraph that contains nodes and edges Returns ------- networkx.MultiDiGraph The clique graph with only ``same_as`` edges """ clique_graph = nx.MultiDiGraph() for n, data in target_graph.nodes(data=True): if "same_as" in data: new_data = copy.deepcopy(data) del new_data["same_as"] clique_graph.add_node(n, **new_data) for s in data["same_as"]: edge_data1 = {"subject": n, "predicate": SAME_AS, "object": s} if "provided_by" in data: edge_data1["provided_by"] = data["provided_by"] clique_graph.add_edge(n, s, **edge_data1) edge_data2 = {"subject": s, "predicate": SAME_AS, "object": n} if "provided_by" in data: edge_data2["provided_by"] = data["provided_by"] clique_graph.add_edge(s, n, **edge_data2) for u, v, data in target_graph.edges(data=True): if "predicate" in data and data["predicate"] == SAME_AS: # load all biolink:same_as edges to clique_graph clique_graph.add_node(u, **target_graph.nodes()[u]) clique_graph.add_node(v, **target_graph.nodes()[v]) clique_graph.add_edge(u, v, **data) clique_graph.add_edge( v, u, **{ "subject": v, "predicate": data["predicate"], "object": v, "relation": data["relation"], }, ) return clique_graph
def build_cliques(target_graph: BaseGraph) -> nx.MultiDiGraph: """ Builds a clique graph from ``same_as`` edges in ``target_graph``. Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph An instance of BaseGraph that contains nodes and edges Returns ------- networkx.MultiDiGraph The clique graph with only ``same_as`` edges """ clique_graph = nx.MultiDiGraph() for n, data in target_graph.nodes(data=True): if 'same_as' in data: new_data = copy.deepcopy(data) del new_data['same_as'] clique_graph.add_node(n, **new_data) for s in data['same_as']: edge_data1 = {'subject': n, 'predicate': SAME_AS, 'object': s} if 'provided_by' in data: edge_data1['provided_by'] = data['provided_by'] clique_graph.add_edge(n, s, **edge_data1) edge_data2 = {'subject': s, 'predicate': SAME_AS, 'object': n} if 'provided_by' in data: edge_data2['provided_by'] = data['provided_by'] clique_graph.add_edge(s, n, **edge_data2) for u, v, data in target_graph.edges(data=True): if 'predicate' in data and data['predicate'] == SAME_AS: # load all biolink:same_as edges to clique_graph clique_graph.add_node(u, **target_graph.nodes()[u]) clique_graph.add_node(v, **target_graph.nodes()[v]) clique_graph.add_edge(u, v, **data) clique_graph.add_edge( v, u, **{ 'subject': v, 'predicate': data['predicate'], 'object': v, 'relation': data['relation'] }) return clique_graph
def unfold_node_property(graph: BaseGraph, node_property: str, prefix: Optional[str] = None) -> None: """ Unfold node property as a predicate where every node with ``node_property`` will be unfolded as an edge. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph node_property: str The node property to unfold prefix: Optional[str] The prefix to use """ node_cache = [] edge_cache = [] start = current_time_in_millis() p = f"{prefix}:{node_property}" if prefix else node_property for n, data in graph.nodes(data=True): sub = n if node_property in data: obj = data[node_property] edge_cache.append((sub, obj, p)) node_cache.append((n, node_property)) while edge_cache: e = edge_cache.pop() graph.add_edge( *e, **{ 'subject': e[0], 'object': e[1], 'predicate': e[2], 'relation': e[2] }) while node_cache: n = node_cache.pop() del graph.nodes()[n[0]][n[1]] end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def summarize_graph_nodes(self, graph: BaseGraph) -> Dict: """ Summarize the nodes in a graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph Returns ------- Dict The node stats """ # TODO: count TOTAL_NODES somewhere else? self.add_node_stat(TOTAL_NODES, len(graph.nodes())) for n, data in graph.nodes(data=True): self.analyse_node(n, data) return self.get_node_stats()
def remove_singleton_nodes(graph: BaseGraph) -> None: """ Remove singleton nodes (nodes that have a degree of 0) from the graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph """ start = current_time_in_millis() singleton = [] for n, d in graph.degree(): if d == 0: singleton.append(n) while singleton: n = singleton.pop() log.debug(f"Removing singleton node {n}") graph.remove_node(n) end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def generate_edge_identifiers(graph: BaseGraph): """ Generate unique identifiers for edges in a graph that do not have an ``id`` field. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph """ for u, v, data in graph.edges(data=True): if "id" not in data: data["id"] = generate_uuid()
def summarize_graph_edges(self, graph: BaseGraph) -> Dict: """ Summarize the edges in a graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph Returns ------- Dict The edge stats """ # TODO: count TOTAL_EDGES somewhere else? self.edge_stats[TOTAL_EDGES] = len(graph.edges()) for u, v, k, data in graph.edges(keys=True, data=True): self.analyse_edge(u, v, k, data) self.edges_processed = True return self.get_edge_stats()
def summarize_graph_nodes(self, graph: BaseGraph) -> Dict: """ Summarize the nodes in a graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph Returns ------- Dict The node stats """ for n, data in graph.nodes(data=True): self.analyse_node(n, data) return self.get_node_stats()
def summarize_graph_edges(self, graph: BaseGraph) -> Dict: """ Summarize the edges in a graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph Returns ------- Dict The edge stats """ for u, v, k, data in graph.edges(keys=True, data=True): self.analyse_edge(u, v, k, data) return self.get_edge_stats()
def validate_nodes(self, graph: BaseGraph): """ Validate all the nodes in a graph. This method validates for the following, - Node properties - Node property type - Node property value type - Node categories Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph to validate """ with click.progressbar(graph.nodes(data=True), label="Validating nodes in graph") as bar: for n, data in bar: self.analyse_node(n, data)
def validate_edges(self, graph: BaseGraph): """ Validate all the edges in a graph. This method validates for the following, - Edge properties - Edge property type - Edge property value type - Edge predicate Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph to validate """ with click.progressbar(graph.edges(data=True), label="Validate edges in graph") as bar: for u, v, data in bar: self.analyse_edge(u, v, None, data)
def elect_leader( target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, leader_annotation: str, prefix_prioritization_map: Optional[Dict[str, List[str]]], category_mapping: Optional[Dict[str, str]], strict: bool = True, ) -> BaseGraph: """ Elect leader for each clique in a graph. Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph The original graph clique_graph: networkx.Graph The clique graph leader_annotation: str The field on a node that signifies that the node is the leader of a clique prefix_prioritization_map: Optional[Dict[str, List[str]]] A map that gives a prefix priority for one or more categories category_mapping: Optional[Dict[str, str]] Mapping for non-Biolink Model categories to Biolink Model categories strict: bool Whether or not to merge nodes in a clique that have conflicting node categories Returns ------- kgx.graph.base_graph.BaseGraph The updated target graph """ cliques = list(nx.strongly_connected_components(clique_graph)) log.info(f"Total cliques in clique graph: {len(cliques)}") count = 0 update_dict = {} for clique in cliques: log.info( f"Processing clique: {clique} with {[clique_graph.nodes()[x]['category'] if 'category' in clique_graph.nodes()[x] else None for x in clique]}" ) update_node_categories( target_graph, clique_graph, clique, category_mapping, strict ) clique_category, clique_category_ancestors = get_clique_category( clique_graph, clique ) log.debug(f"Clique category: {clique_category}") invalid_nodes = set() for n in clique: data = clique_graph.nodes()[n] if "_excluded_from_clique" in data and data["_excluded_from_clique"]: log.info( f"Removing invalid node {n} from clique graph; node marked to be excluded" ) clique_graph.remove_node(n) invalid_nodes.add(n) if data["category"][0] not in clique_category_ancestors: log.info( f"Removing invalid node {n} from the clique graph; node category {data['category'][0]} not in CCA: {clique_category_ancestors}" ) clique_graph.remove_node(n) invalid_nodes.add(n) filtered_clique = [x for x in clique if x not in invalid_nodes] if filtered_clique: if clique_category: # First check for LEADER_ANNOTATION property leader, election_strategy = get_leader_by_annotation( target_graph, clique_graph, filtered_clique, leader_annotation ) if not leader: # Leader is None; use prefix prioritization strategy log.debug( "Could not elect clique leader by looking for LEADER_ANNOTATION property; " "Using prefix prioritization instead" ) if ( prefix_prioritization_map and clique_category in prefix_prioritization_map.keys() ): leader, election_strategy = get_leader_by_prefix_priority( target_graph, clique_graph, filtered_clique, prefix_prioritization_map[clique_category], ) else: log.debug( f"No prefix order found for category '{clique_category}' in PREFIX_PRIORITIZATION_MAP" ) if not leader: # Leader is None; fall back to alphabetical sort on prefixes log.debug( "Could not elect clique leader by PREFIX_PRIORITIZATION; Using alphabetical sort on prefixes" ) leader, election_strategy = get_leader_by_sort( target_graph, clique_graph, filtered_clique ) log.debug( f"Elected {leader} as leader via {election_strategy} for clique {filtered_clique}" ) update_dict[leader] = { LEADER_ANNOTATION: True, "election_strategy": election_strategy, } count += 1 nx.set_node_attributes(clique_graph, update_dict) target_graph.set_node_attributes(target_graph, update_dict) log.info(f"Total merged cliques: {count}") return target_graph
def consolidate_edges( target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, leader_annotation: str ) -> BaseGraph: """ Move all edges from nodes in a clique to the clique leader. Original subject and object of a node are preserved via ``ORIGINAL_SUBJECT_PROPERTY`` and ``ORIGINAL_OBJECT_PROPERTY`` Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph The original graph clique_graph: networkx.MultiDiGraph The clique graph leader_annotation: str The field on a node that signifies that the node is the leader of a clique Returns ------- kgx.graph.base_graph.BaseGraph The target graph where all edges from nodes in a clique are moved to clique leader """ cliques = list(nx.strongly_connected_components(clique_graph)) log.info(f"Consolidating edges in {len(cliques)} cliques") for clique in cliques: log.debug(f"Processing clique: {clique}") leaders: List = [ x for x in clique if leader_annotation in clique_graph.nodes()[x] and clique_graph.nodes()[x][leader_annotation] ] if len(leaders) == 0: log.debug("No leader elected for clique {}; skipping".format(clique)) continue leader: str = leaders[0] # update nodes in target graph target_graph.set_node_attributes( target_graph, { leader: { leader_annotation: clique_graph.nodes()[leader].get( leader_annotation ), "election_strategy": clique_graph.nodes()[leader].get( "election_strategy" ), } }, ) leader_equivalent_identifiers = set([x for x in clique_graph.neighbors(leader)]) for node in clique: if node == leader: continue log.debug(f"Looking for in_edges for {node}") in_edges = target_graph.in_edges(node, keys=False, data=True) filtered_in_edges = [x for x in in_edges if x[2]["predicate"] != SAME_AS] equiv_in_edges = [x for x in in_edges if x[2]["predicate"] == SAME_AS] log.debug(f"Moving {len(in_edges)} in-edges from {node} to {leader}") for u, v, edge_data in filtered_in_edges: key = generate_edge_key(u, edge_data["predicate"], v) target_graph.remove_edge(u, v, edge_key=key) edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"] edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"] edge_data["object"] = leader key = generate_edge_key(u, edge_data["predicate"], leader) if ( edge_data["subject"] == edge_data["object"] and edge_data["predicate"] == SUBCLASS_OF ): continue target_graph.add_edge( edge_data["subject"], edge_data["object"], key, **edge_data ) log.debug(f"Looking for out_edges for {node}") out_edges = target_graph.out_edges(node, keys=False, data=True) filtered_out_edges = [x for x in out_edges if x[2]["predicate"] != SAME_AS] equiv_out_edges = [x for x in out_edges if x[2]["predicate"] == SAME_AS] log.debug(f"Moving {len(out_edges)} out-edges from {node} to {leader}") for u, v, edge_data in filtered_out_edges: key = generate_edge_key(u, edge_data["predicate"], v) target_graph.remove_edge(u, v, edge_key=key) edge_data[ORIGINAL_SUBJECT_PROPERTY] = edge_data["subject"] edge_data[ORIGINAL_OBJECT_PROPERTY] = edge_data["object"] edge_data["subject"] = leader key = generate_edge_key(leader, edge_data["predicate"], v) if ( edge_data["subject"] == edge_data["object"] and edge_data["predicate"] == SUBCLASS_OF ): continue target_graph.add_edge( edge_data["subject"], edge_data["object"], key, **edge_data ) log.debug(f"equiv out edges: {equiv_out_edges}") equivalent_identifiers = set() for u, v, edge_data in equiv_in_edges: if u != leader: equivalent_identifiers.add(u) if v != leader: equivalent_identifiers.add(v) target_graph.remove_edge( u, v, edge_key=generate_edge_key(u, SAME_AS, v) ) log.debug(f"equiv out edges: {equiv_out_edges}") for u, v, edge_data in equiv_out_edges: if u != leader: log.debug(f"{u} is an equivalent identifier of leader {leader}") equivalent_identifiers.add(u) if v != leader: log.debug(f"{v} is an equivalent identifier of leader {leader}") equivalent_identifiers.add(v) target_graph.remove_edge( u, v, edge_key=generate_edge_key(u, SAME_AS, v) ) leader_equivalent_identifiers.update(equivalent_identifiers) log.debug( f"setting same_as property to leader node with {leader_equivalent_identifiers}" ) target_graph.set_node_attributes( target_graph, {leader: {"same_as": list(leader_equivalent_identifiers)}} ) log.debug( f"removing equivalent nodes of leader: {leader_equivalent_identifiers}" ) for n in leader_equivalent_identifiers: target_graph.remove_node(n) return target_graph
def update_node_categories( target_graph: BaseGraph, clique_graph: nx.MultiDiGraph, clique: List, category_mapping: Optional[Dict[str, str]], strict: bool = True, ) -> List: """ For a given clique, get category for each node in clique and validate against Biolink Model, mapping to Biolink Model category where needed. For example, If a node has ``biolink:Gene`` as its category, then this method adds all of its ancestors. Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph The original graph clique_graph: networkx.Graph The clique graph clique: List A list of nodes from a clique category_mapping: Optional[Dict[str, str]] Mapping for non-Biolink Model categories to Biolink Model categories strict: bool Whether or not to merge nodes in a clique that have conflicting node categories Returns ------- List The clique """ updated_clique_graph_properties = {} updated_target_graph_properties = {} for node in clique: # For each node in a clique, get its category property data = clique_graph.nodes()[node] if "category" in data: categories = data["category"] else: categories = get_category_from_equivalence( target_graph, clique_graph, node, data ) # differentiate between valid and invalid categories ( valid_biolink_categories, invalid_biolink_categories, invalid_categories, ) = check_all_categories(categories) log.debug( f"valid biolink categories: {valid_biolink_categories} invalid biolink categories: {invalid_biolink_categories} invalid_categories: {invalid_categories}" ) # extend categories to have the longest list of ancestors extended_categories: List = [] for x in valid_biolink_categories: ancestors = get_biolink_ancestors(x) if len(ancestors) > len(extended_categories): extended_categories.extend(ancestors) log.debug(f"Extended categories: {extended_categories}") clique_graph_update_dict: Dict = {"category": list(extended_categories)} target_graph_update_dict: Dict = {} if invalid_biolink_categories: if strict: clique_graph_update_dict["_excluded_from_clique"] = True target_graph_update_dict["_excluded_from_clique"] = True clique_graph_update_dict[ "invalid_biolink_category" ] = invalid_biolink_categories target_graph_update_dict[ "invalid_biolink_category" ] = invalid_biolink_categories if invalid_categories: clique_graph_update_dict["_invalid_category"] = invalid_categories target_graph_update_dict["_invalid_category"] = invalid_categories updated_clique_graph_properties[node] = clique_graph_update_dict updated_target_graph_properties[node] = target_graph_update_dict nx.set_node_attributes(clique_graph, updated_clique_graph_properties) target_graph.set_node_attributes(target_graph, updated_target_graph_properties) return clique
def remap_node_identifier( graph: BaseGraph, category: str, alternative_property: str, prefix=None ) -> BaseGraph: """ Remap a node's 'id' attribute with value from a node's ``alternative_property`` attribute. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph category: string category referring to nodes whose 'id' needs to be remapped alternative_property: string property name from which the new value is pulled from prefix: string signifies that the value for ``alternative_property`` is a list and the ``prefix`` indicates which value to pick from the list Returns ------- kgx.graph.base_graph.BaseGraph The modified graph """ mapping: Dict = {} for nid, data in graph.nodes(data=True): node_data = data.copy() if "category" in node_data and category not in node_data["category"]: continue if alternative_property in node_data: alternative_values = node_data[alternative_property] if isinstance(alternative_values, (list, set, tuple)): if prefix: for v in alternative_values: if prefix in v: # take the first occurring value that contains the given prefix mapping[nid] = {"id": v} break else: # no prefix defined; pick the 1st one from list mapping[nid] = {"id": next(iter(alternative_values))} elif isinstance(alternative_values, str): if prefix: if alternative_values.startswith(prefix): mapping[nid] = {"id": alternative_values} else: # no prefix defined mapping[nid] = {"id": alternative_values} else: log.error( f"Cannot use {alternative_values} from alternative_property {alternative_property}" ) graph.set_node_attributes(graph, attributes=mapping) graph.relabel_nodes(graph, {k: list(v.values())[0] for k, v in mapping.items()}) # update 'subject' of all outgoing edges update_edge_keys = {} updated_subject_values = {} updated_object_values = {} for u, v, k, edge_data in graph.edges(data=True, keys=True): if u is not edge_data["subject"]: updated_subject_values[(u, v, k)] = {"subject": u} update_edge_keys[(u, v, k)] = { "edge_key": generate_edge_key(u, edge_data["predicate"], v) } if v is not edge_data["object"]: updated_object_values[(u, v, k)] = {"object": v} update_edge_keys[(u, v, k)] = { "edge_key": generate_edge_key(u, edge_data["predicate"], v) } graph.set_edge_attributes(graph, attributes=updated_subject_values) graph.set_edge_attributes(graph, attributes=updated_object_values) graph.set_edge_attributes(graph, attributes=update_edge_keys) return graph