def fold_predicate( graph: BaseGraph, predicate: str, remove_prefix: bool = False ) -> None: """ Fold predicate as node property where every edge with ``predicate`` will be folded as a node property. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph predicate: str The predicate to fold remove_prefix: bool Whether or not to remove prefix from the predicate (``False``, by default) """ node_cache = [] edge_cache = [] start = current_time_in_millis() p = predicate.split(":", 1)[1] if remove_prefix else predicate for u, v, k, data in graph.edges(keys=True, data=True): if data["predicate"] == predicate: node_cache.append((u, p, v)) edge_cache.append((u, v, k)) while node_cache: n = node_cache.pop() graph.add_node_attribute(*n) while edge_cache: e = edge_cache.pop() graph.remove_edge(*e) end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def parse( self, filename: str, format: str = "owl", compression: Optional[str] = None, **kwargs: Any, ) -> Generator: """ This method reads from an OWL and yields records. Parameters ---------- filename: str The filename to parse format: str The format (``owl``) compression: Optional[str] The compression type (``gz``) kwargs: Any Any additional arguments Returns ------- Generator A generator for node and edge records read from the file """ rdfgraph = rdflib.Graph() if compression: log.warning( f"compression mode '{compression}' not supported by OwlSource") if format is None: format = rdflib.util.guess_format(filename) if format == "owl": format = "xml" log.info("Parsing {} with '{}' format".format(filename, format)) rdfgraph.parse(filename, format=format) log.info("{} parsed with {} triples".format(filename, len(rdfgraph))) self.set_provenance_map(kwargs) self.start = current_time_in_millis() log.info(f"Done parsing {filename}") triples = rdfgraph.triples((None, OWL.imports, None)) for s, p, o in triples: # Load all imports first if p == OWL.imports: if o not in self.imported: input_format = rdflib.util.guess_format(o) imported_rdfgraph = rdflib.Graph() log.info(f"Parsing OWL import: {o}") self.imported.add(o) imported_rdfgraph.parse(o, format=input_format) self.load_graph(imported_rdfgraph) else: log.warning(f"Trying to import {o} but its already done") yield from self.load_graph(rdfgraph)
def parse(self, filename: str = None, input_format: str = None, provided_by: str = None, predicates: Set[URIRef] = None) -> None: """ Parse a n-triple file into networkx.MultiDiGraph The file must be a *.nt formatted file. Parameters ---------- filename : str File to read from. input_format : str The input file format. Must be one of ``['nt', 'nt.gz']`` provided_by : str Define the source providing the input file. """ p = p = NTriplesParser(self) self.start = current_time_in_millis() if input_format == INPUT_FORMATS[0]: p.parse(open(filename, 'rb')) elif input_format == INPUT_FORMATS[1]: p.parse(gzip.open(filename, 'rb')) else: raise NameError( f"input_format: {input_format} not supported. Must be one of {INPUT_FORMATS}" ) print("Done parsing NT file") self.dereify(self.assocs)
def unfold_node_property(graph: BaseGraph, node_property: str, prefix: Optional[str] = None) -> None: """ Unfold node property as a predicate where every node with ``node_property`` will be unfolded as an edge. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph node_property: str The node property to unfold prefix: Optional[str] The prefix to use """ node_cache = [] edge_cache = [] start = current_time_in_millis() p = f"{prefix}:{node_property}" if prefix else node_property for n, data in graph.nodes(data=True): sub = n if node_property in data: obj = data[node_property] edge_cache.append((sub, obj, p)) node_cache.append((n, node_property)) while edge_cache: e = edge_cache.pop() graph.add_edge( *e, **{ 'subject': e[0], 'object': e[1], 'predicate': e[2], 'relation': e[2] }) while node_cache: n = node_cache.pop() del graph.nodes()[n[0]][n[1]] end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def remove_singleton_nodes(graph: BaseGraph) -> None: """ Remove singleton nodes (nodes that have a degree of 0) from the graph. Parameters ---------- graph: kgx.graph.base_graph.BaseGraph The graph """ start = current_time_in_millis() singleton = [] for n, d in graph.degree(): if d == 0: singleton.append(n) while singleton: n = singleton.pop() log.debug(f"Removing singleton node {n}") graph.remove_node(n) end = current_time_in_millis() log.info(f"Time taken: {end - start} ms")
def clique_merge( target_graph: BaseGraph, leader_annotation: str = None, prefix_prioritization_map: Optional[Dict[str, List[str]]] = None, category_mapping: Optional[Dict[str, str]] = None, strict: bool = True, ) -> Tuple[BaseGraph, nx.MultiDiGraph]: """ Parameters ---------- target_graph: kgx.graph.base_graph.BaseGraph The original graph leader_annotation: str The field on a node that signifies that the node is the leader of a clique prefix_prioritization_map: Optional[Dict[str, List[str]]] A map that gives a prefix priority for one or more categories category_mapping: Optional[Dict[str, str]] Mapping for non-Biolink Model categories to Biolink Model categories strict: bool Whether or not to merge nodes in a clique that have conflicting node categories Returns ------- Tuple[kgx.graph.base_graph.BaseGraph, networkx.MultiDiGraph] A tuple containing the updated target graph, and the clique graph """ ppm = get_prefix_prioritization_map() if prefix_prioritization_map: ppm.update(prefix_prioritization_map) prefix_prioritization_map = ppm if not leader_annotation: leader_annotation = LEADER_ANNOTATION start = current_time_in_millis() clique_graph = build_cliques(target_graph) end = current_time_in_millis() log.info(f"Total time taken to build cliques: {end - start} ms") start = current_time_in_millis() elect_leader( target_graph, clique_graph, leader_annotation, prefix_prioritization_map, category_mapping, strict, ) end = current_time_in_millis() log.info(f"Total time taken to elect leaders for all cliques: {end - start} ms") start = current_time_in_millis() graph = consolidate_edges(target_graph, clique_graph, leader_annotation) end = current_time_in_millis() log.info(f"Total time taken to consolidate edges in target graph: {end - start} ms") return graph, clique_graph
def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None: """ Hook for rdflib.plugins.parsers.ntriples.NTriplesParser This method will be called by NTriplesParser when reading from a file. Parameters ---------- s: rdflib.term.URIRef The subject of a triple. p: rdflib.term.URIRef The predicate of a triple. o: rdflib.term.URIRef The object of a triple. """ if p in self.cache: # already processed this predicate before; pull from cache element = self.cache[p]['element'] predicate = self.cache[p]['predicate'] property_name = self.cache[p]['property_name'] else: # haven't seen this predicate before; map to element predicate = self.prefix_manager.contract(str(p)) property_name = self.prefix_manager.get_reference(predicate) element = self.get_biolink_element(predicate, property_name) self.cache[p] = { 'element': element, 'predicate': predicate, 'property_name': property_name } if element: if element.is_a == 'association slot' or predicate in self.edge_properties: logging.debug( f"property {property_name} is an edge property but belongs to a reified node" ) n = self.add_node(s) self.add_node_attribute(n, p, o) self.assocs.add(n) elif element.is_a == 'node property' or predicate in self.node_properties: logging.debug(f"property {property_name} is a node property") n = self.add_node(s) self.add_node_attribute(n, p, o) else: logging.debug( f"property {property_name} is a related_to property") self.add_edge(s, o, p) else: logging.debug( f"property {property_name} is not a biolink model element") if predicate in self.node_properties: logging.debug(f"treating {predicate} as node property") n = self.add_node(s) self.add_node_attribute(n, p, o) else: # treating as an edge logging.debug(f"treating {predicate} as edge property") self.add_edge(s, o, p) self.count += 1 if self.count % 1000 == 0: logging.info( f"Parsed {self.count} triples; time taken: {current_time_in_millis() - self.start} ms" ) self.start = current_time_in_millis()