def read_edge(self, edge: Dict) -> Optional[Tuple]: """ Load an edge into an instance of BaseGraph. Parameters ---------- edge: Dict An edge Returns ------- Optional[Tuple] A tuple that contains subject id, object id, edge key, and edge data """ edge = validate_edge(edge) edge_data = sanitize_import(edge.copy()) if 'id' not in edge_data: edge_data['id'] = generate_uuid() s = edge_data['subject'] o = edge_data['object'] if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys(): edge_data['provided_by'] = self.graph_metadata['provided_by'] key = generate_edge_key(s, edge_data['predicate'], o) self.edge_properties.update(list(edge_data.keys())) if self.check_edge_filter(edge_data): self.node_properties.update(edge_data.keys()) return s, o, key, edge_data
def load_edge(self, edge_record: List) -> Tuple: """ Load an edge into an instance of BaseGraph Parameters ---------- edge_record: List A 4-tuple edge record Returns ------- Tuple A tuple with subject ID, object ID, edge key, and edge data """ subject_node = edge_record[0] edge = edge_record[1] object_node = edge_record[2] if 'provided_by' in self.graph_metadata and 'provided_by' not in edge.keys(): edge['provided_by'] = self.graph_metadata['provided_by'] if 'id' not in edge.keys(): edge['id'] = generate_uuid() key = generate_edge_key(subject_node['id'], edge['predicate'], object_node['id']) edge = validate_edge(edge) edge = sanitize_import(edge.copy()) self.edge_properties.update(edge.keys()) return subject_node['id'], object_node['id'], key, edge
def read_edges(self) -> Generator: """ Read edges as records from the graph. Returns ------- Generator A generator for edges """ for u, v, k, data in self.graph.edges(keys=True, data=True): edge_data = validate_edge(data) edge_data = sanitize_import(edge_data.copy()) if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys( ): edge_data['provided_by'] = self.graph_metadata['provided_by'] if self.check_edge_filter(edge_data): self.node_properties.update(edge_data.keys()) yield u, v, k, edge_data
def triple(self, s: URIRef, p: URIRef, o: URIRef) -> None: """ Parse a triple. Parameters ---------- s: URIRef Subject p: URIRef Predicate o: URIRef Object """ self.count += 1 (element_uri, canonical_uri, predicate, property_name) = self.process_predicate(p) if element_uri: prop_uri = element_uri elif predicate: prop_uri = predicate else: prop_uri = property_name s_curie = self.prefix_manager.contract(s) if s_curie.startswith('biolink') or s_curie.startswith('OBAN'): log.warning(f"Skipping {s} {p} {o}") elif s_curie in self.reified_nodes: # subject is a reified node self.add_node_attribute(s, key=prop_uri, value=o) elif p in self.reification_predicates: # subject is a reified node self.reified_nodes.add(s_curie) self.add_node_attribute(s, key=prop_uri, value=o) elif property_name in {'subject', 'predicate', 'object', 'predicate', 'relation'}: # subject is a reified node self.reified_nodes.add(s_curie) self.add_node_attribute(s, key=prop_uri, value=o) elif o in self.reification_types: # subject is a reified node self.reified_nodes.add(s_curie) self.add_node_attribute(s, key=prop_uri, value=o) elif element_uri and element_uri in self.node_property_predicates: # treating predicate as a node property self.add_node_attribute(s, key=prop_uri, value=o) elif ( p in self.node_property_predicates or predicate in self.node_property_predicates or property_name in self.node_property_predicates ): # treating predicate as a node property self.add_node_attribute(s, key=prop_uri, value=o) elif isinstance(o, rdflib.term.Literal): self.add_node_attribute(s, key=prop_uri, value=o) else: # treating predicate as an edge self.add_edge(s, o, p) if len(self.edge_cache) >= self.CACHE_SIZE: while self.reified_nodes: n = self.reified_nodes.pop() data = self.node_cache.pop(n) try: self.dereify(n, data) except ValueError as e: log.info(e) self._incomplete_nodes[n] = data for n in self._incomplete_nodes.keys(): self.node_cache[n] = self._incomplete_nodes[n] self.reified_nodes.add(n) self._incomplete_nodes.clear() for k in self.edge_cache.keys(): if 'id' not in self.edge_cache[k] and 'association_id' not in self.edge_cache[k]: edge_key = generate_edge_key( self.edge_cache[k]['subject'], self.edge_cache[k]['predicate'], self.edge_cache[k]['object'], ) self.edge_cache[k]['id'] = edge_key data = self.edge_cache[k] data = validate_edge(data) data = sanitize_import(data) if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys(): data['provided_by'] = self.graph_metadata['provided_by'] if self.check_edge_filter(data): self.edge_properties.update(data.keys()) yield k[0], k[1], k[2], data self.edge_cache.clear() yield None
def parse( self, filename: str, format: str = 'nt', compression: Optional[str] = None, provided_by: Optional[str] = None, **kwargs: Any, ) -> Generator: """ This method reads from RDF N-Triples and yields records. .. note:: To ensure proper parsing of N-Triples and a relatively low memory footprint, it is recommended that the N-Triples be sorted based on the subject IRIs. ```sort -k 1,2 -t ' ' data.nt > data_sorted.nt``` Parameters ---------- filename: str The filename to parse format: str The format (``nt``) compression: Optional[str] The compression type (``gz``) provided_by: Optional[str] The name of the source providing the input file kwargs: Any Any additional arguments Returns ------- Generator A generator for records """ p = CustomNTriplesParser(self) if provided_by: self.graph_metadata['provided_by'] = [provided_by] if compression == 'gz': yield from p.parse(gzip.open(filename, 'rb')) else: yield from p.parse(open(filename, 'rb')) log.info(f"Done parsing {filename}") for n in self.reified_nodes: data = self.node_cache.pop(n) self.dereify(n, data) for k in self.node_cache.keys(): data = self.node_cache[k] if 'category' in data: if 'biolink:NamedThing' not in set(data['category']): data['category'].append('biolink:NamedThing') else: data['category'] = ["biolink:NamedThing"] data = validate_node(data) data = sanitize_import(data) if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys(): data['provided_by'] = self.graph_metadata['provided_by'] if self.check_node_filter(data): self.node_properties.update(data.keys()) yield k, data self.node_cache.clear() for k in self.edge_cache.keys(): data = self.edge_cache[k] data = validate_edge(data) data = sanitize_import(data) if 'provided_by' in self.graph_metadata and 'provided_by' not in data.keys(): data['provided_by'] = self.graph_metadata['provided_by'] if self.check_edge_filter(data): self.edge_properties.update(data.keys()) yield k[0], k[1], k[2], data self.edge_cache.clear()
def load_edge(self, edge: Dict) -> Generator: """ Load an edge into an instance of BaseGraph Parameters ---------- edge : Dict An edge Returns ------- Generator A generator for node and edge records """ (element_uri, canonical_uri, predicate, property_name) = process_predicate(self.prefix_manager, edge['predicate_id'], self.predicate_mapping) if element_uri: edge_predicate = element_uri elif predicate: edge_predicate = predicate else: edge_predicate = property_name if canonical_uri: edge_predicate = element_uri data = { 'subject': edge['subject_id'], 'predicate': edge_predicate, 'object': edge['object_id'], } del edge['predicate_id'] data = validate_edge(data) subject_node = {} object_node = {} for k, v in edge.items(): if k in SSSOM_NODE_PROPERTY_MAPPING: if k.startswith('subject'): mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k] if mapped_k == 'category' and not PrefixManager.is_curie( v): v = f"biolink:OntologyClass" subject_node[mapped_k] = v elif k.startswith('object'): mapped_k = SSSOM_NODE_PROPERTY_MAPPING[k] if mapped_k == 'category' and not PrefixManager.is_curie( v): v = f"biolink:OntologyClass" object_node[mapped_k] = v else: log.info(f"Ignoring {k} {v}") else: data[k] = v objs = [self.load_node(subject_node), self.load_node(object_node)] for k, v in self.graph_metadata.items(): if k not in {'curie_map'}: data[k] = v edge_data = sanitize_import(data.copy()) if 'subject' in edge_data and 'object' in edge_data: if 'id' not in edge_data: edge_data['id'] = generate_uuid() s = edge_data['subject'] o = edge_data['object'] if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys( ): edge_data['provided_by'] = self.graph_metadata['provided_by'] key = generate_edge_key(s, edge_data['predicate'], o) self.edge_properties.update(list(edge_data.keys())) objs.append((s, o, key, edge_data)) else: log.info( "Ignoring edge with either a missing 'subject' or 'object': {}" .format(edge_data)) for o in objs: yield o
def load_graph(self, rdfgraph: rdflib.Graph, **kwargs: Any) -> None: """ Walk through the rdflib.Graph and load all triples into kgx.graph.base_graph.BaseGraph Parameters ---------- rdfgraph: rdflib.Graph Graph containing nodes and edges kwargs: Any Any additional arguments """ seen = set() seen.add(RDFS.subClassOf) for s, p, o in rdfgraph.triples((None, RDFS.subClassOf, None)): # ignoring blank nodes if isinstance(s, rdflib.term.BNode): continue pred = None parent = None os_interpretation = None if isinstance(o, rdflib.term.BNode): # C SubClassOf R some D for x in rdfgraph.objects(o, OWL.onProperty): pred = x # owl:someValuesFrom for x in rdfgraph.objects(o, OWL.someValuesFrom): os_interpretation = self.OWLSTAR.term( 'AllSomeInterpretation') parent = x # owl:allValuesFrom for x in rdfgraph.objects(o, OWL.allValuesFrom): os_interpretation = self.OWLSTAR.term( "AllOnlyInterpretation") parent = x if pred is None or parent is None: log.warning( f"{s} {p} {o} has OWL.onProperty {pred} and OWL.someValuesFrom {parent}" ) log.warning( "Do not know how to handle BNode: {}".format(o)) continue else: # C rdfs:subClassOf D (where C and D are named classes) pred = p parent = o if os_interpretation: # reify edges that have logical interpretation eid = generate_uuid() self.reified_nodes.add(eid) yield from self.triple(URIRef(eid), self.BIOLINK.term('category'), self.BIOLINK.Association) yield from self.triple(URIRef(eid), self.BIOLINK.term('subject'), s) yield from self.triple(URIRef(eid), self.BIOLINK.term('predicate'), pred) yield from self.triple(URIRef(eid), self.BIOLINK.term('object'), parent) yield from self.triple( URIRef(eid), self.BIOLINK.term('logical_interpretation'), os_interpretation) else: yield from self.triple(s, pred, parent) seen.add(OWL.equivalentClass) for s, p, o in rdfgraph.triples((None, OWL.equivalentClass, None)): # A owl:equivalentClass B (where A and B are named classes) if not isinstance(o, rdflib.term.BNode): yield from self.triple(s, p, o) for relation in rdfgraph.subjects(RDF.type, OWL.ObjectProperty): seen.add(relation) for s, p, o in rdfgraph.triples((relation, None, None)): if not isinstance(o, rdflib.term.BNode): if p not in self.excluded_predicates: yield from self.triple(s, p, o) for s, p, o in rdfgraph.triples((None, None, None)): if isinstance(s, rdflib.term.BNode) or isinstance( o, rdflib.term.BNode): continue if p in seen: continue if p in self.excluded_predicates: continue yield from self.triple(s, p, o) for n in self.reified_nodes: data = self.node_cache.pop(n) self.dereify(n, data) for k, data in self.node_cache.items(): node_data = validate_node(data) node_data = sanitize_import(node_data) if 'provided_by' in self.graph_metadata and 'provided_by' not in node_data.keys( ): node_data['provided_by'] = self.graph_metadata['provided_by'] if self.check_node_filter(node_data): yield k, node_data self.node_cache.clear() for k, data in self.edge_cache.items(): edge_data = validate_edge(data) edge_data = sanitize_import(edge_data) if 'provided_by' in self.graph_metadata and 'provided_by' not in edge_data.keys( ): edge_data['provided_by'] = self.graph_metadata['provided_by'] if self.check_edge_filter(edge_data): yield k[0], k[1], k[2], edge_data self.edge_cache.clear()
def test_validate_correct_edge(edge): """ Test basic validation of an edge, where the edge is valid. """ e = validate_edge(edge) assert e is not None
def test_validate_incorrect_edge(edge): """ Test basic validation of an edge, where the edge is invalid. """ with pytest.raises(KeyError): validate_edge(edge)