def get_ontology(dataset: str) -> Ontology: """Get ontology of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["ont"]: # if it has been cached ... cache_file = get_cache_dir(dataset) / 'ont.pkl' cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont = deserialize(cache_file) else: ont = Ontology.from_dataset(dataset) serialize(ont, cache_file) _data_io_vars["ont"][dataset] = ont return _data_io_vars["ont"][dataset]
def build_ont_graph(dataset: str) -> OntGraph: ont = Ontology.from_dataset(dataset) ont_graph: OntGraph = OntGraph(dataset) predicates: Dict[str, Predicate] = {} for ont_name, ont_conf in config.datasets[dataset].ontology.items(): fpaths = [] if 'fpath' in ont_conf: fpaths = [ont_conf.fpath] elif 'fpaths' in ont_conf: fpaths = [ont_conf.fpaths] for fpath in fpaths: g = ontospy.Ontospy(str(fpath.as_path())) is_rdf_type_reliable = False for cls in g.classes: add_node(ont, ont_graph, cls) for prop in g.properties: for rg in prop.ranges: add_node(ont, ont_graph, rg) for domain in prop.domains: add_node(ont, ont_graph, domain) try: predicate = Predicate(str(prop.uri), [str(x.uri) for x in prop.domains], [str(x.uri) for x in prop.ranges], ont.simplify_uri(str(prop.rdftype)), False, {ont_name}) if str(prop.uri) in predicates: predicates[str(prop.uri)].merge(predicate) else: predicates[str(prop.uri)] = predicate if predicate.rdf_type in { PredicateType.OWL_DATA_PROP, PredicateType.OWL_OBJECT_PROP }: is_rdf_type_reliable = True except Exception: print(ont_name, prop) print(prop.__dict__) raise for uri, predicate in predicates.items(): if ont_name in predicate.defined_in_onts: predicate.is_rdf_type_reliable = is_rdf_type_reliable ont_graph.set_predicates(list(predicates.values())) # update parent & children between nodes for node in ont_graph.iter_nodes(): for node_uri in node.parents_uris.union(node.children_uris): if not ont_graph.has_node_with_uri(node_uri): # node is referred by subClassOf but never been defined before ont_graph.add_new_node( GraphNodeType.CLASS_NODE, ont.simplify_uri(node_uri).encode('utf-8'), node_uri, set(), set()) for node in ont_graph.iter_nodes(): for parent_uri in node.parents_uris: ont_graph.get_node_by_uri(parent_uri).children_uris.add(node.uri) for child_uri in node.children_uris: ont_graph.get_node_by_uri(child_uri).parents_uris.add(node.uri) return ont_graph
semantic_models = [] tables = [] for i, raw_tbl in enumerate(raw_tables): r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml" tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl) semantic_models.append(sm) tables.append(tbl) serializeJSON(semantic_models, cache_file) _data_io_vars["data_tables"][dataset] = tables _data_io_vars["semantic_models"][dataset] = semantic_models return _data_io_vars["semantic_models"][dataset] if __name__ == '__main__': dataset = 'museum_crm' ont = Ontology.from_dataset(dataset) data_dir = Path(config.datasets[dataset].as_path()) (data_dir / "models-viz").mkdir(exist_ok=True, parents=True) (data_dir / "tables-viz").mkdir(exist_ok=True, parents=True) for sm in get_semantic_models(dataset): sm.graph.render2pdf(data_dir / f"models-viz/{sm.id}.pdf") for tbl in get_data_tables(dataset): with open(data_dir / "tables-viz" / f"{tbl.id}.txt", "wb") as f: f.write(tbl.to_string().encode("utf-8"))