def from_json(obj: dict, ont: Ontology) -> 'SSD': g = Graph(True, True, True) node2attr = {x['node']: x['attribute'] for x in obj['mappings']} idmap = {} raw_attributes = {} for raw_attr in obj['attributes']: assert len(raw_attr['columnIds'] ) == 1 and raw_attr['columnIds'][0] == raw_attr['id'] raw_attributes[raw_attr['id']] = raw_attr attrs = [] for n in obj['semanticModel']['nodes']: if n['type'] == 'DataNode': node_type = GraphNodeType.DATA_NODE attr = raw_attributes[node2attr[n['id']]] n_lbl = attr['name'] attrs.append(SSDAttribute(n['id'], n_lbl)) else: node_type = GraphNodeType.CLASS_NODE n_lbl = n['prefix'] + n['label'] n_lbl = ont.simplify_uri(n_lbl) idmap[n['id']] = g.add_new_node(node_type, n_lbl.encode()).id for e in obj['semanticModel']['links']: e_lbl = e['prefix'] + e['label'] e_lbl = ont.simplify_uri(e_lbl) g.add_new_link(GraphLinkType.UNSPECIFIED, e_lbl.encode(), idmap[e['source']], idmap[e['target']]) return SSD(obj['name'], attrs, g, ont)
def from_karma_model( node: dict, ont: Ontology, id2columns: Dict[str, 'KarmaSourceColumn'] ) -> Tuple['KarmaGraphNode', int, bytes]: assert node['type'] in {'ColumnNode', 'InternalNode', 'LiteralNode' }, "Not recognized type: %s" % node['type'] if node['type'] in {'ColumnNode', 'LiteralNode'}: type = GraphNodeType.DATA_NODE else: assert node['type'] == "InternalNode", node['type'] type = GraphNodeType.CLASS_NODE is_literal_node = False if type == GraphNodeType.DATA_NODE: # IMPORTANT: this related to the SourceColumn::get_unique_column_name if node['type'] == 'LiteralNode': # trying to make short & readable label using heuristic label = node['value'] is_literal_node = True else: label = id2columns[node['id']].column_name else: label = ont.simplify_uri(node['label']['uri']) user_semantic_types = [] if 'user_semantic_types' in node: for x in node['user_semantic_types']: x = _dict_camel_to_snake(x) x['domain'] = ont.simplify_uri(x['domain']['uri']) x['type'] = ont.simplify_uri(x['type']['uri']) user_semantic_types.append(KarmaSemanticType(**x)) # because there is duplication in data sources, now we filter out duplicated semantic types user_semantic_types = unique_values(user_semantic_types, key=lambda n: n.get_hashing_id()) learned_semantic_types = [] if 'learned_semantic_types' in node and node[ 'learned_semantic_types'] is not None: for x in node['learned_semantic_types']: x = _dict_camel_to_snake(x) x['domain'] = ont.simplify_uri(x['domain']['uri']) x['type'] = ont.simplify_uri(x['type']['uri']) x['h_node_id'] = node['h_node_id'] learned_semantic_types.append(KarmaSemanticType(**x)) # double check data assert node['model_ids'] is None if 'rdf_literal_type' not in node or node['rdf_literal_type'] is None: literal_type = None else: literal_type = node['rdf_literal_type']['uri'] return KarmaGraphNode(user_semantic_types, learned_semantic_types, literal_type, is_literal_node), type, label.encode('utf-8')
def add_ont_paths(graph: IntGraph, ont: Ontology, ont_graph: OntGraph) -> None: for u in graph.iter_class_nodes(): for v in graph.iter_class_nodes(): if u == v: continue c1 = next(ont_graph.iter_nodes_by_label(u.label)) c2 = next(ont_graph.iter_nodes_by_label(v.label)) possible_predicates = ont_graph.get_possible_predicates( ont.full_uri(c1.label.decode('utf-8')), ont.full_uri(c2.label.decode('utf-8'))) for p in possible_predicates: p_lbl = ont.simplify_uri(p.uri).encode('utf-8') e = next((e for e in v.iter_incoming_links() if e.source_id == u.id and e.label == p_lbl), None) if e is None: e = IntGraphLink({Tag.ONT_GRAPH_SOURCE}) graph.real_add_new_link(e, GraphLinkType.UNSPECIFIED, p_lbl, u.id, v.id)
def from_karma_model( link: dict, ont: Ontology) -> Tuple['KarmaGraphLink', int, bytes, int, int]: if link['type'] == 'ObjectPropertyLink': link_type = GraphLinkType.OBJECT_PROPERTY elif link['type'] == 'DataPropertyLink': link_type = GraphLinkType.DATA_PROPERTY else: assert link['type'] == 'ClassInstanceLink' link_type = GraphLinkType.URI_PROPERTY return KarmaGraphLink(link['weight']), link_type, ont.simplify_uri( link['label']['uri']).encode( 'utf-8'), link['source_id'], link['target_id']
def get_ontology(dataset: str) -> Ontology: """Get ontology of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["ont"]: # if it has been cached ... cache_file = get_cache_dir(dataset) / 'ont.pkl' cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont = deserialize(cache_file) else: ont = Ontology.from_dataset(dataset) serialize(ont, cache_file) _data_io_vars["ont"][dataset] = ont return _data_io_vars["ont"][dataset]
def create_node_args( ont: Ontology, cls: OntoClass ) -> Optional[Tuple[int, bytes, str, Set[str], Set[str]]]: if not filter_uri(str(cls.uri)): return None if is_data_node(str(cls.uri)): node_type = GraphNodeType.DATA_NODE else: node_type = GraphNodeType.CLASS_NODE if cls.sparqlHelper is not None: parents = { str(x[0]) for x in cls.sparqlHelper.getClassAllSupers(cls.uri) } else: parents = set() children = set() return node_type, ont.simplify_uri(str(cls.uri)).encode('utf-8'), str( cls.uri), parents, children
def build_ont_graph(dataset: str) -> OntGraph: ont = Ontology.from_dataset(dataset) ont_graph: OntGraph = OntGraph(dataset) predicates: Dict[str, Predicate] = {} for ont_name, ont_conf in config.datasets[dataset].ontology.items(): fpaths = [] if 'fpath' in ont_conf: fpaths = [ont_conf.fpath] elif 'fpaths' in ont_conf: fpaths = [ont_conf.fpaths] for fpath in fpaths: g = ontospy.Ontospy(str(fpath.as_path())) is_rdf_type_reliable = False for cls in g.classes: add_node(ont, ont_graph, cls) for prop in g.properties: for rg in prop.ranges: add_node(ont, ont_graph, rg) for domain in prop.domains: add_node(ont, ont_graph, domain) try: predicate = Predicate(str(prop.uri), [str(x.uri) for x in prop.domains], [str(x.uri) for x in prop.ranges], ont.simplify_uri(str(prop.rdftype)), False, {ont_name}) if str(prop.uri) in predicates: predicates[str(prop.uri)].merge(predicate) else: predicates[str(prop.uri)] = predicate if predicate.rdf_type in { PredicateType.OWL_DATA_PROP, PredicateType.OWL_OBJECT_PROP }: is_rdf_type_reliable = True except Exception: print(ont_name, prop) print(prop.__dict__) raise for uri, predicate in predicates.items(): if ont_name in predicate.defined_in_onts: predicate.is_rdf_type_reliable = is_rdf_type_reliable ont_graph.set_predicates(list(predicates.values())) # update parent & children between nodes for node in ont_graph.iter_nodes(): for node_uri in node.parents_uris.union(node.children_uris): if not ont_graph.has_node_with_uri(node_uri): # node is referred by subClassOf but never been defined before ont_graph.add_new_node( GraphNodeType.CLASS_NODE, ont.simplify_uri(node_uri).encode('utf-8'), node_uri, set(), set()) for node in ont_graph.iter_nodes(): for parent_uri in node.parents_uris: ont_graph.get_node_by_uri(parent_uri).children_uris.add(node.uri) for child_uri in node.children_uris: ont_graph.get_node_by_uri(child_uri).parents_uris.add(node.uri) return ont_graph
def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str, Path]): g = RDFGraph() km_dev = Namespace("http://isi.edu/integration/karma/dev#") g.namespace_manager.bind("km-dev", km_dev) kr2rml = BNode() g.add((kr2rml, RDF.type, km_dev.R2RMLMapping)) g.add((kr2rml, km_dev.sourceName, Literal(tbl.id))) # timestamp and version, doesn't need to be precise g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264))) g.add((kr2rml, km_dev.modelVersion, Literal("1.7"))) input_columns = [] output_columns = [] # mapping from Schema attribute path OR Command to KarmaColumns attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd], List[Dict[str, str]]] = {} for attr_path in tbl.schema.get_attr_paths(): input_columns.append([{ "columnName": x } for x in attr_path.split(Schema.PATH_DELIMITER)]) if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE: # default karma behaviour, you cannot set semantic type for higher level, but only "values" input_columns[-1].append({"columnName": "values"}) output_columns.append(input_columns[-1]) attr2hnodes[attr_path] = input_columns[-1] for cmd in self.commands: if isinstance(cmd, PyTransformNewColumnCmd): new_attr_path = cmd.input_attr_paths[0].split( Schema.PATH_DELIMITER)[:-1] new_attr_path.append(cmd.new_attr_name) new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path) # when you create a new column from a list, karma convert to a list of objects # e.g: birth_death_date.values, create col death date from that, # Karma create => birth_death_date.death_date # that's why we have this code below new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1] new_hnode.append({"columnName": cmd.new_attr_name}) output_columns.append(new_hnode) attr2hnodes[cmd] = output_columns[-1] attr2hnodes[new_attr_path] = output_columns[-1] worksheet_history = [] # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1) commands = [ cmd for cmd in self.commands if isinstance(cmd, PyTransformNewColumnCmd) ] for cmd in sorted( [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)], key=lambda c: c.node_id): commands.append(cmd) for cmd in sorted( [c for c in self.commands if isinstance(c, SetInternalLinkCmd)], key=lambda c: c.target_uri or c.source_uri or ""): commands.append(cmd) # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma # need to re-arrange the node_id node_id_old2new: Dict[str, str] = {} node_id_domain_count: Dict[str, int] = {} for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pass elif isinstance(cmd, SetSemanticTypeCmd): if cmd.node_id not in node_id_old2new: node_id_domain_count[ cmd.domain] = node_id_domain_count.get(cmd.domain, 0) + 1 node_id_old2new[ cmd. node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}" elif isinstance(cmd, SetInternalLinkCmd): if cmd.source_id not in node_id_old2new: assert cmd.source_uri is not None node_id_domain_count[ cmd.source_uri] = node_id_domain_count.get( cmd.source_uri, 0) + 1 node_id_old2new[ cmd. source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}" if cmd.target_id not in node_id_old2new: assert cmd.target_uri is not None node_id_domain_count[ cmd.target_uri] = node_id_domain_count.get( cmd.target_uri, 0) + 1 node_id_old2new[ cmd. target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}" for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pytransform_code = cmd.code # recover pytransform_code from our code pytransform_code = pytransform_code.replace( "__return__ = ", "return ") for match in reversed( list( re.finditer("getValue\(([^)]+)\)", pytransform_code))): start, end = match.span(1) field = pytransform_code[start:end].replace( "'", "").replace('"""', "").replace('"', '') # convert full name to last column name since Karma use last column name instead for input_attr_path in cmd.input_attr_paths: if input_attr_path == field: # TODO: will Karma always use last column name? field = attr2hnodes[input_attr_path][-1][ 'columnName'] break else: assert False, f"Cannot find any field {field} in the input columns" pytransform_code = pytransform_code[: start] + f'"{field}"' + pytransform_code[ end:] worksheet_history.append({ "tags": ["Transformation"], "commandName": "SubmitPythonTransformationCommand", "inputParameters": [{ "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_paths[0]], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "newColumnName", "value": cmd.new_attr_name, "type": "other" }, { "name": "transformationCode", "value": pytransform_code, "type": "other" }, { "name": "errorDefaultValue", "value": cmd.default_error_value, "type": "other" }, { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[iap] } for iap in cmd.input_attr_paths]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd] if attr2hnodes[cmd][-1]['columnName'] != "values" else attr2hnodes[cmd][:-1] }]) }] }) elif isinstance(cmd, SetSemanticTypeCmd): if cmd.type != "karma:classLink": worksheet_history.append({ "commandName": "SetSemanticTypeCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) else: worksheet_history.append({ "commandName": "SetMetaPropertyCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "metaPropertyName", "value": "isUriOfClass", "type": "other" }, { "name": "metaPropertyUri", "value": ont.full_uri(cmd.domain), "type": "other" }, { "name": "metaPropertyId", "value": ont.full_uri(node_id_old2new[cmd.node_id]), "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) elif isinstance(cmd, SetInternalLinkCmd): # TODO: comment out because old KARMA doesn't recognize this! # if cmd.target_uri is not None or cmd.source_uri is not None: # worksheet_history.append({ # "commandName": "AddLinkCommand", # "tags": ["Modeling"], # "inputParameters": [ # {"name": "worksheetId", "value": "W", "type": "worksheetId"}, # { # "name": "edge", # "type": "other", # "value": { # "edgeId": ont.full_uri(cmd.link_lbl), # "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), # "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]), # "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), # "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1]) # } # }, # {"name": "inputColumns", "type": "hNodeIdList", "value": []}, # {"name": "outputColumns", "type": "hNodeIdList", "value": []} # ] # }) # else: worksheet_history.append({ "commandName": "ChangeInternalNodeLinksCommand", "tags": ["Modeling"], "inputParameters": [{ "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "initialEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]) }] }, { "name": "newEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), "edgeTargetUri": ont.full_uri( cmd.target_uri or node_id_old2new[cmd.target_id][:-1]), "edgeSourceUri": ont.full_uri( cmd.source_uri or node_id_old2new[cmd.source_id][:-1]) }] }, { "name": "inputColumns", "type": "hNodeIdList", "value": [] }, { "name": "outputColumns", "type": "hNodeIdList", "value": [] }] }) g.add((kr2rml, km_dev.hasInputColumns, Literal(ujson.dumps(input_columns)))) g.add((kr2rml, km_dev.hasOutputColumns, Literal(ujson.dumps(output_columns)))) g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id))) g.add((kr2rml, km_dev.hasBaseURI, Literal("http://localhost:8080/source/"))) g.add((kr2rml, km_dev.hasWorksheetHistory, Literal(ujson.dumps(worksheet_history, indent=4)))) g.serialize(str(fpath), format='n3')
def __init__(self, ont: Ontology) -> None: self.choices = ont.get_classes() self.extended_choices = []
def get_predicate_completer(ont: Ontology) -> 'StringCompleter': return StringCompleter(ont.get_predicates())
def __init__(self, ont: Ontology, tbl: DataTable, kr2rml_file: Path) -> None: g = rdflib.Graph(store=IOMemory()) g.parse(location=str(kr2rml_file), format="n3") worksheet_history = list( g.triples( (None, URIRef( "http://isi.edu/integration/karma/dev#hasWorksheetHistory" ), None))) assert len(worksheet_history) == 1 worksheet_history = ujson.loads(worksheet_history[0][-1]) input_columns = list( g.triples(( None, URIRef("http://isi.edu/integration/karma/dev#hasInputColumns"), None))) assert len(input_columns) == 1 input_columns = ujson.loads(input_columns[0][-1]) # construct mapping between kr2rml attribute paths to tbl_attr_paths tbl_attr_paths = tbl.schema.get_attr_paths() n_attr_paths = len(tbl_attr_paths) tbl_attr_paths = { apath.replace("@", ""): apath for apath in tbl_attr_paths } assert len(tbl_attr_paths) == n_attr_paths start_idx = 0 for i, cname in enumerate(input_columns[0]): cpath = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in input_columns[0][i:]) # cname = Schema.PATH_DELIMITERinput_columns[i:]) cname['columnName'] + Schema.PATH_DELIMITER found_attr = False for attr_path in tbl_attr_paths: if (attr_path + Schema.PATH_DELIMITER).startswith(cpath): found_attr = True break if found_attr: start_idx = i break literal_nodes = {} col2col = {} for col in input_columns: attr_path = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col[start_idx:]) if attr_path not in tbl_attr_paths: attr_path = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col[start_idx:-1]) if col[-1]['columnName'] == 'Values': assert attr_path in tbl_attr_paths elif col[-1]['columnName'] == 'content': attr_path += Schema.PATH_DELIMITER + "#text" assert attr_path in tbl_attr_paths else: raise ValueError( f"Invalid column type: {col[-1]['columnName']}") col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col)] = tbl_attr_paths[attr_path] assert len(set( col2col.values())) == len(input_columns), "No duplication" # extracting commands commands = [] for command in worksheet_history: if command['commandName'] == "SubmitPythonTransformationCommand": cmd_start_col = command['inputParameters'][0] cmd_input_parent_col = Schema.PATH_DELIMITER.join( [col['columnName'] for col in cmd_start_col['value'][:-1]]) cmd_input_col = command['inputParameters'][-2] cmd_output_col = command['inputParameters'][-1] if command['inputParameters'][-3]['name'] == 'isJSONOutput': cmd_code = command['inputParameters'][-5] default_error_value = command['inputParameters'][-4] assert command['inputParameters'][-3]['value'] == "false" else: default_error_value = command['inputParameters'][-3] cmd_code = command['inputParameters'][-4] assert cmd_input_col['name'] == "inputColumns" and cmd_output_col[ "name"] == "outputColumns" and cmd_code[ 'name'] == 'transformationCode' and default_error_value[ 'name'] == 'errorDefaultValue' cmd_input_cols = [[ cname['columnName'] for cname in o['value'] ] for o in ujson.loads(cmd_input_col['value'])] karma_input_attr_paths = [ col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols ] # update col2col because of new columns new_attr_name = ujson.loads( cmd_output_col['value'])[0]['value'][-1]['columnName'] new_attr_path = new_attr_name if cmd_input_parent_col == "" else ( cmd_input_parent_col + Schema.PATH_DELIMITER + new_attr_name) cmd_output_col = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_output_col['value'])[0]['value']) col2col[cmd_output_col] = new_attr_path cmd_code = cmd_code['value'].replace("return ", "__return__ = ") input_attr_paths = [] for match in reversed( list(re.finditer("getValue\(([^)]+)\)", cmd_code))): start, end = match.span(1) field = cmd_code[start:end].replace("'", "").replace( '"""', "").replace('"', '') # it seems that Karma use last column name, we need to recover full name # using the provided input first for cmd_input_col, input_attr_path in zip( cmd_input_cols, karma_input_attr_paths): if field == cmd_input_col[-1]: field = input_attr_path break else: # otherwise construct from the start columns full_field = field if cmd_input_parent_col == "" else ( cmd_input_parent_col + Schema.PATH_DELIMITER + field) field = col2col[full_field] cmd_code = cmd_code[:start] + f'"{field}"' + cmd_code[end:] input_attr_paths.append(field) default_error_value = default_error_value['value'] commands.append( PyTransformNewColumnCmd(input_attr_paths, new_attr_name, cmd_code, default_error_value)) elif command["commandName"] == "SetSemanticTypeCommand" or command[ "commandName"] == "SetMetaPropertyCommand": cmd_input_col = command['inputParameters'][-2] if command["inputParameters"][-5][ 'name'] == 'SemanticTypesArray': cmd_stype = command['inputParameters'][-5] else: cmd_stype = command['inputParameters'][-6] if cmd_stype['name'] == 'SemanticTypesArray': assert cmd_input_col['name'] == "inputColumns" and len( cmd_stype['value'] ) == 1 and cmd_stype['value'][0]['isPrimary'] cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] cmd_stype = cmd_stype['value'][0] commands.append( SetSemanticTypeCmd( cmd_input_col, domain=ont.simplify_uri(cmd_stype['DomainUri']), type=ont.simplify_uri(cmd_stype['FullType']), node_id=ont.simplify_uri( cmd_stype['DomainId'].replace(" (add)", "")))) else: cmd_stype_domain = command['inputParameters'][-7] cmd_stype_id = command['inputParameters'][-6] assert cmd_input_col['name'] == "inputColumns" and cmd_stype_domain['name'] == 'metaPropertyUri' \ and cmd_stype_id['name'] == 'metaPropertyId' cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] commands.append( SetSemanticTypeCmd( cmd_input_col, domain=ont.simplify_uri(cmd_stype_domain['value']), type="karma:classLink", node_id=ont.simplify_uri(cmd_stype_id['value']))) elif command['commandName'] == 'UnassignSemanticTypeCommand': cmd_input_col = command['inputParameters'][-2] assert cmd_input_col['name'] == "inputColumns" cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] delete_cmds = [] for i, cmd in enumerate(commands): if isinstance(cmd, SetSemanticTypeCmd ) and cmd.input_attr_path == cmd_input_col: delete_cmds.append(i) for i in reversed(delete_cmds): commands.pop(i) elif command["commandName"] == "ChangeInternalNodeLinksCommand": cmd_edges = command['inputParameters'][-3] assert cmd_edges['name'] == 'newEdges' # cmd_initial_edges = command['inputParameters'][-4] # if cmd_initial_edges['name'] == 'initialEdges' and len(cmd_initial_edges['value']) > 0: # delete_cmds = [] # for cmd_edge in cmd_initial_edges['value']: # edge_lbl = ont.simplify_uri(cmd_edge['edgeId']) # source_id = ont.simplify_uri(cmd_edge['edgeSourceId']) # # if cmd_edge['edgeTargetId'] in literal_nodes: # for i, cmd in enumerate(commands): # if isinstance(cmd, SetSemanticTypeCmd) and cmd.type == edge_lbl and cmd.node_id == source_id: # delete_cmds.append(i) # else: # target_id = ont.simplify_uri(cmd_edge['edgeTargetId']) # for i, cmd in enumerate(commands): # if isinstance(cmd, SetInternalLinkCmd) and cmd.link_lbl == edge_lbl and cmd.target_id == target_id and cmd.source_id == source_id: # delete_cmds.append(i) # # for idx in sorted(delete_cmds, reverse=True): # commands.pop(idx) for cmd_edge in cmd_edges['value']: source_uri = cmd_edge.get('edgeSourceUri', None) target_uri = cmd_edge.get('edgeTargetUri', None) if source_uri is not None and source_uri != cmd_edge[ 'edgeSourceId']: source_uri = ont.simplify_uri(source_uri) else: source_uri = None if target_uri is not None and target_uri != cmd_edge[ 'edgeTargetId']: target_uri = ont.simplify_uri(target_uri) else: target_uri = None if cmd_edge['edgeTargetId'] in literal_nodes: # convert this command to SetSemanticType commands.append( SetSemanticTypeCmd( literal_nodes[cmd_edge['edgeTargetId']], domain=ont.simplify_uri(source_uri), type=ont.simplify_uri(cmd_edge['edgeId']), node_id=ont.simplify_uri( cmd_edge['edgeSourceId']))) else: commands.append( SetInternalLinkCmd( ont.simplify_uri(cmd_edge['edgeSourceId']), ont.simplify_uri(cmd_edge['edgeTargetId']), ont.simplify_uri(cmd_edge['edgeId']), source_uri, target_uri)) elif command['commandName'] == "AddLinkCommand": cmd_edges = command['inputParameters'][-3] assert cmd_edges['name'] == 'edge' cmd_edge = cmd_edges['value'] source_uri = cmd_edge.get('edgeSourceUri', None) target_uri = cmd_edge.get('edgeTargetUri', None) if source_uri is not None: source_uri = ont.simplify_uri(source_uri) else: source_uri = None if cmd_edge['edgeTargetId'] in literal_nodes: # convert this command to SetSemanticType commands.append( SetSemanticTypeCmd( literal_nodes[cmd_edge['edgeTargetId']], domain=ont.simplify_uri(source_uri), type=ont.simplify_uri(cmd_edge['edgeId']), node_id=ont.simplify_uri( cmd_edge['edgeSourceId']))) else: if target_uri is not None: target_uri = ont.simplify_uri(target_uri) else: target_uri = None commands.append( SetInternalLinkCmd( ont.simplify_uri(cmd_edge['edgeSourceId']), ont.simplify_uri(cmd_edge['edgeTargetId']), ont.simplify_uri(cmd_edge['edgeId']), source_uri, target_uri)) elif command['commandName'] == 'DeleteLinkCommand': cmd_edge = command['inputParameters'][-3] assert cmd_edge['name'] == 'edge' cmd_edge = cmd_edge['value'] for i, cmd in enumerate(commands): if isinstance(cmd, SetInternalLinkCmd): if cmd.source_id == cmd_edge[ 'edgeSourceId'] and cmd.target_id == cmd_edge[ 'edgeTargetId'] and cmd.link_lbl == ont.simplify_uri( cmd_edge['edgeId']): commands.pop(i) break elif command["commandName"] == "AddLiteralNodeCommand": cmd_literal_value = command["inputParameters"][0] assert cmd_literal_value['name'] == 'literalValue' cmd_literal_value = cmd_literal_value['value'] # they may re-use literal_values, let's user fix it manually if cmd_literal_value.startswith("http"): new_attr_path = f"literal:{ont.simplify_uri(cmd_literal_value)}" else: new_attr_path = f"literal:{cmd_literal_value}" if cmd_literal_value + "1" not in literal_nodes: new_attr_path += ":1" literal_nodes[cmd_literal_value + "1"] = new_attr_path elif cmd_literal_value + "2" not in literal_nodes: new_attr_path += ":2" literal_nodes[cmd_literal_value + "2"] = new_attr_path elif cmd_literal_value + "3" not in literal_nodes: new_attr_path += ":3" literal_nodes[cmd_literal_value + "3"] = new_attr_path else: assert False col2col[new_attr_path] = new_attr_path commands.append( AddLiteralColumnCmd(new_attr_path, cmd_literal_value)) elif command["commandName"] == "OperateSelectionCommand": # no way to see it in the KARMA UI continue elif command["commandName"] == "OrganizeColumnsCommand": continue elif command["commandName"] == "SetWorksheetPropertiesCommand": # this command doesn't affect the model continue # elif command["commandName"] == "UnfoldCommand": # cmd_input_col = command["inputParameters"][-2] # cmd_output_col = command["inputParameters"][-1] # assert cmd_input_col['name'] == "inputColumns" and cmd_output_col['name'] == 'outputColumns' # cmd_input_cols = [ # [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_input_col['value']) # ] # input_attr_paths = [col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols] # cmd_output_cols = [ # [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_output_col['value']) # ] # # output_attr_paths = [] # # update columns mapping # for cmd_output_col in cmd_output_cols: # attr_path = Schema.PATH_DELIMITER.join(cmd_output_col[start_idx:]) # col2col[Schema.PATH_DELIMITER.join(cmd_output_col)] = attr_path # output_attr_paths.append(attr_path) # # commands.append(UnrollCmd(input_attr_paths, output_attr_paths)) # elif command["commandName"] == "GlueCommand": # cmd_input_col = command["inputParameters"][-2] # cmd_output_col = command["inputParameters"][-1] else: assert False, "Source: %s. Doesn't handle command %s" % ( tbl.id, command["commandName"]) # fixing conflict modeling command conflicts = defaultdict(lambda: []) for i, cmd in enumerate(commands): if isinstance(cmd, SetSemanticTypeCmd): conflicts[cmd.input_attr_path].append((i, cmd)) if isinstance(cmd, SetInternalLinkCmd): conflicts[(cmd.source_id, cmd.target_id)].append((i, cmd)) delete_commands = [] for cmds in conflicts.values(): if len(cmds) > 1: display_warn = False for idx, cmd in cmds[1:]: if cmd != cmds[0][1]: if not display_warn: display_warn = True KR2RML.logger.warning( "Table: %s. Conflict between command: \n\t+ %s \n\t+ %s", tbl.id, cmds[0][1], cmd) else: print("\t+", cmd) # only keep final commands for idx, cmd in cmds[:-1]: delete_commands.append(idx) if isinstance(cmds[0][1], SetInternalLinkCmd): # need to update source_uri & target_uri first (for duplicate commands, source_uri, target_uri = None) key = (cmds[-1][1].source_id, cmds[-1][1].link_lbl, cmds[-1][1].target_id) for idx, cmd in cmds[:-1]: if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key: cmds[-1][1].source_uri = cmd.source_uri cmds[-1][1].target_uri = cmd.target_uri break delete_commands.sort(reverse=True) for idx in delete_commands: commands.pop(idx) super().__init__(commands)
semantic_models = [] tables = [] for i, raw_tbl in enumerate(raw_tables): r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml" tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl) semantic_models.append(sm) tables.append(tbl) serializeJSON(semantic_models, cache_file) _data_io_vars["data_tables"][dataset] = tables _data_io_vars["semantic_models"][dataset] = semantic_models return _data_io_vars["semantic_models"][dataset] if __name__ == '__main__': dataset = 'museum_crm' ont = Ontology.from_dataset(dataset) data_dir = Path(config.datasets[dataset].as_path()) (data_dir / "models-viz").mkdir(exist_ok=True, parents=True) (data_dir / "tables-viz").mkdir(exist_ok=True, parents=True) for sm in get_semantic_models(dataset): sm.graph.render2pdf(data_dir / f"models-viz/{sm.id}.pdf") for tbl in get_data_tables(dataset): with open(data_dir / "tables-viz" / f"{tbl.id}.txt", "wb") as f: f.write(tbl.to_string().encode("utf-8"))
def to_normalized_json_model(self, ont: Ontology = None) -> dict: """Dump the normalized/changed model back to karma model JSON format Few changes: + All id are converted from int to str so that it's compatible with source_id of link (str type due to split("---") + LiteralNodes is converted to ColumnNode (we are going to treat LiteralNode as a column contains only one value) and an new column name will be generated for LiteralNodes An optional ontology to restore URI from simplified version (e.g: crm:E39_Actor) to full version (http://www.cidoc-crm.org/cidoc-crm/E39_Actor) """ nodes = [] links = [] if ont is None: ont = UselessOntology() # add literal nodes to source_columns source_columns = [{ "id": str(col.id), "hNodeId": str(col.h_node_id), "columnName": col.column_name } for col in self.source_columns] count = len(self.source_columns) for node in self.karma_graph.iter_data_nodes(): if node.is_literal_node: source_columns.append({ "id": str(node.id), "hNodeId": str(node.id), "columnName": "A%d__literal_val_%s" % (count, node.label.decode('utf-8').lower().replace( " ", "-")) }) count += 1 colid2name: Dict[int, str] = { int(col['id']): col["columnName"] for col in source_columns } for node in self.karma_graph.iter_nodes(): onode = { "id": str(node.id), "modelIds": None, "type": "InternalNode" if node.is_class_node() else "ColumnNode", "label": { "uri": node.label.decode("utf-8") } } if node.is_data_node(): onode["hNodeId"] = str(node.id) onode["columnName"] = colid2name[node.id] if node.literal_type is None: onode["rdfLiteralType"] = None else: onode["rdfLiteralType"] = {"uri": node.literal_type} if node.is_literal_node: parent_link = node.get_first_incoming_link() onode["userSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri( parent_link.get_source_node().label.decode( "utf-8")), "rdfsLabel": None }, "type": { "uri": ont.full_uri(parent_link.label.decode("utf-8")), "rdfsLabel": None }, "origin": "User", "confidenceScore": 1.0 }] onode["learnedSemanticTypes"] = [] else: onode["userSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri(st.domain), "rdfsLabel": None }, "type": { "uri": ont.full_uri(st.type), "rdfsLabel": None }, "origin": st.origin, "confidenceScore": st.confidence_score } for st in node.user_semantic_types] onode["learnedSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri(st.domain), "rdfsLabel": None }, "type": { "uri": ont.full_uri(st.type), "rdfsLabel": None }, "origin": "AutoModel", "confidenceScore": st.confidence_score } for st in node.learned_semantic_types] else: onode["label"]["uri"] = ont.full_uri(onode["label"]["uri"]) nodes.append(onode) for link in self.karma_graph.iter_links(): if link.type == GraphLinkType.OBJECT_PROPERTY: link_type = 'ObjectPropertyLink' elif link.type == GraphLinkType.DATA_PROPERTY: link_type = 'DataPropertyLink' elif link.label == 'karma:dev': link_type = 'ClassInstanceLink' elif link.get_target_node().is_data_node(): link_type = "DataPropertyLink" elif link.get_target_node().is_class_node(): link_type = "ObjectPropertyLink" olink = { "id": "%s---%s---%s" % (link.source_id, link.label.decode("utf-8"), link.target_id), "weight": None, "type": link_type, "label": { "uri": ont.full_uri(link.label.decode("utf-8")) }, "objectPropertyType": "Indirect", "status": "Normal", "keyInfo": "None", "modelIds": None } links.append(olink) model_json = { "id": self.id, "name": self.id, "description": self.description, "sourceColumns": source_columns, "mappingToSourceColumns": [{ "id": col["id"], "sourceColumnId": col["id"] } for col in source_columns], "graph": { "nodes": nodes, "links": links } } return model_json