def add_ont_paths(graph: IntGraph, ont: Ontology, ont_graph: OntGraph) -> None: for u in graph.iter_class_nodes(): for v in graph.iter_class_nodes(): if u == v: continue c1 = next(ont_graph.iter_nodes_by_label(u.label)) c2 = next(ont_graph.iter_nodes_by_label(v.label)) possible_predicates = ont_graph.get_possible_predicates( ont.full_uri(c1.label.decode('utf-8')), ont.full_uri(c2.label.decode('utf-8'))) for p in possible_predicates: p_lbl = ont.simplify_uri(p.uri).encode('utf-8') e = next((e for e in v.iter_incoming_links() if e.source_id == u.id and e.label == p_lbl), None) if e is None: e = IntGraphLink({Tag.ONT_GRAPH_SOURCE}) graph.real_add_new_link(e, GraphLinkType.UNSPECIFIED, p_lbl, u.id, v.id)
def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str, Path]): g = RDFGraph() km_dev = Namespace("http://isi.edu/integration/karma/dev#") g.namespace_manager.bind("km-dev", km_dev) kr2rml = BNode() g.add((kr2rml, RDF.type, km_dev.R2RMLMapping)) g.add((kr2rml, km_dev.sourceName, Literal(tbl.id))) # timestamp and version, doesn't need to be precise g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264))) g.add((kr2rml, km_dev.modelVersion, Literal("1.7"))) input_columns = [] output_columns = [] # mapping from Schema attribute path OR Command to KarmaColumns attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd], List[Dict[str, str]]] = {} for attr_path in tbl.schema.get_attr_paths(): input_columns.append([{ "columnName": x } for x in attr_path.split(Schema.PATH_DELIMITER)]) if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE: # default karma behaviour, you cannot set semantic type for higher level, but only "values" input_columns[-1].append({"columnName": "values"}) output_columns.append(input_columns[-1]) attr2hnodes[attr_path] = input_columns[-1] for cmd in self.commands: if isinstance(cmd, PyTransformNewColumnCmd): new_attr_path = cmd.input_attr_paths[0].split( Schema.PATH_DELIMITER)[:-1] new_attr_path.append(cmd.new_attr_name) new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path) # when you create a new column from a list, karma convert to a list of objects # e.g: birth_death_date.values, create col death date from that, # Karma create => birth_death_date.death_date # that's why we have this code below new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1] new_hnode.append({"columnName": cmd.new_attr_name}) output_columns.append(new_hnode) attr2hnodes[cmd] = output_columns[-1] attr2hnodes[new_attr_path] = output_columns[-1] worksheet_history = [] # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1) commands = [ cmd for cmd in self.commands if isinstance(cmd, PyTransformNewColumnCmd) ] for cmd in sorted( [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)], key=lambda c: c.node_id): commands.append(cmd) for cmd in sorted( [c for c in self.commands if isinstance(c, SetInternalLinkCmd)], key=lambda c: c.target_uri or c.source_uri or ""): commands.append(cmd) # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma # need to re-arrange the node_id node_id_old2new: Dict[str, str] = {} node_id_domain_count: Dict[str, int] = {} for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pass elif isinstance(cmd, SetSemanticTypeCmd): if cmd.node_id not in node_id_old2new: node_id_domain_count[ cmd.domain] = node_id_domain_count.get(cmd.domain, 0) + 1 node_id_old2new[ cmd. node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}" elif isinstance(cmd, SetInternalLinkCmd): if cmd.source_id not in node_id_old2new: assert cmd.source_uri is not None node_id_domain_count[ cmd.source_uri] = node_id_domain_count.get( cmd.source_uri, 0) + 1 node_id_old2new[ cmd. source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}" if cmd.target_id not in node_id_old2new: assert cmd.target_uri is not None node_id_domain_count[ cmd.target_uri] = node_id_domain_count.get( cmd.target_uri, 0) + 1 node_id_old2new[ cmd. target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}" for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pytransform_code = cmd.code # recover pytransform_code from our code pytransform_code = pytransform_code.replace( "__return__ = ", "return ") for match in reversed( list( re.finditer("getValue\(([^)]+)\)", pytransform_code))): start, end = match.span(1) field = pytransform_code[start:end].replace( "'", "").replace('"""', "").replace('"', '') # convert full name to last column name since Karma use last column name instead for input_attr_path in cmd.input_attr_paths: if input_attr_path == field: # TODO: will Karma always use last column name? field = attr2hnodes[input_attr_path][-1][ 'columnName'] break else: assert False, f"Cannot find any field {field} in the input columns" pytransform_code = pytransform_code[: start] + f'"{field}"' + pytransform_code[ end:] worksheet_history.append({ "tags": ["Transformation"], "commandName": "SubmitPythonTransformationCommand", "inputParameters": [{ "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_paths[0]], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "newColumnName", "value": cmd.new_attr_name, "type": "other" }, { "name": "transformationCode", "value": pytransform_code, "type": "other" }, { "name": "errorDefaultValue", "value": cmd.default_error_value, "type": "other" }, { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[iap] } for iap in cmd.input_attr_paths]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd] if attr2hnodes[cmd][-1]['columnName'] != "values" else attr2hnodes[cmd][:-1] }]) }] }) elif isinstance(cmd, SetSemanticTypeCmd): if cmd.type != "karma:classLink": worksheet_history.append({ "commandName": "SetSemanticTypeCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) else: worksheet_history.append({ "commandName": "SetMetaPropertyCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "metaPropertyName", "value": "isUriOfClass", "type": "other" }, { "name": "metaPropertyUri", "value": ont.full_uri(cmd.domain), "type": "other" }, { "name": "metaPropertyId", "value": ont.full_uri(node_id_old2new[cmd.node_id]), "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) elif isinstance(cmd, SetInternalLinkCmd): # TODO: comment out because old KARMA doesn't recognize this! # if cmd.target_uri is not None or cmd.source_uri is not None: # worksheet_history.append({ # "commandName": "AddLinkCommand", # "tags": ["Modeling"], # "inputParameters": [ # {"name": "worksheetId", "value": "W", "type": "worksheetId"}, # { # "name": "edge", # "type": "other", # "value": { # "edgeId": ont.full_uri(cmd.link_lbl), # "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), # "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]), # "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), # "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1]) # } # }, # {"name": "inputColumns", "type": "hNodeIdList", "value": []}, # {"name": "outputColumns", "type": "hNodeIdList", "value": []} # ] # }) # else: worksheet_history.append({ "commandName": "ChangeInternalNodeLinksCommand", "tags": ["Modeling"], "inputParameters": [{ "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "initialEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]) }] }, { "name": "newEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), "edgeTargetUri": ont.full_uri( cmd.target_uri or node_id_old2new[cmd.target_id][:-1]), "edgeSourceUri": ont.full_uri( cmd.source_uri or node_id_old2new[cmd.source_id][:-1]) }] }, { "name": "inputColumns", "type": "hNodeIdList", "value": [] }, { "name": "outputColumns", "type": "hNodeIdList", "value": [] }] }) g.add((kr2rml, km_dev.hasInputColumns, Literal(ujson.dumps(input_columns)))) g.add((kr2rml, km_dev.hasOutputColumns, Literal(ujson.dumps(output_columns)))) g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id))) g.add((kr2rml, km_dev.hasBaseURI, Literal("http://localhost:8080/source/"))) g.add((kr2rml, km_dev.hasWorksheetHistory, Literal(ujson.dumps(worksheet_history, indent=4)))) g.serialize(str(fpath), format='n3')
def to_normalized_json_model(self, ont: Ontology = None) -> dict: """Dump the normalized/changed model back to karma model JSON format Few changes: + All id are converted from int to str so that it's compatible with source_id of link (str type due to split("---") + LiteralNodes is converted to ColumnNode (we are going to treat LiteralNode as a column contains only one value) and an new column name will be generated for LiteralNodes An optional ontology to restore URI from simplified version (e.g: crm:E39_Actor) to full version (http://www.cidoc-crm.org/cidoc-crm/E39_Actor) """ nodes = [] links = [] if ont is None: ont = UselessOntology() # add literal nodes to source_columns source_columns = [{ "id": str(col.id), "hNodeId": str(col.h_node_id), "columnName": col.column_name } for col in self.source_columns] count = len(self.source_columns) for node in self.karma_graph.iter_data_nodes(): if node.is_literal_node: source_columns.append({ "id": str(node.id), "hNodeId": str(node.id), "columnName": "A%d__literal_val_%s" % (count, node.label.decode('utf-8').lower().replace( " ", "-")) }) count += 1 colid2name: Dict[int, str] = { int(col['id']): col["columnName"] for col in source_columns } for node in self.karma_graph.iter_nodes(): onode = { "id": str(node.id), "modelIds": None, "type": "InternalNode" if node.is_class_node() else "ColumnNode", "label": { "uri": node.label.decode("utf-8") } } if node.is_data_node(): onode["hNodeId"] = str(node.id) onode["columnName"] = colid2name[node.id] if node.literal_type is None: onode["rdfLiteralType"] = None else: onode["rdfLiteralType"] = {"uri": node.literal_type} if node.is_literal_node: parent_link = node.get_first_incoming_link() onode["userSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri( parent_link.get_source_node().label.decode( "utf-8")), "rdfsLabel": None }, "type": { "uri": ont.full_uri(parent_link.label.decode("utf-8")), "rdfsLabel": None }, "origin": "User", "confidenceScore": 1.0 }] onode["learnedSemanticTypes"] = [] else: onode["userSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri(st.domain), "rdfsLabel": None }, "type": { "uri": ont.full_uri(st.type), "rdfsLabel": None }, "origin": st.origin, "confidenceScore": st.confidence_score } for st in node.user_semantic_types] onode["learnedSemanticTypes"] = [{ "hNodeId": str(node.id), "domain": { "uri": ont.full_uri(st.domain), "rdfsLabel": None }, "type": { "uri": ont.full_uri(st.type), "rdfsLabel": None }, "origin": "AutoModel", "confidenceScore": st.confidence_score } for st in node.learned_semantic_types] else: onode["label"]["uri"] = ont.full_uri(onode["label"]["uri"]) nodes.append(onode) for link in self.karma_graph.iter_links(): if link.type == GraphLinkType.OBJECT_PROPERTY: link_type = 'ObjectPropertyLink' elif link.type == GraphLinkType.DATA_PROPERTY: link_type = 'DataPropertyLink' elif link.label == 'karma:dev': link_type = 'ClassInstanceLink' elif link.get_target_node().is_data_node(): link_type = "DataPropertyLink" elif link.get_target_node().is_class_node(): link_type = "ObjectPropertyLink" olink = { "id": "%s---%s---%s" % (link.source_id, link.label.decode("utf-8"), link.target_id), "weight": None, "type": link_type, "label": { "uri": ont.full_uri(link.label.decode("utf-8")) }, "objectPropertyType": "Indirect", "status": "Normal", "keyInfo": "None", "modelIds": None } links.append(olink) model_json = { "id": self.id, "name": self.id, "description": self.description, "sourceColumns": source_columns, "mappingToSourceColumns": [{ "id": col["id"], "sourceColumnId": col["id"] } for col in source_columns], "graph": { "nodes": nodes, "links": links } } return model_json