def __init__(self, dataset: str, dir: Path) -> None: self.ont = get_ontology(dataset) self.sms = get_semantic_models(dataset) self.sm_prefix_index = {sm.id[:3]: sm for sm in self.sms} self.sm_attr2stypes: Dict[str, Dict[str, List[SemanticType]]] = {} assert len(self.sm_prefix_index) == len( self.sms), "No duplicated prefix" class_uris = set() predicates = set() for sm in self.sms: for n in sm.graph.iter_data_nodes(): e = n.get_first_incoming_link() class_uri = e.get_source_node().label.decode() predicate = e.label.decode() class_uris.add(class_uri) predicates.add(predicate) for file in dir.iterdir(): if file.name.endswith(".df.csv"): prefix = file.name[:3] self.sm_attr2stypes[prefix] = self.read_serene_stypes(file) for attr_lbl, stypes in self.sm_attr2stypes[prefix].items(): for stype in stypes: stype.domain = self.recover_class_uris( stype.domain, class_uris) stype.type = self.recover_predicates( stype.type, predicates)
def __init__(self, dataset: str, max_n_records: int = float('inf'), is_sampling: bool = False, exec_dir: Optional[Union[Path, str]] = None) -> None: self.dataset: str = dataset self.ont: Ontology = get_ontology(dataset) self.max_n_records: int = max_n_records self.is_sampling: bool = is_sampling assert not is_sampling, "Not implemented" self.source_ids: Set[str] = { file.stem for file in Path( config.datasets[dataset].data.as_path()).iterdir() if file.is_file() and not file.name.startswith(".") } if exec_dir is None: exec_dir = Path( config.fsys.debug.as_path()) / dataset / "minhptx_iswc2016" self.exec_dir: Path = Path(exec_dir) self.meta_file: Path = self.exec_dir / "execution-meta.json" self.input_dir: Path = self.exec_dir / "input" self.input_dir.mkdir(parents=True, exist_ok=True) self.output_dir: Path = self.exec_dir / "output" self.output_dir.mkdir(parents=True, exist_ok=True)
def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str], exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None): self.dataset: str = dataset self.train_sm_ids = train_sm_ids self.ont = get_ontology(dataset) self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)} # can only run once time, trying re-invoke will generate an error self.__has_run_modeling = False if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015" self.exec_dir: Path = Path(exec_dir) self.sm_type_dir = sm_type_dir # parameters for mohsen's algorithm self.use_old_semantic_typer = use_old_semantic_typer self.use_correct_type = use_correct_type assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4 self.num_candidate_semantic_type = 4 self.multiple_same_property_per_node = True self.coherence = 1.0 self.confidence = 1.0 self.size_reduction = 0.5 self.num_candidate_mappings = 50 self.mapping_branching_factor = 50 self.topk_steiner_tree = 10 # take all, not cut off everything self.cut_off = int(1e6) self.our_and_karma_sm_alignments = {}
def generate_candidate_sm(dataset: str, test_sm: SemanticModel, stat: Statistic, model_bundle, train_source_ids): # generate candidate ont = get_ontology(dataset) ont_graph = get_ont_graph(dataset) settings = Settings.get_instance() dnodes: Dict[bytes, List[KarmaSemanticType]] = { attr.label.encode('utf-8'): attr.semantic_types for attr in test_sm.attrs } ota = EmpiricalTripleAdviser( ont_graph, ont, stat.p_triple, settings.searching_triple_adviser_max_candidate) graph_explorer_builder = GraphExplorerBuilder( ota, max_data_node_hop=settings.searching_max_data_node_hop, max_class_node_hop=settings.searching_max_class_node_hop) for attr, semantic_types in dnodes.items(): ota.add_data_node(attr, semantic_types) model = Model(*model_bundle) args = PGMBeamSearchArgs( test_sm.id, custom_search_discovery, Tracker(track_search_nodes=False), partial(model.predict_sm_probs, test_sm.id, train_source_ids), graph_explorer_builder, early_terminate_func=None, beam_width=settings.training_beam_width, gold_sm=test_sm.graph, source_attributes=test_sm.attrs, pre_filter_func=filter_unlikely_graph, ) started_node = PGMStartSearchNode( args.get_and_increment_id(), args, [a.label.encode('utf-8') for a in test_sm.attrs]) args._tmp_random_state = numpy.random.RandomState( Settings.get_instance().random_seed) results: List[PGMSearchNode] = beam_search( [started_node], beam_width=settings.training_beam_width, n_results=settings.searching_n_explore_result, args=args) candidate_sms = {} for search_node in args._tmp_tracker_for_storing_search_discovery_nodes: g = search_node.get_value().graph candidate_sms[graph_to_hashable_string(g)] = g for search_node in results: g = search_node.get_value().graph candidate_sms[graph_to_hashable_string(g)] = g return candidate_sms
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]): global _instance if _instance is None: cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True if cache_file.exists(): SemanticTypeAssistant.logger.debug("Try to load previous run...") model, cache_dataset, cache_train_sm_ids = deserialize(cache_file) if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}: need_rebuilt = False ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) model.triple_adviser = ota if need_rebuilt: ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) typer = SemanticTyper.get_instance(dataset, train_sms) try: typer.load_model() except: sms = get_semantic_models(dataset) train_ids = {sm.id for sm in train_sms} typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4) model = SemanticTypeAssistant(train_sms, typer, ota) model.triple_adviser = None serialize((model, dataset, {sm.id for sm in train_sms}), cache_file) model.triple_adviser = ota _instance = model return _instance
def serialize_ont_graph(dataset: str): def rdf_type_to_rust_str(rdf_type: PredicateType): if rdf_type == PredicateType.OWL_DATA_PROP: return "OwlDataProp" if rdf_type == PredicateType.OWL_OBJECT_PROP: return "OwlObjectProp" if rdf_type == PredicateType.OWL_ANNOTATION_PROP: return "OwlAnnotationProp" if rdf_type == PredicateType.RDF_PROP: return "RdfProp" ont = get_ontology(dataset) ont_graph = get_ont_graph(dataset) return { "predicates": [{ "uri": ont.simplify_uri(predicate.uri), "domains": [ont.simplify_uri(uri) for uri in predicate.domains], "ranges": [ont.simplify_uri(uri) for uri in predicate.ranges], "rdf_type": rdf_type_to_rust_str(predicate.rdf_type), "is_rdf_type_reliable": predicate.is_rdf_type_reliable } for predicate in ont_graph.predicates], "class_uris": { ont.simplify_uri(node.uri): { "uri": ont.simplify_uri(node.uri), "parents_uris": [ont.simplify_uri(uri) for uri in node.parents_uris], "children_uris": [ont.simplify_uri(uri) for uri in node.children_uris], } for node in ont_graph.iter_nodes() } }
ssd = ssds[0] # ssd.graph.render() result = smodel_eval.f1_precision_recall(gold_graph, ssd.graph, DataNodeMode.NO_TOUCH) eval_results[chuffed_idx]['precision'] = result['precision'] eval_results[chuffed_idx]['recall'] = result['recall'] eval_results[chuffed_idx]['f1'] = result['f1'] return eval_results if __name__ == '__main__': dataset = "museum_crm" sms = get_semantic_models(dataset) sms_index = {sm.id[:3]: sm for sm in sms} ont = get_ontology(dataset) ont.register_namespace("serene", "http://au.csiro.data61/serene/dev#") # get serene output by sms kfold_results = [] stype = "ReImplMinhISWC_False_pat" for kfold in ["kfold-s01-s14", "kfold-s15-s28", "kfold-s08-s21"]: kfold_sms_prefix = { sm[:3] for sm in get_sm_ids_by_name_range( *kfold.replace("kfold-", "").split("-"), [sm.id for sm in sms]) } print("==== KFOLD:", kfold, "====") serene_output_dir = Path( "/workspace/tmp/serene-python-client/datasets/%s/" %
def generate_candidate_sm(dataset: str, test_sm: SemanticModel, stat: Statistic, model_bundle, train_source_ids): # generate candidate ont = get_ontology(dataset) ont_graph = get_ont_graph(dataset) settings = Settings.get_instance() dnodes: Dict[bytes, List[KarmaSemanticType]] = { attr.label.encode('utf-8'): attr.semantic_types for attr in test_sm.attrs } ota = EmpiricalTripleAdviser( ont_graph, ont, stat.p_triple, settings.searching_triple_adviser_max_candidate) graph_explorer_builder = GraphExplorerBuilder( ota, max_data_node_hop=settings.searching_max_data_node_hop, max_class_node_hop=settings.searching_max_class_node_hop) for attr, semantic_types in dnodes.items(): ota.add_data_node(attr, semantic_types) early_stopping = EarlyStopping() model = Model(*model_bundle) args = PGMBeamSearchArgs( test_sm.id, discovering_func, Tracker(track_search_nodes=True), partial(model.predict_sm_probs, test_sm.id, train_source_ids), graph_explorer_builder, # early_terminate_func=early_stopping.early_stopping, early_terminate_func=None, beam_width=settings.searching_beam_width, gold_sm=test_sm.graph, source_attributes=test_sm.attrs, pre_filter_func=filter_unlikely_graph, ) started_nodes = [ PGMStartSearchNode(args.get_and_increment_id(), args, [a.label.encode('utf-8') for a in test_sm.attrs]) ] results: List[PGMSearchNode] = beam_search( started_nodes, beam_width=settings.searching_beam_width, n_results=settings.searching_n_explore_result, args=args) # *****************************************************************************************************************' # DEBUG CODE output_dir = Path(config.fsys.debug.as_path() + "/tmp/final/") # for search_node in args.tracker.list_search_nodes: # search_node.beam_search_args = None # serialize(args.tracker.list_search_nodes, output_dir / "search_nodes2.pkl") # for file in output_dir.iterdir(): # if file.is_dir(): # shutil.rmtree(file) # else: # os.remove(file) # # for i, search_nodes in enumerate(args.tracker.list_search_nodes): # if len(search_nodes) == 0: # continue # # sub_output_dir = output_dir / str(i) # sub_output_dir.mkdir(exist_ok=True, parents=True) # # for j, r in enumerate(search_nodes[:30]): # pred_sm = r.get_value().graph # pred_sm.set_name(str(r.get_score()).encode('utf-8')) # # g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0]) # g.render2img(sub_output_dir / f"{j}.png") # serialize(pred_sm, sub_output_dir / f"{j}.pkl") # # sub_output_dir = output_dir / "result" # sub_output_dir.mkdir(exist_ok=True, parents=True) # # for i, r in enumerate(results): # pred_sm = r.get_value().graph # pred_sm.set_name(str(r.get_score()).encode('utf-8')) # # g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0]) # g.render2img(sub_output_dir / f"{i}.png") # serialize(pred_sm, sub_output_dir / f"{i}.pkl") # # # STEP 4: report performance print( f"{test_sm.id}: Performance at prev iter:", smodel_eval.f1_precision_recall( test_sm.graph, args.tracker.list_search_nodes[-1][0].get_value().graph, DataNodeMode.NO_TOUCH)) print( f"{test_sm.id}: Performance at final iter:", smodel_eval.f1_precision_recall(test_sm.graph, results[0].get_value().graph, DataNodeMode.NO_TOUCH)) # *****************************************************************************************************************' performances = [] for iter_no, search_nodes in enumerate(args.tracker.list_search_nodes): if len(search_nodes) == 0: continue x = smodel_eval.f1_precision_recall(test_sm.graph, search_nodes[0].get_value().graph, DataNodeMode.NO_TOUCH) performances.append((iter_no, search_nodes[0].get_score(), x['precision'], x['recall'], x['f1'])) x = smodel_eval.f1_precision_recall(test_sm.graph, results[0].get_value().graph, DataNodeMode.NO_TOUCH) performances.append((len(performances), results[0].get_score(), x['precision'], x['recall'], x['f1'])) pred_sms = [(search_node.get_score(), search_node.get_value().graph) for search_node in results] search_history = [[n.get_value().graph for n in search_nodes] for search_nodes in args.tracker.list_search_nodes] search_history.append([n.get_value().graph for n in results]) return pred_sms, performances, search_history
Literal(ujson.dumps(input_columns)))) g.add((kr2rml, km_dev.hasOutputColumns, Literal(ujson.dumps(output_columns)))) g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id))) g.add((kr2rml, km_dev.hasBaseURI, Literal("http://localhost:8080/source/"))) g.add((kr2rml, km_dev.hasWorksheetHistory, Literal(ujson.dumps(worksheet_history, indent=4)))) g.serialize(str(fpath), format='n3') if __name__ == '__main__': from semantic_modeling.data_io import get_ontology ont = get_ontology("museum_edm") r2rml = R2RML.load_from_file( Path( "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/models-y2rml/s01-cb-model.yml" )) tbl = DataTable.load_from_file( Path( "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/sources/s01-cb.csv" )) r2rml.to_kr2rml( ont, tbl, Path( "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/karma-version/models-r2rml/s01-cb-model.ttl" )) # r2rml = R2RML.load_from_file(Path("/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/models-y2rml/s04-ima-artworks-model.yml"))
cmds[-1][1].target_id) for idx, cmd in cmds[:-1]: if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key: cmds[-1][1].source_uri = cmd.source_uri cmds[-1][1].target_uri = cmd.target_uri break delete_commands.sort(reverse=True) for idx in delete_commands: commands.pop(idx) super().__init__(commands) def to_yaml(self, fpath: Path): with open(fpath, "w") as f: yaml.dump(self.to_dict(), f, default_flow_style=False, indent=4) if __name__ == '__main__': path = '/home/rook/workspace/DataIntegration/SourceModeling/data/mohsen-data/museum/cleaned-edm/sources/s04-ima-artworks.xml' kr2rml_file = '/home/rook/workspace/DataIntegration/SourceModeling/data/mohsen-data/museum/edm/models-r2rml/s04-ima-artworks-model.ttl' dataset = "museum_edm" tbl = DataTable.load_from_file(Path(path)) # print(tbl.head(5).to_string()) transformer = KR2RML(get_ontology(dataset), tbl, Path(kr2rml_file)) transformer.apply_build(tbl) print(tbl.head(5).to_string()) transformer.to_yaml("test.yml")
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms, test_sms): ont: Ontology = get_ontology(dataset) karma_models: List[KarmaModel] = get_karma_models(dataset) semantic_models: List[SemanticModel] = get_semantic_models(dataset) train_sm_ids = [sm.id for sm in train_sms] sdesc_args = dict( dataset=dataset, train_sm_ids=train_sm_ids, use_correct_type= False, # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes use_old_semantic_typer=False, exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015", sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder if sdesc_args['sm_type_dir'].exists(): shutil.rmtree(sdesc_args['sm_type_dir']) sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True) top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes typer = create_semantic_typer(dataset, train_sms) typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True) for sm, ksm in zip(semantic_models, karma_models): # assign semantic types to learnedSemanticTypes sm_alignment = SemanticModelAlignment(sm, ksm) for col in ksm.source_columns: attr = sm.get_attr_by_label( sm.graph.get_node_by_id( sm_alignment.alignment[col.id]).label.decode('utf-8')) node = ksm.karma_graph.get_node_by_id(col.id) link = node.get_first_incoming_link() node.learned_semantic_types = [ KarmaSemanticType(node.id, stype.domain, stype.type, typer.__class__.__name__, stype.confidence_score) for stype in attr.semantic_types ] node.user_semantic_types = [ KarmaSemanticType(node.id, link.get_source_node().label.decode(), link.label.decode(), "User", 1.0) ] serializeJSON(ksm.to_normalized_json_model(ont), sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json", indent=4) # STEP 2: invoking semantic modeling modeler = MohsenSemanticModeling(**sdesc_args) pred_sms = modeler.sm_prediction(train_sms, test_sms) # STEP 3: prediction semantic mapping result eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]] if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH for sm, pred_sm in zip(test_sms, pred_sms): eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph, data_node_mode) eval_hist.append([ sm.id, eval_result["precision"], eval_result["recall"], eval_result["f1"], smodel_eval.stype_acc(sm.graph, pred_sm.graph) ]) eval_hist.append([ 'average', np.average([float(x[1]) for x in eval_hist[1:]]), np.average([float(x[2]) for x in eval_hist[1:]]), np.average([float(x[3]) for x in eval_hist[1:]]), np.average([float(x[4]) for x in eval_hist[1:]]) ]) serializeCSV( eval_hist, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv") # STEP 4: prediction semantic labeling result pred_stypes = modeler.semantic_labeling(train_sms, test_sms) for pred_stype, sm in zip(pred_stypes, test_sms): for attr in sm.attrs: if attr.label not in pred_stype: attr.semantic_types = [] else: attr.semantic_types = pred_stype[attr.label] eval_sources( test_sms, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}_stype.csv") # STEP 5: visualize the prediction (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True) need_render_graphs = [ (colorize_prediction( pred_sm.graph, AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]), sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png") for sm, pred_sm in zip(test_sms, pred_sms) ] with ThreadPool(32) as p: p.map(render_graph, need_render_graphs) return eval_hist