def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str], exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None): self.dataset: str = dataset self.train_sm_ids = train_sm_ids self.ont = get_ontology(dataset) self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)} # can only run once time, trying re-invoke will generate an error self.__has_run_modeling = False if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015" self.exec_dir: Path = Path(exec_dir) self.sm_type_dir = sm_type_dir # parameters for mohsen's algorithm self.use_old_semantic_typer = use_old_semantic_typer self.use_correct_type = use_correct_type assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4 self.num_candidate_semantic_type = 4 self.multiple_same_property_per_node = True self.coherence = 1.0 self.confidence = 1.0 self.size_reduction = 0.5 self.num_candidate_mappings = 50 self.mapping_branching_factor = 50 self.topk_steiner_tree = 10 # take all, not cut off everything self.cut_off = int(1e6) self.our_and_karma_sm_alignments = {}
def get_instance(dataset: str, train_sms: List[SemanticModel]): if PrimaryKey.instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "primary_keys.json" if not cache_file.exists(): train_sm_ids = {sm.id for sm in train_sms} train_tbls = { tbl.id: tbl for tbl in get_data_tables(dataset) if tbl.id in train_sm_ids } predictions: Dict[str, List[dict]] = defaultdict(lambda: []) pesudo_primary_keys = {} for sm in train_sms: jsonld_objects = jsonld_generator(sm, train_tbls[sm.id]) for n in sm.graph.iter_class_nodes(): fields = [ e.label.decode("utf-8") for e in n.iter_outgoing_links() if e.get_target_node().is_data_node() ] if len(fields) == 0: continue if 'karma:classLink' in fields: pesudo_primary_keys[n.label] = 'karma:classLink' continue results = extract_node_data(n, jsonld_objects) views = create_unique_views(results, fields) predictions[n.label].append( predict_pesudo_keys(fields, views)) for class_lbl, preds in predictions.items(): total = defaultdict(lambda: 0) for pred in preds: for link_lbl in pred: total[link_lbl] += pred[link_lbl] for link_lbl, count in total.items(): total[link_lbl] = count pesudo_primary_keys[class_lbl] = max(total.items(), key=lambda x: x[1])[0] PrimaryKey.instance = PrimaryKey({ k: v.encode('utf-8') for k, v in pesudo_primary_keys.items() }) cache_file.parent.mkdir(exist_ok=True, parents=True) serializeJSON(PrimaryKey.instance, cache_file, indent=4) else: PrimaryKey.instance: PrimaryKey = deserializeJSON( cache_file, Class=PrimaryKey) return PrimaryKey.instance
def __init__(self, dataset: str, train_sms: List[SemanticModel], exec_dir: Optional[Path] = None) -> None: self.dataset = dataset self.train_source_ids = {sm.id for sm in train_sms} if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sms) / "semantic-labeling" self.exec_dir = Path(exec_dir) self.exec_dir.mkdir(exist_ok=True, parents=True) self.model = None self.stype_db = SemanticTypeDB.get_stype_db( dataset, [sm.id for sm in train_sms], self.exec_dir)
def get_data_constraint_model( dataset: str, train_sms: List[SemanticModel], ) -> DataConstraint: global _instance if _instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "data_constraint.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True settings = Settings.get_instance() valid_threshold = settings.data_constraint_valid_threshold guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold n_comparison_samples = settings.data_constraint_n_comparison_samples random_seed = settings.random_seed n_sample = settings.n_samples if cache_file.exists(): DataConstraint.logger.debug("Try to load previous run...") model, cached_dataset, cached_train_sm_ids, extra_args = deserialize( cache_file) if cached_dataset == dataset \ and cached_train_sm_ids == {sm.id for sm in train_sms} \ and extra_args == ( valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample): need_rebuilt = False if need_rebuilt: DataConstraint.logger.debug("Re-build data-constraint model...") data_tables = [ ColumnBasedTable.from_table(tbl) for tbl in get_sampled_data_tables(dataset) ] model = DataConstraint(train_sms, data_tables, valid_threshold, guess_datetime_threshold, n_comparison_samples) serialize((model, dataset, {sm.id for sm in train_sms}, (valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample)), cache_file) _instance = model return _instance
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]): global _instance if _instance is None: cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True if cache_file.exists(): SemanticTypeAssistant.logger.debug("Try to load previous run...") model, cache_dataset, cache_train_sm_ids = deserialize(cache_file) if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}: need_rebuilt = False ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) model.triple_adviser = ota if need_rebuilt: ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) typer = SemanticTyper.get_instance(dataset, train_sms) try: typer.load_model() except: sms = get_semantic_models(dataset) train_ids = {sm.id for sm in train_sms} typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4) model = SemanticTypeAssistant(train_sms, typer, ota) model.triple_adviser = None serialize((model, dataset, {sm.id for sm in train_sms}), cache_file) model.triple_adviser = ota _instance = model return _instance
def create_rust_input(dataset: str, scenario: Scenario, train_sms, test_sms): train_sm_ids = [sm.id for sm in train_sms] exec_dir = get_cache_dir(dataset, train_sms) / "mohsen_jws2015" modeler = MohsenSemanticModeling( dataset, False, False, train_sm_ids, exec_dir=exec_dir, sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") candidate_smss = modeler.sm_candidate_generation(train_sms, test_sms) if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH train_sm_ids = {sm.id for sm in train_sms} real_test_sm_ids = {sm.id for sm in test_sms if sm.id not in train_sm_ids} train_eval_hist = get_eval_hist(train_sm_ids, test_sms, candidate_smss, data_node_mode) test_eval_hist = get_eval_hist(real_test_sm_ids, test_sms, candidate_smss, data_node_mode) serializeCSV( train_eval_hist, exec_dir / f"evaluation_result_{scenario.value}.train.oracle.csv") serializeCSV( test_eval_hist, exec_dir / f"evaluation_result_{scenario.value}.test.oracle.csv") # now create rust bridge obj = {} for gold_sm, candidate_sms in zip(test_sms, candidate_smss): obj[gold_sm.id] = [c.graph.to_dict() for c in candidate_sms] serializeJSON(obj, exec_dir / "rust-karma-pred-input.json")
def get_classifier(self, retrain: bool, train_examples: List[Example]): # TODO: implement this properly, currently, we have to train and save manually cached_file = get_cache_dir( self.example_annotator.dataset, list(self.example_annotator.train_source_ids) ) / "weak_models" / "node_prob_classifier.pkl" if not cached_file.exists() or retrain: self.logger.debug("Retrain new model") raw_X_train = make_data(self, train_examples) classifier = LogisticRegression(fit_intercept=True) X_train = numpy.asarray( [list(features.values())[1:] for features in raw_X_train]) X_train, y_train = X_train[:, :-1], [ int(x) for x in X_train[:, -1] ] scaler = StandardScaler().fit(X_train) scaler.transform(X_train) try: classifier.fit(X_train, y_train) except ValueError as e: assert str(e).startswith( "This solver needs samples of at least 2 classes in the data" ) # this should be at a starter phase when we don't have any data but use ground-truth to build X_train = numpy.vstack([X_train, [0, 0]]) y_train.append(0) classifier.fit(X_train, y_train) cached_file.parent.mkdir(exist_ok=True, parents=True) serialize((scaler, classifier), cached_file) return scaler, classifier return deserialize(cached_file)
def clear_cache(dataset: str) -> None: # only clear cache which are generated for different training models cache_dir = get_cache_dir(dataset) for item in cache_dir.iterdir(): if item.is_dir(): shutil.rmtree(item)
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms, test_sms): ont: Ontology = get_ontology(dataset) karma_models: List[KarmaModel] = get_karma_models(dataset) semantic_models: List[SemanticModel] = get_semantic_models(dataset) train_sm_ids = [sm.id for sm in train_sms] sdesc_args = dict( dataset=dataset, train_sm_ids=train_sm_ids, use_correct_type= False, # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes use_old_semantic_typer=False, exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015", sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder if sdesc_args['sm_type_dir'].exists(): shutil.rmtree(sdesc_args['sm_type_dir']) sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True) top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes typer = create_semantic_typer(dataset, train_sms) typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True) for sm, ksm in zip(semantic_models, karma_models): # assign semantic types to learnedSemanticTypes sm_alignment = SemanticModelAlignment(sm, ksm) for col in ksm.source_columns: attr = sm.get_attr_by_label( sm.graph.get_node_by_id( sm_alignment.alignment[col.id]).label.decode('utf-8')) node = ksm.karma_graph.get_node_by_id(col.id) link = node.get_first_incoming_link() node.learned_semantic_types = [ KarmaSemanticType(node.id, stype.domain, stype.type, typer.__class__.__name__, stype.confidence_score) for stype in attr.semantic_types ] node.user_semantic_types = [ KarmaSemanticType(node.id, link.get_source_node().label.decode(), link.label.decode(), "User", 1.0) ] serializeJSON(ksm.to_normalized_json_model(ont), sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json", indent=4) # STEP 2: invoking semantic modeling modeler = MohsenSemanticModeling(**sdesc_args) pred_sms = modeler.sm_prediction(train_sms, test_sms) # STEP 3: prediction semantic mapping result eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]] if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH for sm, pred_sm in zip(test_sms, pred_sms): eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph, data_node_mode) eval_hist.append([ sm.id, eval_result["precision"], eval_result["recall"], eval_result["f1"], smodel_eval.stype_acc(sm.graph, pred_sm.graph) ]) eval_hist.append([ 'average', np.average([float(x[1]) for x in eval_hist[1:]]), np.average([float(x[2]) for x in eval_hist[1:]]), np.average([float(x[3]) for x in eval_hist[1:]]), np.average([float(x[4]) for x in eval_hist[1:]]) ]) serializeCSV( eval_hist, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv") # STEP 4: prediction semantic labeling result pred_stypes = modeler.semantic_labeling(train_sms, test_sms) for pred_stype, sm in zip(pred_stypes, test_sms): for attr in sm.attrs: if attr.label not in pred_stype: attr.semantic_types = [] else: attr.semantic_types = pred_stype[attr.label] eval_sources( test_sms, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}_stype.csv") # STEP 5: visualize the prediction (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True) need_render_graphs = [ (colorize_prediction( pred_sm.graph, AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]), sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png") for sm, pred_sm in zip(test_sms, pred_sms) ] with ThreadPool(32) as p: p.map(render_graph, need_render_graphs) return eval_hist
if __name__ == '__main__': # HYPER-ARGS args = get_shell_args() Settings.get_instance( False ).semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes Settings.get_instance().semantic_labeling_method = args.semantic_typer Settings.get_instance().log_current_settings() exp_dir = Path(args.exp_dir) assert exp_dir.exists() source_models = {sm.id: sm for sm in get_semantic_models(args.dataset)} train_sms = [source_models[sid] for sid in args.kfold['train_sm_ids']] test_sms = [source_models[sid] for sid in args.kfold['test_sm_ids']] eval_hist = run_evaluation_workflow(args.dataset, Scenario.SCENARIO_2, train_sms, test_sms) serializeCSV(eval_hist, exp_dir / f"kfold-{get_short_train_name(train_sms)}.test.csv") serializeJSON(args, exp_dir / f"kfold-{get_short_train_name(train_sms)}.meta.json", indent=4) shutil.move( get_cache_dir(args.dataset, train_sms) / "mohsen_jws2015", exp_dir / f"kfold-{get_short_train_name(train_sms)}")