def __init__(self, dataset: str, train_sms: List[SemanticModel]) -> None: input_file = Path(config.datasets[dataset].karma_version.as_path()) / "semantic-types" / f"{get_short_train_name(train_sms)}.json" if not input_file.exists(): compute_mohsen_stypes(dataset, train_sms) self.stypes = deserializeJSON(input_file) self.train_source_ids = {sm.id for sm in train_sms}
def get_semantic_models(dataset: str) -> List[SemanticModel]: """Get list of semantic models of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["semantic_models"]: # if it has been cached... cache_file = get_cache_dir(dataset) / 'semantic_models.json' if cache_file.exists(): semantic_models = deserializeJSON(cache_file, Class=SemanticModel) else: mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path()) R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path())) raw_tables = get_raw_data_tables(dataset) semantic_models = [] tables = [] for i, raw_tbl in enumerate(raw_tables): r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml" tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl) semantic_models.append(sm) tables.append(tbl) serializeJSON(semantic_models, cache_file) _data_io_vars["data_tables"][dataset] = tables _data_io_vars["semantic_models"][dataset] = semantic_models return _data_io_vars["semantic_models"][dataset]
def load_data(dataset): data_dir = Path(config.fsys.debug.as_path( )) / dataset / "training_workflow" / "examples_generator" / "i0" # train_examples: List[Example] = deserializeJSON(data_dir / "train.small.json", Class=Example) # test_examples: List[Example] = deserializeJSON(data_dir / "test.small.json", Class=Example) train_examples: List[Example] = deserializeJSON(data_dir / "train.json", Class=Example) test_examples: List[Example] = deserializeJSON(data_dir / "test.json", Class=Example) # TODO: uncomment below to create small dataset to debug # train_examples = [e for e in train_examples if e.model_id.startswith('s03')] # train_examples = train_examples[:100] # test_examples = test_examples[:100] # serializeJSON(train_examples, data_dir / "train.small.json") # serializeJSON(test_examples, data_dir / "test.small.json") return train_examples, test_examples
def get_instance(dataset: str, train_sms: List[SemanticModel]): if PrimaryKey.instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "primary_keys.json" if not cache_file.exists(): train_sm_ids = {sm.id for sm in train_sms} train_tbls = { tbl.id: tbl for tbl in get_data_tables(dataset) if tbl.id in train_sm_ids } predictions: Dict[str, List[dict]] = defaultdict(lambda: []) pesudo_primary_keys = {} for sm in train_sms: jsonld_objects = jsonld_generator(sm, train_tbls[sm.id]) for n in sm.graph.iter_class_nodes(): fields = [ e.label.decode("utf-8") for e in n.iter_outgoing_links() if e.get_target_node().is_data_node() ] if len(fields) == 0: continue if 'karma:classLink' in fields: pesudo_primary_keys[n.label] = 'karma:classLink' continue results = extract_node_data(n, jsonld_objects) views = create_unique_views(results, fields) predictions[n.label].append( predict_pesudo_keys(fields, views)) for class_lbl, preds in predictions.items(): total = defaultdict(lambda: 0) for pred in preds: for link_lbl in pred: total[link_lbl] += pred[link_lbl] for link_lbl, count in total.items(): total[link_lbl] = count pesudo_primary_keys[class_lbl] = max(total.items(), key=lambda x: x[1])[0] PrimaryKey.instance = PrimaryKey({ k: v.encode('utf-8') for k, v in pesudo_primary_keys.items() }) cache_file.parent.mkdir(exist_ok=True, parents=True) serializeJSON(PrimaryKey.instance, cache_file, indent=4) else: PrimaryKey.instance: PrimaryKey = deserializeJSON( cache_file, Class=PrimaryKey) return PrimaryKey.instance
def get_ont_graph(dataset: str) -> OntGraph: global _ont_graph_vars if dataset not in _ont_graph_vars: # if it hasn't been cached cache_file = Path(config.fsys.debug.as_path() + f'/{dataset}/cached/ont_graph.json') cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont_graph = deserializeJSON(cache_file, Class=OntGraph) else: ont_graph: OntGraph = build_ont_graph(dataset) serializeJSON(ont_graph, cache_file) _ont_graph_vars[dataset] = ont_graph return _ont_graph_vars[dataset]
def make_test_from_prediction(train_sms: List[SemanticModel], evaluate_sms: List[SemanticModel], workdir: Path, model_dir: Path): search_history: Dict[str, List[List[dict]]] = deserializeJSON( model_dir / "search_history.json") evaluate_sms = {sm.id: sm for sm in evaluate_sms} train_sm_ids = [sm.id for sm in train_sms] test_examples = [] for sid in search_history: for i, gs in enumerate(search_history[sid]): for j, g in enumerate(gs): eid = Example.generate_example_id(sid, j, i) example = make_example(evaluate_sms[sid], Graph.from_dict(g), eid, train_sm_ids) test_examples.append(example) serializeJSON(test_examples, workdir / "examples" / "test.json") return test_examples
def evaluate_serene_outputs( files: List[Path], ont: Ontology, gold_sm: Optional[SemanticModel] = None) -> Union[dict, None]: try: cor_ssd_file = [ file for file in files if file.name.endswith(".cor_ssd.json") ][0] ssd_file = [file for file in files if file.name.endswith(".ssd.json")][0] except Exception as e: raise Exception("Invalid : %s" % files[0], e) cor_ssd = SSD.from_file(cor_ssd_file, ont).clear_serene_footprint() ssd = SSD.from_file(ssd_file, ont) chuffed_ssds = [] for file in files: if file.name.find(".chuffed") != -1: objs = deserializeJSON(file) chuffed_ssds.append([SSD.from_json(obj, ont) for obj in objs]) if gold_sm is None: # SERENE can filter the cor_ssd graph to remove new-semantic types gold_graph = cor_ssd.graph else: gold_graph = gold_sm.graph eval_results = {} for chuffed_idx, ssds in enumerate(chuffed_ssds): eval_results[chuffed_idx] = {} if len(ssds) == 0: eval_results[chuffed_idx] = {'precision': 0, 'recall': 0, 'f1': 0} else: ssd = ssds[0] # ssd.graph.render() result = smodel_eval.f1_precision_recall(gold_graph, ssd.graph, DataNodeMode.NO_TOUCH) eval_results[chuffed_idx]['precision'] = result['precision'] eval_results[chuffed_idx]['recall'] = result['recall'] eval_results[chuffed_idx]['f1'] = result['f1'] return eval_results
def get_karma_models(dataset: str) -> List[KarmaModel]: """Get list of json models of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["karma_models"]: # if it has been cached... cache_file = get_cache_dir(dataset) / 'karma_models.json' if cache_file.exists(): karma_models = deserializeJSON(cache_file, Class=KarmaModel) else: karma_models = [] model_dir = Path(config.datasets[dataset].karma_version.as_path()) / "models-json" ont = get_ontology(dataset) for file in sorted(model_dir.iterdir()): if file.name.endswith(".json"): karma_models.append(KarmaModel.load_from_file(ont, file)) serializeJSON(karma_models, cache_file) _data_io_vars["karma_models"][dataset] = karma_models return _data_io_vars["karma_models"][dataset]
def draw_graph(same_dir: bool): finput = Path("/tmp/sm_debugging/draw_graphs.json") input = deserializeJSON(finput) new_id = -1 for item in finput.parent.iterdir(): match = re.match("draw_graph_(\d+)$", item.name) if match is not None: if int(match.groups()[0]) > new_id: new_id = int(match.groups()[0]) if not same_dir: new_id += 1 output = finput.parent / f"draw_graph_{new_id}" output.mkdir(exist_ok=True) n_graphs = len(list(output.iterdir())) graphs = [Graph.from_dict(o) for o in input["graphs"]] with ThreadPool() as p: p.map(lambda ig: ig[1].render2img(output / f"graph_{ig[0]}.png"), enumerate(graphs, start=n_graphs))
with ThreadPool() as p: p.map( lambda igs: igs[1].render2pdf(output_dir / igs[ 2] / f"example_no_{igs[0]}.pdf"), render_graphs) if __name__ == "__main__": # Load all input dataset = sys.argv[1] workdir = Path(sys.argv[2]) train_or_test_file = workdir / sys.argv[3] assert workdir.exists() assert train_or_test_file.exists() semantic_models = {sm.id: sm for sm in get_semantic_models(dataset)} timer = pyutils.progress.Timer().start() examples = deserializeJSON(train_or_test_file) # for example, map_example in zip(train_examples, train_map_examples): # example["map_link2label"] = map_example if 'train' in str(train_or_test_file).lower(): render_examples(examples, train_or_test_file.parent / "train_viz") elif 'test' in str(train_or_test_file).lower(): render_examples(examples, train_or_test_file.parent / "test_viz") else: print("Cannot detect type is train or test. Exit!!") exit(0) print("Render examples: %s" % timer.lap().get_total_time(), flush=True)
args = get_shell_args() dataset = args.dataset settings = Settings.get_instance(False) settings.n_samples = args.n_samples settings.random_seed = args.seed settings.log_current_settings() ont = get_ontology(dataset) source_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "sources" source_dir.mkdir(exist_ok=True, parents=True) meta_file = source_dir / ".meta" if meta_file.exists(): meta = deserializeJSON(meta_file) if meta['n_samples'] == settings.n_samples and meta[ 'random_seed'] == settings.random_seed: print( "Don't need to prepare karma sources because it has been generated with same configuration before. Terminating...!" ) exit(0) print(f"Generate karma sources for dataset: {dataset}") serializeJSON( { 'n_samples': settings.n_samples, 'random_seed': settings.random_seed }, meta_file,
def from_file(file: Union[str, Path], ont: Ontology) -> 'SSD': content = deserializeJSON(file) return SSD.from_json(content, ont)
# if frame.f_code.co_filename.startswith("/Users/rook/workspace/DataIntegration/SourceModeling/"): # print("%s, %s:%d" % (event, frame.f_code.co_filename, frame.f_lineno)) # else: # print(".", end="") # return trace # # args = {'children_uris': set(), # 'parents_uris': set(), # 'uri': 'http://www.w3.org/2000/01/rdf-schema#Resource'} # # OntGraphNode("haha", set(), set()) # a = OntGraphNode(**args) # print("==========") # # print(a.uri) # # sys.settrace(trace) ont_graph: OntGraph = deserializeJSON( config.fsys.debug.as_path() + '/%s/cached/ont_graph.json' % dataset, OntGraph) ont: Ontology = deserialize(config.fsys.debug.as_path() + '/%s/cached/ont.pkl' % dataset) # print(a.uri) # print("========SIGSEGV IN DEBUG MODE==") # ont = Ontology.from_data_source(data_source) # ont_graph = build_ont_graph(data_source) # # # %% # # # ont_graph.render2txt(config.fsys.debug.as_path() + '/%s/ont_graph.txt' % data_source) # # # %% s1 = ont.full_uri('crm:E63_Beginning_of_Existence')
def _semantic_labeling( self, train_source_ids: Set[str], test_source_ids: Set[str] ) -> Dict[str, MinhptxSemanticLabelingResult]: """Generate semantic labeling for test_sources using train_sources""" need_reexec = True if Path(self.meta_file).exists(): # read meta and compare if previous run is compatible with current run self.logger.debug("Load information from previous run...") meta = deserializeJSON(self.meta_file) meta["training_sources"] = set(meta["training_sources"]) meta["testing_sources"] = set(meta["testing_sources"]) meta["source_ids"] = set(meta['source_ids']) new_meta = self.get_meta(train_source_ids, test_source_ids) if len( new_meta.pop("testing_sources").difference( meta.pop("testing_sources"))) == 0: if new_meta == meta: need_reexec = False if need_reexec: self.logger.debug("Re-execute semantic labeling...") try: # preparing data, want to compute semantic models for all sources in dataset data_dir = Path(config.datasets[self.dataset].data.as_path()) model_dir = Path( config.datasets[self.dataset].models_json.as_path()) shutil.rmtree(str(self.input_dir)) for fpath in self.output_dir.iterdir(): os.remove(fpath) [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True) for x in ["%s_train" % self.dataset, "%s_test" % self.dataset] for y in ["data", "model"]] input_train_dir = self.input_dir / ("%s_train" % self.dataset) input_test_dir = self.input_dir / ("%s_test" % self.dataset) for fpath in sorted(data_dir.iterdir()): model_fname = fpath.stem + "-model.json" if fpath.stem in train_source_ids: self._copy_data(fpath, input_train_dir / "data" / fpath.name) # seriaalize the model instead of copied because we want to convert uri to simplified uri # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_train_dir / "model" / f"{fpath.name}.model.json", indent=4) if fpath.stem in test_source_ids: self._copy_data(fpath, input_test_dir / "data" / fpath.name) # same reason like above serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_test_dir / "model" / f"{fpath.name}.model.json", indent=4) invoke_command(" ".join([ config.previous_works.minhptx_iswc2016.cli.as_path(), str(self.input_dir), str(self.output_dir), "--train_dataset", "%s_train" % self.dataset, "--test_dataset", "%s_test" % self.dataset, "--evaluate_train_set", "True", "--reuse_rf_model", "False" ]), output2file=self.exec_dir / "execution.log") except Exception: sys.stdout.flush() self.logger.exception( "Error while preparing and invoking semantic labeling api..." ) raise serializeJSON(self.get_meta(train_source_ids, test_source_ids), self.meta_file, indent=4) # load result self.logger.debug("Load previous result...") output_files = [ fpath for fpath in self.output_dir.iterdir() if fpath.suffix == ".json" ] assert len(output_files) == 2 app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON( output_files[0], Class=MinhptxSemanticLabelingResult) app_result.update( deserializeJSON(output_files[1], Class=MinhptxSemanticLabelingResult)) return { source_id: app_result[source_id] for source_id in chain(test_source_ids, train_source_ids) }
#!/usr/bin/python # -*- coding: utf-8 -*- import os from pathlib import Path from typing import Dict, Tuple, List, Set, Union, Optional, Any from semantic_modeling.config import config from semantic_modeling.utilities.serializable import deserializeJSON, serializeJSON """Usually run after generate r2rml and copied from KARMA_HOME""" dataset = "museum_edm" model_dir = Path( config.datasets[dataset].karma_version.as_path()) / "models-json" for file in sorted(model_dir.iterdir()): sm = deserializeJSON(file) sm['id'] = Path(sm['id']).stem sm['name'] = sm['id'] serializeJSON(sm, model_dir / f"{sm['id']}-model.json", indent=4) os.remove(file)
Settings.get_instance().parallel_gmtk_n_threads = 6 Settings.get_instance().parallel_n_process = 2 training_args = TrainingArgs.parse_shell_args() # model = create_default_model(dataset, train_sms, training_args, workdir / "models") # model = Model.from_file(dataset, workdir / "models" / "exp_no_0") # model = online_learning(model, dataset, train_sms, train_sms, workdir, training_args, iter_range=(1, 3)) # model = Model.from_file(dataset, workdir / "models" / "exp_no_2") # build_test_data(model, dataset, train_sms, test_sms, workdir, 2) # predictions = predict_sm(model, dataset, [sm.id for sm in train_sms], test_sms, model_dir) # evaluate(test_sms, predictions, model_dir) # train_examples = deserializeJSON(workdir / "examples" / f"train.2.json", Class=Example) test_examples = deserializeJSON(workdir / "examples" / f"test.json", Class=Example) # test_examples = train_examples args = TrainingArgs.parse_shell_args() args.parallel_training = True args.n_switch = 19 args.n_epoch = 22 args.mini_batch_size = 200 args.shuffle_mini_batch = True # args.n_iter_eval = 50 # args.optparams = {"lr": 0.005, "amsgrad": True} # args.optimizer = 'LBFGS' # args.optparams = {"lr": 0.1} args.optparams = {"lr": 0.1, "amsgrad": True} model_bundle = train_model(dataset, [sm.id for sm in train_sms], 120,
numpy.average([x['f1'] for x in average_result]))) if __name__ == "__main__": if len(sys.argv) > 1: workdir = Path(sys.argv[1]) else: workdir = Path("/workspace/semantic-modeling/debug/museum_crm/run3/") kfold_dirs = [ dpath for dpath in workdir.iterdir() if dpath.name.startswith("kfold") ] for kfold_dir in kfold_dirs: if not kfold_dir.is_dir(): continue rust_input = deserializeJSON(kfold_dir / "rust-input.json") dataset = rust_input['dataset'] semantic_models = get_semantic_models(dataset) train_sms = [semantic_models[i] for i in rust_input['train_sm_idxs']] test_sms = [semantic_models[i] for i in rust_input['test_sm_idxs']] ranker = Ranking(train_sms, test_sms) predictions = [ Prediction(obj) for obj in deserializeJSON(kfold_dir / "rust" / "prediction.json") ] print(kfold_dir.name) ranker.rank(predictions)
def print_cooccurrence(features_file_content: dict, output_file: Path): serializeJSON(features_file_content['cooccurrence'], output_file, indent=4) if __name__ == "__main__": if len(sys.argv) > 1: workdir = Path(sys.argv[1]) else: workdir = Path("/workspace/semantic-modeling/debug/museum_crm/run") for kfold_dir in workdir.iterdir(): if kfold_dir.name.startswith("kfold") and kfold_dir.is_dir(): input = kfold_dir / "rust-input.json" output_dir = kfold_dir / "features" output_dir.mkdir(exist_ok=True) with open(input, "r") as f: input = ujson.load(f) print_primary_keys(input, output_dir / "pk.txt") print_stypes(input, output_dir / "stypes.txt") if (kfold_dir / "rust" / "examples.debug.features.json").exists(): features = deserializeJSON(kfold_dir / "rust" / "examples.debug.features.json") print_triple_features(features, output_dir / "triple_features.train.csv", output_dir / "triple_features.test.csv") print_cooccurrence(features, output_dir / "cooccurrence.json")