def compute_mohsen_stypes(dataset: str, train_sms: List[SemanticModel]): sms = get_semantic_models(dataset) train_sm_ids = [sm.id for sm in train_sms] exec_dir = Path(config.fsys.debug.as_path()) / "tmp" / f"mohsen-styper-{get_short_train_name(train_sms)}" if exec_dir.exists(): shutil.rmtree(exec_dir) exec_dir.mkdir(exist_ok=True, parents=True) semantic_types = {} # now we parallel to save time # with ThreadPool(os.cpu_count() // 2) as pool: with ThreadPool(6) as pool: results = {} # because karma re-learn semantic types for every data source, we parallel for every data source for sm in sms: if sm.id in train_sm_ids: local_train_sms = [s for s in train_sms if s.id != sm.id] else: local_train_sms = train_sms local_exec_dir = exec_dir / sm.id local_exec_dir.mkdir(exist_ok=True) results[sm.id] = pool.apply_async(worker_get_stype, (dataset, local_train_sms, sm, local_exec_dir)) for sid, result in results.items(): semantic_types[sid] = result.get() output_dir = Path(config.datasets[dataset].karma_version.as_path()) / "semantic-types" output_dir.mkdir(exist_ok=True) serializeJSON(semantic_types, output_dir / f"{get_short_train_name(train_sms)}.json", indent=4) return semantic_types
def save_evaluation_result(map_and_nll_examples: Iterable[Tuple[ MAPAssignmentExample, NegativeLogLikelihoodExample]], fpath: Union[Path, str]) -> None: outputs = [] confusion_matrixes = [] for map_example, nll_example in map_and_nll_examples: real_example: Example = nll_example.variables[0].triple.example map_assignment: Dict[ TripleLabel, BinaryVectorValue[bool]] = map_example.get_map_assignment() link2labels = {} for var, val in map_assignment.items(): link2labels[var.triple.link.id] = val.val desired_assignment = { var: var.domain.encode_value(True) for var in nll_example.variables } log_prob = sum( f.score_assignment(desired_assignment) for f in nll_example.factors) - nll_example.inference.logZ() output = OutputExample(real_example.example_id, link2labels, log_prob) outputs.append(output) confusion_matrixes.append( Evaluation.get_confusion_matrix(map_assignment, nll_example.target_assignment)) serializeJSON(outputs, fpath)
def build_test_data(model: Model, dataset: str, train_sms: List[SemanticModel], discover_sources: List[SemanticModel], output_dir: Path, n_iter): data: Dict[str, Dict[bytes, Example]] = {sm.id: {} for sm in discover_sources} discover_sids = {sm.id for sm in discover_sources} (output_dir / "examples").mkdir(exist_ok=True, parents=True) # default should have ground-truth for sm in discover_sources: data[sm.id][graph_to_hashable_string(sm.graph)] = make_example( sm, sm.graph, Example.generate_example_id(sm.id, 0, 0), [sm.id for sm in train_sms]) new_data = generate_data(model, dataset, train_sms, discover_sources, 1) for sm in discover_sources: new_candidate_sms = [ key for key in new_data[sm.id] if key not in data[sm.id] ] for key in new_candidate_sms: data[sm.id][key] = new_data[sm.id][key] test_examples = [ example for sid in discover_sids for example in data[sid].values() ] test_examples.sort(key=lambda e: e.example_id) serializeJSON(test_examples, output_dir / "examples" / f"test.{n_iter}.json")
def make_dataset(sm: SemanticModel, tbl: DataTable, ont: Ontology, serene_data_dir: Path, serene_sm_dir: Path): def cross_products(row: dict) -> Union[List, Dict]: single_fields = {} multi_fields = {} for key, val in row.items(): if isinstance(val, dict): result = cross_products(val) if isinstance(result, dict): single_fields[key] = result elif isinstance(result, list): multi_fields[key] = result else: raise Exception("Invalid result type: %s" % type(result)) elif isinstance(val, list): multi_fields[key] = val else: single_fields[key] = val if len(multi_fields) == 0: return single_fields rows = [] keys, field_values = list(zip(*multi_fields.items())) for values in itertools.product(*field_values): row = copy(single_fields) for i, val in enumerate(values): row[keys[i]] = val rows.append(row) return rows def flatten_row(row: dict) -> dict: new_row = {} for key, val in row.items(): if isinstance(val, dict): for k2, v2 in flatten_row(val).items(): new_row[f"{key}{Schema.PATH_DELIMITER}{k2}"] = v2 else: new_row[key] = val return new_row # flatten a data table flatten_rows = [] for row in tbl.rows: new_rows = cross_products(row) if isinstance(new_rows, dict): new_rows = [new_rows] for r in new_rows: flatten_rows.append(flatten_row(r)) # print(DataTable.load_from_rows("", flatten_rows).to_string()) keys = list(flatten_rows[0].keys()) values = [[r[k] for k in keys] for r in flatten_rows] serializeCSV([keys] + values, serene_data_dir / f"{sm.id}.csv") # create ssds ssd = make_ssd(sm, set(keys), ont) serializeJSON(ssd.to_dict(), serene_sm_dir / f"{sm.id}.ssd", indent=4)
def get_semantic_models(dataset: str) -> List[SemanticModel]: """Get list of semantic models of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["semantic_models"]: # if it has been cached... cache_file = get_cache_dir(dataset) / 'semantic_models.json' if cache_file.exists(): semantic_models = deserializeJSON(cache_file, Class=SemanticModel) else: mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path()) R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path())) raw_tables = get_raw_data_tables(dataset) semantic_models = [] tables = [] for i, raw_tbl in enumerate(raw_tables): r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml" tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl) semantic_models.append(sm) tables.append(tbl) serializeJSON(semantic_models, cache_file) _data_io_vars["data_tables"][dataset] = tables _data_io_vars["semantic_models"][dataset] = semantic_models return _data_io_vars["semantic_models"][dataset]
def online_learning(model: Model, dataset: str, train_sms: List[SemanticModel], discover_sources: List[SemanticModel], output_dir: Path, training_args, iter_range=(1, 3)): data: Dict[str, Dict[bytes, Example]] = {sm.id: {} for sm in discover_sources} discover_sids = {sm.id for sm in discover_sources} ignore_sids = set( ) # those should not include in the discovery_helper process because of no new sources logger = get_logger("app") (output_dir / "examples").mkdir(exist_ok=True, parents=True) # default should have ground-truth for sm in discover_sources: data[sm.id][graph_to_hashable_string(sm.graph)] = make_example( sm, sm.graph, Example.generate_example_id(sm.id, 0, 0), [sm.id for sm in train_sms]) for n_iter in range(*iter_range): logger.info("==================================> Iter: %s", n_iter) new_data = generate_data(model, dataset, train_sms, discover_sources, n_iter) for sm in discover_sources: if sm.id in ignore_sids: continue new_candidate_sms = [ key for key in new_data[sm.id] if key not in data[sm.id] ] if len(new_candidate_sms) == 0: # no new candidate sms logger.info("No new candidate for source: %s", sm.id) ignore_sids.add(sm.id) else: for key in new_candidate_sms: data[sm.id][key] = new_data[sm.id][key] train_examples = [ example for sm in train_sms if sm.id in discover_sids for example in data[sm.id].values() ] train_examples.sort(key=lambda e: e.example_id) serializeJSON(train_examples, output_dir / "examples" / f"train.{n_iter}.json") shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json", output_dir / "examples" / f"train.json") raw_model, tf_domain, pairwise_domain, __ = train_model( dataset, [sm.id for sm in train_sms], 120, train_examples, [], training_args, output_dir / "models") model = Model(dataset, raw_model, tf_domain, pairwise_domain) return model
def get_instance(dataset: str, train_sms: List[SemanticModel]): if PrimaryKey.instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "primary_keys.json" if not cache_file.exists(): train_sm_ids = {sm.id for sm in train_sms} train_tbls = { tbl.id: tbl for tbl in get_data_tables(dataset) if tbl.id in train_sm_ids } predictions: Dict[str, List[dict]] = defaultdict(lambda: []) pesudo_primary_keys = {} for sm in train_sms: jsonld_objects = jsonld_generator(sm, train_tbls[sm.id]) for n in sm.graph.iter_class_nodes(): fields = [ e.label.decode("utf-8") for e in n.iter_outgoing_links() if e.get_target_node().is_data_node() ] if len(fields) == 0: continue if 'karma:classLink' in fields: pesudo_primary_keys[n.label] = 'karma:classLink' continue results = extract_node_data(n, jsonld_objects) views = create_unique_views(results, fields) predictions[n.label].append( predict_pesudo_keys(fields, views)) for class_lbl, preds in predictions.items(): total = defaultdict(lambda: 0) for pred in preds: for link_lbl in pred: total[link_lbl] += pred[link_lbl] for link_lbl, count in total.items(): total[link_lbl] = count pesudo_primary_keys[class_lbl] = max(total.items(), key=lambda x: x[1])[0] PrimaryKey.instance = PrimaryKey({ k: v.encode('utf-8') for k, v in pesudo_primary_keys.items() }) cache_file.parent.mkdir(exist_ok=True, parents=True) serializeJSON(PrimaryKey.instance, cache_file, indent=4) else: PrimaryKey.instance: PrimaryKey = deserializeJSON( cache_file, Class=PrimaryKey) return PrimaryKey.instance
def get_ont_graph(dataset: str) -> OntGraph: global _ont_graph_vars if dataset not in _ont_graph_vars: # if it hasn't been cached cache_file = Path(config.fsys.debug.as_path() + f'/{dataset}/cached/ont_graph.json') cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont_graph = deserializeJSON(cache_file, Class=OntGraph) else: ont_graph: OntGraph = build_ont_graph(dataset) serializeJSON(ont_graph, cache_file) _ont_graph_vars[dataset] = ont_graph return _ont_graph_vars[dataset]
def make_test_from_prediction(train_sms: List[SemanticModel], evaluate_sms: List[SemanticModel], workdir: Path, model_dir: Path): search_history: Dict[str, List[List[dict]]] = deserializeJSON( model_dir / "search_history.json") evaluate_sms = {sm.id: sm for sm in evaluate_sms} train_sm_ids = [sm.id for sm in train_sms] test_examples = [] for sid in search_history: for i, gs in enumerate(search_history[sid]): for j, g in enumerate(gs): eid = Example.generate_example_id(sid, j, i) example = make_example(evaluate_sms[sid], Graph.from_dict(g), eid, train_sm_ids) test_examples.append(example) serializeJSON(test_examples, workdir / "examples" / "test.json") return test_examples
def get_karma_models(dataset: str) -> List[KarmaModel]: """Get list of json models of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["karma_models"]: # if it has been cached... cache_file = get_cache_dir(dataset) / 'karma_models.json' if cache_file.exists(): karma_models = deserializeJSON(cache_file, Class=KarmaModel) else: karma_models = [] model_dir = Path(config.datasets[dataset].karma_version.as_path()) / "models-json" ont = get_ontology(dataset) for file in sorted(model_dir.iterdir()): if file.name.endswith(".json"): karma_models.append(KarmaModel.load_from_file(ont, file)) serializeJSON(karma_models, cache_file) _data_io_vars["karma_models"][dataset] = karma_models return _data_io_vars["karma_models"][dataset]
def create_rust_input(dataset: str, scenario: Scenario, train_sms, test_sms): train_sm_ids = [sm.id for sm in train_sms] exec_dir = get_cache_dir(dataset, train_sms) / "mohsen_jws2015" modeler = MohsenSemanticModeling( dataset, False, False, train_sm_ids, exec_dir=exec_dir, sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") candidate_smss = modeler.sm_candidate_generation(train_sms, test_sms) if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH train_sm_ids = {sm.id for sm in train_sms} real_test_sm_ids = {sm.id for sm in test_sms if sm.id not in train_sm_ids} train_eval_hist = get_eval_hist(train_sm_ids, test_sms, candidate_smss, data_node_mode) test_eval_hist = get_eval_hist(real_test_sm_ids, test_sms, candidate_smss, data_node_mode) serializeCSV( train_eval_hist, exec_dir / f"evaluation_result_{scenario.value}.train.oracle.csv") serializeCSV( test_eval_hist, exec_dir / f"evaluation_result_{scenario.value}.test.oracle.csv") # now create rust bridge obj = {} for gold_sm, candidate_sms in zip(test_sms, candidate_smss): obj[gold_sm.id] = [c.graph.to_dict() for c in candidate_sms] serializeJSON(obj, exec_dir / "rust-karma-pred-input.json")
def serialize_rust_input(dataset: str, workdir: str, train_sms: List[SemanticModel], test_sms: List[SemanticModel], foutput: Path): primary_key = PrimaryKey.get_instance(dataset, train_sms) sms = get_semantic_models(dataset) sm_index = {sm.id: i for i, sm in enumerate(sms)} train_sm_idxs = [sm_index[sm.id] for sm in train_sms] test_sm_idxs = [sm_index[sm.id] for sm in test_sms] predicted_parent_stypes = serialize_stype_assistant( dataset, sms, train_sms, test_sms) cardinality = CardinalityFeatures.get_instance(dataset) semantic_labeling(dataset, train_sms, test_sms) data = { "dataset": dataset, "workdir": str(workdir), "semantic_models": [sm.to_dict() for sm in sms], "predicted_parent_stypes": { "stype_details": predicted_parent_stypes }, "train_sm_idxs": train_sm_idxs, "test_sm_idxs": test_sm_idxs, "feature_primary_keys": primary_key.to_dict(), "feature_cardinality_features": { sm_id: { "columns": matrix.columns, "matrix": matrix.matrix } for sm_id, matrix in cardinality.cardinality_matrices.items() }, "ont_graph": serialize_ont_graph(dataset) } serializeJSON(data, foutput, indent=4)
def predict_sm(model: Model, dataset: str, train_sms: List[SemanticModel], evaluate_sms: List[SemanticModel], workdir): train_sids = [sm.id for sm in train_sms] predictions: Dict[str, Graph] = {} stat = Statistic.get_instance(train_sms) model_bundle = (model.dataset, model.model, model.tf_domain, model.pairwise_domain) search_performance_history = {} search_history = {} with get_pool(Settings.get_instance().parallel_n_process) as pool: results = [] for sm in evaluate_sms: result = pool.apply_async( generate_candidate_sm, (dataset, sm, stat, model_bundle, train_sids)) results.append(result) pred_sms: Tuple[List[Tuple[float, Graph]], List[Tuple[int, float, float, float, float]], List[List[Graph]]] for sm, result in zip(evaluate_sms, results): pred_sms = result.get() predictions[sm.id] = pred_sms[0][0][1] search_performance_history[sm.id] = pred_sms[1] search_history[sm.id] = pred_sms[2] serializeJSON({sid: o.to_dict() for sid, o in predictions.items()}, workdir / "predicted_sms.json") serializeJSON(search_performance_history, workdir / "search_performance_history.json", indent=4) serializeJSON( { sid: [[o.to_dict() for o in os] for os in oss] for sid, oss in search_history.items() }, workdir / "search_history.json") return predictions
from semantic_modeling.utilities.serializable import serializeJSON from transformation.r2rml.commands.modeling import SetInternalLinkCmd, SetSemanticTypeCmd from transformation.r2rml.r2rml import R2RML dataset = "museum_crm" ont = get_ontology(dataset) r2rml_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "models-r2rml" r2rml_dir.mkdir(exist_ok=True, parents=True) model_dir = Path(config.datasets[dataset].models_y2rml.as_path()) model_json_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "models-json" model_json_dir.mkdir(exist_ok=True, parents=True) for tbl in get_data_tables(dataset): r2rml_file = r2rml_dir / f"{tbl.id}-model.ttl" r2rml = R2RML.load_from_file(model_dir / f"{tbl.id}-model.yml") # note that we use a cleaned data table, whatever columns need to create/transform have been done. # therefore, we will remove all command that aren't SetSemanticType or SetInternalLink r2rml.commands = [ cmd for cmd in r2rml.commands if isinstance(cmd, (SetSemanticTypeCmd, SetInternalLinkCmd)) ] sm = r2rml.apply_cmds(tbl) r2rml.to_kr2rml(ont, tbl, r2rml_file) serializeJSON(sm.to_karma_json_model(ont), model_json_dir / f"{tbl.id}-model.json", indent=4)
def train_model(dataset: str, train_sids: List[str], manual_seed: int, train_examples: List[Example], test_examples: List[Example], args: TrainingArgs, basedir: Path): DenseTensorFunc.manual_seed(manual_seed) tf_domain = GrowableBinaryVectorDomain() timer = pyutils.progress.Timer().start() input_train_examples = train_examples input_test_examples = test_examples # BUILDING VARIABLES NEEDED FOR THE TRAINING example_annotator = ExampleAnnotator(dataset, train_sids, training_examples=train_examples) train_examples = sequential_map(example_annotator.annotate, train_examples) train_examples = _(train_examples) \ .imap(example_annotator.example2vars) \ .submap(partial(example_annotator.build_triple_features, domain=tf_domain)) pairwise_domain = example_annotator.build_pairwise_domain() # Freeze domain now, we've added all feature values observed in training data tf_domain.freeze() test_examples = sequential_map(example_annotator.annotate, test_examples) test_examples = _(test_examples) \ .imap(example_annotator.example2vars) \ .submap(partial(example_annotator.build_triple_features, domain=tf_domain)) # print domain to debug logger.info("Preprocessing take %s" % timer.lap().get_total_time()) # build random variables train_graphs = _(train_examples).submap(lambda t: t.label) test_graphs = _(test_examples).submap(lambda t: t.label) # build models, select inference method model = TemplateLogLinearModel([ TripleFactorTemplate( *TripleFactorTemplate.get_default_args(tf_domain)), SubstructureFactorTemplate( *SubstructureFactorTemplate.get_default_args( pairwise_domain, example_annotator.get_obj_props())), # ExternalModelFactorTemplate(*ExternalModelFactorTemplate.get_default_weights()) ]) # or load previous training # model_dir = config.fsys.debug.as_path() + "/%s/models/exp_no_2" % dataset # model, ___, state_dict = deserialize(model_dir + '/gmtk_model.bin') inference = BeliefPropagation.get_constructor(InferProb.MARGINAL) map_inference = BeliefPropagation.get_constructor(InferProb.MAP) train_nll_examples = _( train_graphs).map(lambda vars: NegativeLogLikelihoodExample( vars, model.get_factors(vars), inference)) train_map_examples = _(train_nll_examples).map( lambda example: MAPAssignmentExample.from_nll_example( example, map_inference)) test_nll_examples = _( test_graphs).map(lambda vars: NegativeLogLikelihoodExample( vars, model.get_factors(vars), inference)) test_map_examples = _(test_nll_examples).map( lambda example: MAPAssignmentExample.from_nll_example( example, map_inference)) # select training method/parameters, and evaluation n_epoch = args.n_epoch params = args.optparams mini_batch_size = args.mini_batch_size n_switch = args.n_switch global_step = 0 require_closure = False if args.optimizer == 'SGD': optimizer = PyTorchOptimizer.SGD(parameters=model.get_parameters(), **params) elif args.optimizer == 'ADAM': optimizer = PyTorchOptimizer.Adam(parameters=model.get_parameters(), **params) elif args.optimizer == 'LBFGS': optimizer = PyTorchOptimizer.LBFGS(parameters=model.get_parameters(), **params) require_closure = True else: assert False # optimizer.optimizer.load_state_dict(state_dict) for template in model.templates: if hasattr(template, 'after_update_weights'): optimizer.register_on_step(template.after_update_weights) logger.info(args.to_string()) logger.info("Template info: \n%s" % ("\n" % (["\t" + template.get_info() for template in model.templates]))) logger.info("Train size: %s, Test size: %s", len(train_nll_examples), len(test_nll_examples)) reporter = TensorBoard(log_dir=basedir) # cast to list to keep train_map_examples & train_nll_examples aligned with each other (batch example may shuffle) if args.parallel_training: batch_nll_example = ParallelBatchExample(list(train_nll_examples), 0) else: batch_nll_example = BatchExample(list(train_nll_examples), 0) # *********************************************** DEBUG CODE # for i, triples in enumerate(train_examples): # example = triples[0].example # if example.model_id.startswith("s03") and example.no_sample == 29: # example.pred_sm.render() # render_factor_graph(model.get_factors(train_graphs[i]), train_graphs[i], # config.fsys.debug.as_path() + "/tmp/factor_graph.pdf") # exit(0) # # render_factor_graph(train_nll_examples[0].factors, train_nll_examples[0].variables, # config.fsys.debug.as_path() + "/tmp/factor_graph.pdf") # # loss_val_accum = ValueAccumulator() # gradient_accum = Tensor1AccumulatorDict() # for weights in model.get_parameters(): # gradient_accum.track_obj(weights, DenseTensorFunc.zeros_like(weights.val)) # ********************************************************** progress = pyutils.progress.Progress(n_epoch) progress.start() if n_switch > 0: examples = list(batch_nll_example.split_random(mini_batch_size)) else: examples = [batch_nll_example] cm_train, cm_test = None, None loss_history = [] param_hists = [] for i in range(n_epoch): logger.info("Iter %s" % i) if i >= n_switch: examples = [batch_nll_example] if args.shuffle_mini_batch and 0 < i < n_switch: examples = batch_nll_example.split_random(mini_batch_size) average_loss_val = [] if not require_closure: for example in examples: optimizer.zero_grad() example.accumulate_value_and_gradient( optimizer.get_value_accumulator(), optimizer.get_gradient_accumulator()) optimizer.average(example.size()) logger.info("Accum loss: %.10f" % optimizer.get_value_accumulator().get_value()) average_loss_val.append( optimizer.get_value_accumulator().get_value()) # *********************************************** DEBUG GRADIENT # numerical_gradient = NumericalGradient(1e-5) # for j, e in enumerate(example.examples): # print(f"\rExample {j}/{len(example.examples)}", end="", flush=True) # gradient_accum.clear() # loss_val_accum.clear() # e.accumulate_value_and_gradient(loss_val_accum, gradient_accum) # for template in model.templates: # for weights in template.get_weights(): # gradient = gradient_accum.get_value(weights) # approx_gradients = numerical_gradient.compute_gradient(weights, lambda: nll_func(e)) # try: # np.testing.assert_almost_equal(gradient.numpy(), approx_gradients.numpy(), 6) # except Exception: # logger.exception("Incorrect gradient...") # print(template, weights.val.tolist()) # print(["%11.8f" % x for x in gradient.tolist()]) # print(["%11.8f" % x for x in approx_gradients.tolist()]) # print(["%11d" % int(np.isclose(x, y, rtol=0, atol=1e-6)) for x, y in zip(gradient, approx_gradients)]) # # raise # print("\n") # ************************************************************** optimizer.step() reporter.loss_val( optimizer.get_value_accumulator().get_value(), global_step) global_step += 1 else: for example in examples: def closure(): optimizer.zero_grad() example.accumulate_value_and_gradient( optimizer.get_value_accumulator(), optimizer.get_gradient_accumulator()) optimizer.average(example.size()) optimizer.copy_grad() return optimizer.get_value_accumulator().get_value() optimizer.step(closure) logger.info("Accum loss: %.10f" % optimizer.get_value_accumulator().get_value()) average_loss_val.append( optimizer.get_value_accumulator().get_value()) reporter.loss_val( optimizer.get_value_accumulator().get_value(), global_step) global_step += 1 if len(average_loss_val) > 1: logger.info("Average accum loss: %.10f" % np.average(average_loss_val)) if optimizer.get_value_accumulator().get_value() < 0: break if i % args.n_iter_eval == 0 or i == n_epoch - 1: cm_train = evaluate(train_map_examples) cm_test = evaluate(test_map_examples) or cm_train logger.info('train (class_idx=0): %s', cm_train.precision_recall_fbeta(class_idx=0)) logger.info('train (class_idx=1): %s', cm_train.precision_recall_fbeta(class_idx=1)) logger.info('test (class_idx=0): %s', cm_test.precision_recall_fbeta(class_idx=0)) logger.info('test (class_idx=1): %s', cm_test.precision_recall_fbeta(class_idx=1)) reporter.precision_recall_fbeta(cm_train, global_step, group='train') reporter.precision_recall_fbeta(cm_test, global_step, group='test') loss_history.append(np.average(average_loss_val)) param_hists.append(model.clone_parameters()) if len(param_hists) > 3: param_hists.pop(0) if args.optimizer == "ADAM" and len(loss_history) > 4 and all( x - y > 0 for x, y in zip(loss_history[-3:], loss_history[-4:-1])): logger.info("Loss increase after 3 epoches. Stop training!") break progress.finish_one() if args.report_final_loss: loss_val_accum = ValueAccumulator() batch_nll_example.accumulate_value_and_gradient(loss_val_accum, None) logger.info("Average accum loss: %.10f" % (loss_val_accum.get_value() / batch_nll_example.size())) logger.info("\n\r%s" % progress.summary()) cm_train.pretty_print("** TRAIN **", precision_recall_fbeta=True, output_stream=logger.info) cm_test.pretty_print("** TEST **", precision_recall_fbeta=True, output_stream=logger.info) # save model and move everything into another folder for storage reporter.close() reporter.export(basedir / 'tensorboard_raw.json') # clear all cache for template in model.templates: if isinstance(template, CachedTemplateFactorConstructor): template.clear_cache() assert len(param_hists) == len(loss_history[-3:]) min_loss, min_params, min_idx = min(zip(loss_history[-3:], param_hists, [-3, -2, -1]), key=lambda x: x[0]) logger.info("Select parameters at index: %d. Loss = %s", min_idx, min_loss) model.update_parameters(min_params) serialize( (model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()), basedir / 'gmtk_model.bin') save_evaluation_result(zip(train_map_examples, train_nll_examples), basedir / 'train.output.json') save_evaluation_result(zip(test_map_examples, test_nll_examples), basedir / 'test.output.json') serializeJSON(input_train_examples, basedir / "train.json") serializeJSON(input_test_examples, basedir / "test.json") # attempt to copy log file try: logger.handlers[1].flush() shutil.copy(logger.handlers[1].file_handler.baseFilename, str(basedir / "train.log")) except: logger.exception("Cannot backup log...") model_id = get_latest_model_id(basedir) + 1 move_current_files(basedir, model_id) logger.info("Save model %s", model_id) return model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()
def _semantic_labeling( self, train_source_ids: Set[str], test_source_ids: Set[str] ) -> Dict[str, MinhptxSemanticLabelingResult]: """Generate semantic labeling for test_sources using train_sources""" need_reexec = True if Path(self.meta_file).exists(): # read meta and compare if previous run is compatible with current run self.logger.debug("Load information from previous run...") meta = deserializeJSON(self.meta_file) meta["training_sources"] = set(meta["training_sources"]) meta["testing_sources"] = set(meta["testing_sources"]) meta["source_ids"] = set(meta['source_ids']) new_meta = self.get_meta(train_source_ids, test_source_ids) if len( new_meta.pop("testing_sources").difference( meta.pop("testing_sources"))) == 0: if new_meta == meta: need_reexec = False if need_reexec: self.logger.debug("Re-execute semantic labeling...") try: # preparing data, want to compute semantic models for all sources in dataset data_dir = Path(config.datasets[self.dataset].data.as_path()) model_dir = Path( config.datasets[self.dataset].models_json.as_path()) shutil.rmtree(str(self.input_dir)) for fpath in self.output_dir.iterdir(): os.remove(fpath) [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True) for x in ["%s_train" % self.dataset, "%s_test" % self.dataset] for y in ["data", "model"]] input_train_dir = self.input_dir / ("%s_train" % self.dataset) input_test_dir = self.input_dir / ("%s_test" % self.dataset) for fpath in sorted(data_dir.iterdir()): model_fname = fpath.stem + "-model.json" if fpath.stem in train_source_ids: self._copy_data(fpath, input_train_dir / "data" / fpath.name) # seriaalize the model instead of copied because we want to convert uri to simplified uri # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_train_dir / "model" / f"{fpath.name}.model.json", indent=4) if fpath.stem in test_source_ids: self._copy_data(fpath, input_test_dir / "data" / fpath.name) # same reason like above serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_test_dir / "model" / f"{fpath.name}.model.json", indent=4) invoke_command(" ".join([ config.previous_works.minhptx_iswc2016.cli.as_path(), str(self.input_dir), str(self.output_dir), "--train_dataset", "%s_train" % self.dataset, "--test_dataset", "%s_test" % self.dataset, "--evaluate_train_set", "True", "--reuse_rf_model", "False" ]), output2file=self.exec_dir / "execution.log") except Exception: sys.stdout.flush() self.logger.exception( "Error while preparing and invoking semantic labeling api..." ) raise serializeJSON(self.get_meta(train_source_ids, test_source_ids), self.meta_file, indent=4) # load result self.logger.debug("Load previous result...") output_files = [ fpath for fpath in self.output_dir.iterdir() if fpath.suffix == ".json" ] assert len(output_files) == 2 app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON( output_files[0], Class=MinhptxSemanticLabelingResult) app_result.update( deserializeJSON(output_files[1], Class=MinhptxSemanticLabelingResult)) return { source_id: app_result[source_id] for source_id in chain(test_source_ids, train_source_ids) }
#!/usr/bin/python # -*- coding: utf-8 -*- import os from pathlib import Path from typing import Dict, Tuple, List, Set, Union, Optional, Any from semantic_modeling.config import config from semantic_modeling.utilities.serializable import deserializeJSON, serializeJSON """Usually run after generate r2rml and copied from KARMA_HOME""" dataset = "museum_edm" model_dir = Path( config.datasets[dataset].karma_version.as_path()) / "models-json" for file in sorted(model_dir.iterdir()): sm = deserializeJSON(file) sm['id'] = Path(sm['id']).stem sm['name'] = sm['id'] serializeJSON(sm, model_dir / f"{sm['id']}-model.json", indent=4) os.remove(file)
#!/usr/bin/python # -*- coding: utf-8 -*- import ujson from pathlib import Path from typing import Dict, Tuple, List, Set, Union, Optional, Any from semantic_modeling.config import config from semantic_modeling.data_io import get_data_tables, get_raw_data_tables, get_semantic_models, get_ontology, \ get_sampled_data_tables from semantic_modeling.utilities.serializable import serializeJSON from transformation.r2rml.commands.modeling import SetInternalLinkCmd, SetSemanticTypeCmd from transformation.r2rml.r2rml import R2RML dataset = "museum_crm" ont = get_ontology(dataset) source_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "sources" source_dir.mkdir(exist_ok=True, parents=True) for tbl in get_sampled_data_tables(dataset): serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)
config.datasets[dataset].as_path()) / "karma-version" / "sources" source_dir.mkdir(exist_ok=True, parents=True) meta_file = source_dir / ".meta" if meta_file.exists(): meta = deserializeJSON(meta_file) if meta['n_samples'] == settings.n_samples and meta[ 'random_seed'] == settings.random_seed: print( "Don't need to prepare karma sources because it has been generated with same configuration before. Terminating...!" ) exit(0) print(f"Generate karma sources for dataset: {dataset}") serializeJSON( { 'n_samples': settings.n_samples, 'random_seed': settings.random_seed }, meta_file, indent=4) model_dir = Path(config.datasets[dataset].models_y2rml.as_path()) # clear cache file clear_sampled_data_tables(dataset) for tbl, sm in zip(get_sampled_data_tables(dataset), get_semantic_models(dataset)): serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)
def print_cooccurrence(features_file_content: dict, output_file: Path): serializeJSON(features_file_content['cooccurrence'], output_file, indent=4)
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms, test_sms): ont: Ontology = get_ontology(dataset) karma_models: List[KarmaModel] = get_karma_models(dataset) semantic_models: List[SemanticModel] = get_semantic_models(dataset) train_sm_ids = [sm.id for sm in train_sms] sdesc_args = dict( dataset=dataset, train_sm_ids=train_sm_ids, use_correct_type= False, # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes use_old_semantic_typer=False, exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015", sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder if sdesc_args['sm_type_dir'].exists(): shutil.rmtree(sdesc_args['sm_type_dir']) sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True) top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes typer = create_semantic_typer(dataset, train_sms) typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True) for sm, ksm in zip(semantic_models, karma_models): # assign semantic types to learnedSemanticTypes sm_alignment = SemanticModelAlignment(sm, ksm) for col in ksm.source_columns: attr = sm.get_attr_by_label( sm.graph.get_node_by_id( sm_alignment.alignment[col.id]).label.decode('utf-8')) node = ksm.karma_graph.get_node_by_id(col.id) link = node.get_first_incoming_link() node.learned_semantic_types = [ KarmaSemanticType(node.id, stype.domain, stype.type, typer.__class__.__name__, stype.confidence_score) for stype in attr.semantic_types ] node.user_semantic_types = [ KarmaSemanticType(node.id, link.get_source_node().label.decode(), link.label.decode(), "User", 1.0) ] serializeJSON(ksm.to_normalized_json_model(ont), sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json", indent=4) # STEP 2: invoking semantic modeling modeler = MohsenSemanticModeling(**sdesc_args) pred_sms = modeler.sm_prediction(train_sms, test_sms) # STEP 3: prediction semantic mapping result eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]] if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH for sm, pred_sm in zip(test_sms, pred_sms): eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph, data_node_mode) eval_hist.append([ sm.id, eval_result["precision"], eval_result["recall"], eval_result["f1"], smodel_eval.stype_acc(sm.graph, pred_sm.graph) ]) eval_hist.append([ 'average', np.average([float(x[1]) for x in eval_hist[1:]]), np.average([float(x[2]) for x in eval_hist[1:]]), np.average([float(x[3]) for x in eval_hist[1:]]), np.average([float(x[4]) for x in eval_hist[1:]]) ]) serializeCSV( eval_hist, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv") # STEP 4: prediction semantic labeling result pred_stypes = modeler.semantic_labeling(train_sms, test_sms) for pred_stype, sm in zip(pred_stypes, test_sms): for attr in sm.attrs: if attr.label not in pred_stype: attr.semantic_types = [] else: attr.semantic_types = pred_stype[attr.label] eval_sources( test_sms, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}_stype.csv") # STEP 5: visualize the prediction (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True) need_render_graphs = [ (colorize_prediction( pred_sm.graph, AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]), sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png") for sm, pred_sm in zip(test_sms, pred_sms) ] with ThreadPool(32) as p: p.map(render_graph, need_render_graphs) return eval_hist
if __name__ == '__main__': # HYPER-ARGS args = get_shell_args() Settings.get_instance( False ).semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes Settings.get_instance().semantic_labeling_method = args.semantic_typer Settings.get_instance().log_current_settings() exp_dir = Path(args.exp_dir) assert exp_dir.exists() source_models = {sm.id: sm for sm in get_semantic_models(args.dataset)} train_sms = [source_models[sid] for sid in args.kfold['train_sm_ids']] test_sms = [source_models[sid] for sid in args.kfold['test_sm_ids']] eval_hist = run_evaluation_workflow(args.dataset, Scenario.SCENARIO_2, train_sms, test_sms) serializeCSV(eval_hist, exp_dir / f"kfold-{get_short_train_name(train_sms)}.test.csv") serializeJSON(args, exp_dir / f"kfold-{get_short_train_name(train_sms)}.meta.json", indent=4) shutil.move( get_cache_dir(args.dataset, train_sms) / "mohsen_jws2015", exp_dir / f"kfold-{get_short_train_name(train_sms)}")