示例#1
0
    def __init__(self, dataset: str, dir: Path) -> None:
        self.ont = get_ontology(dataset)
        self.sms = get_semantic_models(dataset)
        self.sm_prefix_index = {sm.id[:3]: sm for sm in self.sms}
        self.sm_attr2stypes: Dict[str, Dict[str, List[SemanticType]]] = {}
        assert len(self.sm_prefix_index) == len(
            self.sms), "No duplicated prefix"

        class_uris = set()
        predicates = set()
        for sm in self.sms:
            for n in sm.graph.iter_data_nodes():
                e = n.get_first_incoming_link()
                class_uri = e.get_source_node().label.decode()
                predicate = e.label.decode()

                class_uris.add(class_uri)
                predicates.add(predicate)

        for file in dir.iterdir():
            if file.name.endswith(".df.csv"):
                prefix = file.name[:3]
                self.sm_attr2stypes[prefix] = self.read_serene_stypes(file)
                for attr_lbl, stypes in self.sm_attr2stypes[prefix].items():
                    for stype in stypes:
                        stype.domain = self.recover_class_uris(
                            stype.domain, class_uris)
                        stype.type = self.recover_predicates(
                            stype.type, predicates)
示例#2
0
    def __init__(self,
                 dataset: str,
                 max_n_records: int = float('inf'),
                 is_sampling: bool = False,
                 exec_dir: Optional[Union[Path, str]] = None) -> None:
        self.dataset: str = dataset
        self.ont: Ontology = get_ontology(dataset)
        self.max_n_records: int = max_n_records
        self.is_sampling: bool = is_sampling
        assert not is_sampling, "Not implemented"

        self.source_ids: Set[str] = {
            file.stem
            for file in Path(
                config.datasets[dataset].data.as_path()).iterdir()
            if file.is_file() and not file.name.startswith(".")
        }

        if exec_dir is None:
            exec_dir = Path(
                config.fsys.debug.as_path()) / dataset / "minhptx_iswc2016"
        self.exec_dir: Path = Path(exec_dir)

        self.meta_file: Path = self.exec_dir / "execution-meta.json"
        self.input_dir: Path = self.exec_dir / "input"
        self.input_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir: Path = self.exec_dir / "output"
        self.output_dir.mkdir(parents=True, exist_ok=True)
示例#3
0
    def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str],
                 exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None):
        self.dataset: str = dataset
        self.train_sm_ids = train_sm_ids
        self.ont = get_ontology(dataset)
        self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)}

        # can only run once time, trying re-invoke will generate an error
        self.__has_run_modeling = False
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015"
        self.exec_dir: Path = Path(exec_dir)
        self.sm_type_dir = sm_type_dir

        # parameters for mohsen's algorithm
        self.use_old_semantic_typer = use_old_semantic_typer
        self.use_correct_type = use_correct_type
        assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4
        self.num_candidate_semantic_type = 4
        self.multiple_same_property_per_node = True

        self.coherence = 1.0
        self.confidence = 1.0
        self.size_reduction = 0.5

        self.num_candidate_mappings = 50
        self.mapping_branching_factor = 50
        self.topk_steiner_tree = 10

        # take all, not cut off everything
        self.cut_off = int(1e6)
        self.our_and_karma_sm_alignments = {}
示例#4
0
def generate_candidate_sm(dataset: str, test_sm: SemanticModel,
                          stat: Statistic, model_bundle, train_source_ids):
    # generate candidate
    ont = get_ontology(dataset)
    ont_graph = get_ont_graph(dataset)
    settings = Settings.get_instance()

    dnodes: Dict[bytes, List[KarmaSemanticType]] = {
        attr.label.encode('utf-8'): attr.semantic_types
        for attr in test_sm.attrs
    }

    ota = EmpiricalTripleAdviser(
        ont_graph, ont, stat.p_triple,
        settings.searching_triple_adviser_max_candidate)
    graph_explorer_builder = GraphExplorerBuilder(
        ota,
        max_data_node_hop=settings.searching_max_data_node_hop,
        max_class_node_hop=settings.searching_max_class_node_hop)

    for attr, semantic_types in dnodes.items():
        ota.add_data_node(attr, semantic_types)

    model = Model(*model_bundle)
    args = PGMBeamSearchArgs(
        test_sm.id,
        custom_search_discovery,
        Tracker(track_search_nodes=False),
        partial(model.predict_sm_probs, test_sm.id, train_source_ids),
        graph_explorer_builder,
        early_terminate_func=None,
        beam_width=settings.training_beam_width,
        gold_sm=test_sm.graph,
        source_attributes=test_sm.attrs,
        pre_filter_func=filter_unlikely_graph,
    )
    started_node = PGMStartSearchNode(
        args.get_and_increment_id(), args,
        [a.label.encode('utf-8') for a in test_sm.attrs])

    args._tmp_random_state = numpy.random.RandomState(
        Settings.get_instance().random_seed)

    results: List[PGMSearchNode] = beam_search(
        [started_node],
        beam_width=settings.training_beam_width,
        n_results=settings.searching_n_explore_result,
        args=args)

    candidate_sms = {}
    for search_node in args._tmp_tracker_for_storing_search_discovery_nodes:
        g = search_node.get_value().graph
        candidate_sms[graph_to_hashable_string(g)] = g

    for search_node in results:
        g = search_node.get_value().graph
        candidate_sms[graph_to_hashable_string(g)] = g

    return candidate_sms
示例#5
0
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]):
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        if cache_file.exists():
            SemanticTypeAssistant.logger.debug("Try to load previous run...")
            model, cache_dataset, cache_train_sm_ids = deserialize(cache_file)
            if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}:
                need_rebuilt = False

            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)
            model.triple_adviser = ota

        if need_rebuilt:
            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)

            typer = SemanticTyper.get_instance(dataset, train_sms)
            try:
                typer.load_model()
            except:
                sms = get_semantic_models(dataset)
                train_ids = {sm.id for sm in train_sms}
                typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4)

            model = SemanticTypeAssistant(train_sms, typer, ota)
            model.triple_adviser = None
            serialize((model, dataset, {sm.id for sm in train_sms}), cache_file)
            model.triple_adviser = ota

        _instance = model

    return _instance
示例#6
0
def serialize_ont_graph(dataset: str):
    def rdf_type_to_rust_str(rdf_type: PredicateType):
        if rdf_type == PredicateType.OWL_DATA_PROP:
            return "OwlDataProp"
        if rdf_type == PredicateType.OWL_OBJECT_PROP:
            return "OwlObjectProp"
        if rdf_type == PredicateType.OWL_ANNOTATION_PROP:
            return "OwlAnnotationProp"
        if rdf_type == PredicateType.RDF_PROP:
            return "RdfProp"

    ont = get_ontology(dataset)
    ont_graph = get_ont_graph(dataset)

    return {
        "predicates": [{
            "uri":
            ont.simplify_uri(predicate.uri),
            "domains": [ont.simplify_uri(uri) for uri in predicate.domains],
            "ranges": [ont.simplify_uri(uri) for uri in predicate.ranges],
            "rdf_type":
            rdf_type_to_rust_str(predicate.rdf_type),
            "is_rdf_type_reliable":
            predicate.is_rdf_type_reliable
        } for predicate in ont_graph.predicates],
        "class_uris": {
            ont.simplify_uri(node.uri): {
                "uri":
                ont.simplify_uri(node.uri),
                "parents_uris":
                [ont.simplify_uri(uri) for uri in node.parents_uris],
                "children_uris":
                [ont.simplify_uri(uri) for uri in node.children_uris],
            }
            for node in ont_graph.iter_nodes()
        }
    }
示例#7
0
            ssd = ssds[0]
            # ssd.graph.render()
            result = smodel_eval.f1_precision_recall(gold_graph, ssd.graph,
                                                     DataNodeMode.NO_TOUCH)
            eval_results[chuffed_idx]['precision'] = result['precision']
            eval_results[chuffed_idx]['recall'] = result['recall']
            eval_results[chuffed_idx]['f1'] = result['f1']

    return eval_results


if __name__ == '__main__':
    dataset = "museum_crm"
    sms = get_semantic_models(dataset)
    sms_index = {sm.id[:3]: sm for sm in sms}
    ont = get_ontology(dataset)
    ont.register_namespace("serene", "http://au.csiro.data61/serene/dev#")

    # get serene output by sms
    kfold_results = []
    stype = "ReImplMinhISWC_False_pat"
    for kfold in ["kfold-s01-s14", "kfold-s15-s28", "kfold-s08-s21"]:
        kfold_sms_prefix = {
            sm[:3]
            for sm in get_sm_ids_by_name_range(
                *kfold.replace("kfold-", "").split("-"), [sm.id for sm in sms])
        }

        print("==== KFOLD:", kfold, "====")
        serene_output_dir = Path(
            "/workspace/tmp/serene-python-client/datasets/%s/" %
示例#8
0
def generate_candidate_sm(dataset: str, test_sm: SemanticModel,
                          stat: Statistic, model_bundle, train_source_ids):
    # generate candidate
    ont = get_ontology(dataset)
    ont_graph = get_ont_graph(dataset)
    settings = Settings.get_instance()

    dnodes: Dict[bytes, List[KarmaSemanticType]] = {
        attr.label.encode('utf-8'): attr.semantic_types
        for attr in test_sm.attrs
    }

    ota = EmpiricalTripleAdviser(
        ont_graph, ont, stat.p_triple,
        settings.searching_triple_adviser_max_candidate)
    graph_explorer_builder = GraphExplorerBuilder(
        ota,
        max_data_node_hop=settings.searching_max_data_node_hop,
        max_class_node_hop=settings.searching_max_class_node_hop)

    for attr, semantic_types in dnodes.items():
        ota.add_data_node(attr, semantic_types)

    early_stopping = EarlyStopping()
    model = Model(*model_bundle)
    args = PGMBeamSearchArgs(
        test_sm.id,
        discovering_func,
        Tracker(track_search_nodes=True),
        partial(model.predict_sm_probs, test_sm.id, train_source_ids),
        graph_explorer_builder,
        # early_terminate_func=early_stopping.early_stopping,
        early_terminate_func=None,
        beam_width=settings.searching_beam_width,
        gold_sm=test_sm.graph,
        source_attributes=test_sm.attrs,
        pre_filter_func=filter_unlikely_graph,
    )
    started_nodes = [
        PGMStartSearchNode(args.get_and_increment_id(), args,
                           [a.label.encode('utf-8') for a in test_sm.attrs])
    ]

    results: List[PGMSearchNode] = beam_search(
        started_nodes,
        beam_width=settings.searching_beam_width,
        n_results=settings.searching_n_explore_result,
        args=args)

    # *****************************************************************************************************************'
    # DEBUG CODE
    output_dir = Path(config.fsys.debug.as_path() + "/tmp/final/")
    # for search_node in args.tracker.list_search_nodes:
    #     search_node.beam_search_args = None
    # serialize(args.tracker.list_search_nodes, output_dir / "search_nodes2.pkl")

    # for file in output_dir.iterdir():
    #     if file.is_dir():
    #         shutil.rmtree(file)
    #     else:
    #         os.remove(file)
    #
    # for i, search_nodes in enumerate(args.tracker.list_search_nodes):
    #     if len(search_nodes) == 0:
    #         continue
    #
    #     sub_output_dir = output_dir / str(i)
    #     sub_output_dir.mkdir(exist_ok=True, parents=True)
    #
    #     for j, r in enumerate(search_nodes[:30]):
    #         pred_sm = r.get_value().graph
    #         pred_sm.set_name(str(r.get_score()).encode('utf-8'))
    #
    #         g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0])
    #         g.render2img(sub_output_dir / f"{j}.png")
    #         serialize(pred_sm, sub_output_dir / f"{j}.pkl")
    #
    # sub_output_dir = output_dir / "result"
    # sub_output_dir.mkdir(exist_ok=True, parents=True)
    #
    # for i, r in enumerate(results):
    #     pred_sm = r.get_value().graph
    #     pred_sm.set_name(str(r.get_score()).encode('utf-8'))
    #
    #     g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0])
    #     g.render2img(sub_output_dir / f"{i}.png")
    #     serialize(pred_sm, sub_output_dir / f"{i}.pkl")
    #
    # # STEP 4: report performance
    print(
        f"{test_sm.id}: Performance at prev iter:",
        smodel_eval.f1_precision_recall(
            test_sm.graph,
            args.tracker.list_search_nodes[-1][0].get_value().graph,
            DataNodeMode.NO_TOUCH))
    print(
        f"{test_sm.id}: Performance at final iter:",
        smodel_eval.f1_precision_recall(test_sm.graph,
                                        results[0].get_value().graph,
                                        DataNodeMode.NO_TOUCH))
    # *****************************************************************************************************************'
    performances = []
    for iter_no, search_nodes in enumerate(args.tracker.list_search_nodes):
        if len(search_nodes) == 0:
            continue

        x = smodel_eval.f1_precision_recall(test_sm.graph,
                                            search_nodes[0].get_value().graph,
                                            DataNodeMode.NO_TOUCH)
        performances.append((iter_no, search_nodes[0].get_score(),
                             x['precision'], x['recall'], x['f1']))

    x = smodel_eval.f1_precision_recall(test_sm.graph,
                                        results[0].get_value().graph,
                                        DataNodeMode.NO_TOUCH)
    performances.append((len(performances), results[0].get_score(),
                         x['precision'], x['recall'], x['f1']))

    pred_sms = [(search_node.get_score(), search_node.get_value().graph)
                for search_node in results]
    search_history = [[n.get_value().graph for n in search_nodes]
                      for search_nodes in args.tracker.list_search_nodes]
    search_history.append([n.get_value().graph for n in results])

    return pred_sms, performances, search_history
示例#9
0
               Literal(ujson.dumps(input_columns))))
        g.add((kr2rml, km_dev.hasOutputColumns,
               Literal(ujson.dumps(output_columns))))
        g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id)))
        g.add((kr2rml, km_dev.hasBaseURI,
               Literal("http://localhost:8080/source/")))
        g.add((kr2rml, km_dev.hasWorksheetHistory,
               Literal(ujson.dumps(worksheet_history, indent=4))))

        g.serialize(str(fpath), format='n3')


if __name__ == '__main__':
    from semantic_modeling.data_io import get_ontology

    ont = get_ontology("museum_edm")
    r2rml = R2RML.load_from_file(
        Path(
            "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/models-y2rml/s01-cb-model.yml"
        ))
    tbl = DataTable.load_from_file(
        Path(
            "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/sources/s01-cb.csv"
        ))
    r2rml.to_kr2rml(
        ont, tbl,
        Path(
            "/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/karma-version/models-r2rml/s01-cb-model.ttl"
        ))

    # r2rml = R2RML.load_from_file(Path("/home/rook/workspace/DataIntegration/SourceModeling/data/museum-edm/models-y2rml/s04-ima-artworks-model.yml"))
示例#10
0
                           cmds[-1][1].target_id)
                    for idx, cmd in cmds[:-1]:
                        if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key:
                            cmds[-1][1].source_uri = cmd.source_uri
                            cmds[-1][1].target_uri = cmd.target_uri
                            break

        delete_commands.sort(reverse=True)
        for idx in delete_commands:
            commands.pop(idx)

        super().__init__(commands)

    def to_yaml(self, fpath: Path):
        with open(fpath, "w") as f:
            yaml.dump(self.to_dict(), f, default_flow_style=False, indent=4)


if __name__ == '__main__':
    path = '/home/rook/workspace/DataIntegration/SourceModeling/data/mohsen-data/museum/cleaned-edm/sources/s04-ima-artworks.xml'
    kr2rml_file = '/home/rook/workspace/DataIntegration/SourceModeling/data/mohsen-data/museum/edm/models-r2rml/s04-ima-artworks-model.ttl'
    dataset = "museum_edm"

    tbl = DataTable.load_from_file(Path(path))
    # print(tbl.head(5).to_string())
    transformer = KR2RML(get_ontology(dataset), tbl, Path(kr2rml_file))

    transformer.apply_build(tbl)
    print(tbl.head(5).to_string())
    transformer.to_yaml("test.yml")
示例#11
0
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms,
                            test_sms):
    ont: Ontology = get_ontology(dataset)
    karma_models: List[KarmaModel] = get_karma_models(dataset)
    semantic_models: List[SemanticModel] = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    sdesc_args = dict(
        dataset=dataset,
        train_sm_ids=train_sm_ids,
        use_correct_type=
        False,  # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes
        use_old_semantic_typer=False,
        exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015",
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")
    # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder
    if sdesc_args['sm_type_dir'].exists():
        shutil.rmtree(sdesc_args['sm_type_dir'])
    sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True)

    top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes
    typer = create_semantic_typer(dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True)

    for sm, ksm in zip(semantic_models, karma_models):
        # assign semantic types to learnedSemanticTypes
        sm_alignment = SemanticModelAlignment(sm, ksm)
        for col in ksm.source_columns:
            attr = sm.get_attr_by_label(
                sm.graph.get_node_by_id(
                    sm_alignment.alignment[col.id]).label.decode('utf-8'))
            node = ksm.karma_graph.get_node_by_id(col.id)
            link = node.get_first_incoming_link()

            node.learned_semantic_types = [
                KarmaSemanticType(node.id, stype.domain, stype.type,
                                  typer.__class__.__name__,
                                  stype.confidence_score)
                for stype in attr.semantic_types
            ]
            node.user_semantic_types = [
                KarmaSemanticType(node.id,
                                  link.get_source_node().label.decode(),
                                  link.label.decode(), "User", 1.0)
            ]

        serializeJSON(ksm.to_normalized_json_model(ont),
                      sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json",
                      indent=4)

    # STEP 2: invoking semantic modeling
    modeler = MohsenSemanticModeling(**sdesc_args)
    pred_sms = modeler.sm_prediction(train_sms, test_sms)

    # STEP 3: prediction semantic mapping result
    eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]]
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    for sm, pred_sm in zip(test_sms, pred_sms):
        eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph,
                                                      data_node_mode)
        eval_hist.append([
            sm.id, eval_result["precision"], eval_result["recall"],
            eval_result["f1"],
            smodel_eval.stype_acc(sm.graph, pred_sm.graph)
        ])

    eval_hist.append([
        'average',
        np.average([float(x[1]) for x in eval_hist[1:]]),
        np.average([float(x[2]) for x in eval_hist[1:]]),
        np.average([float(x[3]) for x in eval_hist[1:]]),
        np.average([float(x[4]) for x in eval_hist[1:]])
    ])
    serializeCSV(
        eval_hist,
        sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv")

    # STEP 4: prediction semantic labeling result
    pred_stypes = modeler.semantic_labeling(train_sms, test_sms)
    for pred_stype, sm in zip(pred_stypes, test_sms):
        for attr in sm.attrs:
            if attr.label not in pred_stype:
                attr.semantic_types = []
            else:
                attr.semantic_types = pred_stype[attr.label]
    eval_sources(
        test_sms, sdesc_args["exec_dir"] /
        f"evaluation_result_{scenario.value}_stype.csv")

    # STEP 5: visualize the prediction
    (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True)
    need_render_graphs = [
        (colorize_prediction(
            pred_sm.graph,
            AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]),
         sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png")
        for sm, pred_sm in zip(test_sms, pred_sms)
    ]
    with ThreadPool(32) as p:
        p.map(render_graph, need_render_graphs)

    return eval_hist