Пример #1
0
def online_learning(model: Model,
                    dataset: str,
                    train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel],
                    output_dir: Path,
                    training_args,
                    iter_range=(1, 3)):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    ignore_sids = set(
    )  # those should not include in the discovery_helper process because of no new sources
    logger = get_logger("app")
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    for n_iter in range(*iter_range):
        logger.info("==================================> Iter: %s", n_iter)
        new_data = generate_data(model, dataset, train_sms, discover_sources,
                                 n_iter)
        for sm in discover_sources:
            if sm.id in ignore_sids:
                continue

            new_candidate_sms = [
                key for key in new_data[sm.id] if key not in data[sm.id]
            ]
            if len(new_candidate_sms) == 0:
                # no new candidate sms
                logger.info("No new candidate for source: %s", sm.id)
                ignore_sids.add(sm.id)
            else:
                for key in new_candidate_sms:
                    data[sm.id][key] = new_data[sm.id][key]

        train_examples = [
            example for sm in train_sms if sm.id in discover_sids
            for example in data[sm.id].values()
        ]
        train_examples.sort(key=lambda e: e.example_id)

        serializeJSON(train_examples,
                      output_dir / "examples" / f"train.{n_iter}.json")
        shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json",
                        output_dir / "examples" / f"train.json")

        raw_model, tf_domain, pairwise_domain, __ = train_model(
            dataset, [sm.id for sm in train_sms], 120, train_examples, [],
            training_args, output_dir / "models")
        model = Model(dataset, raw_model, tf_domain, pairwise_domain)

    return model
Пример #2
0
class EarlyStopping(object):

    logger = get_logger("app.assembling.training_workflow.early_stopping")

    def __init__(self) -> None:
        self.prev_max_score = 1

    def early_stopping(self, n_iter, search_nodes: Iterable[PGMSearchNode]):
        # return True
        # return n_iter >= 4
        try:
            search_node = next(iter(search_nodes))
        except StopIteration:
            return True

        current_score = search_node.get_score()
        if self.prev_max_score - current_score > 0.3 and current_score < 0.3:
            return True
        self.prev_max_score = current_score
        return False
Пример #3
0
class NodeProb(object):

    logger = get_logger("app.assembling.weak_models.node_prob")

    def __init__(self,
                 example_annotator: 'ExampleAnnotator',
                 load_classifier: bool = False):
        self.example_annotator = example_annotator
        self.multival_predicate = example_annotator.multival_predicate

        if load_classifier:
            retrain = example_annotator.training_examples is not None
            self.scaler, self.classifier = self.get_classifier(
                retrain=retrain,
                train_examples=example_annotator.training_examples)
        else:
            self.scaler, self.classifier = None, None

    def feature_extraction(self, graph: Graph,
                           stype_score: Dict[int, Optional[float]]):
        node2features = {}
        for node in graph.iter_class_nodes():
            prob_data_nodes = _(node.iter_outgoing_links()) \
                .imap(lambda x: x.get_target_node()) \
                .ifilter(lambda x: x.is_data_node()) \
                .reduce(lambda a, b: a + (stype_score[b.id] or 0), 0)

            similar_nodes = graph.iter_nodes_by_label(node.label)
            minimum_merged_cost = min((get_merged_cost(node, similar_node,
                                                       self.multival_predicate)
                                       for similar_node in similar_nodes))

            node2features[node.id] = [('prob_data_nodes', prob_data_nodes),
                                      ('minimum_merged_cost',
                                       minimum_merged_cost)]
        return node2features

    def compute_prob(self, node2features):
        X = numpy.asarray([[p[1] for p in features]
                           for features in node2features.values()])

        self.scaler.transform(X)
        y_pred = self.classifier.predict_proba(X)[:, 1]
        return {nid: y_pred[i] for i, nid in enumerate(node2features.keys())}

    def get_classifier(self, retrain: bool, train_examples: List[Example]):
        # TODO: implement this properly, currently, we have to train and save manually
        cached_file = get_cache_dir(
            self.example_annotator.dataset,
            list(self.example_annotator.train_source_ids)
        ) / "weak_models" / "node_prob_classifier.pkl"
        if not cached_file.exists() or retrain:
            self.logger.debug("Retrain new model")
            raw_X_train = make_data(self, train_examples)
            classifier = LogisticRegression(fit_intercept=True)

            X_train = numpy.asarray(
                [list(features.values())[1:] for features in raw_X_train])
            X_train, y_train = X_train[:, :-1], [
                int(x) for x in X_train[:, -1]
            ]

            scaler = StandardScaler().fit(X_train)
            scaler.transform(X_train)

            try:
                classifier.fit(X_train, y_train)
            except ValueError as e:
                assert str(e).startswith(
                    "This solver needs samples of at least 2 classes in the data"
                )
                # this should be at a starter phase when we don't have any data but use ground-truth to build
                X_train = numpy.vstack([X_train, [0, 0]])
                y_train.append(0)
                classifier.fit(X_train, y_train)

            cached_file.parent.mkdir(exist_ok=True, parents=True)
            serialize((scaler, classifier), cached_file)
            return scaler, classifier

        return deserialize(cached_file)
Пример #4
0
class Settings(object):

    logger = get_logger("app.assembling.settings")
    instance = None

    # ####################################################################
    # Semantic Labeling constant
    ReImplMinhISWC = "ReImplMinhISWC"
    MohsenJWS = "MohsenJWS"
    OracleSL = "OracleSL"

    # Searching constant
    ALGO_ES_DISABLE = "NoEarlyStopping"
    ALGO_ES_MIN_PROB = "MinProb"

    # Auto-labeling constant
    ALGO_AUTO_LBL_MAX_F1 = "AUTO_LBL_MAX_F1"
    ALGO_AUTO_LBL_PRESERVED_STRUCTURE = "AUTO_LBL_PRESERVED_STRUCTURE"

    # ####################################################################

    def __init__(self):
        # ####################################################################
        # General arguments
        self.random_seed: int = 120
        self.n_samples: int = 1000

        # ####################################################################
        # semantic labeling arguments
        self.semantic_labeling_method: str = Settings.ReImplMinhISWC
        self.semantic_labeling_top_n_stypes: int = 4
        self.semantic_labeling_simulate_testing: bool = False

        # ####################################################################
        # auto labeling arguments
        self.auto_labeling_method: str = Settings.ALGO_AUTO_LBL_MAX_F1

        # ####################################################################
        # weak models arguments
        self.data_constraint_guess_datetime_threshold: float = 0.5
        self.data_constraint_valid_threshold: float = 0.95
        self.data_constraint_n_comparison_samples: int = 150

        # ####################################################################
        # graphical model arguments
        self.mrf_max_n_props = 10
        self.mrf_max_n_duplications = 5
        self.mrf_max_n_duplication_types = 4

        # ####################################################################
        # searching arguments
        self.training_beam_width: int = 10
        self.searching_beam_width: int = 10
        self.searching_max_data_node_hop: int = 2
        self.searching_max_class_node_hop: int = 2
        self.searching_n_explore_result = 5
        self.searching_triple_adviser_max_candidate: int = 15

        self.searching_early_stopping_method: str = Settings.ALGO_ES_DISABLE
        self.searching_early_stopping_minimum_expected_accuracy = 0
        self.searching_early_stopping_min_prob_args: Tuple[float] = (0.01, )

        # ####################################################################
        # parallels
        self.parallel_gmtk_n_threads: int = 8
        self.parallel_n_process: int = 4
        self.parallel_n_annotators: int = 8
        self.max_n_tasks: int = 80  # tune this parameter if its consume lots of memory

    def log_current_settings(self):
        self.logger.info("Current settings: %s", self.to_string())

    def set_setting(self, key: str, value, log_change: bool = True):
        assert key in self.__dict__
        self.__dict__[key] = value
        if log_change:
            self.log_current_settings()

    @staticmethod
    def get_instance(print_settings: bool = True) -> 'Settings':
        if Settings.instance is None:
            Settings.instance = Settings()
            if print_settings:
                Settings.instance.log_current_settings()

        return Settings.instance

    @staticmethod
    def parse_shell_args(print_settings: bool = True):
        def str2bool(v):
            assert v.lower() in {"true", "false"}
            return v.lower() == "true"

        parser = argparse.ArgumentParser('Settings')
        parser.register("type", "boolean", str2bool)
        parser.add_argument('--random_seed',
                            type=int,
                            default=120,
                            help='default 120')
        parser.add_argument('--n_samples',
                            type=int,
                            default=1000,
                            help='default 1000')

        parser.add_argument(
            '--semantic_labeling_method',
            type=str,
            default='ReImplMinhISWC',
            help=
            'can be OracleSL, ReImplMinhISWC and MohsenISWC, default ReImplMinhISWC'
        )
        parser.add_argument('--semantic_labeling_top_n_stypes',
                            type=int,
                            default=4,
                            help='Default is top 4')
        parser.add_argument('--semantic_labeling_simulate_testing',
                            type='boolean',
                            default=False,
                            help='Default is False')

        parser.add_argument(
            '--auto_labeling_method',
            type=str,
            default='AUTO_LBL_MAX_F1',
            help=
            'can be AUTO_LBL_MAX_F1 and AUTO_LBL_PRESERVED_STRUCTURE (default AUTO_LBL_MAX_F1)'
        )

        parser.add_argument('--data_constraint_guess_datetime_threshold',
                            type=int,
                            default=0.5,
                            help='default 0.5')
        parser.add_argument('--data_constraint_valid_threshold',
                            type=int,
                            default=0.95,
                            help='default is 0.95')
        parser.add_argument('--data_constraint_n_comparison_samples',
                            type=int,
                            default=150,
                            help='default is 150')

        parser.add_argument('--training_beam_width',
                            type=int,
                            default=10,
                            help='default 10')
        parser.add_argument('--searching_beam_width',
                            type=int,
                            default=10,
                            help='default 10')
        parser.add_argument('--searching_max_data_node_hop',
                            type=int,
                            default=2,
                            help='default 2')
        parser.add_argument('--searching_max_class_node_hop',
                            type=int,
                            default=2,
                            help='default 2')
        parser.add_argument('--searching_n_explore_result',
                            type=int,
                            default=5,
                            help='default 5')
        parser.add_argument('--searching_triple_adviser_max_candidate',
                            type=int,
                            default=15,
                            help='default 15')
        parser.add_argument(
            '--searching_early_stopping_method',
            type=str,
            default='NoEarlyStopping',
            help='can be NoEarlyStopping or MinProb (default NoEarlyStopping)')
        parser.add_argument(
            '--searching_early_stopping_minimum_expected_accuracy',
            type=int,
            default=0,
            help='default 0')
        parser.add_argument('--searching_early_stopping_min_prob_args',
                            type=str,
                            default="[0.01]",
                            help='default is [0.01]')

        parser.add_argument('--parallel_gmtk_n_threads',
                            type=int,
                            default=8,
                            help='default is 8 threads')
        parser.add_argument('--parallel_n_process',
                            type=int,
                            default=4,
                            help='default is 4 processes')
        parser.add_argument('--parallel_n_annotators',
                            type=int,
                            default=8,
                            help='default is 8')
        parser.add_argument('--max_n_tasks',
                            type=int,
                            default=80,
                            help='default is 80')

        args = parser.parse_args()
        args.searching_early_stopping_min_prob_args = ujson.loads(
            args.searching_early_stopping_min_prob_args)

        assert args.semantic_labeling_method in {
            Settings.ReImplMinhISWC, Settings.MohsenJWS, Settings.OracleSL
        }
        assert args.auto_labeling_method in {
            Settings.ALGO_AUTO_LBL_MAX_F1,
            Settings.ALGO_AUTO_LBL_PRESERVED_STRUCTURE
        }
        assert args.searching_early_stopping_method in {
            Settings.ALGO_ES_DISABLE, Settings.ALGO_ES_MIN_PROB
        }

        Settings.get_instance(False)
        settings = Settings.instance

        settings.random_seed = args.random_seed
        settings.n_samples = args.n_samples
        settings.semantic_labeling_method = args.semantic_labeling_method
        settings.semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes
        settings.semantic_labeling_simulate_testing = args.semantic_labeling_simulate_testing

        settings.auto_labeling_method = args.auto_labeling_method
        settings.data_constraint_guess_datetime_threshold = args.data_constraint_guess_datetime_threshold
        settings.data_constraint_valid_threshold = args.data_constraint_valid_threshold
        settings.data_constraint_n_comparison_samples = args.data_constraint_n_comparison_samples
        settings.searching_beam_width = args.searching_beam_width
        settings.searching_max_data_node_hop = args.searching_max_data_node_hop
        settings.searching_max_class_node_hop = args.searching_max_class_node_hop
        settings.searching_n_explore_result = args.searching_n_explore_result
        settings.searching_triple_adviser_max_candidate = args.searching_triple_adviser_max_candidate
        settings.searching_early_stopping_method = args.searching_early_stopping_method
        settings.searching_early_stopping_minimum_expected_accuracy = args.searching_early_stopping_minimum_expected_accuracy
        settings.searching_early_stopping_min_prob_args = args.searching_early_stopping_min_prob_args
        settings.parallel_gmtk_n_threads = args.parallel_gmtk_n_threads
        settings.parallel_n_process = args.parallel_n_process
        settings.parallel_n_annotators = args.parallel_n_annotators
        settings.max_n_tasks = args.max_n_tasks

        if print_settings:
            settings.log_current_settings()

        return settings

    def to_string(self):
        return f"""
Пример #5
0
    delete_worksheet = driver.find_elements_by_css_selector(
        "#WorksheetOptionsDiv > ul.dropdown-menu > li")[-3]
    assert delete_worksheet.text.strip() == "Delete Worksheet"
    delete_worksheet.click()
    short_delay()

    alert = driver.switch_to.alert
    alert.accept()
    delay()

    remove_all_noti()


# SETUP hyper-parameters
logger = get_logger("app.preprocessing.generate_r2rml")
dataset = "museum_edm"
ont = get_ontology(dataset)

#%% INIT SELENIUM

driver = webdriver.Firefox()
driver.get("http://localhost:8080")
time.sleep(5)

#%% LOAD FILES
model_dir = Path(config.datasets[dataset].models_y2rml.as_path())
r2rml_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "models-r2rml"
karma_source_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "sources"
Пример #6
0
class KR2RML(R2RML):
    """Load KR2RML and produce default command history"""
    logger = get_logger("app.transformation.kr2rml")

    def __init__(self, ont: Ontology, tbl: DataTable,
                 kr2rml_file: Path) -> None:
        g = rdflib.Graph(store=IOMemory())
        g.parse(location=str(kr2rml_file), format="n3")

        worksheet_history = list(
            g.triples(
                (None,
                 URIRef(
                     "http://isi.edu/integration/karma/dev#hasWorksheetHistory"
                 ), None)))
        assert len(worksheet_history) == 1
        worksheet_history = ujson.loads(worksheet_history[0][-1])

        input_columns = list(
            g.triples((
                None,
                URIRef("http://isi.edu/integration/karma/dev#hasInputColumns"),
                None)))
        assert len(input_columns) == 1
        input_columns = ujson.loads(input_columns[0][-1])

        # construct mapping between kr2rml attribute paths to tbl_attr_paths
        tbl_attr_paths = tbl.schema.get_attr_paths()
        n_attr_paths = len(tbl_attr_paths)
        tbl_attr_paths = {
            apath.replace("@", ""): apath
            for apath in tbl_attr_paths
        }
        assert len(tbl_attr_paths) == n_attr_paths

        start_idx = 0
        for i, cname in enumerate(input_columns[0]):
            cpath = Schema.PATH_DELIMITER.join(
                cname['columnName'] for cname in input_columns[0][i:])
            # cname = Schema.PATH_DELIMITERinput_columns[i:]) cname['columnName'] + Schema.PATH_DELIMITER
            found_attr = False
            for attr_path in tbl_attr_paths:
                if (attr_path + Schema.PATH_DELIMITER).startswith(cpath):
                    found_attr = True
                    break
            if found_attr:
                start_idx = i
                break

        literal_nodes = {}
        col2col = {}
        for col in input_columns:
            attr_path = Schema.PATH_DELIMITER.join(
                cname['columnName'] for cname in col[start_idx:])
            if attr_path not in tbl_attr_paths:
                attr_path = Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in col[start_idx:-1])
                if col[-1]['columnName'] == 'Values':
                    assert attr_path in tbl_attr_paths
                elif col[-1]['columnName'] == 'content':
                    attr_path += Schema.PATH_DELIMITER + "#text"
                    assert attr_path in tbl_attr_paths
                else:
                    raise ValueError(
                        f"Invalid column type: {col[-1]['columnName']}")

            col2col[Schema.PATH_DELIMITER.join(
                cname['columnName']
                for cname in col)] = tbl_attr_paths[attr_path]
        assert len(set(
            col2col.values())) == len(input_columns), "No duplication"

        # extracting commands
        commands = []
        for command in worksheet_history:
            if command['commandName'] == "SubmitPythonTransformationCommand":
                cmd_start_col = command['inputParameters'][0]
                cmd_input_parent_col = Schema.PATH_DELIMITER.join(
                    [col['columnName'] for col in cmd_start_col['value'][:-1]])
                cmd_input_col = command['inputParameters'][-2]
                cmd_output_col = command['inputParameters'][-1]

                if command['inputParameters'][-3]['name'] == 'isJSONOutput':
                    cmd_code = command['inputParameters'][-5]
                    default_error_value = command['inputParameters'][-4]
                    assert command['inputParameters'][-3]['value'] == "false"
                else:
                    default_error_value = command['inputParameters'][-3]
                    cmd_code = command['inputParameters'][-4]

                assert cmd_input_col['name'] == "inputColumns" and cmd_output_col[
                    "name"] == "outputColumns" and cmd_code[
                        'name'] == 'transformationCode' and default_error_value[
                            'name'] == 'errorDefaultValue'
                cmd_input_cols = [[
                    cname['columnName'] for cname in o['value']
                ] for o in ujson.loads(cmd_input_col['value'])]
                karma_input_attr_paths = [
                    col2col[Schema.PATH_DELIMITER.join(cmd_input_col)]
                    for cmd_input_col in cmd_input_cols
                ]

                # update col2col because of new columns
                new_attr_name = ujson.loads(
                    cmd_output_col['value'])[0]['value'][-1]['columnName']
                new_attr_path = new_attr_name if cmd_input_parent_col == "" else (
                    cmd_input_parent_col + Schema.PATH_DELIMITER +
                    new_attr_name)
                cmd_output_col = Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in ujson.loads(
                        cmd_output_col['value'])[0]['value'])
                col2col[cmd_output_col] = new_attr_path

                cmd_code = cmd_code['value'].replace("return ",
                                                     "__return__ = ")
                input_attr_paths = []
                for match in reversed(
                        list(re.finditer("getValue\(([^)]+)\)", cmd_code))):
                    start, end = match.span(1)
                    field = cmd_code[start:end].replace("'", "").replace(
                        '"""', "").replace('"', '')
                    # it seems that Karma use last column name, we need to recover full name
                    # using the provided input first
                    for cmd_input_col, input_attr_path in zip(
                            cmd_input_cols, karma_input_attr_paths):
                        if field == cmd_input_col[-1]:
                            field = input_attr_path
                            break
                    else:
                        # otherwise construct from the start columns
                        full_field = field if cmd_input_parent_col == "" else (
                            cmd_input_parent_col + Schema.PATH_DELIMITER +
                            field)
                        field = col2col[full_field]
                    cmd_code = cmd_code[:start] + f'"{field}"' + cmd_code[end:]

                    input_attr_paths.append(field)

                default_error_value = default_error_value['value']
                commands.append(
                    PyTransformNewColumnCmd(input_attr_paths, new_attr_name,
                                            cmd_code, default_error_value))
            elif command["commandName"] == "SetSemanticTypeCommand" or command[
                    "commandName"] == "SetMetaPropertyCommand":
                cmd_input_col = command['inputParameters'][-2]
                if command["inputParameters"][-5][
                        'name'] == 'SemanticTypesArray':
                    cmd_stype = command['inputParameters'][-5]
                else:
                    cmd_stype = command['inputParameters'][-6]

                if cmd_stype['name'] == 'SemanticTypesArray':
                    assert cmd_input_col['name'] == "inputColumns" and len(
                        cmd_stype['value']
                    ) == 1 and cmd_stype['value'][0]['isPrimary']
                    cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                        cname['columnName'] for cname in ujson.loads(
                            cmd_input_col['value'])[0]['value'])]
                    cmd_stype = cmd_stype['value'][0]

                    commands.append(
                        SetSemanticTypeCmd(
                            cmd_input_col,
                            domain=ont.simplify_uri(cmd_stype['DomainUri']),
                            type=ont.simplify_uri(cmd_stype['FullType']),
                            node_id=ont.simplify_uri(
                                cmd_stype['DomainId'].replace(" (add)", ""))))
                else:
                    cmd_stype_domain = command['inputParameters'][-7]
                    cmd_stype_id = command['inputParameters'][-6]
                    assert cmd_input_col['name'] == "inputColumns" and cmd_stype_domain['name'] == 'metaPropertyUri' \
                           and cmd_stype_id['name'] == 'metaPropertyId'
                    cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                        cname['columnName'] for cname in ujson.loads(
                            cmd_input_col['value'])[0]['value'])]

                    commands.append(
                        SetSemanticTypeCmd(
                            cmd_input_col,
                            domain=ont.simplify_uri(cmd_stype_domain['value']),
                            type="karma:classLink",
                            node_id=ont.simplify_uri(cmd_stype_id['value'])))
            elif command['commandName'] == 'UnassignSemanticTypeCommand':
                cmd_input_col = command['inputParameters'][-2]
                assert cmd_input_col['name'] == "inputColumns"
                cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in ujson.loads(
                        cmd_input_col['value'])[0]['value'])]

                delete_cmds = []
                for i, cmd in enumerate(commands):
                    if isinstance(cmd, SetSemanticTypeCmd
                                  ) and cmd.input_attr_path == cmd_input_col:
                        delete_cmds.append(i)

                for i in reversed(delete_cmds):
                    commands.pop(i)
            elif command["commandName"] == "ChangeInternalNodeLinksCommand":
                cmd_edges = command['inputParameters'][-3]
                assert cmd_edges['name'] == 'newEdges'
                # cmd_initial_edges = command['inputParameters'][-4]
                # if cmd_initial_edges['name'] == 'initialEdges' and len(cmd_initial_edges['value']) > 0:
                #     delete_cmds = []
                #     for cmd_edge in cmd_initial_edges['value']:
                #         edge_lbl = ont.simplify_uri(cmd_edge['edgeId'])
                #         source_id = ont.simplify_uri(cmd_edge['edgeSourceId'])
                #
                #         if cmd_edge['edgeTargetId'] in literal_nodes:
                #             for i, cmd in enumerate(commands):
                #                 if isinstance(cmd, SetSemanticTypeCmd) and cmd.type == edge_lbl and cmd.node_id == source_id:
                #                         delete_cmds.append(i)
                #         else:
                #             target_id = ont.simplify_uri(cmd_edge['edgeTargetId'])
                #             for i, cmd in enumerate(commands):
                #                 if isinstance(cmd, SetInternalLinkCmd) and cmd.link_lbl == edge_lbl and cmd.target_id == target_id and cmd.source_id == source_id:
                #                     delete_cmds.append(i)
                #
                #     for idx in sorted(delete_cmds, reverse=True):
                #         commands.pop(idx)

                for cmd_edge in cmd_edges['value']:
                    source_uri = cmd_edge.get('edgeSourceUri', None)
                    target_uri = cmd_edge.get('edgeTargetUri', None)

                    if source_uri is not None and source_uri != cmd_edge[
                            'edgeSourceId']:
                        source_uri = ont.simplify_uri(source_uri)
                    else:
                        source_uri = None

                    if target_uri is not None and target_uri != cmd_edge[
                            'edgeTargetId']:
                        target_uri = ont.simplify_uri(target_uri)
                    else:
                        target_uri = None

                    if cmd_edge['edgeTargetId'] in literal_nodes:
                        # convert this command to SetSemanticType
                        commands.append(
                            SetSemanticTypeCmd(
                                literal_nodes[cmd_edge['edgeTargetId']],
                                domain=ont.simplify_uri(source_uri),
                                type=ont.simplify_uri(cmd_edge['edgeId']),
                                node_id=ont.simplify_uri(
                                    cmd_edge['edgeSourceId'])))
                    else:
                        commands.append(
                            SetInternalLinkCmd(
                                ont.simplify_uri(cmd_edge['edgeSourceId']),
                                ont.simplify_uri(cmd_edge['edgeTargetId']),
                                ont.simplify_uri(cmd_edge['edgeId']),
                                source_uri, target_uri))
            elif command['commandName'] == "AddLinkCommand":
                cmd_edges = command['inputParameters'][-3]
                assert cmd_edges['name'] == 'edge'
                cmd_edge = cmd_edges['value']
                source_uri = cmd_edge.get('edgeSourceUri', None)
                target_uri = cmd_edge.get('edgeTargetUri', None)
                if source_uri is not None:
                    source_uri = ont.simplify_uri(source_uri)
                else:
                    source_uri = None

                if cmd_edge['edgeTargetId'] in literal_nodes:
                    # convert this command to SetSemanticType
                    commands.append(
                        SetSemanticTypeCmd(
                            literal_nodes[cmd_edge['edgeTargetId']],
                            domain=ont.simplify_uri(source_uri),
                            type=ont.simplify_uri(cmd_edge['edgeId']),
                            node_id=ont.simplify_uri(
                                cmd_edge['edgeSourceId'])))
                else:
                    if target_uri is not None:
                        target_uri = ont.simplify_uri(target_uri)
                    else:
                        target_uri = None

                    commands.append(
                        SetInternalLinkCmd(
                            ont.simplify_uri(cmd_edge['edgeSourceId']),
                            ont.simplify_uri(cmd_edge['edgeTargetId']),
                            ont.simplify_uri(cmd_edge['edgeId']), source_uri,
                            target_uri))
            elif command['commandName'] == 'DeleteLinkCommand':
                cmd_edge = command['inputParameters'][-3]
                assert cmd_edge['name'] == 'edge'
                cmd_edge = cmd_edge['value']
                for i, cmd in enumerate(commands):
                    if isinstance(cmd, SetInternalLinkCmd):
                        if cmd.source_id == cmd_edge[
                                'edgeSourceId'] and cmd.target_id == cmd_edge[
                                    'edgeTargetId'] and cmd.link_lbl == ont.simplify_uri(
                                        cmd_edge['edgeId']):
                            commands.pop(i)
                            break
            elif command["commandName"] == "AddLiteralNodeCommand":
                cmd_literal_value = command["inputParameters"][0]
                assert cmd_literal_value['name'] == 'literalValue'
                cmd_literal_value = cmd_literal_value['value']

                # they may re-use literal_values, let's user fix it manually
                if cmd_literal_value.startswith("http"):
                    new_attr_path = f"literal:{ont.simplify_uri(cmd_literal_value)}"
                else:
                    new_attr_path = f"literal:{cmd_literal_value}"

                if cmd_literal_value + "1" not in literal_nodes:
                    new_attr_path += ":1"
                    literal_nodes[cmd_literal_value + "1"] = new_attr_path
                elif cmd_literal_value + "2" not in literal_nodes:
                    new_attr_path += ":2"
                    literal_nodes[cmd_literal_value + "2"] = new_attr_path
                elif cmd_literal_value + "3" not in literal_nodes:
                    new_attr_path += ":3"
                    literal_nodes[cmd_literal_value + "3"] = new_attr_path
                else:
                    assert False

                col2col[new_attr_path] = new_attr_path
                commands.append(
                    AddLiteralColumnCmd(new_attr_path, cmd_literal_value))
            elif command["commandName"] == "OperateSelectionCommand":
                # no way to see it in the KARMA UI
                continue
            elif command["commandName"] == "OrganizeColumnsCommand":
                continue
            elif command["commandName"] == "SetWorksheetPropertiesCommand":
                # this command doesn't affect the model
                continue
            # elif command["commandName"] == "UnfoldCommand":
            #     cmd_input_col = command["inputParameters"][-2]
            #     cmd_output_col = command["inputParameters"][-1]
            #     assert cmd_input_col['name'] == "inputColumns" and cmd_output_col['name'] == 'outputColumns'
            #     cmd_input_cols = [
            #         [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_input_col['value'])
            #     ]
            #     input_attr_paths = [col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols]
            #     cmd_output_cols = [
            #         [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_output_col['value'])
            #     ]
            #
            #     output_attr_paths = []
            #     # update columns mapping
            #     for cmd_output_col in cmd_output_cols:
            #         attr_path = Schema.PATH_DELIMITER.join(cmd_output_col[start_idx:])
            #         col2col[Schema.PATH_DELIMITER.join(cmd_output_col)] = attr_path
            #         output_attr_paths.append(attr_path)
            #
            #     commands.append(UnrollCmd(input_attr_paths, output_attr_paths))
            # elif command["commandName"] == "GlueCommand":
            #     cmd_input_col = command["inputParameters"][-2]
            #     cmd_output_col = command["inputParameters"][-1]
            else:
                assert False, "Source: %s. Doesn't handle command %s" % (
                    tbl.id, command["commandName"])

        # fixing conflict modeling command
        conflicts = defaultdict(lambda: [])
        for i, cmd in enumerate(commands):
            if isinstance(cmd, SetSemanticTypeCmd):
                conflicts[cmd.input_attr_path].append((i, cmd))
            if isinstance(cmd, SetInternalLinkCmd):
                conflicts[(cmd.source_id, cmd.target_id)].append((i, cmd))

        delete_commands = []
        for cmds in conflicts.values():
            if len(cmds) > 1:
                display_warn = False
                for idx, cmd in cmds[1:]:
                    if cmd != cmds[0][1]:
                        if not display_warn:
                            display_warn = True
                            KR2RML.logger.warning(
                                "Table: %s. Conflict between command: \n\t+ %s \n\t+ %s",
                                tbl.id, cmds[0][1], cmd)
                        else:
                            print("\t+", cmd)

                # only keep final commands
                for idx, cmd in cmds[:-1]:
                    delete_commands.append(idx)

                if isinstance(cmds[0][1], SetInternalLinkCmd):
                    # need to update source_uri & target_uri first (for duplicate commands, source_uri, target_uri = None)
                    key = (cmds[-1][1].source_id, cmds[-1][1].link_lbl,
                           cmds[-1][1].target_id)
                    for idx, cmd in cmds[:-1]:
                        if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key:
                            cmds[-1][1].source_uri = cmd.source_uri
                            cmds[-1][1].target_uri = cmd.target_uri
                            break

        delete_commands.sort(reverse=True)
        for idx in delete_commands:
            commands.pop(idx)

        super().__init__(commands)

    def to_yaml(self, fpath: Path):
        with open(fpath, "w") as f:
            yaml.dump(self.to_dict(), f, default_flow_style=False, indent=4)
Пример #7
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from multiprocessing.pool import Pool
from typing import Dict, Tuple, List, Set, Union, Optional, Callable, Generic, TypeVar

from multiprocessing import Process, Queue, get_start_method, set_start_method

import time

import os
from nose.tools import eq_
from pyutils.progress_utils import Timer

from semantic_modeling.config import get_logger
"""Provide an easy and quick way to test if parallel is worth to do (overhead cost of serialize/deserialize arguments)"""
logger = get_logger("default")


def get_args_size(*args) -> int:
    total_element = 0
    for arg in args:
        if isinstance(arg, (list, dict, tuple)):
            total_element += len(arg)
        else:
            total_element += 1
    return total_element


def minimal_computing_func(queue, *args):
    """A function that doesn't do anything but use to test overhead cost of multiprocessing"""
    queue.put(get_args_size(*args))
Пример #8
0
class SemanticTyper(object):

    logger = get_logger("app.semantic_labeling.typer")
    instance = None

    def __init__(self,
                 dataset: str,
                 train_sms: List[SemanticModel],
                 exec_dir: Optional[Path] = None) -> None:
        self.dataset = dataset
        self.train_source_ids = {sm.id for sm in train_sms}
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sms) / "semantic-labeling"
        self.exec_dir = Path(exec_dir)
        self.exec_dir.mkdir(exist_ok=True, parents=True)

        self.model = None
        self.stype_db = SemanticTypeDB.get_stype_db(
            dataset, [sm.id for sm in train_sms], self.exec_dir)

    def load_model(self):
        """Try to load previous model if possible"""
        if self.model is not None:
            return

        model_file = self.exec_dir / 'model.pkl'
        if model_file.exists():
            self.logger.debug("Load previous trained model...")
            self.model = deserialize(model_file)
        else:
            self.logger.error("Cannot load model...")
            raise Exception("Model doesn't exist..")

    @staticmethod
    def get_instance(dataset: str,
                     train_sms: List[SemanticModel],
                     exec_dir: Optional[Path] = None) -> 'SemanticTyper':
        if SemanticTyper.instance is None:
            SemanticTyper.instance = SemanticTyper(dataset, train_sms,
                                                   exec_dir)

        assert SemanticTyper.instance.dataset == dataset and \
               SemanticTyper.instance.train_source_ids == {sm.id for sm in train_sms}

        return SemanticTyper.instance

    def semantic_labeling_v2(self, sms: List[SemanticModel],
                             top_n: int) -> None:
        """Generate semantic labels and store it in its own"""
        sms: Dict[str, SemanticModel] = {s.id: s for s in sms}

        if self.model is None:
            model_file = self.exec_dir / 'model.pkl'

            if model_file.exists():
                self.logger.debug("Load previous trained model...")
                self.model = deserialize(model_file)
            else:
                self.logger.debug("Train new model...")
                x_train, y_train, x_test, y_test = generate_training_data(
                    self.stype_db)
                # clf = LogisticRegression(class_weight="balanced")
                clf = RandomForestClassifier(n_estimators=200,
                                             max_depth=10,
                                             class_weight="balanced",
                                             random_state=120)
                clf = clf.fit(x_train, y_train)
                self.logger.debug("Save model...")
                serialize(clf, model_file)
                self.model = clf

        col_attrs = []
        for col in self.stype_db.train_columns:
            if col.table_name not in sms: continue
            col_attrs.append(
                (col, sms[col.table_name].get_attr_by_label(col.name)))

        for col in self.stype_db.test_columns:
            if col.table_name not in sms: continue
            col_attrs.append(
                (col, sms[col.table_name].get_attr_by_label(col.name)))

        for col, attr in col_attrs:
            pred_stypes = self.pred_type(col, top_n)
            attr.semantic_types = [
                SemanticType(stype[0].decode("utf-8"),
                             stype[1].decode("utf-8"), score)
                for stype, score in pred_stypes if score > 0
            ]

    def semantic_labeling(self,
                          train_sources: List[SemanticModel],
                          test_sources: List[SemanticModel],
                          top_n: int,
                          eval_train: bool = False) -> None:
        """Generate semantic labels and store it in test sources"""
        train_sources: Dict[str,
                            SemanticModel] = {s.id: s
                                              for s in train_sources}
        test_sources: Dict[str,
                           SemanticModel] = {s.id: s
                                             for s in test_sources}
        assert set(train_sources.keys()) == self.train_source_ids

        if self.model is None:
            model_file = self.exec_dir / 'model.pkl'

            if model_file.exists():
                self.logger.debug("Load previous trained model...")
                self.model = deserialize(model_file)
            else:
                self.logger.debug("Train new model...")
                x_train, y_train, x_test, y_test = generate_training_data(
                    self.stype_db)
                # clf = LogisticRegression(class_weight="balanced")
                clf = RandomForestClassifier(n_estimators=200,
                                             max_depth=10,
                                             class_weight="balanced",
                                             random_state=120)
                clf = clf.fit(x_train, y_train)
                self.logger.debug("Save model...")
                serialize(clf, model_file)
                self.model = clf

        col_attrs = []
        if eval_train:
            for col in self.stype_db.train_columns:
                if col.table_name not in train_sources: continue
                col_attrs.append(
                    (col, train_sources[col.table_name].get_attr_by_label(
                        col.name)))

        for col in self.stype_db.test_columns:
            if col.table_name not in test_sources: continue
            col_attrs.append(
                (col,
                 test_sources[col.table_name].get_attr_by_label(col.name)))

        for col, attr in col_attrs:
            pred_stypes = self.pred_type(col, top_n)
            attr.semantic_types = [
                SemanticType(stype[0].decode("utf-8"),
                             stype[1].decode("utf-8"), score)
                for stype, score in pred_stypes if score > 0
            ]

    def pred_type(self, col: Column,
                  top_n: int) -> List[Tuple[Tuple[bytes, bytes], float]]:
        X = []
        refcols = [
            refcol for refcol in self.stype_db.train_columns
            if refcol.id != col.id
        ]
        j = self.stype_db.col2idx[col.id]
        for refcol in refcols:
            iref = self.stype_db.col2idx[refcol.id]
            X.append(self.stype_db.similarity_matrix[j, iref])

        result = self.model.predict_proba(X)[:, 1]
        result = _(zip(result, (self.stype_db.col2types[rc.id] for rc in refcols))) \
            .sort(key=lambda x: x[0], reverse=True)
        top_k_st = {}
        for score, stype in result:
            if stype not in top_k_st:
                top_k_st[stype] = score
                if len(top_k_st) == top_n:
                    break

        return sorted([(stype, score) for stype, score in top_k_st.items()],
                      reverse=True,
                      key=lambda x: x[1])

    def semantic_labeling_parent(
        self,
        train_sources: List[SemanticModel],
        test_sources: List[SemanticModel],
        top_n: int,
        eval_train: bool = False
    ) -> Dict[str, Dict[int, List[Tuple[Tuple[bytes, bytes], float, List[Tuple[
            Tuple[bytes, bytes], float]]]]]]:
        """Generate semantic labels and store it in test sources"""
        train_sources: Dict[str,
                            SemanticModel] = {s.id: s
                                              for s in train_sources}
        test_sources: Dict[str,
                           SemanticModel] = {s.id: s
                                             for s in test_sources}
        assert set(train_sources.keys()) == self.train_source_ids

        if self.model is None:
            model_file = self.exec_dir / 'model.pkl'

            if model_file.exists():
                self.logger.debug("Load previous trained model...")
                self.model = deserialize(model_file)
            else:
                self.logger.debug("Train new model...")
                x_train, y_train, x_test, y_test = generate_training_data(
                    self.stype_db)
                # clf = LogisticRegression(class_weight="balanced")
                clf = RandomForestClassifier(n_estimators=200,
                                             max_depth=10,
                                             class_weight="balanced",
                                             random_state=120)
                clf = clf.fit(x_train, y_train)
                self.logger.debug("Save model...")
                serialize(clf, model_file)
                self.model = clf

        col_attrs = []
        pred_parent_stypes = {}

        if eval_train:
            for sid in train_sources:
                pred_parent_stypes[sid] = {}

            for col in self.stype_db.train_columns:
                if col.table_name not in train_sources: continue
                col_attrs.append(
                    (col, train_sources[col.table_name].get_attr_by_label(
                        col.name)))

        for sid in test_sources:
            pred_parent_stypes[sid] = {}

        for col in self.stype_db.test_columns:
            if col.table_name not in test_sources: continue
            col_attrs.append(
                (col,
                 test_sources[col.table_name].get_attr_by_label(col.name)))

        for col, attr in col_attrs:
            pred_full_stypes = self.pred_full_stype(col, top_n)
            attr.semantic_types = [
                SemanticType(stype[0].decode("utf-8"),
                             stype[1].decode("utf-8"), score)
                for stype, score, parent_stypes in pred_full_stypes
                if score > 0
            ]

            for stype, score, parent_stypes in pred_full_stypes:
                if score > 0:
                    if attr.id not in pred_parent_stypes[col.table_name]:
                        pred_parent_stypes[col.table_name][attr.id] = []
                    pred_parent_stypes[col.table_name][attr.id].append(
                        (stype, score,
                         sorted(parent_stypes.items(),
                                key=lambda x: x[1],
                                reverse=True)))

        return pred_parent_stypes

    def pred_full_stype(
        self, col: Column, top_n: int
    ) -> List[Tuple[Tuple[bytes, bytes], float, Dict[Tuple[bytes, bytes],
                                                     float]]]:
        X = []
        refcols = [
            refcol for refcol in self.stype_db.train_columns
            if refcol.id != col.id
        ]
        j = self.stype_db.col2idx[col.id]
        for refcol in refcols:
            iref = self.stype_db.col2idx[refcol.id]
            X.append(self.stype_db.similarity_matrix[j, iref])

        result = self.model.predict_proba(X)[:, 1]
        result = _(zip(result, (self.stype_db.col2dnodes[rc.id] for rc in refcols))) \
            .sort(key=lambda x: x[0], reverse=True)

        # each top_k_st is map between stype, its score, and list of parent stypes with score
        top_k_st: Dict[Tuple[bytes, bytes],
                       Tuple[float, Dict[Tuple[Tuple[bytes, bytes],
                                               float]]]] = {}
        for score, dnode in result:
            link = dnode.get_first_incoming_link()
            parent = link.get_source_node()
            parent_link = parent.get_first_incoming_link()
            if parent_link is None:
                parent_stype = None
            else:
                parent_stype = (parent_link.get_source_node().label,
                                parent_link.label)

            stype = (parent.label, link.label)
            if stype not in top_k_st:
                if len(top_k_st) == top_n:
                    # ignore stype which doesn't make itself into top k
                    continue

                top_k_st[stype] = (score, {parent_stype: score})
            else:
                # keep looping until we collect enough parent_link, default is top 3
                if parent_stype not in top_k_st[stype][1]:
                    # if we have seen the parent_stype, we don't need to update score because it's already the greatest
                    top_k_st[stype][1][parent_stype] = score

        return sorted([(stype, score, parent_stypes)
                       for stype, (score, parent_stypes) in top_k_st.items()],
                      reverse=True,
                      key=lambda x: x[1])
Пример #9
0
class DataConstraint(object):
    """This model tries to answer a question whether a mapping of a column follow some constraints inferred from data

    For example: we have 2 columns DoB & DoD
        1. if they are linked to same class by different predicate (local constraint)
        2. if they are linked to same class by same predicate, which class should we choose to link? (look at the parent)
        + how about: the case of same predicate, different class (not handled yet, let's semantic labeling does it)

    Given a list of known sources, we extract possible columns order =, >=, <= (consider only columns that are in the above scope).
    The we count cases the semantic types/relationship follow the order, and cases it doesn't as prob.

    Given a new example, we also look for columns that match, and produce the prediction
    """

    logger = get_logger("app.weak_models.data_constraint")

    def __init__(self, train_sms: List[SemanticModel],
                 data_tables: List[ColumnBasedTable], valid_threshold: float,
                 guess_datetime_threshold: float,
                 n_comparison_sample: int) -> None:
        self.guess_datetime_threshold = guess_datetime_threshold
        self.valid_threshold = valid_threshold
        self.n_comparison_sample = n_comparison_sample
        self.cached_compared_cols: Dict[str, Dict[Tuple[bytes, bytes],
                                                  Optional[float]]] = {}
        self.prob_count_scope1: Dict[Tuple[bytes, bytes],
                                     Dict[Tuple[bytes, bytes], int]] = {}
        self.prob_count_scope2: Dict[Tuple[bytes, bytes],
                                     Dict[Tuple[bytes, bytes], int]] = {}

        # keep a list of columns that can have data constraint (i.e: its value is comparable with other columns)
        col2useful_type: Dict[Column, ColumnType] = {}
        data_tables: Dict[str, ColumnBasedTable] = {
            tbl.id: tbl
            for tbl in data_tables
        }
        for tbl in data_tables.values():
            for col in tbl.columns:
                type = self._guess_detail_type(col)
                if type is not None and type.is_comparable():
                    col2useful_type[col] = type

        # now we build the constraint from training sources
        for sm in train_sms:
            stypes: Dict[Tuple[bytes, bytes], List[GraphLink]] = {}
            node_group: Dict[GraphNode, List[GraphLink]] = {}
            table = data_tables[sm.id]
            name2col: Dict[bytes, Column] = {
                col.name.encode("utf-8"): col
                for col in table.columns
            }
            col2idx: Dict[Column, int] = {
                col: i
                for i, col in enumerate(table.columns)
            }

            for attr in sm.attrs:
                dnode = sm.graph.get_node_by_id(attr.id)
                dlink = dnode.get_first_incoming_link()
                pnode = dlink.get_source_node()
                stype = (pnode.label, dlink.label)
                if stype not in stypes:
                    stypes[stype] = []
                stypes[stype].append(dlink)
                # group node by their parents
                if pnode not in node_group:
                    node_group[pnode] = []
                node_group[pnode].append(dlink)

            # first scope, infer constraint inside class nodes
            for pnode, dlinks in node_group.items():
                # before filter out data nodes that are not comparable, we double check if the data node we
                # have to ignore has its semantic types comparable
                # for e in dlinks:
                #     if name2col[e.get_target_node().label] not in col2useful_type and (pnode.label, e.label) in self.prob_count_scope1:
                #         self.logger.warning("Column's semantic types was detected to be comparable. But, now it can't: %s: %s", sm.id, e.get_target_node().label)
                #         self.prob_count_scope1[(pnode.label, e.label)] = None

                dlinks = [
                    e for e in dlinks
                    if name2col[e.get_target_node().label] in col2useful_type
                ]
                dnodes = [e.get_target_node() for e in dlinks]
                if len(dnodes) < 2:
                    continue

                if len({
                        col2useful_type[name2col[dnode.label]]
                        for dnode in dnodes
                }) != 1:
                    # doesn't support mixed-type
                    print({
                        col2useful_type[name2col[dnode.label]]
                        for dnode in dnodes
                    })
                    continue

                if len(dnodes) > 2:
                    self.logger.warning("Only handle max-2 now... %s: %s",
                                        sm.id, [e.label for e in dlinks])
                    continue

                cols = [name2col[dnode.label] for dnode in dnodes]
                compare_result = self._compare_col(table, col2idx, cols[0],
                                                   cols[1])

                dtypes = [(pnode.label, dlink.label) for dlink in dlinks]
                # if we cannot compare 2 columns, then we ignore them
                if compare_result is None:
                    # however, if this type is already register in the counter, then instead of ignore them
                    # we should delete set it to None to prevent re-add in the future
                    if (dtypes[0] in self.prob_count_scope1
                            and self.prob_count_scope1[dtypes[0]] is not None
                        ) or (dtypes[1] in self.prob_count_scope1 and
                              self.prob_count_scope1[dtypes[1]] is not None):
                        self.logger.warning(
                            "Inferred constraint for 2 columns %s doesn't hold for source: %s. (Column: %s, %s)",
                            dtypes, sm.id, cols[0].name, cols[1].name)
                        self.prob_count_scope1[dtypes[0]] = None
                        self.prob_count_scope1[dtypes[1]] = None
                    continue

                for dtype in dtypes:
                    if dtype not in self.prob_count_scope1:
                        self.prob_count_scope1[dtype] = {}

                if self.prob_count_scope1[
                        dtypes[0]] is None or self.prob_count_scope1[
                            dtypes[1]] is None:
                    # inferred constraint doesn't hold, so we should ignore this column
                    continue

                if compare_result:
                    # col0 > col1
                    if len(self.prob_count_scope1[dtypes[0]]) != 0:
                        if dtypes[0] not in self.prob_count_scope1[dtypes[
                                0]] or dtypes[1] not in self.prob_count_scope1[
                                    dtypes[0]]:
                            self.prob_count_scope1[dtypes[0]] = None
                            self.prob_count_scope1[dtypes[1]] = None
                        else:
                            assert self.prob_count_scope1[dtypes[0]][
                                dtypes[0]] == 1 and self.prob_count_scope1[
                                    dtypes[0]][dtypes[1]] == 0
                    if len(self.prob_count_scope1[dtypes[1]]) != 0:
                        if dtypes[0] not in self.prob_count_scope1[dtypes[
                                1]] or dtypes[1] not in self.prob_count_scope1[
                                    dtypes[1]]:
                            self.prob_count_scope1[dtypes[0]] = None
                            self.prob_count_scope1[dtypes[1]] = None
                        else:
                            assert self.prob_count_scope1[dtypes[1]][
                                dtypes[0]] == 1 and self.prob_count_scope1[
                                    dtypes[1]][dtypes[1]] == 0
                    self.prob_count_scope1[dtypes[0]] = {
                        dtypes[0]: 1,
                        dtypes[1]: 0
                    }
                    self.prob_count_scope1[dtypes[1]] = {
                        dtypes[0]: 1,
                        dtypes[1]: 0
                    }
                else:
                    if len(self.prob_count_scope1[dtypes[0]]) != 0:
                        assert self.prob_count_scope1[dtypes[0]][
                            dtypes[0]] == 0 and self.prob_count_scope1[
                                dtypes[0]][dtypes[1]] == 1
                    if len(self.prob_count_scope1[dtypes[1]]) != 0:
                        assert self.prob_count_scope1[dtypes[1]][
                            dtypes[0]] == 0 and self.prob_count_scope1[
                                dtypes[1]][dtypes[1]] == 1
                    self.prob_count_scope1[dtypes[0]] = {
                        dtypes[0]: 0,
                        dtypes[1]: 1
                    }
                    self.prob_count_scope1[dtypes[1]] = {
                        dtypes[0]: 0,
                        dtypes[1]: 1
                    }

            # second scope
            for stype, dlinks in stypes.items():
                if len(dlinks) == 1:
                    continue

                # now filter data nodes that is not comparable
                dnodes = [e.get_target_node() for e in dlinks]
                if any(name2col[dnode.label] not in col2useful_type
                       for dnode in dnodes):
                    continue

                if len({
                        col2useful_type[name2col[dnode.label]]
                        for dnode in dnodes
                }) != 1:
                    # doesn't support mixed-type
                    print({
                        col2useful_type[name2col[dnode.label]]
                        for dnode in dnodes
                    })
                    continue

                if len(dlinks) > 2:
                    self.logger.warning("Only handle max-2 now... %s: %s",
                                        sm.id, stype)
                    continue

                snodes = [e.get_source_node() for e in dlinks]
                slinks = [
                    n.get_first_incoming_link() for n in snodes
                    if n.get_first_incoming_link() is not None
                ]
                if len(slinks) == 0:
                    continue

                # now we need to build some constraints to help distinguish between those semantic types
                # we assume parents of those types are different ...
                parent_types = [(se.get_source_node().label, se.label)
                                for se in slinks]
                if len(set(parent_types)) != len(snodes):
                    self.logger.warning(
                        "Doesn't handle a case when parents are same: %s: %s",
                        sm.id, stype)
                    continue

                cols = [name2col[dnode.label] for dnode in dnodes]
                compare_result = self._compare_col(table, col2idx, cols[0],
                                                   cols[1])
                # if we cannot compare 2 columns, then we ignore them
                if compare_result is None:
                    # however, if this type is already register in the counter, then instead of ignore them
                    # we should delete set it to None to prevent re-add in the future
                    if stype in self.prob_count_scope2 and self.prob_count_scope2[
                            stype] is not None:
                        self.logger.warning(
                            "Inferred constraint for type %s doesn't hold for source: %s. (Column: %s, %s)",
                            stype, sm.id, cols[0].name, cols[1].name)
                        self.prob_count_scope2[stype] = None
                    continue

                if stype not in self.prob_count_scope2:
                    self.prob_count_scope2[stype] = {}

                if self.prob_count_scope2[stype] is None:
                    # inferred constraint doesn't hold, so we should ignore this column
                    continue

                if compare_result:
                    # col0 > col1
                    if len(self.prob_count_scope2[stype]) != 0:
                        assert self.prob_count_scope2[stype][parent_types[
                            0]] == 1 and self.prob_count_scope2[stype][
                                parent_types[1]] == 0
                    self.prob_count_scope2[stype] = {
                        parent_types[0]: 1,
                        parent_types[1]: 0
                    }
                else:
                    if len(self.prob_count_scope2[stype]) != 0:
                        assert self.prob_count_scope2[stype][parent_types[
                            0]] == 0 and self.prob_count_scope2[stype][
                                parent_types[1]] == 1
                    self.prob_count_scope2[stype] = {
                        parent_types[0]: 0,
                        parent_types[1]: 1
                    }

        for key in list(self.prob_count_scope1.keys()):
            if self.prob_count_scope1[key] is None:
                del self.prob_count_scope1[key]

        for key in list(self.prob_count_scope2.keys()):
            if self.prob_count_scope2[key] is None:
                del self.prob_count_scope2[key]

        # we also cache column comparison (to speed to evaluation time)
        for tbl in data_tables.values():
            useful_cols = [
                col for col in tbl.columns if col in col2useful_type
            ]
            tbl_comparison: Dict[Tuple[bytes, bytes], Optional[float]] = {}
            col2idx: Dict[Column,
                          int] = {col: i
                                  for i, col in enumerate(tbl.columns)}
            # TODO: can speed up by half
            for col in useful_cols:
                col_name = col.name.encode("utf-8")
                for col2 in useful_cols:
                    if col2 != col:
                        if col2useful_type[col] != col2useful_type[col2]:
                            tbl_comparison[(col_name,
                                            col2.name.encode("utf-8"))] = None
                        else:
                            tbl_comparison[(col_name, col2.name.encode("utf-8")
                                            )] = self._compare_col(
                                                tbl, col2idx, col, col2)

            self.cached_compared_cols[tbl.name] = tbl_comparison

    def extract_feature(self,
                        sm_id: str,
                        g: Graph,
                        attr_id: int,
                        link2label: Optional[Dict[int, bool]] = None) -> dict:
        return {
            "local": self.compute_prob_scope1(sm_id, g, attr_id, link2label),
            "global": self.compute_prob_scope2(sm_id, g, attr_id, link2label),
        }

    def compute_prob_scope1(
            self,
            sm_id: str,
            g: Graph,
            attr_id: int,
            link2label: Optional[Dict[int, bool]] = None) -> Optional[float]:
        if link2label is None:
            # use default dict to reduce code size
            link2label = {}
        dnode = g.get_node_by_id(attr_id)
        dlink = dnode.get_first_incoming_link()
        pnode = dlink.get_source_node()
        stype = (pnode.label, dlink.label)
        if stype not in self.prob_count_scope1 or not link2label.get(
                dlink.id, True):
            return None

        assert len(self.prob_count_scope1[stype]) == 2
        another_stype = [
            x for x in self.prob_count_scope1[stype].keys() if x != stype
        ][0]
        another_dnodes = [
            e.get_target_node() for e in pnode.iter_outgoing_links()
            if e.label == another_stype[1] and link2label.get(e.id, True)
        ]
        if len(another_dnodes) == 0:
            return None

        dnode_stype_idx = self.prob_count_scope1[stype][stype]
        another_dnode_stype_idx = self.prob_count_scope1[stype][another_stype]
        tbl_comparison = self.cached_compared_cols[sm_id]
        result = None

        for another_dnode in another_dnodes:
            if (dnode.label, another_dnode.label) not in tbl_comparison:
                continue

            result = tbl_comparison[(dnode.label, another_dnode.label)]
            if result is None:
                continue

            if result:
                # attr > another_attr, attr_stype_idx should > another_attr_stype_idx with high prob.
                if dnode_stype_idx > another_dnode_stype_idx:
                    return self.valid_threshold
                return 1 - self.valid_threshold
            else:
                # opposite case of above
                if dnode_stype_idx > another_dnode_stype_idx:
                    return 1 - self.valid_threshold
                return self.valid_threshold

        if result is None:
            # the constraint said that we should be able to compare, but we cannot, it should have low probability
            return 1 - self.valid_threshold

    def compute_prob_scope2(
            self,
            sm_id: str,
            g: Graph,
            attr_id: int,
            link2label: Optional[Dict[int, bool]] = None) -> Optional[float]:
        """Give a probability whether mapping of an attribute statistic data constraints

        We can mark some part of graph as false
        """
        dnode = g.get_node_by_id(attr_id)
        dlink = dnode.get_first_incoming_link()
        stype = (dlink.get_source_node().label, dlink.label)
        if stype not in self.prob_count_scope2:
            return None

        slink = dlink.get_source_node().get_first_incoming_link()
        if slink is None:
            # root nodes
            return None

        dnode_parent_type = (slink.get_source_node().label, slink.label)
        if dnode_parent_type not in self.prob_count_scope2[stype] or (
                link2label is not None and not link2label[slink.id]):
            return None
        dnode_stype_idx = self.prob_count_scope2[stype][dnode_parent_type]

        # get other class nodes in the graph that an attr can be mapped to (same semantic type).
        # notice that the constraint is represent as binary-function, so we only keep the class nodes
        # that have another attribute, which is mapped with the same semantic type
        snodes = [
            node for node in g.iter_nodes_by_label(stype[0])
            if node.id != dlink.source_id
        ]
        if len(snodes) == 0:
            # if we don't have any other source nodes (i.e: only one possible mapping)
            return None

        tbl_comparison = self.cached_compared_cols[sm_id]
        another_dnodes = []
        another_dnodes_stype_idx = []
        for snode in snodes:
            # check if this source node have another attribute that is mapped by same semantic type
            for link in snode.iter_outgoing_links():
                if link.label == dlink.label:
                    another_dnode = link.get_target_node()
                    break
            else:
                another_dnode = None

            if another_dnode is not None and (
                    dnode.label, another_dnode.label) in tbl_comparison:
                slink = snode.get_first_incoming_link()
                parent_type = (slink.get_source_node().label, slink.label)
                if parent_type in self.prob_count_scope2[stype] and (
                        link2label is None or link2label[slink.id] is True):
                    # if its parent_type is not in the constraint or its link is false, then we should ignore it
                    another_dnodes.append(another_dnode)
                    another_dnodes_stype_idx.append(
                        self.prob_count_scope2[stype][parent_type])

        # do compare between attr and another_attrs
        if len(another_dnodes) + 1 > len(self.prob_count_scope2[stype]):
            self.logger.warning(
                "There is a model that have more attributes than the inferred constraint.. trace: %s -- %s",
                sm_id, stype)
            return None

        # let's see if we can compare the given attribute with other attributes
        if len(another_dnodes
               ) == 0 or dnode_stype_idx in another_dnodes_stype_idx:
            # how about this case?
            return None

        assert len(self.prob_count_scope2[stype]
                   ) == 2, "Doesn't handle > 2 attributes now..."

        # now we can compare with other attributes
        another_dnode, another_dnode_stype_idx = another_dnodes[
            0], another_dnodes_stype_idx[0]
        result = tbl_comparison[(dnode.label, another_dnode.label)]
        if result is None:
            # the constraint said that we should be able to compare, but we cannot, it should have low probability
            return 1 - self.valid_threshold

        if result:
            # attr > another_attr, attr_stype_idx should > another_attr_stype_idx with high prob.
            if dnode_stype_idx > another_dnode_stype_idx:
                return self.valid_threshold
            return 1 - self.valid_threshold
        else:
            # opposite case of above
            if dnode_stype_idx > another_dnode_stype_idx:
                return 1 - self.valid_threshold
            return self.valid_threshold

    def _compare_col(self, tbl: ColumnBasedTable, col2idx, col1: Column,
                     col2: Column) -> Optional[bool]:
        # any mixed-type should be handled before..
        n_gt, n_eq, n_lt = 0, 0, 0
        count = 0
        if col1.type == ColumnType.NUMBER:
            for row in tbl.rows:
                val1 = row[col2idx[col1]]
                val2 = row[col2idx[col2]]

                if not isinstance(val1, (int, float)) or not isinstance(
                        val2, (int, float)) or val1 is None or val2 is None:
                    continue

                if val1 == val2:
                    n_eq += 1
                elif val1 > val2:
                    n_gt += 1
                else:
                    n_lt += 1

                count += 1
                if count == self.n_comparison_sample:
                    break
        else:
            for row in tbl.rows:
                val1 = row[col2idx[col1]]
                val2 = row[col2idx[col2]]
                if not isinstance(val1, (str, bytes)) or not isinstance(
                        val2, (str, bytes)) or val1 is None or val2 is None:
                    continue

                try:
                    # TODO: need to detect it is
                    val1 = parse_date(val1, dayfirst=False, yearfirst=False)
                    val2 = parse_date(val2, dayfirst=False, yearfirst=False)
                except ValueError:
                    continue

                if val1 == val2:
                    n_eq += 1
                elif val1 > val2:
                    n_gt += 1
                else:
                    n_lt += 1

                count += 1
                if count == 50:
                    break

        if n_gt > 0 and ((n_gt + n_eq) / count) >= self.valid_threshold:
            return True
        if n_lt > 0 and ((n_lt + n_eq) / count) >= self.valid_threshold:
            return False

        # not decidable (also for equal-case)
        return None

    def _guess_detail_type(self, col: Column):
        if col.type == ColumnType.NUMBER:
            return ColumnType.NUMBER
        if col.type == ColumnType.NULL:
            return None

        # trying to guess if this is DateTime
        # just get first 100 values to reduce computing time
        values = [val for val in col.get_textual_data()
                  if val.strip() != ""][:50]
        n_success = 0
        for val in values:
            try:
                parse_date(val)
                n_success += 1
            except ValueError:
                pass

        if (n_success / len(values)) > self.guess_datetime_threshold:
            # consider this is a datetime column
            return ColumnType.DATETIME
        return None
Пример #10
0
        #     if self.value is not None else b"")

    def is_terminal(self, args: BeamSearchArgs) -> bool:
        return len(self.remained_terminals) == 0

    def get_value(self) -> PGMSearchNodeValue:
        return self.value

    def get_score(self) -> float:
        return self.G_scored[self.working_terminal]

    def get_hashing_id(self) -> bytes:
        return self.hashing_id


_logger = get_logger('app.assembling.search_discovery')


def filter_unlikely_graph(g: MergeGraph) -> bool:
    settings = Settings.get_instance()
    max_n_duplications = settings.mrf_max_n_duplications
    max_n_duplication_types = settings.mrf_max_n_duplication_types

    for n in g.iter_class_nodes():
        # FILTER middle nodes
        if n.n_incoming_links == 1 and n.n_outgoing_links == 1:
            link = next(iter(n.iter_outgoing_links()))
            if link.get_target_node().is_class_node():
                return False

        # FILTER: max_size_duplication_group <= 7 and max_n_duplications <= 4
Пример #11
0
class MohsenSemanticModeling(object):
    logger = get_logger("app.mohsen_jws2015")

    def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str],
                 exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None):
        self.dataset: str = dataset
        self.train_sm_ids = train_sm_ids
        self.ont = get_ontology(dataset)
        self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)}

        # can only run once time, trying re-invoke will generate an error
        self.__has_run_modeling = False
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015"
        self.exec_dir: Path = Path(exec_dir)
        self.sm_type_dir = sm_type_dir

        # parameters for mohsen's algorithm
        self.use_old_semantic_typer = use_old_semantic_typer
        self.use_correct_type = use_correct_type
        assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4
        self.num_candidate_semantic_type = 4
        self.multiple_same_property_per_node = True

        self.coherence = 1.0
        self.confidence = 1.0
        self.size_reduction = 0.5

        self.num_candidate_mappings = 50
        self.mapping_branching_factor = 50
        self.topk_steiner_tree = 10

        # take all, not cut off everything
        self.cut_off = int(1e6)
        self.our_and_karma_sm_alignments = {}

    def get_meta(self, train_source_names: List[str], test_source_names: List[str]) -> Dict:
        return {
            "dataset": self.dataset,
            "use_correct_type": self.use_correct_type,
            "use_old_semantic_typer": self.use_old_semantic_typer,
            "num_candidate_semantic_type": self.num_candidate_semantic_type,
            "multiple_same_property_per_node": self.multiple_same_property_per_node,
            "coherence": self.coherence,
            "confidence": self.confidence,
            "size_reduction": self.size_reduction,
            "num_candidate_mappings": self.num_candidate_mappings,
            "mapping_branching_factor": self.mapping_branching_factor,
            "topk_steiner_tree": self.topk_steiner_tree,
            "train_source_names": train_source_names,
            "test_source_names": test_source_names,
            "cut_off": self.cut_off
        }

    def init(self, train_source_names: List[str], test_source_names: List[str]):
        if self.__has_run_modeling:
            raise Exception("Cannot call init twice!!")

        train_source_names = sorted(train_source_names)
        test_source_names = sorted(test_source_names)
        assert self.train_sm_ids == train_source_names

        execution_meta_file = self.exec_dir / "execution-meta.json"
        lock_file = self.exec_dir / "lock.pid"

        if lock_file.exists():
            raise Exception("Cannot run mohsen method because another process is running")

        if execution_meta_file.exists():
            # only have this file when previous execution is success!
            self.logger.debug("Load information from previous run...")
            re_executing = False
            with open(execution_meta_file, 'r') as f:
                try:
                    meta = ujson.load(f)
                except ValueError:
                    re_executing = True

                if re_executing is False:
                    test_source_names = set(meta.pop("test_source_names"))
                    new_meta = self.get_meta(train_source_names, test_source_names)
                    if test_source_names.difference(set(new_meta.pop("test_source_names"))):
                        re_executing = True
                    else:
                        re_executing = meta != new_meta
        else:
            re_executing = True

        if re_executing:
            self.logger.info("Going to re-execute karma code")
            self.exec_dir.mkdir(exist_ok=True)
            with open(lock_file, 'w') as f:
                f.write(str(os.getpid()))

            setup_karma(self.dataset, self.exec_dir, self.sm_type_dir)
            execute_karma_code(self.dataset, self.exec_dir, self.use_correct_type, self.use_old_semantic_typer,
                               self.num_candidate_semantic_type,
                               self.multiple_same_property_per_node, self.coherence, self.confidence,
                               self.size_reduction, self.num_candidate_mappings, self.mapping_branching_factor,
                               self.topk_steiner_tree, self.cut_off, train_source_names, test_source_names)

            # only have this file when previous execution is success!
            with open(execution_meta_file, 'w') as f:
                ujson.dump(self.get_meta(train_source_names, test_source_names), f, indent=4)

        self.__has_run_modeling = True

    def karma_model_candidate_generation(self,
                                         train_sms: List[SemanticModel],
                                         test_sms: List[SemanticModel],
                                         n_candidate: int = 1000) -> List[List[KarmaModel]]:
        if not self.__has_run_modeling:
            self.init([s.id for s in train_sms], [s.id for s in test_sms])

        self.logger.debug("Load previous result...")
        results = []
        karma_models_dir = Path(config.datasets[self.dataset].karma_version.as_path()) / "models-json"
        for test_sm in test_sms:
            file_name = "source--%s.json" % test_sm.id
            predicted_models: List[KarmaModel] = []

            if self.use_old_semantic_typer:
                karma_sm = KarmaModel.load_from_file(self.ont,
                                                     self.exec_dir / "output" / f"source--{test_sm.id}.original.json")
            else:
                karma_sm = KarmaModel.load_from_file(self.ont, karma_models_dir / f"{test_sm.id}-model.json")
            sm_alignment: SemanticModelAlignment = SemanticModelAlignment(test_sm, karma_sm)

            with open(self.exec_dir / "output" / file_name, 'r') as f:
                for i, serialized_sm in enumerate(f):
                    pred_sm = sm_alignment.load_and_align(self.ont, serialized_sm)
                    pred_sm.id = f"{test_sm.id}:::{i}"
                    predicted_models.append(pred_sm)
                    if (i + 1) >= n_candidate:
                        break

            if len(predicted_models) == 0:
                karma_graph = KarmaGraph(True, True, True)
                for dnode in karma_sm.karma_graph.iter_data_nodes():
                    karma_graph.real_add_new_node(KarmaGraphNode([], [], dnode.literal_type, dnode.is_literal_node),
                                                  GraphNodeType.DATA_NODE, dnode.label)
                karma_model = KarmaModel(karma_sm.id, karma_sm.description, karma_sm.source_columns,
                                         karma_sm.mapping_to_source_columns, karma_graph)
                predicted_models = [karma_model]

            assert len(predicted_models) == len({m.id for m in predicted_models}), "No id duplication"
            results.append(predicted_models)

        return results

    def sm_candidate_generation(self, training_sources: List[SemanticModel],
                                testing_sources: List[SemanticModel]) -> List[List[SemanticModel]]:
        results = self.karma_model_candidate_generation(training_sources, testing_sources)
        return [[m.get_semantic_model() for m in predicted_models] for predicted_models in results]

    def semantic_labeling(self, training_sources: List[SemanticModel], testing_sources: List[SemanticModel]) -> List[
        Dict[str, List[KarmaSemanticType]]]:
        """This method perform """
        results = self.karma_model_candidate_generation(training_sources, testing_sources, n_candidate=1)
        node2stypes = []

        for test_sm, predicted_models in zip(testing_sources, results):
            node2stypes.append({node.label.decode("utf-8"): node.learned_semantic_types for node in
                                predicted_models[0].karma_graph.iter_data_nodes()})

        return node2stypes

    def sm_prediction(self, training_sources: List[SemanticModel], testing_sources: List[SemanticModel]) -> List[
        SemanticModel]:
        return [pred_sms[0] for pred_sms in self.sm_candidate_generation(training_sources, testing_sources)]
Пример #12
0
class SemanticTypeDB(object):

    logger = get_logger('app.semantic_labeling.stype_db')
    SIMILARITY_METRICS = [
        "label_jaccard", "stype_jaccard", "num_ks_test",
        "num_mann_whitney_u_test", "num_jaccard", "text_jaccard", "text_tf-idf"
    ]
    instance = None

    def __init__(self, dataset: str, train_tables: List[ColumnBasedTable],
                 test_tables: List[ColumnBasedTable]):
        self.dataset = dataset
        self.train_tables = train_tables
        self.test_tables = test_tables

        self.similarity_matrix: numpy.ndarray = None
        self.tfidf_db: TfidfDatabase = None
        self._init()

    def _init(self):
        self.source_mappings: Dict[str, SemanticModel] = {
            s.id: s
            for s in get_semantic_models(self.dataset)
        }
        self.train_columns = [
            col for tbl in self.train_tables for col in tbl.columns
        ]
        self.train_column_stypes: List[str] = []
        for tbl in self.train_tables:
            sm = self.source_mappings[tbl.id]
            for col in tbl.columns:
                dnode = sm.graph.get_node_by_id(
                    sm.get_attr_by_label(col.name).id)
                dlink = dnode.get_first_incoming_link()
                self.train_column_stypes.append(dlink.label.decode("utf-8"))

        self.test_columns = [
            col for tbl in self.test_tables for col in tbl.columns
        ]
        self.name2table: Dict[str, ColumnBasedTable] = {
            tbl.id: tbl
            for tbl in chain(self.train_tables, self.test_tables)
        }
        self.col2idx: Dict[str, int] = {
            col.id: i
            for i, col in enumerate(
                chain(self.train_columns, self.test_columns))
        }
        self.col2types: Dict[str, Tuple[str, str]] = {}
        self.col2dnodes: Dict[str, GraphNode] = {}

        col: Column
        for col in chain(self.train_columns, self.test_columns):
            sm = self.source_mappings[col.table_name]
            attr = sm.get_attr_by_label(col.name)
            dnode = sm.graph.get_node_by_id(attr.id)
            link = dnode.get_first_incoming_link()
            self.col2types[col.id] = (link.get_source_node().label, link.label)
            self.col2dnodes[col.id] = dnode

        assert len(self.col2types) == len(self.train_columns) + len(
            self.test_columns), "column name must be unique"

    @staticmethod
    def create(dataset: str, train_source_ids: List[str]) -> 'SemanticTypeDB':
        tables = get_sampled_data_tables(dataset)
        train_source_ids = set(train_source_ids)

        train_tables = [
            ColumnBasedTable.from_table(tbl) for tbl in tables
            if tbl.id in train_source_ids
        ]
        test_tables = [
            ColumnBasedTable.from_table(tbl) for tbl in tables
            if tbl.id not in train_source_ids
        ]

        return SemanticTypeDB(dataset, train_tables, test_tables)

    @staticmethod
    def get_stype_db(dataset: str, train_source_ids: List[str],
                     cache_dir: Path) -> 'SemanticTypeDB':
        if SemanticTypeDB.instance is None:
            cache_file = cache_dir / 'stype_db.pkl'
            if cache_file.exists():
                SemanticTypeDB.logger.debug(
                    "Load SemanticTypeDB from cache file...")
                stype_db: SemanticTypeDB = deserialize(cache_file)
                if set(train_source_ids) != {
                        tbl.id
                        for tbl in stype_db.train_tables
                } or stype_db.dataset != dataset:
                    stype_db = None
            else:
                stype_db = None

            if stype_db is None:
                SemanticTypeDB.logger.debug(
                    "Have to re-create SemanticTypeDB...")
                stype_db = SemanticTypeDB.create(dataset, train_source_ids)
                stype_db._build_db()
                serialize(stype_db, cache_file)

            SemanticTypeDB.instance = stype_db

        return SemanticTypeDB.instance

    def get_table_by_name(self, name: str) -> ColumnBasedTable:
        return self.name2table[name]

    def _build_db(self) -> None:
        """Build semantic types database from scratch"""
        n_train_columns = len(self.train_columns)

        self.logger.debug("Build tfidf database...")
        self.similarity_matrix = numpy.zeros(
            (n_train_columns + len(self.test_columns), n_train_columns,
             len(self.SIMILARITY_METRICS)),
            dtype=float)
        self.tfidf_db = TfidfDatabase.create(textual.get_tokenizer(),
                                             self.train_columns)

        self.logger.debug("Pre-build tf-idf for all columns")
        self.tfidf_db.cache_tfidf(self.test_columns)
        self.logger.debug("Computing similarity matrix...")

        # loop through train source ids and compute similarity between columns
        for idx, col in enumerate(self.train_columns):
            self.logger.trace("   + working on col: %s", col.id)
            sim_features = self._compute_feature_vectors(
                col, self.train_columns, self.train_column_stypes)
            self.similarity_matrix[idx, :, :] = numpy.asarray(
                sim_features).reshape((n_train_columns, -1))

        for idx, col in enumerate(self.test_columns):
            self.logger.trace("   + working on col: %s", col.id)
            sim_features = self._compute_feature_vectors(
                col, self.train_columns, self.train_column_stypes)
            self.similarity_matrix[idx +
                                   n_train_columns, :, :] = numpy.asarray(
                                       sim_features).reshape(
                                           (n_train_columns, -1))

    def _compute_feature_vectors(self, col: Column, refcols: List[Column],
                                 refcol_stypes: List[str]):
        features = []
        for i, refcol in enumerate(refcols):
            features.append([
                # name features
                column_name.jaccard_sim_test(refcol.name, col.name,
                                             lower=True),
                column_name.jaccard_sim_test(refcol_stypes[i],
                                             col.name,
                                             lower=True),
                # numeric features
                numeric.ks_test(refcol, col),
                numeric.mann_whitney_u_test(refcol, col),
                numeric.jaccard_sim_test(refcol, col),
                # text features
                textual.jaccard_sim_test(refcol, col),
                textual.cosine_similarity(self.tfidf_db.compute_tfidf(refcol),
                                          self.tfidf_db.compute_tfidf(col)),
            ])

        return features

    # implement pickling
    def __getstate__(self):
        return self.dataset, self.train_tables, self.test_tables, self.similarity_matrix

    def __setstate__(self, state):
        self.dataset = state[0]
        self.train_tables = state[1]
        self.test_tables = state[2]
        self.similarity_matrix = state[3]
        self._init()
Пример #13
0
class TfidfDatabase(object):

    logger = get_logger('app.semantic_labeling.tfidf_db')

    def __init__(self, tokenizer, vocab: Dict[str, int],
                 invert_token_idx: Dict[str, int],
                 col2tfidf: Dict[str, numpy.ndarray]) -> None:
        self.vocab = vocab
        self.invert_token_idx = invert_token_idx
        self.tokenizer = tokenizer
        self.n_docs = len(col2tfidf)
        self.cache_col2tfidf = col2tfidf

    @staticmethod
    def create(tokenizer, columns: List[Column]) -> 'TfidfDatabase':
        vocab = {}
        invert_token_idx: Dict[str, int] = defaultdict(lambda: 0)
        col2tfidf = {}
        token_count = defaultdict(lambda: 0)
        n_docs = len(columns)

        # compute tf first
        with Pool() as p:
            tf_cols = p.map(TfidfDatabase._compute_tf,
                            [(tokenizer, col) for col in columns])

        # then compute vocabulary & preparing for idf
        for tf_col in tf_cols:
            for w in tf_col:
                invert_token_idx[w] += 1
                token_count[w] += 1

        # reduce vocab size
        for w in token_count:
            if token_count[w] < 2 and w.isdigit():
                # delete this word
                del invert_token_idx[w]
            else:
                vocab[w] = len(vocab)

        # revisit it and make tfidf
        for col, tf_col in zip(columns, tf_cols):
            tfidf = numpy.zeros((len(vocab)))
            for w, tf in tf_col.items():
                if w in vocab:
                    tfidf[vocab[w]] = tf * numpy.log(n_docs /
                                                     (1 + invert_token_idx[w]))
            col2tfidf[col.id] = tfidf

        return TfidfDatabase(tokenizer, vocab, invert_token_idx, col2tfidf)

    def compute_tfidf(self, col: Column):
        if col.id in self.cache_col2tfidf:
            return self.cache_col2tfidf[col.id]

        tfidf = numpy.zeros(len(self.vocab))
        for w, tf in self._compute_tf((self.tokenizer, col)).items():
            if w in self.vocab:
                print(w, tf, self.invert_token_idx[w],
                      numpy.log(self.n_docs / (1 + self.invert_token_idx[w])))
                tfidf[self.vocab[w]] = tf * numpy.log(
                    self.n_docs / (1 + self.invert_token_idx[w]))

        return tfidf

    def cache_tfidf(self, cols: List[Column]):
        cols = [col for col in cols if col.id not in self.cache_col2tfidf]

        with Pool() as p:
            tf_cols = p.map(TfidfDatabase._compute_tf,
                            [(self.tokenizer, col) for col in cols])

        for col, tf_col in zip(cols, tf_cols):
            tfidf = numpy.zeros(len(self.vocab))
            for w, tf in tf_col.items():
                if w in self.vocab:
                    tfidf[self.vocab[w]] = tf * numpy.log(
                        self.n_docs / (1 + self.invert_token_idx[w]))
            self.cache_col2tfidf[col.id] = tfidf

    @staticmethod
    def _compute_tf(args):
        tokenizer, col = args
        counter = Counter()
        sents = (subsent for sent in col.get_textual_data()
                 for subsent in sent.decode('utf-8').split("/"))
        for doc in tokenizer.pipe(sents, batch_size=50, n_threads=4):
            counter.update((str(w) for w in doc))

        number_of_token = sum(counter.values())
        for token, val in counter.items():
            counter[token] = val / number_of_token
        return counter
Пример #14
0
    ParallelBatchExample, Tensor1AccumulatorDict
from gmtk.optimize.numerical_gradient import NumericalGradient
from gmtk.optimize.optimizer import PyTorchOptimizer
from gmtk.tensors import DenseTensorFunc
from semantic_modeling.assembling.learning.shared_models import Example, TrainingArgs
from semantic_modeling.assembling.undirected_graphical_model.model_core import ExampleAnnotator
from semantic_modeling.assembling.undirected_graphical_model.templates.triple_template import TripleFactorTemplate
from semantic_modeling.assembling.undirected_graphical_model.model_extra import TensorBoard, evaluate, \
    get_latest_model_id, move_current_files, save_evaluation_result
from semantic_modeling.assembling.undirected_graphical_model.templates.substructure_template import \
    SubstructureFactorTemplate
from semantic_modeling.config import get_logger
from semantic_modeling.utilities.parallel_util import sequential_map
from semantic_modeling.utilities.serializable import serialize, serializeJSON

logger = get_logger('app.persistent.assembling.train_model')


def nll_func(example: NegativeLogLikelihoodExample):
    loss_val_accum = ValueAccumulator()
    example.accumulate_value_and_gradient(loss_val_accum, None)
    return loss_val_accum.get_value()


def train_model(dataset: str, train_sids: List[str], manual_seed: int,
                train_examples: List[Example], test_examples: List[Example],
                args: TrainingArgs, basedir: Path):
    DenseTensorFunc.manual_seed(manual_seed)

    tf_domain = GrowableBinaryVectorDomain()
Пример #15
0
class MinhptxSemanticLabeling(object):

    logger = get_logger("app.minhptx_iswc2016")

    def __init__(self,
                 dataset: str,
                 max_n_records: int = float('inf'),
                 is_sampling: bool = False,
                 exec_dir: Optional[Union[Path, str]] = None) -> None:
        self.dataset: str = dataset
        self.ont: Ontology = get_ontology(dataset)
        self.max_n_records: int = max_n_records
        self.is_sampling: bool = is_sampling
        assert not is_sampling, "Not implemented"

        self.source_ids: Set[str] = {
            file.stem
            for file in Path(
                config.datasets[dataset].data.as_path()).iterdir()
            if file.is_file() and not file.name.startswith(".")
        }

        if exec_dir is None:
            exec_dir = Path(
                config.fsys.debug.as_path()) / dataset / "minhptx_iswc2016"
        self.exec_dir: Path = Path(exec_dir)

        self.meta_file: Path = self.exec_dir / "execution-meta.json"
        self.input_dir: Path = self.exec_dir / "input"
        self.input_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir: Path = self.exec_dir / "output"
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def get_meta(self, train_source_ids: Set[str], test_source_ids: Set[str]):
        return {
            "dataset": self.dataset,
            "max_n_records": self.max_n_records,
            "is_sampling": self.is_sampling,
            "source_ids": self.source_ids,
            "input_dir": str(self.input_dir),
            "output_dir": str(self.output_dir),
            "training_sources": train_source_ids,
            "testing_sources": test_source_ids
        }

    def _semantic_labeling(
            self, train_source_ids: Set[str], test_source_ids: Set[str]
    ) -> Dict[str, MinhptxSemanticLabelingResult]:
        """Generate semantic labeling for test_sources using train_sources"""
        need_reexec = True

        if Path(self.meta_file).exists():
            # read meta and compare if previous run is compatible with current run
            self.logger.debug("Load information from previous run...")

            meta = deserializeJSON(self.meta_file)
            meta["training_sources"] = set(meta["training_sources"])
            meta["testing_sources"] = set(meta["testing_sources"])
            meta["source_ids"] = set(meta['source_ids'])

            new_meta = self.get_meta(train_source_ids, test_source_ids)
            if len(
                    new_meta.pop("testing_sources").difference(
                        meta.pop("testing_sources"))) == 0:
                if new_meta == meta:
                    need_reexec = False

        if need_reexec:
            self.logger.debug("Re-execute semantic labeling...")

            try:
                # preparing data, want to compute semantic models for all sources in dataset
                data_dir = Path(config.datasets[self.dataset].data.as_path())
                model_dir = Path(
                    config.datasets[self.dataset].models_json.as_path())

                shutil.rmtree(str(self.input_dir))
                for fpath in self.output_dir.iterdir():
                    os.remove(fpath)
                [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True)
                 for x in
                 ["%s_train" % self.dataset,
                  "%s_test" % self.dataset] for y in ["data", "model"]]

                input_train_dir = self.input_dir / ("%s_train" % self.dataset)
                input_test_dir = self.input_dir / ("%s_test" % self.dataset)

                for fpath in sorted(data_dir.iterdir()):
                    model_fname = fpath.stem + "-model.json"
                    if fpath.stem in train_source_ids:
                        self._copy_data(fpath,
                                        input_train_dir / "data" / fpath.name)
                        # seriaalize the model instead of copied because we want to convert uri to simplified uri
                        # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_train_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                    if fpath.stem in test_source_ids:
                        self._copy_data(fpath,
                                        input_test_dir / "data" / fpath.name)
                        # same reason like above
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_test_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                invoke_command(" ".join([
                    config.previous_works.minhptx_iswc2016.cli.as_path(),
                    str(self.input_dir),
                    str(self.output_dir), "--train_dataset",
                    "%s_train" % self.dataset, "--test_dataset",
                    "%s_test" % self.dataset, "--evaluate_train_set", "True",
                    "--reuse_rf_model", "False"
                ]),
                               output2file=self.exec_dir / "execution.log")
            except Exception:
                sys.stdout.flush()
                self.logger.exception(
                    "Error while preparing and invoking semantic labeling api..."
                )
                raise

            serializeJSON(self.get_meta(train_source_ids, test_source_ids),
                          self.meta_file,
                          indent=4)

        # load result
        self.logger.debug("Load previous result...")
        output_files = [
            fpath for fpath in self.output_dir.iterdir()
            if fpath.suffix == ".json"
        ]
        assert len(output_files) == 2
        app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON(
            output_files[0], Class=MinhptxSemanticLabelingResult)
        app_result.update(
            deserializeJSON(output_files[1],
                            Class=MinhptxSemanticLabelingResult))

        return {
            source_id: app_result[source_id]
            for source_id in chain(test_source_ids, train_source_ids)
        }

    def _copy_data(self, fsource: Path, fdest: Path) -> None:
        if self.max_n_records == float('inf'):
            shutil.copyfile(str(fsource), str(fdest))
            return

        if fsource.suffix == ".csv":
            with open(fsource, "r") as f, open(fdest, "w") as g:
                for i, line in enumerate(f):
                    if i > self.max_n_records:
                        break

                    g.write(line)
        else:
            assert False, "Not support file type: %s" % fsource.suffix

    def semantic_labeling(self, train_sources: List[SemanticModel],
                          test_sources: List[SemanticModel],
                          top_n: int) -> None:
        """Generate semantic labeling, and store it in test_sources"""
        train_source_ids = {s.id for s in train_sources}
        test_source_ids = {s.id for s in test_sources}
        assert len(train_source_ids.intersection(test_source_ids)) == 0
        result = self._semantic_labeling(train_source_ids, test_source_ids)

        # dump result into test_sources
        for source in chain(train_sources, test_sources):
            for col in source.attrs:
                try:
                    if col.label not in result[source.id].columns:
                        # this column is ignored
                        stypes = []
                    else:
                        stypes = result[source.id].columns[col.label]

                    col.semantic_types = [
                        KarmaSemanticType(col.id, stype.domain, stype.type,
                                          "Minhptx-ISWC2016-SemanticLabeling",
                                          stype.weight) for stype in stypes
                    ][:top_n]
                except Exception:
                    self.logger.exception(
                        "Hit exception for source: %s, col: %s",
                        source.get_id(), col.id)
                    raise
Пример #16
0
class SemanticTypeAssistant(object):
    """We use semantic type to help justify if class C (not data node) should link to class A or class B
    Score is a potential gain if switching to another class (for example: potential gain if C link to B instead of A (currently C link to A))
    """

    logger = get_logger("app.weak_models.stype_assistant")

    def __init__(self, train_sms: List[SemanticModel], typer: SemanticTyper, triple_adviser: TripleAdviser):
        self.train_sms = {sm.id: sm for sm in train_sms}
        self.stype_db = typer.stype_db
        self.triple_adviser = triple_adviser

        # # contain a mapping from (s, p, o) => table.id, and node which are mounted in SM by o
        # self.parent_stype_index: Dict[Tuple[bytes, bytes, bytes], List[Tuple[str, int]]] = {}
        # for train_sm in train_sms:
        #     for n in train_sm.graph.iter_nodes():
        #         for e in n.iter_outgoing_links():
        #             target = e.get_target_node()
        #             index_key = (n.label, e.label, target.label)
        #             if index_key not in self.parent_stype_index:
        #                 self.parent_stype_index[index_key] = []
        #             self.parent_stype_index[index_key].append((train_sm.id, target.id))

        # contain a mapping from (semantic types & parent stypes (s, p, o) to columns
        self.column_stype_index: Dict[bytes, Dict[Tuple[bytes, bytes, bytes], List[Column]]] = {}
        for train_sm in train_sms:
            table = self.stype_db.get_table_by_name(train_sm.id)

            for dnode in train_sm.graph.iter_data_nodes():
                dlink = dnode.get_first_incoming_link()
                pnode = dlink.get_source_node()
                # stype = (pnode.label, dlink.label)
                plink = pnode.get_first_incoming_link()
                if plink is None:
                    # this is a root node
                    continue

                parent_stype = (plink.get_source_node().label, plink.label, pnode.label)

                if pnode.label not in self.column_stype_index:
                    self.column_stype_index[pnode.label] = {}
                if parent_stype not in self.column_stype_index[pnode.label]:
                    self.column_stype_index[pnode.label][parent_stype] = []

                column = table.get_column_by_name(dnode.label.decode("utf-8"))
                self.column_stype_index[pnode.label][parent_stype].append(column)

        # possible_mount of a node
        self.possible_mounts: Dict[bytes, List[Tuple[bytes, bytes]]] = {}
        for train_sm in train_sms:
            for n in train_sm.graph.iter_class_nodes():
                if n.label not in self.possible_mounts:
                    self.possible_mounts[n.label] = self.triple_adviser.get_subj_preds(n.label)

        # contains the likelihood between 2 columns
        X = self.stype_db.similarity_matrix.reshape((-1, self.stype_db.similarity_matrix.shape[-1]))
        similarity_matrix = typer.model.predict_proba(X)[:, 1]
        self.similarity_matrix = similarity_matrix.reshape(self.stype_db.similarity_matrix.shape[:-1])

        # mapping from column's name to column's index
        self.name2cols: Dict[bytes, Dict[bytes, int]] = {}
        tbl: ColumnBasedTable
        for tbl in chain(self.stype_db.train_tables, self.stype_db.test_tables):
            self.name2cols[tbl.id] = {}
            for col in tbl.columns:
                self.name2cols[tbl.id][col.name.encode('utf-8')] = self.stype_db.col2idx[col.id]

        self.logger.debug("Finish building index for semantic type assistant...")

    def compute_prob(self, sm_id: str, g: Graph) -> Dict[int, float]:
        link2features = {}
        graph_observed_mounts = set()
        graph_observed_class_lbls = set()
        name2col_idx = self.name2cols[sm_id]

        parent_nodes: Dict[int, Tuple[GraphNode, Tuple[bytes, bytes]]] = {}
        for dnode in g.iter_data_nodes():
            dlink = dnode.get_first_incoming_link()
            col_idx = name2col_idx[dnode.label]

            if dlink.source_id not in parent_nodes:
                pnode = dlink.get_source_node()
                plink = pnode.get_first_incoming_link()
                if plink is None:
                    continue

                pstype = (plink.get_source_node().label, plink.label)

                # add pstype to observed mounts
                graph_observed_mounts.add(pstype)
                parent_nodes[dlink.source_id] = (pnode, plink, pstype, [dlink], [col_idx])
            else:
                parent_nodes[dlink.source_id][-2].append(dlink)
                parent_nodes[dlink.source_id][-1].append(col_idx)

        for pnode in g.iter_class_nodes():
            graph_observed_class_lbls.add(pnode.label)

        for pnode, plink, pstype, dlinks, col_idxs in parent_nodes.values():
            # map from possible mount => scores of each columns
            parent_stype_score: Dict[Tuple[bytes, bytes], List[float]] = {}

            # filter out all possible mounts that present in the graph (except the current one),
            # but the domain of the mounts are not in the graph
            possible_mounts = [
                possible_mount for possible_mount in self.possible_mounts.get(pnode.label, [])
                if not ((possible_mount in graph_observed_mounts and possible_mount != pstype)
                        or possible_mount[0] not in graph_observed_class_lbls)
            ]

            if len(possible_mounts) > 1:
                # the number only make sense if there are another place to mount this object to
                for possible_mount in possible_mounts:
                    spo = (possible_mount[0], possible_mount[1], pnode.label)
                    scores = []
                    for i, col_idx in enumerate(col_idxs):
                        # stype = (pnode.label, dlinks[i].label)
                        refcols = self.column_stype_index[pnode.label][spo]
                        best_score = max(
                            self.similarity_matrix[col_idx, self.stype_db.col2idx[refcol.id]] for refcol in refcols)
                        scores.append(best_score)
                    parent_stype_score[possible_mount] = scores

                aggregation_score = {mount: sum(scores) / len(scores) for mount, scores in parent_stype_score.items()}
            else:
                aggregation_score = {}

            if pstype not in aggregation_score:
                link2features[plink.id] = None
            else:
                link2features[plink.id] = aggregation_score.pop(pstype) - max(aggregation_score.values())

        return link2features
Пример #17
0
class ColumnBasedTable(object):

    logger = get_logger('app.semantic_labeling.data_table')

    def __init__(self, id: str, columns: List[Column]) -> None:
        self.id = id
        self.columns: List[Column] = columns
        self.name2colidx: Dict[str, int] = {
            cname.name: idx
            for idx, cname in enumerate(columns)
        }

    def get_column_by_name(self, name: str):
        return self.columns[self.name2colidx[name]]

    @staticmethod
    def from_table(tbl: DataTable) -> 'ColumnBasedTable':
        columns = []
        for cname in tbl.schema.get_attr_paths():
            type_stats = {
                type: 0.0
                for type in
                [ColumnType.NUMBER, ColumnType.STRING, ColumnType.NULL]
            }
            col_values = []
            for row in tbl.rows:
                get_col_values(cname.split(Schema.PATH_DELIMITER), row,
                               col_values)

            col_values = [
                norm_val(val, empty_as_null=True) for val in col_values
            ]
            for val in col_values:
                type_stats[get_type(val)] += 1

            for key, val in type_stats.items():
                type_stats[key] = val / len(col_values)

            # now we have to decide what type of this column using some heuristic!!
            if type_stats[ColumnType.STRING] > type_stats[ColumnType.NUMBER]:
                col_type = ColumnType.STRING
            else:
                if type_stats[ColumnType.NULL] < 0.7 and (
                        type_stats[ColumnType.NUMBER] +
                        type_stats[ColumnType.NULL]) < 0.9:
                    col_type = ColumnType.STRING
                elif type_stats[ColumnType.NUMBER] > type_stats[
                        ColumnType.STRING] and (
                            type_stats[ColumnType.NUMBER] +
                            type_stats[ColumnType.NULL]) > 0.9:
                    col_type = ColumnType.NUMBER
                else:
                    if all(val is None for val in col_values):
                        col_type = ColumnType.NULL
                    else:
                        ColumnBasedTable.logger.error(
                            "Cannot decide type with the stats: %s",
                            ujson.dumps(type_stats, indent=4))
                        raise Exception(
                            f"Cannot decide type of column: {col_name} in {tbl.id}"
                        )
            column = Column(tbl.id, cname, col_type, len(col_values),
                            type_stats)
            column.value = ColumnData(col_values)
            columns.append(column)

        col_based_tbl = ColumnBasedTable(tbl.id, columns)
        return col_based_tbl

    def to_dict(self):
        return {
            "id": self.id,
            "columns": [col.to_dict() for col in self.columns]
        }

    @staticmethod
    def from_dict(val) -> 'ColumnBasedTable':
        tbl = ColumnBasedTable(
            val["id"], [Column.from_dict(col) for col in val["columns"]])
        return tbl

    # implement pickling
    def __getstate__(self):
        return self.to_dict()

    def __setstate__(self, state):
        obj = ColumnBasedTable.from_dict(state)
        self.__dict__ = obj.__dict__
Пример #18
0
from semantic_modeling.karma.karma import KarmaModel
from semantic_modeling.karma.semantic_model import SemanticModel
from semantic_modeling.utilities.ontology import Ontology
from semantic_modeling.utilities.serializable import deserialize, deserializeJSON, serialize, serializeJSON
from transformation.models.data_table import DataTable
from transformation.r2rml.r2rml import R2RML

_data_io_vars = {
    "ont": {},
    "karma_models": {},
    "semantic_models": {},
    "data_tables": {},
    "raw_data_tables": {},
    "sampled_data_tables": {}
}
_logger = get_logger("app.data_io")


def get_ontology(dataset: str) -> Ontology:
    """Get ontology of a given dataset"""
    global _data_io_vars
    if dataset not in _data_io_vars["ont"]:
        # if it has been cached ...
        cache_file = get_cache_dir(dataset) / 'ont.pkl'
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont = deserialize(cache_file)
        else:
            ont = Ontology.from_dataset(dataset)
            serialize(ont, cache_file)
        _data_io_vars["ont"][dataset] = ont
Пример #19
0
        self.source_id = source_id
        self.n_attrs = n_attrs
        self.discovering_func = discovering_func
        self.tracker: Tracker = tracker
        self.early_terminate_func: Callable[[int, Iterable[SearchNode]],
                                            bool] = early_terminate_func

    def should_stop(self, n_iter: int,
                    current_nodes: Iterable[SearchNode]) -> bool:
        if self.early_terminate_func is None:
            return False

        return self.early_terminate_func(n_iter, current_nodes)


_logger = get_logger('app.assembling.beam_search')


# @profile
def beam_search(starts: List[SearchNode], beam_width: int, n_results: int,
                args: BeamSearchArgs) -> List[SearchNode]:
    global _logger

    assert beam_width >= len(starts)
    # store the search result, a map from id of node's value => node to eliminate duplicated result
    results: Dict[str, SearchNode] = {}

    # ##############################################
    # Add very first nodes to kick off BEAM SEARCH
    current_exploring_nodes: Dict[str, SearchNode] = OrderedDict()
    for n in starts: