Пример #1
0
    def get_stype_score(self, example: Example) -> Dict[int, float]:
        """Compute stype prob. but store in a map: data node id => prob."""
        stype_score = {}
        source_desc = self.source_models[example.get_model_id()]
        for target in example.pred_sm.iter_data_nodes():
            link = target.get_first_incoming_link()
            source = link.get_source_node()
            for stype in source_desc.get_attr_by_label(
                    target.label.decode("utf-8")).semantic_types:
                if stype.domain.encode(
                        "utf-8") == source.label and stype.type.encode(
                            "utf-8") == link.label:
                    p_link_given_so = stype.confidence_score
                    break
            else:
                p_link_given_so = None

            stype_score[target.id] = p_link_given_so
        return stype_score
Пример #2
0
    def annotate(self, example: Example) -> Example:
        # STEP 1: add semantic types... dont' need to do, because example must be either in train or test...
        sm_id: str = example.get_model_id()
        assert sm_id in self.source_models
        example.annotator = self

        is_train_example: bool = sm_id in self.train_source_ids
        source: SemanticModel = self.source_models[sm_id]

        # id2attrs: Dict[int, Attribute] = {attr.id: attr for attr in sources[sm_id].attrs}
        example.node2features = {}
        example.link2features = {}
        stype_score = self.get_stype_score(example)

        # add node features from node_prob weak model
        node_prob_features = self.node_prob.feature_extraction(
            example.pred_sm, stype_score)
        node_probs = self.node_prob.compute_prob(node_prob_features)
        for nid, prob in node_probs.items():
            example.node2features[nid] = dict(node_prob_features[nid])
            example.node2features[nid]['node_prob'] = prob

        stype_assistant = self.stype_assistant.compute_prob(
            sm_id, example.pred_sm)

        # add link features
        for node in example.pred_sm.iter_class_nodes():
            outgoing_links = list(node.iter_outgoing_links())
            numbered_links = numbering_link_labels(outgoing_links)

            for link in outgoing_links:
                target = link.get_target_node()
                total_stype_score = None
                delta_stype_score = None
                ratio_stype_score = None
                p_link_given_so = None
                p_triple = None
                stype_order = None
                data_constraint_features = {}

                if target.is_class_node():
                    p_link_given_so = self.statistic.p_l_given_so(
                        node.label, link.label, target.label,
                        default=0.5)  # half half
                    p_triple = p_link_given_so * example.node2features[
                        link.source_id]['node_prob'] * example.node2features[
                            link.target_id]['node_prob']
                else:
                    target_stypes = source.get_attr_by_label(
                        target.label.decode("utf-8")).semantic_types
                    n_target_stypes = len(target_stypes)
                    total_stype_score = sum(stype.confidence_score
                                            for stype in target_stypes)

                    for i, stype in enumerate(target_stypes):
                        if stype.domain.encode(
                                "utf-8") == node.label and stype.type.encode(
                                    "utf-8") == link.label:
                            # data node, p_link = score of semantic type
                            p_link_given_so = stype.confidence_score
                            if i == 0 and n_target_stypes > 1:
                                delta_stype_score = stype.confidence_score - target_stypes[
                                    1].confidence_score
                            else:
                                delta_stype_score = stype.confidence_score - target_stypes[
                                    0].confidence_score

                            ratio_stype_score = stype.confidence_score / target_stypes[
                                0].confidence_score
                            stype_order = i
                            break

                    if p_link_given_so is not None:
                        p_triple = p_link_given_so * example.node2features[
                            link.source_id]['node_prob']

                    # add data constraint
                    # if is_train_example:
                    #     # we can use link2label, because of known models
                    #     data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id,
                    #                                                                     example.link2label)
                    # else:
                    #     data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id)

                example.link2features[link.id] = {
                    'p_triple': p_triple,
                    'p_link_given_so': p_link_given_so,
                    'total_stype_score': total_stype_score,
                    'stype_order': stype_order,
                    'delta_stype_score': delta_stype_score,
                    'ratio_stype_score': ratio_stype_score,
                    # 'local_constraint': data_constraint_features.get("local", None),
                    # 'global_constraint': data_constraint_features.get("global", None),
                    'stype_prob': stype_assistant.get(link.id, None)
                }

                multi_val_prob = self.multival_predicate.compute_prob(
                    link.label, numbered_links[link.id])
                if multi_val_prob is not None:
                    example.link2features[
                        link.id]["multi_val_prob"] = multi_val_prob

        return example