def get_stype_score(self, example: Example) -> Dict[int, float]: """Compute stype prob. but store in a map: data node id => prob.""" stype_score = {} source_desc = self.source_models[example.get_model_id()] for target in example.pred_sm.iter_data_nodes(): link = target.get_first_incoming_link() source = link.get_source_node() for stype in source_desc.get_attr_by_label( target.label.decode("utf-8")).semantic_types: if stype.domain.encode( "utf-8") == source.label and stype.type.encode( "utf-8") == link.label: p_link_given_so = stype.confidence_score break else: p_link_given_so = None stype_score[target.id] = p_link_given_so return stype_score
def annotate(self, example: Example) -> Example: # STEP 1: add semantic types... dont' need to do, because example must be either in train or test... sm_id: str = example.get_model_id() assert sm_id in self.source_models example.annotator = self is_train_example: bool = sm_id in self.train_source_ids source: SemanticModel = self.source_models[sm_id] # id2attrs: Dict[int, Attribute] = {attr.id: attr for attr in sources[sm_id].attrs} example.node2features = {} example.link2features = {} stype_score = self.get_stype_score(example) # add node features from node_prob weak model node_prob_features = self.node_prob.feature_extraction( example.pred_sm, stype_score) node_probs = self.node_prob.compute_prob(node_prob_features) for nid, prob in node_probs.items(): example.node2features[nid] = dict(node_prob_features[nid]) example.node2features[nid]['node_prob'] = prob stype_assistant = self.stype_assistant.compute_prob( sm_id, example.pred_sm) # add link features for node in example.pred_sm.iter_class_nodes(): outgoing_links = list(node.iter_outgoing_links()) numbered_links = numbering_link_labels(outgoing_links) for link in outgoing_links: target = link.get_target_node() total_stype_score = None delta_stype_score = None ratio_stype_score = None p_link_given_so = None p_triple = None stype_order = None data_constraint_features = {} if target.is_class_node(): p_link_given_so = self.statistic.p_l_given_so( node.label, link.label, target.label, default=0.5) # half half p_triple = p_link_given_so * example.node2features[ link.source_id]['node_prob'] * example.node2features[ link.target_id]['node_prob'] else: target_stypes = source.get_attr_by_label( target.label.decode("utf-8")).semantic_types n_target_stypes = len(target_stypes) total_stype_score = sum(stype.confidence_score for stype in target_stypes) for i, stype in enumerate(target_stypes): if stype.domain.encode( "utf-8") == node.label and stype.type.encode( "utf-8") == link.label: # data node, p_link = score of semantic type p_link_given_so = stype.confidence_score if i == 0 and n_target_stypes > 1: delta_stype_score = stype.confidence_score - target_stypes[ 1].confidence_score else: delta_stype_score = stype.confidence_score - target_stypes[ 0].confidence_score ratio_stype_score = stype.confidence_score / target_stypes[ 0].confidence_score stype_order = i break if p_link_given_so is not None: p_triple = p_link_given_so * example.node2features[ link.source_id]['node_prob'] # add data constraint # if is_train_example: # # we can use link2label, because of known models # data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id, # example.link2label) # else: # data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id) example.link2features[link.id] = { 'p_triple': p_triple, 'p_link_given_so': p_link_given_so, 'total_stype_score': total_stype_score, 'stype_order': stype_order, 'delta_stype_score': delta_stype_score, 'ratio_stype_score': ratio_stype_score, # 'local_constraint': data_constraint_features.get("local", None), # 'global_constraint': data_constraint_features.get("global", None), 'stype_prob': stype_assistant.get(link.id, None) } multi_val_prob = self.multival_predicate.compute_prob( link.label, numbered_links[link.id]) if multi_val_prob is not None: example.link2features[ link.id]["multi_val_prob"] = multi_val_prob return example