def extract_textual_features( candidates: Union[Candidate, List[Candidate]], ) -> Iterator[Tuple[int, str, int]]: """Extract textual features. :param candidates: A list of candidates to extract features from """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))): raise ValueError( f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found." ) # Unary candidates if len(args) == 1: span: Union[SpanMention, ImplicitSpanMention] = args[0] if span.sentence.is_lingual(): get_tdl_feats = _compile_entity_feature_generator() xmltree = corenlp_to_xmltree(span.sentence) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) if len(sidxs) > 0: # Add DDLIB entity features for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Multinary candidates else: spans = args if all([span.sentence.is_lingual() for span in spans]): get_tdl_feats = compile_relation_feature_generator( is_multary=True) sents = [get_as_dict(span.sentence) for span in spans] xmltree = corenlp_to_xmltree(spans[0].sentence) s_idxs = [ list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) for span in spans ] if all([len(s_idx) > 0 for s_idx in s_idxs]): # Add DDLIB entity features for relation for span, sent, s_idx, i in zip(spans, sents, s_idxs, range(len(spans))): for f in _get_ddlib_feats(span, sent, s_idx): yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in multinary_tdl_feats: multinary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s_idxs): multinary_tdl_feats[candidate.id].add(f) for f in multinary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for i, span in enumerate(spans): for f in _get_word_feats(span): yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE
def get_content_feats(candidates): candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], TemporarySpanMention)): raise ValueError( f"Accepts Span-type arguments, {type(candidate)}-type found.") # Unary candidates if len(args) == 1: span = args[0] if span.sentence.is_lingual(): get_tdl_feats = compile_entity_feature_generator() sent = get_as_dict(span.sentence) xmltree = corenlp_to_xmltree(sent) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1)) if len(sidxs) > 0: # Add DDLIB entity features for f in get_ddlib_feats(span, sent, sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE else: for f in get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Binary candidates elif len(args) == 2: span1, span2 = args if span1.sentence.is_lingual() and span2.sentence.is_lingual(): get_tdl_feats = compile_relation_feature_generator() sent1 = get_as_dict(span1.sentence) sent2 = get_as_dict(span2.sentence) xmltree = corenlp_to_xmltree(get_as_dict(span1.sentence)) s1_idxs = list( range(span1.get_word_start_index(), span1.get_word_end_index() + 1)) s2_idxs = list( range(span2.get_word_start_index(), span2.get_word_end_index() + 1)) if len(s1_idxs) > 0 and len(s2_idxs) > 0: # Add DDLIB entity features for relation for f in get_ddlib_feats(span1, sent1, s1_idxs): yield candidate.id, f"DDL_e1_{f}", DEF_VALUE for f in get_ddlib_feats(span2, sent2, s2_idxs): yield candidate.id, f"DDL_e2_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in binary_tdl_feats: binary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs): binary_tdl_feats[candidate.id].add(f) for f in binary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE else: for f in get_word_feats(span1): yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE for f in get_word_feats(span2): yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE else: raise NotImplementedError( "Only handles unary and binary candidates currently")
def extract_textual_features( candidates: Union[Candidate, List[Candidate]], ) -> Iterator[Tuple[int, str, int]]: """Extract textual features. :param candidates: A list of candidates to extract features from :type candidates: list """ candidates = candidates if isinstance(candidates, list) else [candidates] for candidate in candidates: args = tuple([m.context for m in candidate.get_mentions()]) if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))): raise ValueError( f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found." ) # Unary candidates if len(args) == 1: span: Union[SpanMention, ImplicitSpanMention] = args[0] if span.sentence.is_lingual(): get_tdl_feats = _compile_entity_feature_generator() xmltree = corenlp_to_xmltree(span.sentence) sidxs = list( range(span.get_word_start_index(), span.get_word_end_index() + 1) ) if len(sidxs) > 0: # Add DDLIB entity features for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs): yield candidate.id, f"DDL_{f}", DEF_VALUE # Add TreeDLib entity features if span.stable_id not in unary_tdl_feats: unary_tdl_feats[span.stable_id] = set() for f in get_tdl_feats(xmltree.root, sidxs): unary_tdl_feats[span.stable_id].add(f) for f in unary_tdl_feats[span.stable_id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span): yield candidate.id, f"BASIC_{f}", DEF_VALUE # Binary candidates elif len(args) == 2: span1, span2 = args if span1.sentence.is_lingual() and span2.sentence.is_lingual(): get_tdl_feats = compile_relation_feature_generator() sent1 = get_as_dict(span1.sentence) sent2 = get_as_dict(span2.sentence) xmltree = corenlp_to_xmltree(span1.sentence) s1_idxs = list( range(span1.get_word_start_index(), span1.get_word_end_index() + 1) ) s2_idxs = list( range(span2.get_word_start_index(), span2.get_word_end_index() + 1) ) if len(s1_idxs) > 0 and len(s2_idxs) > 0: # Add DDLIB entity features for relation for f in _get_ddlib_feats(span1, sent1, s1_idxs): yield candidate.id, f"DDL_e1_{f}", DEF_VALUE for f in _get_ddlib_feats(span2, sent2, s2_idxs): yield candidate.id, f"DDL_e2_{f}", DEF_VALUE # Add TreeDLib relation features if candidate.id not in binary_tdl_feats: binary_tdl_feats[candidate.id] = set() for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs): binary_tdl_feats[candidate.id].add(f) for f in binary_tdl_feats[candidate.id]: yield candidate.id, f"TDL_{f}", DEF_VALUE for f in _get_word_feats(span1): yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE for f in _get_word_feats(span2): yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE else: raise NotImplementedError( "Only handles unary and binary candidates currently" )