示例#1
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence),
                                              sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Multinary candidates
        else:
            spans = args
            if all([span.sentence.is_lingual() for span in spans]):
                get_tdl_feats = compile_relation_feature_generator(
                    is_multary=True)
                sents = [get_as_dict(span.sentence) for span in spans]
                xmltree = corenlp_to_xmltree(spans[0].sentence)
                s_idxs = [
                    list(
                        range(span.get_word_start_index(),
                              span.get_word_end_index() + 1)) for span in spans
                ]
                if all([len(s_idx) > 0 for s_idx in s_idxs]):

                    # Add DDLIB entity features for relation
                    for span, sent, s_idx, i in zip(spans, sents, s_idxs,
                                                    range(len(spans))):

                        for f in _get_ddlib_feats(span, sent, s_idx):
                            yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in multinary_tdl_feats:
                        multinary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s_idxs):
                            multinary_tdl_feats[candidate.id].add(f)
                    for f in multinary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for i, span in enumerate(spans):
                for f in _get_word_feats(span):
                    yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE
示例#2
0
def get_content_feats(candidates):
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], TemporarySpanMention)):
            raise ValueError(
                f"Accepts Span-type arguments, {type(candidate)}-type found.")

        # Unary candidates
        if len(args) == 1:
            span = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = compile_entity_feature_generator()
                sent = get_as_dict(span.sentence)
                xmltree = corenlp_to_xmltree(sent)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in get_ddlib_feats(span, sent, sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span):
                    yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(get_as_dict(span1.sentence))
                s1_idxs = list(
                    range(span1.get_word_start_index(),
                          span1.get_word_end_index() + 1))
                s2_idxs = list(
                    range(span2.get_word_start_index(),
                          span2.get_word_end_index() + 1))
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span1):
                    yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

                for f in get_word_feats(span2):
                    yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently")
示例#3
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    :type candidates: list
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(), span.get_word_end_index() + 1)
                )
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(span1.sentence)
                s1_idxs = list(
                    range(span1.get_word_start_index(), span1.get_word_end_index() + 1)
                )
                s2_idxs = list(
                    range(span2.get_word_start_index(), span2.get_word_end_index() + 1)
                )
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in _get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in _get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span1):
                yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

            for f in _get_word_feats(span2):
                yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently"
            )