Пример #1
0
def corenlp_to_xmltree(obj: Union[Dict, Sentence],
                       prune_root: bool = True) -> XMLTree:
    """
    Transforms an object with CoreNLP dep_path and dep_parent attributes into
    an XMLTree. Will include elements of any array having the same dimensiion
    as dep_* as node attributes. Also adds special word_idx attribute
    corresponding to original sequence order in sentence.
    """
    # Convert input object to dictionary
    s: Dict = get_as_dict(obj)

    # Use the dep_parents array as a guide: ensure it is present and a list of
    # ints
    if not ("dep_parents" in s and isinstance(s["dep_parents"], list)):
        raise ValueError(
            "Input CoreNLP object must have a 'dep_parents' attribute which is a list"
        )
    try:
        dep_parents = list(map(int, s["dep_parents"]))
    except Exception:
        raise ValueError("'dep_parents' attribute must be a list of ints")

    # Also ensure that we are using CoreNLP-native indexing
    # (root=0, 1-base word indexes)!
    b = min(dep_parents)
    if b != 0:
        dep_parents = list(map(lambda j: j - b, dep_parents))

    # Parse recursively
    root = corenlp_to_xmltree_sub(s, dep_parents, 0)

    # Often the return tree will have several roots, where one is the actual
    # root and the rest are just singletons not included in the dep tree
    # parse...
    # We optionally remove these singletons and then collapse the root if only
    # one child left.
    if prune_root:
        for c in root:
            if len(c) == 0:
                root.remove(c)
        if len(root) == 1:
            root = root.findall("./*")[0]
    return XMLTree(root, words=s["words"])
Пример #2
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence),
                                              sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Multinary candidates
        else:
            spans = args
            if all([span.sentence.is_lingual() for span in spans]):
                get_tdl_feats = compile_relation_feature_generator(
                    is_multary=True)
                sents = [get_as_dict(span.sentence) for span in spans]
                xmltree = corenlp_to_xmltree(spans[0].sentence)
                s_idxs = [
                    list(
                        range(span.get_word_start_index(),
                              span.get_word_end_index() + 1)) for span in spans
                ]
                if all([len(s_idx) > 0 for s_idx in s_idxs]):

                    # Add DDLIB entity features for relation
                    for span, sent, s_idx, i in zip(spans, sents, s_idxs,
                                                    range(len(spans))):

                        for f in _get_ddlib_feats(span, sent, s_idx):
                            yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in multinary_tdl_feats:
                        multinary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s_idxs):
                            multinary_tdl_feats[candidate.id].add(f)
                    for f in multinary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for i, span in enumerate(spans):
                for f in _get_word_feats(span):
                    yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE
Пример #3
0
def get_content_feats(candidates):
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], TemporarySpanMention)):
            raise ValueError(
                f"Accepts Span-type arguments, {type(candidate)}-type found.")

        # Unary candidates
        if len(args) == 1:
            span = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = compile_entity_feature_generator()
                sent = get_as_dict(span.sentence)
                xmltree = corenlp_to_xmltree(sent)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in get_ddlib_feats(span, sent, sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span):
                    yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(get_as_dict(span1.sentence))
                s1_idxs = list(
                    range(span1.get_word_start_index(),
                          span1.get_word_end_index() + 1))
                s2_idxs = list(
                    range(span2.get_word_start_index(),
                          span2.get_word_end_index() + 1))
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span1):
                    yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

                for f in get_word_feats(span2):
                    yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently")
Пример #4
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    :type candidates: list
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(), span.get_word_end_index() + 1)
                )
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(span1.sentence)
                s1_idxs = list(
                    range(span1.get_word_start_index(), span1.get_word_end_index() + 1)
                )
                s2_idxs = list(
                    range(span2.get_word_start_index(), span2.get_word_end_index() + 1)
                )
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in _get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in _get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span1):
                yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

            for f in _get_word_feats(span2):
                yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently"
            )