예제 #1
0
def extract_single_syntax(doc, edu_info, para_info):
    """syntactic features for the EDU"""
    try:
        tree_idx = edu_info['tkd_tree_idx']
    except KeyError:
        return

    if tree_idx is None:
        return

    ptree = doc.tkd_trees[tree_idx]
    # pheads = doc.lex_heads[tree_idx]

    edu = edu_info['edu']
    tokens = edu_info['tokens']  # WIP

    # WIP 2016-06-02: type of sentence, hopefully informative for non-S
    yield ('SYN_sent_type', ptree.label())

    # spanning nodes for the EDU
    syn_nodes = syntactic_node_seq(ptree, tokens)
    if syn_nodes:
        yield ('SYN_nodes',
               tuple(x.label() for x in syn_nodes))
    # variant, stripped from leading and trailing punctuations
    tokens_strip_punc = strip_punctuation(tokens)
    syn_nodes_nopunc = syntactic_node_seq(ptree, tokens_strip_punc)
    if syn_nodes_nopunc:
        yield ('SYN_nodes_nopunc',
               tuple(x.label() for x in syn_nodes_nopunc))

    # currently de-activated
    if False:
        # find EDU head
        edu_head = edu_info['edu_head']
        if edu_head is not None:
            treepos_hn, treepos_hw = edu_head
            hlabel = ptree[treepos_hn].label()
            hword = ptree[treepos_hw].word

            if False:
                # DEBUG
                print('edu: ', edu.text())
                print('hlabel: ', hlabel)
                print('hword: ', hword)
                print('======')

            yield ('SYN_hlabel', hlabel)
            yield ('SYN_hword', hword)
예제 #2
0
def extract_single_syntax(doc, edu_info, para_info):
    """syntactic features for the EDU"""
    try:
        tree_idx = edu_info['tkd_tree_idx']
    except KeyError:
        return

    if tree_idx is None:
        return

    ptree = doc.tkd_trees[tree_idx]
    # pheads = doc.lex_heads[tree_idx]

    edu = edu_info['edu']
    tokens = edu_info['tokens']  # WIP

    # WIP 2016-06-02: type of sentence, hopefully informative for non-S
    yield ('SYN_sent_type', ptree.label())

    # spanning nodes for the EDU
    syn_nodes = syntactic_node_seq(ptree, tokens)
    if syn_nodes:
        yield ('SYN_nodes', tuple(x.label() for x in syn_nodes))
    # variant, stripped from leading and trailing punctuations
    tokens_strip_punc = strip_punctuation(tokens)
    syn_nodes_nopunc = syntactic_node_seq(ptree, tokens_strip_punc)
    if syn_nodes_nopunc:
        yield ('SYN_nodes_nopunc', tuple(x.label() for x in syn_nodes_nopunc))

    # currently de-activated
    if False:
        # find EDU head
        edu_head = edu_info['edu_head']
        if edu_head is not None:
            treepos_hn, treepos_hw = edu_head
            hlabel = ptree[treepos_hn].label()
            hword = ptree[treepos_hw].word

            if False:
                # DEBUG
                print('edu: ', edu.text())
                print('hlabel: ', hlabel)
                print('hword: ', hword)
                print('======')

            yield ('SYN_hlabel', hlabel)
            yield ('SYN_hword', hword)
예제 #3
0
파일: base.py 프로젝트: eipiplusun/educe
    def preprocess(self, doc, strict=False):
        """Preprocess a document and output basic features for each EDU.

        Parameters
        ----------
        doc: DocumentPlus
            Document to be processed.

        Returns
        -------
        edu_infos: list of dict of features
            List of basic features for each EDU ; each feature is a
            couple (basic_feat_name, basic_feat_val).
        para_infos: list of dict of features
            List of basic features for each paragraph ; each feature is
            a couple (basic_feat_name, basic_feat_val).

        TODO
        ----
        * [ ] explicitly impute missing values, e.g. for idxes_in_*
        """
        token_filter = self.token_filter
        word2clust = self.word2clust

        edus = doc.edus
        raw_words = doc.raw_words  # TEMPORARY
        tokens = doc.tkd_tokens
        trees = doc.tkd_trees
        paragraphs = doc.paragraphs  # NEW
        # mappings from EDU to other annotations
        edu2raw_sent = doc.edu2raw_sent
        edu2para = doc.edu2para
        edu2sent = doc.edu2sent
        edu2tokens = doc.edu2tokens
        lex_heads = doc.lex_heads  # EXPERIMENTAL

        # pre-compute relative indices (in sent, para) in one iteration
        # NB: moved to document_plus itself
        idxes_in_sent = doc.edu2idx_in_sent
        rev_idxes_in_sent = doc.edu2rev_idx_in_sent

        idxes_in_para = doc.edu2idx_in_para
        rev_idxes_in_para = doc.edu2rev_idx_in_para

        # paragraphs
        if paragraphs is None:
            para_infos = None
        else:
            para_infos = []

            # special case for the left padding paragraph
            pfeats = dict()
            pfeats['tokens'] = [tokens[0]]  # left padding token
            pfeats['syn_nodes'] = None
            para_infos.append(pfeats)

            # regular paragraphs
            for para_idx, para in enumerate(paragraphs[1:], start=1):
                pfeats = dict()
                para_beg = para.sentences[0].span.char_start
                para_end = para.sentences[-1].span.char_end
                trees_beg = doc.trees_beg
                trees_end = doc.trees_end
                toks_beg = doc.toks_beg
                toks_end = doc.toks_end

                # * token characterization of the paragraph
                encltoks_idc = np.where(
                    np.logical_and(toks_beg >= para_beg,
                                   toks_end <= para_end)
                )[0]
                encltoks = [tokens[i] for i in encltoks_idc]
                pfeats['tokens'] = encltoks

                # * syntactic characterization of the paragraph
                # find the syntactic trees that span this paragraph
                enclosed_idc = np.intersect1d(
                    np.where(trees_beg >= para_beg),
                    np.where(trees_end <= para_end))
                overlapd_idc = np.intersect1d(
                    np.where(trees_beg < para_end),
                    np.where(trees_end > para_beg))
                if np.array_equal(enclosed_idc, overlapd_idc):
                    # sentence seg and paragraph seg are compatible
                    syn_nodes = [trees[tree_idx]
                                 for tree_idx in overlapd_idc]
                else:
                    # mismatch between the sentence segmentation from the
                    # PTB and paragraph segmentation from the RST-WSJ
                    strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc)
                    syn_nodes = []
                    for tree_idx in overlapd_idc:
                        syn_tree = trees[tree_idx]
                        if tree_idx not in strad_idc:
                            syn_nodes.append(syn_tree)
                            continue
                        # find the list of tokens that overlap this
                        # paragraph, and belong to this straddling
                        # tree
                        tree_beg = trees_beg[tree_idx]
                        tree_end = trees_end[tree_idx]
                        # here, reduce(np.logical_and(...)) was 2x
                        # faster than np.logical_and.reduce(...)
                        overtoks_idc = np.where(
                            reduce(np.logical_and,
                                   (toks_beg < para_end,
                                    toks_end > para_beg,
                                    toks_beg >= tree_beg,
                                    toks_end <= tree_end)
                            )
                        )[0]
                        overtoks = [tokens[i] for i in overtoks_idc]
                        syn_node_seq = syntactic_node_seq(
                            syn_tree, overtoks)
                        syn_nodes.extend(syn_node_seq)
                # add basic feature
                pfeats['syn_nodes'] = syn_nodes
                # store
                para_infos.append(pfeats)
        # EDUs
        edu_infos = []
        # special case: left padding EDU
        edu = edus[0]
        res = dict()
        res['edu'] = edu
        # raw words (temporary)
        res['raw_words'] = []
        # tokens
        res['tokens'] = []  # TODO: __START__ / __START__ ?
        res['tags'] = []  # TODO: __START__ ?
        res['words'] = []  # TODO: __START__ ?
        res['tok_beg'] = 0  # EXPERIMENTAL
        res['tok_end'] = 0  # EXPERIMENTAL
        # EXPERIMENTAL: Brown clusters
        res['brown_clusters'] = []
        # end Brown clusters
        # sentence
        res['edu_idx_in_sent'] = idxes_in_sent[0]
        res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0]
        res['sent_idx'] = 0
        res['sent_rev_idx'] = len(trees) - 1  # NEW
        # para
        res['edu_rev_idx_in_para'] = rev_idxes_in_para[0]
        # aka paragraphID
        res['para_idx'] = 0
        res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None
                               else None)  # NEW
        # raw sent
        res['raw_sent_idx'] = edu2raw_sent[0]
        edu_infos.append(res)

        # regular EDUs
        for edu_idx, edu in enumerate(edus[1:], start=1):
            res = dict()
            res['edu'] = edu

            # raw words (temporary)
            res['raw_words'] = raw_words[edu_idx]

            # tokens
            if tokens is not None:
                tok_idcs = edu2tokens[edu_idx]
                toks = [tokens[tok_idx] for tok_idx in tok_idcs]
                # special case: no tokens
                if strict and not toks:
                    emsg = 'No token for EDU'
                    print(list(enumerate(tokens)))
                    print(tok_idcs)
                    print(edu.text())
                    raise ValueError(emsg)
                # filter tokens if relevant
                if token_filter is not None:
                    toks = [tt for tt in toks if token_filter(tt)]
                # store information
                res['tokens'] = toks
                res['tags'] = [tok.tag for tok in toks]
                res['words'] = [tok.word for tok in toks]
                # EXPERIMENTAL: Brown clusters
                if word2clust is not None:
                    res['brown_clusters'] = [word2clust[w]
                                             for w in res['words']
                                             if w in word2clust]
                # end Brown clusters

            # doc structure

            # position of sentence containing EDU in doc
            # aka sentence_id
            sent_idx = edu2sent[edu_idx]
            res['sent_idx'] = sent_idx
            res['sent_rev_idx'] = (len(trees) - 1 - sent_idx
                                   if sent_idx is not None
                                   else None)  # NEW
            # position of EDU in sentence
            # aka num_edus_from_sent_start aka offset
            res['edu_idx_in_sent'] = idxes_in_sent[edu_idx]
            # aka num_edus_to_sent_end aka revOffset
            res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx]

            # position of paragraph containing EDU in doc
            # aka paragraphID
            para_idx = edu2para[edu_idx]
            res['para_idx'] = para_idx
            res['para_rev_idx'] = (len(paragraphs) - 1 - para_idx
                                   if (paragraphs is not None and
                                       para_idx is not None)
                                   else None)  # NEW
            # position of raw sentence
            res['raw_sent_idx'] = edu2raw_sent[edu_idx]

            # position of EDU in paragraph
            # aka num_edus_to_para_end aka revSentenceID (?!)
            # TODO: check for the 10th time if this is a bug in Li et al.'s
            # parser
            res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx]

            # syntax
            if len(trees) > 1:
                tree_idx = edu2sent[edu_idx]
                res['tkd_tree_idx'] = tree_idx
                if tree_idx is not None:
                    # head node of the EDU (for DS-LST features)
                    ptree = trees[tree_idx]
                    pheads = lex_heads[tree_idx]
                    # tree positions (in the syn tree) of the words of
                    # the EDU
                    tpos_leaves_edu = [x for x
                                       in ptree.treepositions('leaves')
                                       if ptree[x].overlaps(edu)]
                    tpos_words = set(tpos_leaves_edu)
                    res['tpos_words'] = tpos_words
                    edu_head = find_edu_head(ptree, pheads, tpos_words)
                    res['edu_head'] = edu_head

            edu_infos.append(res)

        return edu_infos, para_infos
예제 #4
0
def extract_pair_syntax(doc, edu_info1, edu_info2, edu_info_bwn):
    """syntactic features for the pair of EDUs"""
    try:
        tree_idx1 = edu_info1['tkd_tree_idx']
        tree_idx2 = edu_info2['tkd_tree_idx']
    except KeyError:
        return

    if tree_idx1 is None or tree_idx2 is None:
        return

    edu1 = edu_info1['edu']
    edu2 = edu_info2['edu']
    # nums
    edu1_num = edu1.num
    edu2_num = edu2.num

    # determine the linear order of {EDU_1, EDU_2}
    if edu1.num < edu2.num:
        # edu_l = edu1
        # edu_r = edu2
        edu_info_l = edu_info1
        edu_info_r = edu_info2
    else:
        # edu_l = edu2
        # edu_r = edu1
        edu_info_l = edu_info2
        edu_info_r = edu_info1

    if tree_idx1 == tree_idx2:
        # intra-sentential
        tree_idx = tree_idx1
        ptree = doc.tkd_trees[tree_idx]
        pheads = doc.lex_heads[tree_idx]

        # * DS-LST features
        # find the head node of EDU1
        tpos_words1 = edu_info1['tpos_words']
        edu1_head = edu_info1['edu_head']
        if edu1_head is not None:
            treepos_hn1, treepos_hw1 = edu1_head
            hlabel1 = ptree[treepos_hn1].label()
            hword1 = ptree[treepos_hw1].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn1 != ():
                treepos_an1 = treepos_hn1[:-1]
                treepos_aw1 = pheads[treepos_an1]
                alabel1 = ptree[treepos_an1].label()
                aword1 = ptree[treepos_aw1].word

        # find the head node of EDU2
        tpos_words2 = edu_info2['tpos_words']
        edu2_head = edu_info2['edu_head']
        if edu2_head is not None:
            treepos_hn2, treepos_hw2 = edu2_head
            hlabel2 = ptree[treepos_hn2].label()
            hword2 = ptree[treepos_hw2].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn2 != ():
                treepos_an2 = treepos_hn2[:-1]
                treepos_aw2 = pheads[treepos_an2]
                alabel2 = ptree[treepos_an2].label()
                aword2 = ptree[treepos_aw2].word

        # EXPERIMENTAL
        #
        # EDU 2 > EDU 1
        if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)):
            # dominance relationship: 2 > 1
            yield ('SYN_dom_2', True)
            # attachment label and word
            yield ('SYN_alabel', alabel1)
            yield ('SYN_aword', aword1)
            # head label and word
            yield ('SYN_hlabel', hlabel1)
            yield ('SYN_hword', hword1)

        # EDU 1 > EDU 2
        if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)):
            # dominance relationship: 1 > 2
            yield ('SYN_dom_1', True)
            # attachment label and word
            yield ('SYN_alabel', alabel2)
            yield ('SYN_aword', aword2)
            # head label and word
            yield ('SYN_hlabel', hlabel2)
            yield ('SYN_hword', hword2)

        # TODO assert that 1 > 2 and 2 > 1 cannot happen together

        # TODO fire a feature if the head nodes of EDU1 and EDU2
        # have the same attachment node ?

        # * syntactic nodes (WIP as of 2016-05-25)
        #   - interval between edu1 and edu2
        if edu_info_bwn:
            # bwn_edus = [x['edu'] for x in edu_info_bwn]
            bwn_tokens = list(
                itertools.chain.from_iterable(x['tokens']
                                              for x in edu_info_bwn))
            # 1. EDUs_bwn
            # spanning nodes for the interval
            syn_nodes = syntactic_node_seq(ptree, bwn_tokens)
            if syn_nodes:
                yield ('SYN_nodes_bwn', tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            bwn_tokens_strip_punc = strip_punctuation(bwn_tokens)
            syn_nodes_strip = syntactic_node_seq(ptree, bwn_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_bwn_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 2. EDU_L + EDUs_bwn + EDU_R
            # lbwnr_edus = [edu_l] + bwn_edus + [edu_r]
            lbwnr_tokens = (edu_info_l['tokens'] + bwn_tokens +
                            edu_info_r['tokens'])
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, lbwnr_tokens)
            if syn_nodes:
                yield ('SYN_nodes_lbwnr', tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            lbwnr_tokens_strip_punc = strip_punctuation(lbwnr_tokens)
            syn_nodes_strip = syntactic_node_seq(ptree,
                                                 lbwnr_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_lbwnr_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 3. EDU_L + EDUs_bwn
            # lbwn_edus = [edu_l] + bwn_edus
            lbwn_tokens = (edu_info_l['tokens'] + bwn_tokens)
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, lbwn_tokens)
            if syn_nodes:
                yield ('SYN_nodes_lbwn', tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            lbwn_tokens_strip_punc = strip_punctuation(lbwn_tokens)
            syn_nodes_strip = syntactic_node_seq(ptree, lbwn_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_lbwn_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 4. EDUs_bwn + EDU_R
            # bwnr_edus = bwn_edus + [edu_r]
            bwnr_tokens = (bwn_tokens + edu_info_r['tokens'])
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, bwnr_tokens)
            if syn_nodes:
                yield ('SYN_nodes_bwnr', tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            bwnr_tokens_strip_punc = strip_punctuation(bwnr_tokens)
            syn_nodes_strip = syntactic_node_seq(ptree, bwnr_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_bwnr_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # TODO EDU_L + EDUs_bwn[:i], EDUs_bwn[i:] + EDUs_R ?
            # where i should correspond to the split point of the (2nd
            # order variant of the) Eisner decoder

            # TODO specifically handle interval PRN that start with a comma
            # that trails the preceding EDU ?

    # TODO fire a feature with the pair of labels of the head nodes of EDU1
    # and EDU2 ?
    else:
        ptree1 = doc.tkd_trees[tree_idx1]
        # pheads1 = doc.lex_heads[tree_idx1]

        ptree2 = doc.tkd_trees[tree_idx2]
        # pheads2 = doc.lex_heads[tree_idx2]

        # pair of sentence types, hopefully informative esp. for non-S
        yield ('SYN_sent_type_pair', (ptree1.label(), ptree2.label()))
        # sentence types in between
        ptree_l = edu_info_l['tkd_tree_idx']
        ptree_r = edu_info_r['tkd_tree_idx']
        try:
            ptrees_lbwnr = ([ptree_l] +
                            [x['tkd_tree_idx']
                             for x in edu_info_bwn] + [ptree_r])
        except KeyError:
            pass
        else:
            ptrees_lbwnr = [
                doc.tkd_trees[x] for x, _ in itertools.groupby(ptrees_lbwnr)
            ]
            stypes_lbwnr = [x.label() for x in ptrees_lbwnr]
            yield ('SYN_sent_type_lbwnr', tuple(stypes_lbwnr))
            yield ('SYN_sent_type_bwn', tuple(stypes_lbwnr[1:-1]))
예제 #5
0
def extract_pair_syntax(doc, edu_info1, edu_info2, edu_info_bwn):
    """syntactic features for the pair of EDUs"""
    try:
        tree_idx1 = edu_info1['tkd_tree_idx']
        tree_idx2 = edu_info2['tkd_tree_idx']
    except KeyError:
        return

    if tree_idx1 is None or tree_idx2 is None:
        return

    edu1 = edu_info1['edu']
    edu2 = edu_info2['edu']
    # nums
    edu1_num = edu1.num
    edu2_num = edu2.num

    # determine the linear order of {EDU_1, EDU_2}
    if edu1.num < edu2.num:
        # edu_l = edu1
        # edu_r = edu2
        edu_info_l = edu_info1
        edu_info_r = edu_info2
    else:
        # edu_l = edu2
        # edu_r = edu1
        edu_info_l = edu_info2
        edu_info_r = edu_info1

    if tree_idx1 == tree_idx2:
        # intra-sentential
        tree_idx = tree_idx1
        ptree = doc.tkd_trees[tree_idx]
        pheads = doc.lex_heads[tree_idx]

        # * DS-LST features
        # find the head node of EDU1
        tpos_words1 = edu_info1['tpos_words']
        edu1_head = edu_info1['edu_head']
        if edu1_head is not None:
            treepos_hn1, treepos_hw1 = edu1_head
            hlabel1 = ptree[treepos_hn1].label()
            hword1 = ptree[treepos_hw1].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn1 != ():
                treepos_an1 = treepos_hn1[:-1]
                treepos_aw1 = pheads[treepos_an1]
                alabel1 = ptree[treepos_an1].label()
                aword1 = ptree[treepos_aw1].word

        # find the head node of EDU2
        tpos_words2 = edu_info2['tpos_words']
        edu2_head = edu_info2['edu_head']
        if edu2_head is not None:
            treepos_hn2, treepos_hw2 = edu2_head
            hlabel2 = ptree[treepos_hn2].label()
            hword2 = ptree[treepos_hw2].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn2 != ():
                treepos_an2 = treepos_hn2[:-1]
                treepos_aw2 = pheads[treepos_an2]
                alabel2 = ptree[treepos_an2].label()
                aword2 = ptree[treepos_aw2].word

        # EXPERIMENTAL
        #
        # EDU 2 > EDU 1
        if ((treepos_hn1 != () and
             treepos_aw1 in tpos_words2)):
            # dominance relationship: 2 > 1
            yield ('SYN_dom_2', True)
            # attachment label and word
            yield ('SYN_alabel', alabel1)
            yield ('SYN_aword', aword1)
            # head label and word
            yield ('SYN_hlabel', hlabel1)
            yield ('SYN_hword', hword1)

        # EDU 1 > EDU 2
        if ((treepos_hn2 != () and
             treepos_aw2 in tpos_words1)):
            # dominance relationship: 1 > 2
            yield ('SYN_dom_1', True)
            # attachment label and word
            yield ('SYN_alabel', alabel2)
            yield ('SYN_aword', aword2)
            # head label and word
            yield ('SYN_hlabel', hlabel2)
            yield ('SYN_hword', hword2)

        # TODO assert that 1 > 2 and 2 > 1 cannot happen together

        # TODO fire a feature if the head nodes of EDU1 and EDU2
        # have the same attachment node ?

        # * syntactic nodes (WIP as of 2016-05-25)
        #   - interval between edu1 and edu2
        if edu_info_bwn:
            # bwn_edus = [x['edu'] for x in edu_info_bwn]
            bwn_tokens = list(itertools.chain.from_iterable(
                x['tokens'] for x in edu_info_bwn))
            # 1. EDUs_bwn
            # spanning nodes for the interval
            syn_nodes = syntactic_node_seq(ptree, bwn_tokens)
            if syn_nodes:
                yield ('SYN_nodes_bwn',
                       tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            bwn_tokens_strip_punc = strip_punctuation(bwn_tokens)
            syn_nodes_strip = syntactic_node_seq(ptree, bwn_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_bwn_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 2. EDU_L + EDUs_bwn + EDU_R
            # lbwnr_edus = [edu_l] + bwn_edus + [edu_r]
            lbwnr_tokens = (edu_info_l['tokens']
                            + bwn_tokens
                            + edu_info_r['tokens'])
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, lbwnr_tokens)
            if syn_nodes:
                yield ('SYN_nodes_lbwnr',
                       tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            lbwnr_tokens_strip_punc = strip_punctuation(lbwnr_tokens)
            syn_nodes_strip = syntactic_node_seq(
                ptree, lbwnr_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_lbwnr_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 3. EDU_L + EDUs_bwn
            # lbwn_edus = [edu_l] + bwn_edus
            lbwn_tokens = (edu_info_l['tokens']
                           + bwn_tokens)
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, lbwn_tokens)
            if syn_nodes:
                yield ('SYN_nodes_lbwn',
                       tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            lbwn_tokens_strip_punc = strip_punctuation(lbwn_tokens)
            syn_nodes_strip = syntactic_node_seq(
                ptree, lbwn_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_lbwn_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # 4. EDUs_bwn + EDU_R
            # bwnr_edus = bwn_edus + [edu_r]
            bwnr_tokens = (bwn_tokens
                           + edu_info_r['tokens'])
            # spanning nodes
            syn_nodes = syntactic_node_seq(ptree, bwnr_tokens)
            if syn_nodes:
                yield ('SYN_nodes_bwnr',
                       tuple(x.label() for x in syn_nodes))
            # variant: strip leading and trailing punctuations
            bwnr_tokens_strip_punc = strip_punctuation(bwnr_tokens)
            syn_nodes_strip = syntactic_node_seq(
                ptree, bwnr_tokens_strip_punc)
            if syn_nodes_strip:
                yield ('SYN_nodes_bwnr_nopunc',
                       tuple(x.label() for x in syn_nodes_strip))

            # TODO EDU_L + EDUs_bwn[:i], EDUs_bwn[i:] + EDUs_R ?
            # where i should correspond to the split point of the (2nd
            # order variant of the) Eisner decoder

            # TODO specifically handle interval PRN that start with a comma
            # that trails the preceding EDU ?

    # TODO fire a feature with the pair of labels of the head nodes of EDU1
    # and EDU2 ?
    else:
        ptree1 = doc.tkd_trees[tree_idx1]
        # pheads1 = doc.lex_heads[tree_idx1]

        ptree2 = doc.tkd_trees[tree_idx2]
        # pheads2 = doc.lex_heads[tree_idx2]

        # pair of sentence types, hopefully informative esp. for non-S
        yield ('SYN_sent_type_pair', (ptree1.label(),
                                      ptree2.label()))
        # sentence types in between
        ptree_l = edu_info_l['tkd_tree_idx']
        ptree_r = edu_info_r['tkd_tree_idx']
        try:
            ptrees_lbwnr = ([ptree_l]
                            + [x['tkd_tree_idx'] for x in edu_info_bwn]
                            + [ptree_r])
        except KeyError:
            pass
        else:
            ptrees_lbwnr = [doc.tkd_trees[x] for x, _
                            in itertools.groupby(ptrees_lbwnr)]
            stypes_lbwnr = [x.label() for x in ptrees_lbwnr]
            yield ('SYN_sent_type_lbwnr', tuple(stypes_lbwnr))
            yield ('SYN_sent_type_bwn', tuple(stypes_lbwnr[1:-1]))
예제 #6
0
    def preprocess(self, doc, strict=False):
        """Preprocess a document and output basic features for each EDU.

        Parameters
        ----------
        doc: DocumentPlus
            Document to be processed.

        Returns
        -------
        edu_infos: list of dict of features
            List of basic features for each EDU ; each feature is a
            couple (basic_feat_name, basic_feat_val).
        para_infos: list of dict of features
            List of basic features for each paragraph ; each feature is
            a couple (basic_feat_name, basic_feat_val).

        TODO
        ----
        * [ ] explicitly impute missing values, e.g. for idxes_in_*
        """
        token_filter = self.token_filter
        word2clust = self.word2clust

        edus = doc.edus
        raw_words = doc.raw_words  # TEMPORARY
        tokens = doc.tkd_tokens
        trees = doc.tkd_trees
        paragraphs = doc.paragraphs  # NEW
        # mappings from EDU to other annotations
        edu2raw_sent = doc.edu2raw_sent
        edu2para = doc.edu2para
        edu2sent = doc.edu2sent
        edu2tokens = doc.edu2tokens
        lex_heads = doc.lex_heads  # EXPERIMENTAL

        # pre-compute relative indices (in sent, para) in one iteration
        # NB: moved to document_plus itself
        idxes_in_sent = doc.edu2idx_in_sent
        rev_idxes_in_sent = doc.edu2rev_idx_in_sent

        idxes_in_para = doc.edu2idx_in_para
        rev_idxes_in_para = doc.edu2rev_idx_in_para

        # paragraphs
        if paragraphs is None:
            para_infos = None
        else:
            para_infos = []

            # special case for the left padding paragraph
            pfeats = dict()
            pfeats['tokens'] = [tokens[0]]  # left padding token
            pfeats['syn_nodes'] = None
            para_infos.append(pfeats)

            # regular paragraphs
            for para_idx, para in enumerate(paragraphs[1:], start=1):
                pfeats = dict()
                para_beg = para.sentences[0].span.char_start
                para_end = para.sentences[-1].span.char_end
                trees_beg = doc.trees_beg
                trees_end = doc.trees_end
                toks_beg = doc.toks_beg
                toks_end = doc.toks_end

                # * token characterization of the paragraph
                encltoks_idc = np.where(
                    np.logical_and(toks_beg >= para_beg,
                                   toks_end <= para_end))[0]
                encltoks = [tokens[i] for i in encltoks_idc]
                pfeats['tokens'] = encltoks

                # * syntactic characterization of the paragraph
                # find the syntactic trees that span this paragraph
                enclosed_idc = np.intersect1d(np.where(trees_beg >= para_beg),
                                              np.where(trees_end <= para_end))
                overlapd_idc = np.intersect1d(np.where(trees_beg < para_end),
                                              np.where(trees_end > para_beg))
                if np.array_equal(enclosed_idc, overlapd_idc):
                    # sentence seg and paragraph seg are compatible
                    syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc]
                else:
                    # mismatch between the sentence segmentation from the
                    # PTB and paragraph segmentation from the RST-WSJ
                    strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc)
                    syn_nodes = []
                    for tree_idx in overlapd_idc:
                        syn_tree = trees[tree_idx]
                        if tree_idx not in strad_idc:
                            syn_nodes.append(syn_tree)
                            continue
                        # find the list of tokens that overlap this
                        # paragraph, and belong to this straddling
                        # tree
                        tree_beg = trees_beg[tree_idx]
                        tree_end = trees_end[tree_idx]
                        # here, reduce(np.logical_and(...)) was 2x
                        # faster than np.logical_and.reduce(...)
                        overtoks_idc = np.where(
                            reduce(np.logical_and,
                                   (toks_beg < para_end, toks_end > para_beg,
                                    toks_beg >= tree_beg,
                                    toks_end <= tree_end)))[0]
                        overtoks = [tokens[i] for i in overtoks_idc]
                        syn_node_seq = syntactic_node_seq(syn_tree, overtoks)
                        syn_nodes.extend(syn_node_seq)
                # add basic feature
                pfeats['syn_nodes'] = syn_nodes
                # store
                para_infos.append(pfeats)
        # EDUs
        edu_infos = []
        # special case: left padding EDU
        edu = edus[0]
        res = dict()
        res['edu'] = edu
        # raw words (temporary)
        res['raw_words'] = []
        # tokens
        res['tokens'] = []  # TODO: __START__ / __START__ ?
        res['tags'] = []  # TODO: __START__ ?
        res['words'] = []  # TODO: __START__ ?
        res['tok_beg'] = 0  # EXPERIMENTAL
        res['tok_end'] = 0  # EXPERIMENTAL
        # EXPERIMENTAL: Brown clusters
        res['brown_clusters'] = []
        # end Brown clusters
        # sentence
        res['edu_idx_in_sent'] = idxes_in_sent[0]
        res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0]
        res['sent_idx'] = 0
        res['sent_rev_idx'] = len(trees) - 1  # NEW
        # para
        res['edu_rev_idx_in_para'] = rev_idxes_in_para[0]
        # aka paragraphID
        res['para_idx'] = 0
        res['para_rev_idx'] = (len(paragraphs) -
                               1 if paragraphs is not None else None)  # NEW
        # raw sent
        res['raw_sent_idx'] = edu2raw_sent[0]
        edu_infos.append(res)

        # regular EDUs
        for edu_idx, edu in enumerate(edus[1:], start=1):
            res = dict()
            res['edu'] = edu

            # raw words (temporary)
            res['raw_words'] = raw_words[edu_idx]

            # tokens
            if tokens is not None:
                tok_idcs = edu2tokens[edu_idx]
                toks = [tokens[tok_idx] for tok_idx in tok_idcs]
                # special case: no tokens
                if strict and not toks:
                    emsg = 'No token for EDU'
                    print(list(enumerate(tokens)))
                    print(tok_idcs)
                    print(edu.text())
                    raise ValueError(emsg)
                # filter tokens if relevant
                if token_filter is not None:
                    toks = [tt for tt in toks if token_filter(tt)]
                # store information
                res['tokens'] = toks
                res['tags'] = [tok.tag for tok in toks]
                res['words'] = [tok.word for tok in toks]
                # EXPERIMENTAL: Brown clusters
                if word2clust is not None:
                    res['brown_clusters'] = [
                        word2clust[w] for w in res['words'] if w in word2clust
                    ]
                # end Brown clusters

            # doc structure

            # position of sentence containing EDU in doc
            # aka sentence_id
            sent_idx = edu2sent[edu_idx]
            res['sent_idx'] = sent_idx
            res['sent_rev_idx'] = (len(trees) - 1 -
                                   sent_idx if sent_idx is not None else None
                                   )  # NEW
            # position of EDU in sentence
            # aka num_edus_from_sent_start aka offset
            res['edu_idx_in_sent'] = idxes_in_sent[edu_idx]
            # aka num_edus_to_sent_end aka revOffset
            res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx]

            # position of paragraph containing EDU in doc
            # aka paragraphID
            para_idx = edu2para[edu_idx]
            res['para_idx'] = para_idx
            res['para_rev_idx'] = (
                len(paragraphs) - 1 - para_idx if
                (paragraphs is not None and para_idx is not None) else None
            )  # NEW
            # position of raw sentence
            res['raw_sent_idx'] = edu2raw_sent[edu_idx]

            # position of EDU in paragraph
            # aka num_edus_to_para_end aka revSentenceID (?!)
            # TODO: check for the 10th time if this is a bug in Li et al.'s
            # parser
            res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx]

            # syntax
            if len(trees) > 1:
                tree_idx = edu2sent[edu_idx]
                res['tkd_tree_idx'] = tree_idx
                if tree_idx is not None:
                    # head node of the EDU (for DS-LST features)
                    ptree = trees[tree_idx]
                    pheads = lex_heads[tree_idx]
                    # tree positions (in the syn tree) of the words of
                    # the EDU
                    tpos_leaves_edu = [
                        x for x in ptree.treepositions('leaves')
                        if ptree[x].overlaps(edu)
                    ]
                    tpos_words = set(tpos_leaves_edu)
                    res['tpos_words'] = tpos_words
                    edu_head = find_edu_head(ptree, pheads, tpos_words)
                    res['edu_head'] = edu_head

            edu_infos.append(res)

        return edu_infos, para_infos