예제 #1
0
def extract_single_syntax(edu_info):
    """syntactic features for the EDU"""
    try:
        ptree = edu_info['ptree']
        pheads = edu_info['pheads']
    except KeyError:
        return

    edu = edu_info['edu']

    # tree positions (in the syn tree) of the words that are in the EDU
    tpos_leaves_edu = [
        tpos_leaf for tpos_leaf in ptree.treepositions('leaves')
        if ptree[tpos_leaf].overlaps(edu)
    ]
    wanted = set(tpos_leaves_edu)
    edu_head = find_edu_head(ptree, pheads, wanted)
    if edu_head is not None:
        treepos_hn, treepos_hw = edu_head
        hlabel = ptree[treepos_hn].label()
        hword = ptree[treepos_hw].word

        if False:
            # DEBUG
            print('edu: ', edu.text())
            print('hlabel: ', hlabel)
            print('hword: ', hword)
            print('======')

        yield ('SYN_hlabel', hlabel)
        yield ('SYN_hword', hword)
예제 #2
0
def extract_single_syntax(edu_info):
    """syntactic features for the EDU"""
    try:
        ptree = edu_info['ptree']
        pheads = edu_info['pheads']
    except KeyError:
        return

    edu = edu_info['edu']

    # tree positions (in the syn tree) of the words that are in the EDU
    tpos_leaves_edu = [tpos_leaf
                       for tpos_leaf in ptree.treepositions('leaves')
                       if ptree[tpos_leaf].overlaps(edu)]
    wanted = set(tpos_leaves_edu)
    edu_head = find_edu_head(ptree, pheads, wanted)
    if edu_head is not None:
        treepos_hn, treepos_hw = edu_head
        hlabel = ptree[treepos_hn].label()
        hword = ptree[treepos_hw].word

        if False:
            # DEBUG
            print('edu: ', edu.text())
            print('hlabel: ', hlabel)
            print('hword: ', hword)
            print('======')

        yield ('SYN_hlabel', hlabel)
        yield ('SYN_hword', hword)
예제 #3
0
파일: base.py 프로젝트: eipiplusun/educe
    def preprocess(self, doc, strict=False):
        """Preprocess a document and output basic features for each EDU.

        Parameters
        ----------
        doc: DocumentPlus
            Document to be processed.

        Returns
        -------
        edu_infos: list of dict of features
            List of basic features for each EDU ; each feature is a
            couple (basic_feat_name, basic_feat_val).
        para_infos: list of dict of features
            List of basic features for each paragraph ; each feature is
            a couple (basic_feat_name, basic_feat_val).

        TODO
        ----
        * [ ] explicitly impute missing values, e.g. for idxes_in_*
        """
        token_filter = self.token_filter
        word2clust = self.word2clust

        edus = doc.edus
        raw_words = doc.raw_words  # TEMPORARY
        tokens = doc.tkd_tokens
        trees = doc.tkd_trees
        paragraphs = doc.paragraphs  # NEW
        # mappings from EDU to other annotations
        edu2raw_sent = doc.edu2raw_sent
        edu2para = doc.edu2para
        edu2sent = doc.edu2sent
        edu2tokens = doc.edu2tokens
        lex_heads = doc.lex_heads  # EXPERIMENTAL

        # pre-compute relative indices (in sent, para) in one iteration
        # NB: moved to document_plus itself
        idxes_in_sent = doc.edu2idx_in_sent
        rev_idxes_in_sent = doc.edu2rev_idx_in_sent

        idxes_in_para = doc.edu2idx_in_para
        rev_idxes_in_para = doc.edu2rev_idx_in_para

        # paragraphs
        if paragraphs is None:
            para_infos = None
        else:
            para_infos = []

            # special case for the left padding paragraph
            pfeats = dict()
            pfeats['tokens'] = [tokens[0]]  # left padding token
            pfeats['syn_nodes'] = None
            para_infos.append(pfeats)

            # regular paragraphs
            for para_idx, para in enumerate(paragraphs[1:], start=1):
                pfeats = dict()
                para_beg = para.sentences[0].span.char_start
                para_end = para.sentences[-1].span.char_end
                trees_beg = doc.trees_beg
                trees_end = doc.trees_end
                toks_beg = doc.toks_beg
                toks_end = doc.toks_end

                # * token characterization of the paragraph
                encltoks_idc = np.where(
                    np.logical_and(toks_beg >= para_beg,
                                   toks_end <= para_end)
                )[0]
                encltoks = [tokens[i] for i in encltoks_idc]
                pfeats['tokens'] = encltoks

                # * syntactic characterization of the paragraph
                # find the syntactic trees that span this paragraph
                enclosed_idc = np.intersect1d(
                    np.where(trees_beg >= para_beg),
                    np.where(trees_end <= para_end))
                overlapd_idc = np.intersect1d(
                    np.where(trees_beg < para_end),
                    np.where(trees_end > para_beg))
                if np.array_equal(enclosed_idc, overlapd_idc):
                    # sentence seg and paragraph seg are compatible
                    syn_nodes = [trees[tree_idx]
                                 for tree_idx in overlapd_idc]
                else:
                    # mismatch between the sentence segmentation from the
                    # PTB and paragraph segmentation from the RST-WSJ
                    strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc)
                    syn_nodes = []
                    for tree_idx in overlapd_idc:
                        syn_tree = trees[tree_idx]
                        if tree_idx not in strad_idc:
                            syn_nodes.append(syn_tree)
                            continue
                        # find the list of tokens that overlap this
                        # paragraph, and belong to this straddling
                        # tree
                        tree_beg = trees_beg[tree_idx]
                        tree_end = trees_end[tree_idx]
                        # here, reduce(np.logical_and(...)) was 2x
                        # faster than np.logical_and.reduce(...)
                        overtoks_idc = np.where(
                            reduce(np.logical_and,
                                   (toks_beg < para_end,
                                    toks_end > para_beg,
                                    toks_beg >= tree_beg,
                                    toks_end <= tree_end)
                            )
                        )[0]
                        overtoks = [tokens[i] for i in overtoks_idc]
                        syn_node_seq = syntactic_node_seq(
                            syn_tree, overtoks)
                        syn_nodes.extend(syn_node_seq)
                # add basic feature
                pfeats['syn_nodes'] = syn_nodes
                # store
                para_infos.append(pfeats)
        # EDUs
        edu_infos = []
        # special case: left padding EDU
        edu = edus[0]
        res = dict()
        res['edu'] = edu
        # raw words (temporary)
        res['raw_words'] = []
        # tokens
        res['tokens'] = []  # TODO: __START__ / __START__ ?
        res['tags'] = []  # TODO: __START__ ?
        res['words'] = []  # TODO: __START__ ?
        res['tok_beg'] = 0  # EXPERIMENTAL
        res['tok_end'] = 0  # EXPERIMENTAL
        # EXPERIMENTAL: Brown clusters
        res['brown_clusters'] = []
        # end Brown clusters
        # sentence
        res['edu_idx_in_sent'] = idxes_in_sent[0]
        res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0]
        res['sent_idx'] = 0
        res['sent_rev_idx'] = len(trees) - 1  # NEW
        # para
        res['edu_rev_idx_in_para'] = rev_idxes_in_para[0]
        # aka paragraphID
        res['para_idx'] = 0
        res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None
                               else None)  # NEW
        # raw sent
        res['raw_sent_idx'] = edu2raw_sent[0]
        edu_infos.append(res)

        # regular EDUs
        for edu_idx, edu in enumerate(edus[1:], start=1):
            res = dict()
            res['edu'] = edu

            # raw words (temporary)
            res['raw_words'] = raw_words[edu_idx]

            # tokens
            if tokens is not None:
                tok_idcs = edu2tokens[edu_idx]
                toks = [tokens[tok_idx] for tok_idx in tok_idcs]
                # special case: no tokens
                if strict and not toks:
                    emsg = 'No token for EDU'
                    print(list(enumerate(tokens)))
                    print(tok_idcs)
                    print(edu.text())
                    raise ValueError(emsg)
                # filter tokens if relevant
                if token_filter is not None:
                    toks = [tt for tt in toks if token_filter(tt)]
                # store information
                res['tokens'] = toks
                res['tags'] = [tok.tag for tok in toks]
                res['words'] = [tok.word for tok in toks]
                # EXPERIMENTAL: Brown clusters
                if word2clust is not None:
                    res['brown_clusters'] = [word2clust[w]
                                             for w in res['words']
                                             if w in word2clust]
                # end Brown clusters

            # doc structure

            # position of sentence containing EDU in doc
            # aka sentence_id
            sent_idx = edu2sent[edu_idx]
            res['sent_idx'] = sent_idx
            res['sent_rev_idx'] = (len(trees) - 1 - sent_idx
                                   if sent_idx is not None
                                   else None)  # NEW
            # position of EDU in sentence
            # aka num_edus_from_sent_start aka offset
            res['edu_idx_in_sent'] = idxes_in_sent[edu_idx]
            # aka num_edus_to_sent_end aka revOffset
            res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx]

            # position of paragraph containing EDU in doc
            # aka paragraphID
            para_idx = edu2para[edu_idx]
            res['para_idx'] = para_idx
            res['para_rev_idx'] = (len(paragraphs) - 1 - para_idx
                                   if (paragraphs is not None and
                                       para_idx is not None)
                                   else None)  # NEW
            # position of raw sentence
            res['raw_sent_idx'] = edu2raw_sent[edu_idx]

            # position of EDU in paragraph
            # aka num_edus_to_para_end aka revSentenceID (?!)
            # TODO: check for the 10th time if this is a bug in Li et al.'s
            # parser
            res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx]

            # syntax
            if len(trees) > 1:
                tree_idx = edu2sent[edu_idx]
                res['tkd_tree_idx'] = tree_idx
                if tree_idx is not None:
                    # head node of the EDU (for DS-LST features)
                    ptree = trees[tree_idx]
                    pheads = lex_heads[tree_idx]
                    # tree positions (in the syn tree) of the words of
                    # the EDU
                    tpos_leaves_edu = [x for x
                                       in ptree.treepositions('leaves')
                                       if ptree[x].overlaps(edu)]
                    tpos_words = set(tpos_leaves_edu)
                    res['tpos_words'] = tpos_words
                    edu_head = find_edu_head(ptree, pheads, tpos_words)
                    res['edu_head'] = edu_head

            edu_infos.append(res)

        return edu_infos, para_infos
예제 #4
0
def extract_pair_syntax(edu_info1, edu_info2):
    """syntactic features for the pair of EDUs"""
    try:
        ptree1 = edu_info1['ptree']
        pheads1 = edu_info1['pheads']

        ptree2 = edu_info2['ptree']
        pheads2 = edu_info2['pheads']
    except KeyError:
        return

    edu1 = edu_info1['edu']
    edu2 = edu_info2['edu']

    # generate DS-LST features for intra-sentential
    if ptree1 == ptree2:
        ptree = ptree1
        pheads = pheads1

        # find the head node of EDU1
        # tree positions (in the syn tree) of the words that are in EDU1
        tpos_leaves_edu1 = [
            tpos_leaf for tpos_leaf in ptree.treepositions('leaves')
            if ptree[tpos_leaf].overlaps(edu1)
        ]
        tpos_words1 = set(tpos_leaves_edu1)
        edu1_head = find_edu_head(ptree, pheads, tpos_words1)
        if edu1_head is not None:
            treepos_hn1, treepos_hw1 = edu1_head
            hlabel1 = ptree[treepos_hn1].label()
            hword1 = ptree[treepos_hw1].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn1 != ():
                treepos_an1 = treepos_hn1[:-1]
                treepos_aw1 = pheads[treepos_an1]
                alabel1 = ptree[treepos_an1].label()
                aword1 = ptree[treepos_aw1].word

        # find the head node of EDU2
        # tree positions (in the syn tree) of the words that are in EDU2
        tpos_leaves_edu2 = [
            tpos_leaf for tpos_leaf in ptree.treepositions('leaves')
            if ptree[tpos_leaf].overlaps(edu2)
        ]
        tpos_words2 = set(tpos_leaves_edu2)
        edu2_head = find_edu_head(ptree, pheads, tpos_words2)
        if edu2_head is not None:
            treepos_hn2, treepos_hw2 = edu2_head
            hlabel2 = ptree[treepos_hn2].label()
            hword2 = ptree[treepos_hw2].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn2 != ():
                treepos_an2 = treepos_hn2[:-1]
                treepos_aw2 = pheads[treepos_an2]
                alabel2 = ptree[treepos_an2].label()
                aword2 = ptree[treepos_aw2].word

        # EXPERIMENTAL
        #
        # EDU 2 > EDU 1
        if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)):
            # dominance relationship: 2 > 1
            yield ('SYN_dom_2', True)
            # attachment label and word
            yield ('SYN_alabel', alabel1)
            yield ('SYN_aword', aword1)
            # head label and word
            yield ('SYN_hlabel', hlabel1)
            yield ('SYN_hword', hword1)

        # EDU 1 > EDU 2
        if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)):
            # dominance relationship: 1 > 2
            yield ('SYN_dom_1', True)
            # attachment label and word
            yield ('SYN_alabel', alabel2)
            yield ('SYN_aword', aword2)
            # head label and word
            yield ('SYN_hlabel', hlabel2)
            yield ('SYN_hword', hword2)
예제 #5
0
def extract_pair_syntax(edu_info1, edu_info2):
    """syntactic features for the pair of EDUs"""
    try:
        ptree1 = edu_info1['ptree']
        pheads1 = edu_info1['pheads']

        ptree2 = edu_info2['ptree']
        pheads2 = edu_info2['pheads']
    except KeyError:
        return

    edu1 = edu_info1['edu']
    edu2 = edu_info2['edu']

    # generate DS-LST features for intra-sentential
    if ptree1 == ptree2:
        ptree = ptree1
        pheads = pheads1

        # find the head node of EDU1
        # tree positions (in the syn tree) of the words that are in EDU1
        tpos_leaves_edu1 = [tpos_leaf
                            for tpos_leaf in ptree.treepositions('leaves')
                            if ptree[tpos_leaf].overlaps(edu1)]
        tpos_words1 = set(tpos_leaves_edu1)
        edu1_head = find_edu_head(ptree, pheads, tpos_words1)
        if edu1_head is not None:
            treepos_hn1, treepos_hw1 = edu1_head
            hlabel1 = ptree[treepos_hn1].label()
            hword1 = ptree[treepos_hw1].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn1 != ():
                treepos_an1 = treepos_hn1[:-1]
                treepos_aw1 = pheads[treepos_an1]
                alabel1 = ptree[treepos_an1].label()
                aword1 = ptree[treepos_aw1].word

        # find the head node of EDU2
        # tree positions (in the syn tree) of the words that are in EDU2
        tpos_leaves_edu2 = [tpos_leaf
                            for tpos_leaf in ptree.treepositions('leaves')
                            if ptree[tpos_leaf].overlaps(edu2)]
        tpos_words2 = set(tpos_leaves_edu2)
        edu2_head = find_edu_head(ptree, pheads, tpos_words2)
        if edu2_head is not None:
            treepos_hn2, treepos_hw2 = edu2_head
            hlabel2 = ptree[treepos_hn2].label()
            hword2 = ptree[treepos_hw2].word
            # if the head node is not the root of the syn tree,
            # there is an attachment node
            if treepos_hn2 != ():
                treepos_an2 = treepos_hn2[:-1]
                treepos_aw2 = pheads[treepos_an2]
                alabel2 = ptree[treepos_an2].label()
                aword2 = ptree[treepos_aw2].word

        # EXPERIMENTAL
        #
        # EDU 2 > EDU 1
        if ((treepos_hn1 != () and
             treepos_aw1 in tpos_words2)):
            # dominance relationship: 2 > 1
            yield ('SYN_dom_2', True)
            # attachment label and word
            yield ('SYN_alabel', alabel1)
            yield ('SYN_aword', aword1)
            # head label and word
            yield ('SYN_hlabel', hlabel1)
            yield ('SYN_hword', hword1)

        # EDU 1 > EDU 2
        if ((treepos_hn2 != () and
             treepos_aw2 in tpos_words1)):
            # dominance relationship: 1 > 2
            yield ('SYN_dom_1', True)
            # attachment label and word
            yield ('SYN_alabel', alabel2)
            yield ('SYN_aword', aword2)
            # head label and word
            yield ('SYN_hlabel', hlabel2)
            yield ('SYN_hword', hword2)
예제 #6
0
    def preprocess(self, doc, strict=False):
        """Preprocess a document and output basic features for each EDU.

        Parameters
        ----------
        doc: DocumentPlus
            Document to be processed.

        Returns
        -------
        edu_infos: list of dict of features
            List of basic features for each EDU ; each feature is a
            couple (basic_feat_name, basic_feat_val).
        para_infos: list of dict of features
            List of basic features for each paragraph ; each feature is
            a couple (basic_feat_name, basic_feat_val).

        TODO
        ----
        * [ ] explicitly impute missing values, e.g. for idxes_in_*
        """
        token_filter = self.token_filter
        word2clust = self.word2clust

        edus = doc.edus
        raw_words = doc.raw_words  # TEMPORARY
        tokens = doc.tkd_tokens
        trees = doc.tkd_trees
        paragraphs = doc.paragraphs  # NEW
        # mappings from EDU to other annotations
        edu2raw_sent = doc.edu2raw_sent
        edu2para = doc.edu2para
        edu2sent = doc.edu2sent
        edu2tokens = doc.edu2tokens
        lex_heads = doc.lex_heads  # EXPERIMENTAL

        # pre-compute relative indices (in sent, para) in one iteration
        # NB: moved to document_plus itself
        idxes_in_sent = doc.edu2idx_in_sent
        rev_idxes_in_sent = doc.edu2rev_idx_in_sent

        idxes_in_para = doc.edu2idx_in_para
        rev_idxes_in_para = doc.edu2rev_idx_in_para

        # paragraphs
        if paragraphs is None:
            para_infos = None
        else:
            para_infos = []

            # special case for the left padding paragraph
            pfeats = dict()
            pfeats['tokens'] = [tokens[0]]  # left padding token
            pfeats['syn_nodes'] = None
            para_infos.append(pfeats)

            # regular paragraphs
            for para_idx, para in enumerate(paragraphs[1:], start=1):
                pfeats = dict()
                para_beg = para.sentences[0].span.char_start
                para_end = para.sentences[-1].span.char_end
                trees_beg = doc.trees_beg
                trees_end = doc.trees_end
                toks_beg = doc.toks_beg
                toks_end = doc.toks_end

                # * token characterization of the paragraph
                encltoks_idc = np.where(
                    np.logical_and(toks_beg >= para_beg,
                                   toks_end <= para_end))[0]
                encltoks = [tokens[i] for i in encltoks_idc]
                pfeats['tokens'] = encltoks

                # * syntactic characterization of the paragraph
                # find the syntactic trees that span this paragraph
                enclosed_idc = np.intersect1d(np.where(trees_beg >= para_beg),
                                              np.where(trees_end <= para_end))
                overlapd_idc = np.intersect1d(np.where(trees_beg < para_end),
                                              np.where(trees_end > para_beg))
                if np.array_equal(enclosed_idc, overlapd_idc):
                    # sentence seg and paragraph seg are compatible
                    syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc]
                else:
                    # mismatch between the sentence segmentation from the
                    # PTB and paragraph segmentation from the RST-WSJ
                    strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc)
                    syn_nodes = []
                    for tree_idx in overlapd_idc:
                        syn_tree = trees[tree_idx]
                        if tree_idx not in strad_idc:
                            syn_nodes.append(syn_tree)
                            continue
                        # find the list of tokens that overlap this
                        # paragraph, and belong to this straddling
                        # tree
                        tree_beg = trees_beg[tree_idx]
                        tree_end = trees_end[tree_idx]
                        # here, reduce(np.logical_and(...)) was 2x
                        # faster than np.logical_and.reduce(...)
                        overtoks_idc = np.where(
                            reduce(np.logical_and,
                                   (toks_beg < para_end, toks_end > para_beg,
                                    toks_beg >= tree_beg,
                                    toks_end <= tree_end)))[0]
                        overtoks = [tokens[i] for i in overtoks_idc]
                        syn_node_seq = syntactic_node_seq(syn_tree, overtoks)
                        syn_nodes.extend(syn_node_seq)
                # add basic feature
                pfeats['syn_nodes'] = syn_nodes
                # store
                para_infos.append(pfeats)
        # EDUs
        edu_infos = []
        # special case: left padding EDU
        edu = edus[0]
        res = dict()
        res['edu'] = edu
        # raw words (temporary)
        res['raw_words'] = []
        # tokens
        res['tokens'] = []  # TODO: __START__ / __START__ ?
        res['tags'] = []  # TODO: __START__ ?
        res['words'] = []  # TODO: __START__ ?
        res['tok_beg'] = 0  # EXPERIMENTAL
        res['tok_end'] = 0  # EXPERIMENTAL
        # EXPERIMENTAL: Brown clusters
        res['brown_clusters'] = []
        # end Brown clusters
        # sentence
        res['edu_idx_in_sent'] = idxes_in_sent[0]
        res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0]
        res['sent_idx'] = 0
        res['sent_rev_idx'] = len(trees) - 1  # NEW
        # para
        res['edu_rev_idx_in_para'] = rev_idxes_in_para[0]
        # aka paragraphID
        res['para_idx'] = 0
        res['para_rev_idx'] = (len(paragraphs) -
                               1 if paragraphs is not None else None)  # NEW
        # raw sent
        res['raw_sent_idx'] = edu2raw_sent[0]
        edu_infos.append(res)

        # regular EDUs
        for edu_idx, edu in enumerate(edus[1:], start=1):
            res = dict()
            res['edu'] = edu

            # raw words (temporary)
            res['raw_words'] = raw_words[edu_idx]

            # tokens
            if tokens is not None:
                tok_idcs = edu2tokens[edu_idx]
                toks = [tokens[tok_idx] for tok_idx in tok_idcs]
                # special case: no tokens
                if strict and not toks:
                    emsg = 'No token for EDU'
                    print(list(enumerate(tokens)))
                    print(tok_idcs)
                    print(edu.text())
                    raise ValueError(emsg)
                # filter tokens if relevant
                if token_filter is not None:
                    toks = [tt for tt in toks if token_filter(tt)]
                # store information
                res['tokens'] = toks
                res['tags'] = [tok.tag for tok in toks]
                res['words'] = [tok.word for tok in toks]
                # EXPERIMENTAL: Brown clusters
                if word2clust is not None:
                    res['brown_clusters'] = [
                        word2clust[w] for w in res['words'] if w in word2clust
                    ]
                # end Brown clusters

            # doc structure

            # position of sentence containing EDU in doc
            # aka sentence_id
            sent_idx = edu2sent[edu_idx]
            res['sent_idx'] = sent_idx
            res['sent_rev_idx'] = (len(trees) - 1 -
                                   sent_idx if sent_idx is not None else None
                                   )  # NEW
            # position of EDU in sentence
            # aka num_edus_from_sent_start aka offset
            res['edu_idx_in_sent'] = idxes_in_sent[edu_idx]
            # aka num_edus_to_sent_end aka revOffset
            res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx]

            # position of paragraph containing EDU in doc
            # aka paragraphID
            para_idx = edu2para[edu_idx]
            res['para_idx'] = para_idx
            res['para_rev_idx'] = (
                len(paragraphs) - 1 - para_idx if
                (paragraphs is not None and para_idx is not None) else None
            )  # NEW
            # position of raw sentence
            res['raw_sent_idx'] = edu2raw_sent[edu_idx]

            # position of EDU in paragraph
            # aka num_edus_to_para_end aka revSentenceID (?!)
            # TODO: check for the 10th time if this is a bug in Li et al.'s
            # parser
            res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx]

            # syntax
            if len(trees) > 1:
                tree_idx = edu2sent[edu_idx]
                res['tkd_tree_idx'] = tree_idx
                if tree_idx is not None:
                    # head node of the EDU (for DS-LST features)
                    ptree = trees[tree_idx]
                    pheads = lex_heads[tree_idx]
                    # tree positions (in the syn tree) of the words of
                    # the EDU
                    tpos_leaves_edu = [
                        x for x in ptree.treepositions('leaves')
                        if ptree[x].overlaps(edu)
                    ]
                    tpos_words = set(tpos_leaves_edu)
                    res['tpos_words'] = tpos_words
                    edu_head = find_edu_head(ptree, pheads, tpos_words)
                    res['edu_head'] = edu_head

            edu_infos.append(res)

        return edu_infos, para_infos