def extract_single_syntax(doc, edu_info, para_info): """syntactic features for the EDU""" try: tree_idx = edu_info['tkd_tree_idx'] except KeyError: return if tree_idx is None: return ptree = doc.tkd_trees[tree_idx] # pheads = doc.lex_heads[tree_idx] edu = edu_info['edu'] tokens = edu_info['tokens'] # WIP # WIP 2016-06-02: type of sentence, hopefully informative for non-S yield ('SYN_sent_type', ptree.label()) # spanning nodes for the EDU syn_nodes = syntactic_node_seq(ptree, tokens) if syn_nodes: yield ('SYN_nodes', tuple(x.label() for x in syn_nodes)) # variant, stripped from leading and trailing punctuations tokens_strip_punc = strip_punctuation(tokens) syn_nodes_nopunc = syntactic_node_seq(ptree, tokens_strip_punc) if syn_nodes_nopunc: yield ('SYN_nodes_nopunc', tuple(x.label() for x in syn_nodes_nopunc)) # currently de-activated if False: # find EDU head edu_head = edu_info['edu_head'] if edu_head is not None: treepos_hn, treepos_hw = edu_head hlabel = ptree[treepos_hn].label() hword = ptree[treepos_hw].word if False: # DEBUG print('edu: ', edu.text()) print('hlabel: ', hlabel) print('hword: ', hword) print('======') yield ('SYN_hlabel', hlabel) yield ('SYN_hword', hword)
def preprocess(self, doc, strict=False): """Preprocess a document and output basic features for each EDU. Parameters ---------- doc: DocumentPlus Document to be processed. Returns ------- edu_infos: list of dict of features List of basic features for each EDU ; each feature is a couple (basic_feat_name, basic_feat_val). para_infos: list of dict of features List of basic features for each paragraph ; each feature is a couple (basic_feat_name, basic_feat_val). TODO ---- * [ ] explicitly impute missing values, e.g. for idxes_in_* """ token_filter = self.token_filter word2clust = self.word2clust edus = doc.edus raw_words = doc.raw_words # TEMPORARY tokens = doc.tkd_tokens trees = doc.tkd_trees paragraphs = doc.paragraphs # NEW # mappings from EDU to other annotations edu2raw_sent = doc.edu2raw_sent edu2para = doc.edu2para edu2sent = doc.edu2sent edu2tokens = doc.edu2tokens lex_heads = doc.lex_heads # EXPERIMENTAL # pre-compute relative indices (in sent, para) in one iteration # NB: moved to document_plus itself idxes_in_sent = doc.edu2idx_in_sent rev_idxes_in_sent = doc.edu2rev_idx_in_sent idxes_in_para = doc.edu2idx_in_para rev_idxes_in_para = doc.edu2rev_idx_in_para # paragraphs if paragraphs is None: para_infos = None else: para_infos = [] # special case for the left padding paragraph pfeats = dict() pfeats['tokens'] = [tokens[0]] # left padding token pfeats['syn_nodes'] = None para_infos.append(pfeats) # regular paragraphs for para_idx, para in enumerate(paragraphs[1:], start=1): pfeats = dict() para_beg = para.sentences[0].span.char_start para_end = para.sentences[-1].span.char_end trees_beg = doc.trees_beg trees_end = doc.trees_end toks_beg = doc.toks_beg toks_end = doc.toks_end # * token characterization of the paragraph encltoks_idc = np.where( np.logical_and(toks_beg >= para_beg, toks_end <= para_end) )[0] encltoks = [tokens[i] for i in encltoks_idc] pfeats['tokens'] = encltoks # * syntactic characterization of the paragraph # find the syntactic trees that span this paragraph enclosed_idc = np.intersect1d( np.where(trees_beg >= para_beg), np.where(trees_end <= para_end)) overlapd_idc = np.intersect1d( np.where(trees_beg < para_end), np.where(trees_end > para_beg)) if np.array_equal(enclosed_idc, overlapd_idc): # sentence seg and paragraph seg are compatible syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc] else: # mismatch between the sentence segmentation from the # PTB and paragraph segmentation from the RST-WSJ strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc) syn_nodes = [] for tree_idx in overlapd_idc: syn_tree = trees[tree_idx] if tree_idx not in strad_idc: syn_nodes.append(syn_tree) continue # find the list of tokens that overlap this # paragraph, and belong to this straddling # tree tree_beg = trees_beg[tree_idx] tree_end = trees_end[tree_idx] # here, reduce(np.logical_and(...)) was 2x # faster than np.logical_and.reduce(...) overtoks_idc = np.where( reduce(np.logical_and, (toks_beg < para_end, toks_end > para_beg, toks_beg >= tree_beg, toks_end <= tree_end) ) )[0] overtoks = [tokens[i] for i in overtoks_idc] syn_node_seq = syntactic_node_seq( syn_tree, overtoks) syn_nodes.extend(syn_node_seq) # add basic feature pfeats['syn_nodes'] = syn_nodes # store para_infos.append(pfeats) # EDUs edu_infos = [] # special case: left padding EDU edu = edus[0] res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = [] # tokens res['tokens'] = [] # TODO: __START__ / __START__ ? res['tags'] = [] # TODO: __START__ ? res['words'] = [] # TODO: __START__ ? res['tok_beg'] = 0 # EXPERIMENTAL res['tok_end'] = 0 # EXPERIMENTAL # EXPERIMENTAL: Brown clusters res['brown_clusters'] = [] # end Brown clusters # sentence res['edu_idx_in_sent'] = idxes_in_sent[0] res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0] res['sent_idx'] = 0 res['sent_rev_idx'] = len(trees) - 1 # NEW # para res['edu_rev_idx_in_para'] = rev_idxes_in_para[0] # aka paragraphID res['para_idx'] = 0 res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None else None) # NEW # raw sent res['raw_sent_idx'] = edu2raw_sent[0] edu_infos.append(res) # regular EDUs for edu_idx, edu in enumerate(edus[1:], start=1): res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = raw_words[edu_idx] # tokens if tokens is not None: tok_idcs = edu2tokens[edu_idx] toks = [tokens[tok_idx] for tok_idx in tok_idcs] # special case: no tokens if strict and not toks: emsg = 'No token for EDU' print(list(enumerate(tokens))) print(tok_idcs) print(edu.text()) raise ValueError(emsg) # filter tokens if relevant if token_filter is not None: toks = [tt for tt in toks if token_filter(tt)] # store information res['tokens'] = toks res['tags'] = [tok.tag for tok in toks] res['words'] = [tok.word for tok in toks] # EXPERIMENTAL: Brown clusters if word2clust is not None: res['brown_clusters'] = [word2clust[w] for w in res['words'] if w in word2clust] # end Brown clusters # doc structure # position of sentence containing EDU in doc # aka sentence_id sent_idx = edu2sent[edu_idx] res['sent_idx'] = sent_idx res['sent_rev_idx'] = (len(trees) - 1 - sent_idx if sent_idx is not None else None) # NEW # position of EDU in sentence # aka num_edus_from_sent_start aka offset res['edu_idx_in_sent'] = idxes_in_sent[edu_idx] # aka num_edus_to_sent_end aka revOffset res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx] # position of paragraph containing EDU in doc # aka paragraphID para_idx = edu2para[edu_idx] res['para_idx'] = para_idx res['para_rev_idx'] = (len(paragraphs) - 1 - para_idx if (paragraphs is not None and para_idx is not None) else None) # NEW # position of raw sentence res['raw_sent_idx'] = edu2raw_sent[edu_idx] # position of EDU in paragraph # aka num_edus_to_para_end aka revSentenceID (?!) # TODO: check for the 10th time if this is a bug in Li et al.'s # parser res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx] # syntax if len(trees) > 1: tree_idx = edu2sent[edu_idx] res['tkd_tree_idx'] = tree_idx if tree_idx is not None: # head node of the EDU (for DS-LST features) ptree = trees[tree_idx] pheads = lex_heads[tree_idx] # tree positions (in the syn tree) of the words of # the EDU tpos_leaves_edu = [x for x in ptree.treepositions('leaves') if ptree[x].overlaps(edu)] tpos_words = set(tpos_leaves_edu) res['tpos_words'] = tpos_words edu_head = find_edu_head(ptree, pheads, tpos_words) res['edu_head'] = edu_head edu_infos.append(res) return edu_infos, para_infos
def extract_pair_syntax(doc, edu_info1, edu_info2, edu_info_bwn): """syntactic features for the pair of EDUs""" try: tree_idx1 = edu_info1['tkd_tree_idx'] tree_idx2 = edu_info2['tkd_tree_idx'] except KeyError: return if tree_idx1 is None or tree_idx2 is None: return edu1 = edu_info1['edu'] edu2 = edu_info2['edu'] # nums edu1_num = edu1.num edu2_num = edu2.num # determine the linear order of {EDU_1, EDU_2} if edu1.num < edu2.num: # edu_l = edu1 # edu_r = edu2 edu_info_l = edu_info1 edu_info_r = edu_info2 else: # edu_l = edu2 # edu_r = edu1 edu_info_l = edu_info2 edu_info_r = edu_info1 if tree_idx1 == tree_idx2: # intra-sentential tree_idx = tree_idx1 ptree = doc.tkd_trees[tree_idx] pheads = doc.lex_heads[tree_idx] # * DS-LST features # find the head node of EDU1 tpos_words1 = edu_info1['tpos_words'] edu1_head = edu_info1['edu_head'] if edu1_head is not None: treepos_hn1, treepos_hw1 = edu1_head hlabel1 = ptree[treepos_hn1].label() hword1 = ptree[treepos_hw1].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn1 != (): treepos_an1 = treepos_hn1[:-1] treepos_aw1 = pheads[treepos_an1] alabel1 = ptree[treepos_an1].label() aword1 = ptree[treepos_aw1].word # find the head node of EDU2 tpos_words2 = edu_info2['tpos_words'] edu2_head = edu_info2['edu_head'] if edu2_head is not None: treepos_hn2, treepos_hw2 = edu2_head hlabel2 = ptree[treepos_hn2].label() hword2 = ptree[treepos_hw2].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn2 != (): treepos_an2 = treepos_hn2[:-1] treepos_aw2 = pheads[treepos_an2] alabel2 = ptree[treepos_an2].label() aword2 = ptree[treepos_aw2].word # EXPERIMENTAL # # EDU 2 > EDU 1 if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)): # dominance relationship: 2 > 1 yield ('SYN_dom_2', True) # attachment label and word yield ('SYN_alabel', alabel1) yield ('SYN_aword', aword1) # head label and word yield ('SYN_hlabel', hlabel1) yield ('SYN_hword', hword1) # EDU 1 > EDU 2 if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)): # dominance relationship: 1 > 2 yield ('SYN_dom_1', True) # attachment label and word yield ('SYN_alabel', alabel2) yield ('SYN_aword', aword2) # head label and word yield ('SYN_hlabel', hlabel2) yield ('SYN_hword', hword2) # TODO assert that 1 > 2 and 2 > 1 cannot happen together # TODO fire a feature if the head nodes of EDU1 and EDU2 # have the same attachment node ? # * syntactic nodes (WIP as of 2016-05-25) # - interval between edu1 and edu2 if edu_info_bwn: # bwn_edus = [x['edu'] for x in edu_info_bwn] bwn_tokens = list( itertools.chain.from_iterable(x['tokens'] for x in edu_info_bwn)) # 1. EDUs_bwn # spanning nodes for the interval syn_nodes = syntactic_node_seq(ptree, bwn_tokens) if syn_nodes: yield ('SYN_nodes_bwn', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations bwn_tokens_strip_punc = strip_punctuation(bwn_tokens) syn_nodes_strip = syntactic_node_seq(ptree, bwn_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_bwn_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 2. EDU_L + EDUs_bwn + EDU_R # lbwnr_edus = [edu_l] + bwn_edus + [edu_r] lbwnr_tokens = (edu_info_l['tokens'] + bwn_tokens + edu_info_r['tokens']) # spanning nodes syn_nodes = syntactic_node_seq(ptree, lbwnr_tokens) if syn_nodes: yield ('SYN_nodes_lbwnr', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations lbwnr_tokens_strip_punc = strip_punctuation(lbwnr_tokens) syn_nodes_strip = syntactic_node_seq(ptree, lbwnr_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_lbwnr_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 3. EDU_L + EDUs_bwn # lbwn_edus = [edu_l] + bwn_edus lbwn_tokens = (edu_info_l['tokens'] + bwn_tokens) # spanning nodes syn_nodes = syntactic_node_seq(ptree, lbwn_tokens) if syn_nodes: yield ('SYN_nodes_lbwn', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations lbwn_tokens_strip_punc = strip_punctuation(lbwn_tokens) syn_nodes_strip = syntactic_node_seq(ptree, lbwn_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_lbwn_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 4. EDUs_bwn + EDU_R # bwnr_edus = bwn_edus + [edu_r] bwnr_tokens = (bwn_tokens + edu_info_r['tokens']) # spanning nodes syn_nodes = syntactic_node_seq(ptree, bwnr_tokens) if syn_nodes: yield ('SYN_nodes_bwnr', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations bwnr_tokens_strip_punc = strip_punctuation(bwnr_tokens) syn_nodes_strip = syntactic_node_seq(ptree, bwnr_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_bwnr_nopunc', tuple(x.label() for x in syn_nodes_strip)) # TODO EDU_L + EDUs_bwn[:i], EDUs_bwn[i:] + EDUs_R ? # where i should correspond to the split point of the (2nd # order variant of the) Eisner decoder # TODO specifically handle interval PRN that start with a comma # that trails the preceding EDU ? # TODO fire a feature with the pair of labels of the head nodes of EDU1 # and EDU2 ? else: ptree1 = doc.tkd_trees[tree_idx1] # pheads1 = doc.lex_heads[tree_idx1] ptree2 = doc.tkd_trees[tree_idx2] # pheads2 = doc.lex_heads[tree_idx2] # pair of sentence types, hopefully informative esp. for non-S yield ('SYN_sent_type_pair', (ptree1.label(), ptree2.label())) # sentence types in between ptree_l = edu_info_l['tkd_tree_idx'] ptree_r = edu_info_r['tkd_tree_idx'] try: ptrees_lbwnr = ([ptree_l] + [x['tkd_tree_idx'] for x in edu_info_bwn] + [ptree_r]) except KeyError: pass else: ptrees_lbwnr = [ doc.tkd_trees[x] for x, _ in itertools.groupby(ptrees_lbwnr) ] stypes_lbwnr = [x.label() for x in ptrees_lbwnr] yield ('SYN_sent_type_lbwnr', tuple(stypes_lbwnr)) yield ('SYN_sent_type_bwn', tuple(stypes_lbwnr[1:-1]))
def extract_pair_syntax(doc, edu_info1, edu_info2, edu_info_bwn): """syntactic features for the pair of EDUs""" try: tree_idx1 = edu_info1['tkd_tree_idx'] tree_idx2 = edu_info2['tkd_tree_idx'] except KeyError: return if tree_idx1 is None or tree_idx2 is None: return edu1 = edu_info1['edu'] edu2 = edu_info2['edu'] # nums edu1_num = edu1.num edu2_num = edu2.num # determine the linear order of {EDU_1, EDU_2} if edu1.num < edu2.num: # edu_l = edu1 # edu_r = edu2 edu_info_l = edu_info1 edu_info_r = edu_info2 else: # edu_l = edu2 # edu_r = edu1 edu_info_l = edu_info2 edu_info_r = edu_info1 if tree_idx1 == tree_idx2: # intra-sentential tree_idx = tree_idx1 ptree = doc.tkd_trees[tree_idx] pheads = doc.lex_heads[tree_idx] # * DS-LST features # find the head node of EDU1 tpos_words1 = edu_info1['tpos_words'] edu1_head = edu_info1['edu_head'] if edu1_head is not None: treepos_hn1, treepos_hw1 = edu1_head hlabel1 = ptree[treepos_hn1].label() hword1 = ptree[treepos_hw1].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn1 != (): treepos_an1 = treepos_hn1[:-1] treepos_aw1 = pheads[treepos_an1] alabel1 = ptree[treepos_an1].label() aword1 = ptree[treepos_aw1].word # find the head node of EDU2 tpos_words2 = edu_info2['tpos_words'] edu2_head = edu_info2['edu_head'] if edu2_head is not None: treepos_hn2, treepos_hw2 = edu2_head hlabel2 = ptree[treepos_hn2].label() hword2 = ptree[treepos_hw2].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn2 != (): treepos_an2 = treepos_hn2[:-1] treepos_aw2 = pheads[treepos_an2] alabel2 = ptree[treepos_an2].label() aword2 = ptree[treepos_aw2].word # EXPERIMENTAL # # EDU 2 > EDU 1 if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)): # dominance relationship: 2 > 1 yield ('SYN_dom_2', True) # attachment label and word yield ('SYN_alabel', alabel1) yield ('SYN_aword', aword1) # head label and word yield ('SYN_hlabel', hlabel1) yield ('SYN_hword', hword1) # EDU 1 > EDU 2 if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)): # dominance relationship: 1 > 2 yield ('SYN_dom_1', True) # attachment label and word yield ('SYN_alabel', alabel2) yield ('SYN_aword', aword2) # head label and word yield ('SYN_hlabel', hlabel2) yield ('SYN_hword', hword2) # TODO assert that 1 > 2 and 2 > 1 cannot happen together # TODO fire a feature if the head nodes of EDU1 and EDU2 # have the same attachment node ? # * syntactic nodes (WIP as of 2016-05-25) # - interval between edu1 and edu2 if edu_info_bwn: # bwn_edus = [x['edu'] for x in edu_info_bwn] bwn_tokens = list(itertools.chain.from_iterable( x['tokens'] for x in edu_info_bwn)) # 1. EDUs_bwn # spanning nodes for the interval syn_nodes = syntactic_node_seq(ptree, bwn_tokens) if syn_nodes: yield ('SYN_nodes_bwn', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations bwn_tokens_strip_punc = strip_punctuation(bwn_tokens) syn_nodes_strip = syntactic_node_seq(ptree, bwn_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_bwn_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 2. EDU_L + EDUs_bwn + EDU_R # lbwnr_edus = [edu_l] + bwn_edus + [edu_r] lbwnr_tokens = (edu_info_l['tokens'] + bwn_tokens + edu_info_r['tokens']) # spanning nodes syn_nodes = syntactic_node_seq(ptree, lbwnr_tokens) if syn_nodes: yield ('SYN_nodes_lbwnr', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations lbwnr_tokens_strip_punc = strip_punctuation(lbwnr_tokens) syn_nodes_strip = syntactic_node_seq( ptree, lbwnr_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_lbwnr_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 3. EDU_L + EDUs_bwn # lbwn_edus = [edu_l] + bwn_edus lbwn_tokens = (edu_info_l['tokens'] + bwn_tokens) # spanning nodes syn_nodes = syntactic_node_seq(ptree, lbwn_tokens) if syn_nodes: yield ('SYN_nodes_lbwn', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations lbwn_tokens_strip_punc = strip_punctuation(lbwn_tokens) syn_nodes_strip = syntactic_node_seq( ptree, lbwn_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_lbwn_nopunc', tuple(x.label() for x in syn_nodes_strip)) # 4. EDUs_bwn + EDU_R # bwnr_edus = bwn_edus + [edu_r] bwnr_tokens = (bwn_tokens + edu_info_r['tokens']) # spanning nodes syn_nodes = syntactic_node_seq(ptree, bwnr_tokens) if syn_nodes: yield ('SYN_nodes_bwnr', tuple(x.label() for x in syn_nodes)) # variant: strip leading and trailing punctuations bwnr_tokens_strip_punc = strip_punctuation(bwnr_tokens) syn_nodes_strip = syntactic_node_seq( ptree, bwnr_tokens_strip_punc) if syn_nodes_strip: yield ('SYN_nodes_bwnr_nopunc', tuple(x.label() for x in syn_nodes_strip)) # TODO EDU_L + EDUs_bwn[:i], EDUs_bwn[i:] + EDUs_R ? # where i should correspond to the split point of the (2nd # order variant of the) Eisner decoder # TODO specifically handle interval PRN that start with a comma # that trails the preceding EDU ? # TODO fire a feature with the pair of labels of the head nodes of EDU1 # and EDU2 ? else: ptree1 = doc.tkd_trees[tree_idx1] # pheads1 = doc.lex_heads[tree_idx1] ptree2 = doc.tkd_trees[tree_idx2] # pheads2 = doc.lex_heads[tree_idx2] # pair of sentence types, hopefully informative esp. for non-S yield ('SYN_sent_type_pair', (ptree1.label(), ptree2.label())) # sentence types in between ptree_l = edu_info_l['tkd_tree_idx'] ptree_r = edu_info_r['tkd_tree_idx'] try: ptrees_lbwnr = ([ptree_l] + [x['tkd_tree_idx'] for x in edu_info_bwn] + [ptree_r]) except KeyError: pass else: ptrees_lbwnr = [doc.tkd_trees[x] for x, _ in itertools.groupby(ptrees_lbwnr)] stypes_lbwnr = [x.label() for x in ptrees_lbwnr] yield ('SYN_sent_type_lbwnr', tuple(stypes_lbwnr)) yield ('SYN_sent_type_bwn', tuple(stypes_lbwnr[1:-1]))
def preprocess(self, doc, strict=False): """Preprocess a document and output basic features for each EDU. Parameters ---------- doc: DocumentPlus Document to be processed. Returns ------- edu_infos: list of dict of features List of basic features for each EDU ; each feature is a couple (basic_feat_name, basic_feat_val). para_infos: list of dict of features List of basic features for each paragraph ; each feature is a couple (basic_feat_name, basic_feat_val). TODO ---- * [ ] explicitly impute missing values, e.g. for idxes_in_* """ token_filter = self.token_filter word2clust = self.word2clust edus = doc.edus raw_words = doc.raw_words # TEMPORARY tokens = doc.tkd_tokens trees = doc.tkd_trees paragraphs = doc.paragraphs # NEW # mappings from EDU to other annotations edu2raw_sent = doc.edu2raw_sent edu2para = doc.edu2para edu2sent = doc.edu2sent edu2tokens = doc.edu2tokens lex_heads = doc.lex_heads # EXPERIMENTAL # pre-compute relative indices (in sent, para) in one iteration # NB: moved to document_plus itself idxes_in_sent = doc.edu2idx_in_sent rev_idxes_in_sent = doc.edu2rev_idx_in_sent idxes_in_para = doc.edu2idx_in_para rev_idxes_in_para = doc.edu2rev_idx_in_para # paragraphs if paragraphs is None: para_infos = None else: para_infos = [] # special case for the left padding paragraph pfeats = dict() pfeats['tokens'] = [tokens[0]] # left padding token pfeats['syn_nodes'] = None para_infos.append(pfeats) # regular paragraphs for para_idx, para in enumerate(paragraphs[1:], start=1): pfeats = dict() para_beg = para.sentences[0].span.char_start para_end = para.sentences[-1].span.char_end trees_beg = doc.trees_beg trees_end = doc.trees_end toks_beg = doc.toks_beg toks_end = doc.toks_end # * token characterization of the paragraph encltoks_idc = np.where( np.logical_and(toks_beg >= para_beg, toks_end <= para_end))[0] encltoks = [tokens[i] for i in encltoks_idc] pfeats['tokens'] = encltoks # * syntactic characterization of the paragraph # find the syntactic trees that span this paragraph enclosed_idc = np.intersect1d(np.where(trees_beg >= para_beg), np.where(trees_end <= para_end)) overlapd_idc = np.intersect1d(np.where(trees_beg < para_end), np.where(trees_end > para_beg)) if np.array_equal(enclosed_idc, overlapd_idc): # sentence seg and paragraph seg are compatible syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc] else: # mismatch between the sentence segmentation from the # PTB and paragraph segmentation from the RST-WSJ strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc) syn_nodes = [] for tree_idx in overlapd_idc: syn_tree = trees[tree_idx] if tree_idx not in strad_idc: syn_nodes.append(syn_tree) continue # find the list of tokens that overlap this # paragraph, and belong to this straddling # tree tree_beg = trees_beg[tree_idx] tree_end = trees_end[tree_idx] # here, reduce(np.logical_and(...)) was 2x # faster than np.logical_and.reduce(...) overtoks_idc = np.where( reduce(np.logical_and, (toks_beg < para_end, toks_end > para_beg, toks_beg >= tree_beg, toks_end <= tree_end)))[0] overtoks = [tokens[i] for i in overtoks_idc] syn_node_seq = syntactic_node_seq(syn_tree, overtoks) syn_nodes.extend(syn_node_seq) # add basic feature pfeats['syn_nodes'] = syn_nodes # store para_infos.append(pfeats) # EDUs edu_infos = [] # special case: left padding EDU edu = edus[0] res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = [] # tokens res['tokens'] = [] # TODO: __START__ / __START__ ? res['tags'] = [] # TODO: __START__ ? res['words'] = [] # TODO: __START__ ? res['tok_beg'] = 0 # EXPERIMENTAL res['tok_end'] = 0 # EXPERIMENTAL # EXPERIMENTAL: Brown clusters res['brown_clusters'] = [] # end Brown clusters # sentence res['edu_idx_in_sent'] = idxes_in_sent[0] res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0] res['sent_idx'] = 0 res['sent_rev_idx'] = len(trees) - 1 # NEW # para res['edu_rev_idx_in_para'] = rev_idxes_in_para[0] # aka paragraphID res['para_idx'] = 0 res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None else None) # NEW # raw sent res['raw_sent_idx'] = edu2raw_sent[0] edu_infos.append(res) # regular EDUs for edu_idx, edu in enumerate(edus[1:], start=1): res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = raw_words[edu_idx] # tokens if tokens is not None: tok_idcs = edu2tokens[edu_idx] toks = [tokens[tok_idx] for tok_idx in tok_idcs] # special case: no tokens if strict and not toks: emsg = 'No token for EDU' print(list(enumerate(tokens))) print(tok_idcs) print(edu.text()) raise ValueError(emsg) # filter tokens if relevant if token_filter is not None: toks = [tt for tt in toks if token_filter(tt)] # store information res['tokens'] = toks res['tags'] = [tok.tag for tok in toks] res['words'] = [tok.word for tok in toks] # EXPERIMENTAL: Brown clusters if word2clust is not None: res['brown_clusters'] = [ word2clust[w] for w in res['words'] if w in word2clust ] # end Brown clusters # doc structure # position of sentence containing EDU in doc # aka sentence_id sent_idx = edu2sent[edu_idx] res['sent_idx'] = sent_idx res['sent_rev_idx'] = (len(trees) - 1 - sent_idx if sent_idx is not None else None ) # NEW # position of EDU in sentence # aka num_edus_from_sent_start aka offset res['edu_idx_in_sent'] = idxes_in_sent[edu_idx] # aka num_edus_to_sent_end aka revOffset res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx] # position of paragraph containing EDU in doc # aka paragraphID para_idx = edu2para[edu_idx] res['para_idx'] = para_idx res['para_rev_idx'] = ( len(paragraphs) - 1 - para_idx if (paragraphs is not None and para_idx is not None) else None ) # NEW # position of raw sentence res['raw_sent_idx'] = edu2raw_sent[edu_idx] # position of EDU in paragraph # aka num_edus_to_para_end aka revSentenceID (?!) # TODO: check for the 10th time if this is a bug in Li et al.'s # parser res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx] # syntax if len(trees) > 1: tree_idx = edu2sent[edu_idx] res['tkd_tree_idx'] = tree_idx if tree_idx is not None: # head node of the EDU (for DS-LST features) ptree = trees[tree_idx] pheads = lex_heads[tree_idx] # tree positions (in the syn tree) of the words of # the EDU tpos_leaves_edu = [ x for x in ptree.treepositions('leaves') if ptree[x].overlaps(edu) ] tpos_words = set(tpos_leaves_edu) res['tpos_words'] = tpos_words edu_head = find_edu_head(ptree, pheads, tpos_words) res['edu_head'] = edu_head edu_infos.append(res) return edu_infos, para_infos