def extract_single_syntax(edu_info): """syntactic features for the EDU""" try: ptree = edu_info['ptree'] pheads = edu_info['pheads'] except KeyError: return edu = edu_info['edu'] # tree positions (in the syn tree) of the words that are in the EDU tpos_leaves_edu = [ tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu) ] wanted = set(tpos_leaves_edu) edu_head = find_edu_head(ptree, pheads, wanted) if edu_head is not None: treepos_hn, treepos_hw = edu_head hlabel = ptree[treepos_hn].label() hword = ptree[treepos_hw].word if False: # DEBUG print('edu: ', edu.text()) print('hlabel: ', hlabel) print('hword: ', hword) print('======') yield ('SYN_hlabel', hlabel) yield ('SYN_hword', hword)
def extract_single_syntax(edu_info): """syntactic features for the EDU""" try: ptree = edu_info['ptree'] pheads = edu_info['pheads'] except KeyError: return edu = edu_info['edu'] # tree positions (in the syn tree) of the words that are in the EDU tpos_leaves_edu = [tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu)] wanted = set(tpos_leaves_edu) edu_head = find_edu_head(ptree, pheads, wanted) if edu_head is not None: treepos_hn, treepos_hw = edu_head hlabel = ptree[treepos_hn].label() hword = ptree[treepos_hw].word if False: # DEBUG print('edu: ', edu.text()) print('hlabel: ', hlabel) print('hword: ', hword) print('======') yield ('SYN_hlabel', hlabel) yield ('SYN_hword', hword)
def preprocess(self, doc, strict=False): """Preprocess a document and output basic features for each EDU. Parameters ---------- doc: DocumentPlus Document to be processed. Returns ------- edu_infos: list of dict of features List of basic features for each EDU ; each feature is a couple (basic_feat_name, basic_feat_val). para_infos: list of dict of features List of basic features for each paragraph ; each feature is a couple (basic_feat_name, basic_feat_val). TODO ---- * [ ] explicitly impute missing values, e.g. for idxes_in_* """ token_filter = self.token_filter word2clust = self.word2clust edus = doc.edus raw_words = doc.raw_words # TEMPORARY tokens = doc.tkd_tokens trees = doc.tkd_trees paragraphs = doc.paragraphs # NEW # mappings from EDU to other annotations edu2raw_sent = doc.edu2raw_sent edu2para = doc.edu2para edu2sent = doc.edu2sent edu2tokens = doc.edu2tokens lex_heads = doc.lex_heads # EXPERIMENTAL # pre-compute relative indices (in sent, para) in one iteration # NB: moved to document_plus itself idxes_in_sent = doc.edu2idx_in_sent rev_idxes_in_sent = doc.edu2rev_idx_in_sent idxes_in_para = doc.edu2idx_in_para rev_idxes_in_para = doc.edu2rev_idx_in_para # paragraphs if paragraphs is None: para_infos = None else: para_infos = [] # special case for the left padding paragraph pfeats = dict() pfeats['tokens'] = [tokens[0]] # left padding token pfeats['syn_nodes'] = None para_infos.append(pfeats) # regular paragraphs for para_idx, para in enumerate(paragraphs[1:], start=1): pfeats = dict() para_beg = para.sentences[0].span.char_start para_end = para.sentences[-1].span.char_end trees_beg = doc.trees_beg trees_end = doc.trees_end toks_beg = doc.toks_beg toks_end = doc.toks_end # * token characterization of the paragraph encltoks_idc = np.where( np.logical_and(toks_beg >= para_beg, toks_end <= para_end) )[0] encltoks = [tokens[i] for i in encltoks_idc] pfeats['tokens'] = encltoks # * syntactic characterization of the paragraph # find the syntactic trees that span this paragraph enclosed_idc = np.intersect1d( np.where(trees_beg >= para_beg), np.where(trees_end <= para_end)) overlapd_idc = np.intersect1d( np.where(trees_beg < para_end), np.where(trees_end > para_beg)) if np.array_equal(enclosed_idc, overlapd_idc): # sentence seg and paragraph seg are compatible syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc] else: # mismatch between the sentence segmentation from the # PTB and paragraph segmentation from the RST-WSJ strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc) syn_nodes = [] for tree_idx in overlapd_idc: syn_tree = trees[tree_idx] if tree_idx not in strad_idc: syn_nodes.append(syn_tree) continue # find the list of tokens that overlap this # paragraph, and belong to this straddling # tree tree_beg = trees_beg[tree_idx] tree_end = trees_end[tree_idx] # here, reduce(np.logical_and(...)) was 2x # faster than np.logical_and.reduce(...) overtoks_idc = np.where( reduce(np.logical_and, (toks_beg < para_end, toks_end > para_beg, toks_beg >= tree_beg, toks_end <= tree_end) ) )[0] overtoks = [tokens[i] for i in overtoks_idc] syn_node_seq = syntactic_node_seq( syn_tree, overtoks) syn_nodes.extend(syn_node_seq) # add basic feature pfeats['syn_nodes'] = syn_nodes # store para_infos.append(pfeats) # EDUs edu_infos = [] # special case: left padding EDU edu = edus[0] res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = [] # tokens res['tokens'] = [] # TODO: __START__ / __START__ ? res['tags'] = [] # TODO: __START__ ? res['words'] = [] # TODO: __START__ ? res['tok_beg'] = 0 # EXPERIMENTAL res['tok_end'] = 0 # EXPERIMENTAL # EXPERIMENTAL: Brown clusters res['brown_clusters'] = [] # end Brown clusters # sentence res['edu_idx_in_sent'] = idxes_in_sent[0] res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0] res['sent_idx'] = 0 res['sent_rev_idx'] = len(trees) - 1 # NEW # para res['edu_rev_idx_in_para'] = rev_idxes_in_para[0] # aka paragraphID res['para_idx'] = 0 res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None else None) # NEW # raw sent res['raw_sent_idx'] = edu2raw_sent[0] edu_infos.append(res) # regular EDUs for edu_idx, edu in enumerate(edus[1:], start=1): res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = raw_words[edu_idx] # tokens if tokens is not None: tok_idcs = edu2tokens[edu_idx] toks = [tokens[tok_idx] for tok_idx in tok_idcs] # special case: no tokens if strict and not toks: emsg = 'No token for EDU' print(list(enumerate(tokens))) print(tok_idcs) print(edu.text()) raise ValueError(emsg) # filter tokens if relevant if token_filter is not None: toks = [tt for tt in toks if token_filter(tt)] # store information res['tokens'] = toks res['tags'] = [tok.tag for tok in toks] res['words'] = [tok.word for tok in toks] # EXPERIMENTAL: Brown clusters if word2clust is not None: res['brown_clusters'] = [word2clust[w] for w in res['words'] if w in word2clust] # end Brown clusters # doc structure # position of sentence containing EDU in doc # aka sentence_id sent_idx = edu2sent[edu_idx] res['sent_idx'] = sent_idx res['sent_rev_idx'] = (len(trees) - 1 - sent_idx if sent_idx is not None else None) # NEW # position of EDU in sentence # aka num_edus_from_sent_start aka offset res['edu_idx_in_sent'] = idxes_in_sent[edu_idx] # aka num_edus_to_sent_end aka revOffset res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx] # position of paragraph containing EDU in doc # aka paragraphID para_idx = edu2para[edu_idx] res['para_idx'] = para_idx res['para_rev_idx'] = (len(paragraphs) - 1 - para_idx if (paragraphs is not None and para_idx is not None) else None) # NEW # position of raw sentence res['raw_sent_idx'] = edu2raw_sent[edu_idx] # position of EDU in paragraph # aka num_edus_to_para_end aka revSentenceID (?!) # TODO: check for the 10th time if this is a bug in Li et al.'s # parser res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx] # syntax if len(trees) > 1: tree_idx = edu2sent[edu_idx] res['tkd_tree_idx'] = tree_idx if tree_idx is not None: # head node of the EDU (for DS-LST features) ptree = trees[tree_idx] pheads = lex_heads[tree_idx] # tree positions (in the syn tree) of the words of # the EDU tpos_leaves_edu = [x for x in ptree.treepositions('leaves') if ptree[x].overlaps(edu)] tpos_words = set(tpos_leaves_edu) res['tpos_words'] = tpos_words edu_head = find_edu_head(ptree, pheads, tpos_words) res['edu_head'] = edu_head edu_infos.append(res) return edu_infos, para_infos
def extract_pair_syntax(edu_info1, edu_info2): """syntactic features for the pair of EDUs""" try: ptree1 = edu_info1['ptree'] pheads1 = edu_info1['pheads'] ptree2 = edu_info2['ptree'] pheads2 = edu_info2['pheads'] except KeyError: return edu1 = edu_info1['edu'] edu2 = edu_info2['edu'] # generate DS-LST features for intra-sentential if ptree1 == ptree2: ptree = ptree1 pheads = pheads1 # find the head node of EDU1 # tree positions (in the syn tree) of the words that are in EDU1 tpos_leaves_edu1 = [ tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu1) ] tpos_words1 = set(tpos_leaves_edu1) edu1_head = find_edu_head(ptree, pheads, tpos_words1) if edu1_head is not None: treepos_hn1, treepos_hw1 = edu1_head hlabel1 = ptree[treepos_hn1].label() hword1 = ptree[treepos_hw1].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn1 != (): treepos_an1 = treepos_hn1[:-1] treepos_aw1 = pheads[treepos_an1] alabel1 = ptree[treepos_an1].label() aword1 = ptree[treepos_aw1].word # find the head node of EDU2 # tree positions (in the syn tree) of the words that are in EDU2 tpos_leaves_edu2 = [ tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu2) ] tpos_words2 = set(tpos_leaves_edu2) edu2_head = find_edu_head(ptree, pheads, tpos_words2) if edu2_head is not None: treepos_hn2, treepos_hw2 = edu2_head hlabel2 = ptree[treepos_hn2].label() hword2 = ptree[treepos_hw2].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn2 != (): treepos_an2 = treepos_hn2[:-1] treepos_aw2 = pheads[treepos_an2] alabel2 = ptree[treepos_an2].label() aword2 = ptree[treepos_aw2].word # EXPERIMENTAL # # EDU 2 > EDU 1 if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)): # dominance relationship: 2 > 1 yield ('SYN_dom_2', True) # attachment label and word yield ('SYN_alabel', alabel1) yield ('SYN_aword', aword1) # head label and word yield ('SYN_hlabel', hlabel1) yield ('SYN_hword', hword1) # EDU 1 > EDU 2 if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)): # dominance relationship: 1 > 2 yield ('SYN_dom_1', True) # attachment label and word yield ('SYN_alabel', alabel2) yield ('SYN_aword', aword2) # head label and word yield ('SYN_hlabel', hlabel2) yield ('SYN_hword', hword2)
def extract_pair_syntax(edu_info1, edu_info2): """syntactic features for the pair of EDUs""" try: ptree1 = edu_info1['ptree'] pheads1 = edu_info1['pheads'] ptree2 = edu_info2['ptree'] pheads2 = edu_info2['pheads'] except KeyError: return edu1 = edu_info1['edu'] edu2 = edu_info2['edu'] # generate DS-LST features for intra-sentential if ptree1 == ptree2: ptree = ptree1 pheads = pheads1 # find the head node of EDU1 # tree positions (in the syn tree) of the words that are in EDU1 tpos_leaves_edu1 = [tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu1)] tpos_words1 = set(tpos_leaves_edu1) edu1_head = find_edu_head(ptree, pheads, tpos_words1) if edu1_head is not None: treepos_hn1, treepos_hw1 = edu1_head hlabel1 = ptree[treepos_hn1].label() hword1 = ptree[treepos_hw1].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn1 != (): treepos_an1 = treepos_hn1[:-1] treepos_aw1 = pheads[treepos_an1] alabel1 = ptree[treepos_an1].label() aword1 = ptree[treepos_aw1].word # find the head node of EDU2 # tree positions (in the syn tree) of the words that are in EDU2 tpos_leaves_edu2 = [tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu2)] tpos_words2 = set(tpos_leaves_edu2) edu2_head = find_edu_head(ptree, pheads, tpos_words2) if edu2_head is not None: treepos_hn2, treepos_hw2 = edu2_head hlabel2 = ptree[treepos_hn2].label() hword2 = ptree[treepos_hw2].word # if the head node is not the root of the syn tree, # there is an attachment node if treepos_hn2 != (): treepos_an2 = treepos_hn2[:-1] treepos_aw2 = pheads[treepos_an2] alabel2 = ptree[treepos_an2].label() aword2 = ptree[treepos_aw2].word # EXPERIMENTAL # # EDU 2 > EDU 1 if ((treepos_hn1 != () and treepos_aw1 in tpos_words2)): # dominance relationship: 2 > 1 yield ('SYN_dom_2', True) # attachment label and word yield ('SYN_alabel', alabel1) yield ('SYN_aword', aword1) # head label and word yield ('SYN_hlabel', hlabel1) yield ('SYN_hword', hword1) # EDU 1 > EDU 2 if ((treepos_hn2 != () and treepos_aw2 in tpos_words1)): # dominance relationship: 1 > 2 yield ('SYN_dom_1', True) # attachment label and word yield ('SYN_alabel', alabel2) yield ('SYN_aword', aword2) # head label and word yield ('SYN_hlabel', hlabel2) yield ('SYN_hword', hword2)
def preprocess(self, doc, strict=False): """Preprocess a document and output basic features for each EDU. Parameters ---------- doc: DocumentPlus Document to be processed. Returns ------- edu_infos: list of dict of features List of basic features for each EDU ; each feature is a couple (basic_feat_name, basic_feat_val). para_infos: list of dict of features List of basic features for each paragraph ; each feature is a couple (basic_feat_name, basic_feat_val). TODO ---- * [ ] explicitly impute missing values, e.g. for idxes_in_* """ token_filter = self.token_filter word2clust = self.word2clust edus = doc.edus raw_words = doc.raw_words # TEMPORARY tokens = doc.tkd_tokens trees = doc.tkd_trees paragraphs = doc.paragraphs # NEW # mappings from EDU to other annotations edu2raw_sent = doc.edu2raw_sent edu2para = doc.edu2para edu2sent = doc.edu2sent edu2tokens = doc.edu2tokens lex_heads = doc.lex_heads # EXPERIMENTAL # pre-compute relative indices (in sent, para) in one iteration # NB: moved to document_plus itself idxes_in_sent = doc.edu2idx_in_sent rev_idxes_in_sent = doc.edu2rev_idx_in_sent idxes_in_para = doc.edu2idx_in_para rev_idxes_in_para = doc.edu2rev_idx_in_para # paragraphs if paragraphs is None: para_infos = None else: para_infos = [] # special case for the left padding paragraph pfeats = dict() pfeats['tokens'] = [tokens[0]] # left padding token pfeats['syn_nodes'] = None para_infos.append(pfeats) # regular paragraphs for para_idx, para in enumerate(paragraphs[1:], start=1): pfeats = dict() para_beg = para.sentences[0].span.char_start para_end = para.sentences[-1].span.char_end trees_beg = doc.trees_beg trees_end = doc.trees_end toks_beg = doc.toks_beg toks_end = doc.toks_end # * token characterization of the paragraph encltoks_idc = np.where( np.logical_and(toks_beg >= para_beg, toks_end <= para_end))[0] encltoks = [tokens[i] for i in encltoks_idc] pfeats['tokens'] = encltoks # * syntactic characterization of the paragraph # find the syntactic trees that span this paragraph enclosed_idc = np.intersect1d(np.where(trees_beg >= para_beg), np.where(trees_end <= para_end)) overlapd_idc = np.intersect1d(np.where(trees_beg < para_end), np.where(trees_end > para_beg)) if np.array_equal(enclosed_idc, overlapd_idc): # sentence seg and paragraph seg are compatible syn_nodes = [trees[tree_idx] for tree_idx in overlapd_idc] else: # mismatch between the sentence segmentation from the # PTB and paragraph segmentation from the RST-WSJ strad_idc = np.setdiff1d(overlapd_idc, enclosed_idc) syn_nodes = [] for tree_idx in overlapd_idc: syn_tree = trees[tree_idx] if tree_idx not in strad_idc: syn_nodes.append(syn_tree) continue # find the list of tokens that overlap this # paragraph, and belong to this straddling # tree tree_beg = trees_beg[tree_idx] tree_end = trees_end[tree_idx] # here, reduce(np.logical_and(...)) was 2x # faster than np.logical_and.reduce(...) overtoks_idc = np.where( reduce(np.logical_and, (toks_beg < para_end, toks_end > para_beg, toks_beg >= tree_beg, toks_end <= tree_end)))[0] overtoks = [tokens[i] for i in overtoks_idc] syn_node_seq = syntactic_node_seq(syn_tree, overtoks) syn_nodes.extend(syn_node_seq) # add basic feature pfeats['syn_nodes'] = syn_nodes # store para_infos.append(pfeats) # EDUs edu_infos = [] # special case: left padding EDU edu = edus[0] res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = [] # tokens res['tokens'] = [] # TODO: __START__ / __START__ ? res['tags'] = [] # TODO: __START__ ? res['words'] = [] # TODO: __START__ ? res['tok_beg'] = 0 # EXPERIMENTAL res['tok_end'] = 0 # EXPERIMENTAL # EXPERIMENTAL: Brown clusters res['brown_clusters'] = [] # end Brown clusters # sentence res['edu_idx_in_sent'] = idxes_in_sent[0] res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[0] res['sent_idx'] = 0 res['sent_rev_idx'] = len(trees) - 1 # NEW # para res['edu_rev_idx_in_para'] = rev_idxes_in_para[0] # aka paragraphID res['para_idx'] = 0 res['para_rev_idx'] = (len(paragraphs) - 1 if paragraphs is not None else None) # NEW # raw sent res['raw_sent_idx'] = edu2raw_sent[0] edu_infos.append(res) # regular EDUs for edu_idx, edu in enumerate(edus[1:], start=1): res = dict() res['edu'] = edu # raw words (temporary) res['raw_words'] = raw_words[edu_idx] # tokens if tokens is not None: tok_idcs = edu2tokens[edu_idx] toks = [tokens[tok_idx] for tok_idx in tok_idcs] # special case: no tokens if strict and not toks: emsg = 'No token for EDU' print(list(enumerate(tokens))) print(tok_idcs) print(edu.text()) raise ValueError(emsg) # filter tokens if relevant if token_filter is not None: toks = [tt for tt in toks if token_filter(tt)] # store information res['tokens'] = toks res['tags'] = [tok.tag for tok in toks] res['words'] = [tok.word for tok in toks] # EXPERIMENTAL: Brown clusters if word2clust is not None: res['brown_clusters'] = [ word2clust[w] for w in res['words'] if w in word2clust ] # end Brown clusters # doc structure # position of sentence containing EDU in doc # aka sentence_id sent_idx = edu2sent[edu_idx] res['sent_idx'] = sent_idx res['sent_rev_idx'] = (len(trees) - 1 - sent_idx if sent_idx is not None else None ) # NEW # position of EDU in sentence # aka num_edus_from_sent_start aka offset res['edu_idx_in_sent'] = idxes_in_sent[edu_idx] # aka num_edus_to_sent_end aka revOffset res['edu_rev_idx_in_sent'] = rev_idxes_in_sent[edu_idx] # position of paragraph containing EDU in doc # aka paragraphID para_idx = edu2para[edu_idx] res['para_idx'] = para_idx res['para_rev_idx'] = ( len(paragraphs) - 1 - para_idx if (paragraphs is not None and para_idx is not None) else None ) # NEW # position of raw sentence res['raw_sent_idx'] = edu2raw_sent[edu_idx] # position of EDU in paragraph # aka num_edus_to_para_end aka revSentenceID (?!) # TODO: check for the 10th time if this is a bug in Li et al.'s # parser res['edu_rev_idx_in_para'] = rev_idxes_in_para[edu_idx] # syntax if len(trees) > 1: tree_idx = edu2sent[edu_idx] res['tkd_tree_idx'] = tree_idx if tree_idx is not None: # head node of the EDU (for DS-LST features) ptree = trees[tree_idx] pheads = lex_heads[tree_idx] # tree positions (in the syn tree) of the words of # the EDU tpos_leaves_edu = [ x for x in ptree.treepositions('leaves') if ptree[x].overlaps(edu) ] tpos_words = set(tpos_leaves_edu) res['tpos_words'] = tpos_words edu_head = find_edu_head(ptree, pheads, tpos_words) res['edu_head'] = edu_head edu_infos.append(res) return edu_infos, para_infos