def align_with_trees(self, strict=False): """Compute for each EDU the overlapping trees""" edus = self.edus syn_trees = self.tkd_trees # if there is no sentence segmentation from syntax, # use the raw (bad) one from the .out if len(syn_trees) == 1: # only lpad self.edu2sent = self.edu2raw_sent return self # compute edu2sent, prepend 0 for lpad, shift all indices by 1 assert edus[0].is_left_padding() edu2sent = align_edus_with_sentences(self.edus[1:], syn_trees[1:], strict=strict) edu2sent = [0] + [(i+1 if i is not None else i) for i in edu2sent] self.edu2sent = edu2sent # compute relative index of each EDU from the beginning (resp. to # the end) of the sentence around idxes_in_sent = relative_indices(edu2sent) rev_idxes_in_sent = relative_indices(edu2sent, reverse=True) self.edu2idx_in_sent = idxes_in_sent self.edu2rev_idx_in_sent = rev_idxes_in_sent return self
def test_relative_indices(): """Test for relative_indices""" # first example: common case example1 = [0, 1, 1, 1, 2, 3, 3] rel_exa1 = [0, 0, 1, 2, 0, 0, 1] assert relative_indices(example1) == rel_exa1 inv_exa1 = [0, 2, 1, 0, 0, 1, 0] assert relative_indices(example1, reverse=True) == inv_exa1 # second example: case with None # each None has relative index 'valna' # so if valna=0, each None is considered to be a distinct subgroup example2 = [None, 0, 1, 1, None, None] rel_exa2 = [0, 0, 0, 1, 0, 0] assert relative_indices(example2, valna=0) == rel_exa2 inv_exa2 = [0, 0, 1, 0, 0, 0] assert relative_indices(example2, reverse=True, valna=0) == inv_exa2
def align_with_doc_structure(self): """Align EDUs with the document structure (paragraph and sentence). Determine which paragraph and sentence (if any) surrounds this EDU. Try to accomodate the occasional off-by-a-smidgen error by folks marking these EDU boundaries, eg. original text: Para1: "Magazines are not providing us in-depth information on circulation," said Edgar Bronfman Jr., .. "How do readers feel about the magazine?... Research doesn't tell us whether people actually do read the magazines they subscribe to." Para2: Reuben Mark, chief executive of Colgate-Palmolive, said... Marked up EDU is wide to the left by three characters: " Reuben Mark, chief executive of Colgate-Palmolive, said... """ text = self.text edus = self.edus # align EDUs with paragraphs paragraphs = self.paragraphs # dirty extraction if paragraphs is None: edu2para = None else: edu2para = align_edus_with_paragraphs(edus, paragraphs, text) if edu2para is None: edu2para = [None for edu in edus] self.edu2para = edu2para # compute relative index of each EDU to the beginning (resp. to # the end) of the paragraph idxes_in_para = relative_indices(edu2para) rev_idxes_in_para = relative_indices(edu2para, reverse=True) self.edu2idx_in_para = idxes_in_para self.edu2rev_idx_in_para = rev_idxes_in_para # align EDUs with raw sentences # NB: this usually fails due to bad sentence segmentation, e.g. # ... Prof.\nHarold ... in wsj_##.out files, # or end of sentence missing in file## files. raw_sentences = self.raw_sentences if raw_sentences is None: edu2raw_sent = [None for edu in edus] else: edu2raw_sent = [] edu2raw_sent.append(0) # left padding # align the other EDUs for edu in edus[1:]: espan = edu.text_span() # find enclosing raw sentence sent = _filter0(containing(espan), raw_sentences) # sloppy EDUs happen; try shaving off some characters # if we can't find a sentence if sent is None: # DEBUG if False: print('WP ({}) : {}'.format(self.grouping, edu)) # end DEBUG espan = copy.copy(espan) espan.char_start += 1 espan.char_end -= 1 etext = text[espan.char_start:espan.char_end] # kill left whitespace espan.char_start += len(etext) - len(etext.lstrip()) etext = etext.lstrip() # kill right whitespace espan.char_end -= len(etext) - len(etext.rstrip()) etext = etext.rstrip() # try again sent = _filter0(containing(espan), raw_sentences) # DEBUG if False: if sent is None: print('EP ({}): {}'.format(self.grouping, edu)) # end DEBUG # update edu to sentence mapping raw_sent_idx = (raw_sentences.index(sent) if sent is not None else None) # TODO or -1 or ... ? edu2raw_sent.append(raw_sent_idx) self.edu2raw_sent = edu2raw_sent return self
def align_with_trees(self, strict=False): """Compute for each EDU the overlapping trees""" syn_trees = self.tkd_trees # if there is no sentence segmentation from syntax, # use the raw (bad) one from the .out if len(syn_trees) == 1: # only lpad self.edu2sent = self.edu2raw_sent return self edu2sent = [] edus = self.edus # left padding EDU assert edus[0].is_left_padding() edu2sent.append(0) # regular EDUs for edu in edus[1:]: tree_idcs = [tree_idx for tree_idx, tree in enumerate(syn_trees[1:], start=1) if tree is not None and tree.overlaps(edu)] if len(tree_idcs) == 1: tree_idx = tree_idcs[0] elif len(tree_idcs) == 0: # "no tree at all" can happen when the EDU text is totally # absent from the list of sentences of this doc in the PTB # ex: wsj_0696.out, last sentence if strict: print(edu) emsg = 'No PTB tree for this EDU' raise ValueError(emsg) tree_idx = None else: # more than one PTB trees overlap with this EDU if strict: emsg = ('Segmentation mismatch:', 'one EDU, more than one PTB tree') print(edu) for ptree in ptrees: print(' ', [str(leaf) for leaf in ptree.leaves()]) raise ValueError(emsg) # heuristics: pick the PTB tree with maximal overlap # with the EDU span len_espan = edu.span.length() ovlaps = [syn_trees[tree_idx].overlaps(edu).length() for tree_idx in tree_idcs] ovlap_ratios = [float(ovlap) / len_espan for ovlap in ovlaps] # find the argmax max_idx = ovlap_ratios.index(max(ovlap_ratios)) tree_idx = tree_idcs[max_idx] edu2sent.append(tree_idx) self.edu2sent = edu2sent # compute relative index of each EDU from the beginning (resp. to # the end) of the sentence around idxes_in_sent = relative_indices(edu2sent) rev_idxes_in_sent = relative_indices(edu2sent, reverse=True) self.edu2idx_in_sent = idxes_in_sent self.edu2rev_idx_in_sent = rev_idxes_in_sent return self