def _postprocess(tree, start=0, edu_start=1): """ Helper function: Convert the NLTK-parsed representation of an RST tree to one using educe-style Standoff objects """ if isinstance(tree, Tree): children = [] position = start - 1 # compensate for virtual whitespace added below node = _parse_node(treenode(tree), Span(-1, -1)) edu_start2 = node.edu_span[0] for child_ in tree: # (NB: +1 to add virtual whitespace between EDUs) child = _postprocess(child_, position + 1, edu_start2) children.append(child) # pylint: disable=E1101 child_sp = _tree_span(child) # pylint: enable=E1101 position = child_sp.char_end node.span = Span(start, position) return RSTTree(node, children) else: if tree.startswith("["): return _parse_edu(tree[1:-1], edu_start, start) else: raise RSTTreeException("ERROR in rst tree format for leaf : ", child)
def walk(subtree, posinfo=PosInfo(text=0, edu=0)): """ walk down first-cut tree, counting span info and returning a fancier tree along the way """ if isinstance(subtree, Tree): start = copy.copy(posinfo) children = [] nuc_kids = [] for kid in subtree: tree, posinfo, nuc_kid = walk(kid, posinfo) children.append(tree) nuc_kids.append(nuc_kid) nuclearity = ''.join(x for x in nuc_kids) match = _lw_type_re.match(treenode(subtree)) if not match: raise RSTTreeException("Missing nuclearity annotation in " + str(subtree)) nuc = match.group("nuc") rel = match.group("rel") or "leaf" edu_span = (start.edu, posinfo.edu - 1) span = Span(start.text, posinfo.text) node = Node(nuclearity, edu_span, span, rel) return SimpleRSTTree(node, children), posinfo, nuc else: text = subtree start = posinfo.text end = start + len(text) posinfo2 = PosInfo(text=end, edu=posinfo.edu + 1) return EDU(posinfo.edu, Span(start, end), text), posinfo2, "leaf"
def assertOverlap(self, expected, pair1, pair2, **kwargs): "true if `pair1.overlaps(pair2) == expected` (modulo boxing)" (x1, y1) = pair1 (x2, y2) = pair2 (rx, ry) = expected o = Span(x1, y1).overlaps(Span(x2, y2), **kwargs) self.assertTrue(o) self.assertEqual(Span(rx, ry), o)
def split_doc(doc, middle): """ Given a split point, break a document into two pieces. If the split point is None, we take the whole document (this is slightly different from having -1 as a split point) Raise an exception if there are any annotations that span the point. Parameters ---------- doc : Document The document we want to split. middle : int Split point. Returns ------- doc_prefix : Document Deep copy of `doc` restricted to span [:middle] doc_suffix : Document Deep copy of `doc` restricted to span [middle:] ; the span of each annotation is shifted to match the new text. """ doc_len = doc.text_span().char_end if middle < 0: middle = doc_len + 1 + middle def straddles(point, span): """ True if the point is somewhere in the middle of the span (sitting at right edge doesn't count). Note that this is not the same as checking for enclosure because we do not include the rightward edge """ if span is None: return False return span.char_start < point and span.char_end > point leftovers = [ x for x in doc.annotations() if straddles(middle, x.text_span()) ] if leftovers: oops = ("Can't split document [{origin}] at {middle} because it is " "straddled by the following annotations:\n" "{annotations}\n" "Either split at a different place or remove the annotations") leftovers = [' * %s %s' % (x.text_span(), x) for x in leftovers] raise StacDocException( oops.format(origin=doc.origin, middle=middle, annotations='\n'.join(leftovers))) prefix = Span(0, middle) suffix = Span(middle, doc_len) return narrow_to_span(doc, prefix), narrow_to_span(doc, suffix)
def test_simple_align(self): "trivial token realignment" tokens = ["a", "bb", "ccc"] text = "a bb ccc" spans = list(generic_token_spans(text, tokens)) expected = [Span(0, 1), Span(2, 4), Span(8, 11)] self.assertEquals(expected, spans)
def test_messy_align(self): "ignore whitespace in token" tokens = ["a", "b b", "c c c"] text = "a bb ccc" spans = list(generic_token_spans(text, tokens)) expected = [Span(0, 1), Span(2, 4), Span(8, 11)] self.assertEquals(expected, spans)
def _actually_split(tcache, doc, dialogue, turn): """Split the dialogue before the given turn. """ dspan = dialogue.text_span() tspan = turn.text_span() span1 = Span(dspan.char_start, tspan.char_start - 1) span2 = Span(tspan.char_start - 1, dspan.char_end) dialogue1 = dialogue dialogue2 = copy.deepcopy(dialogue) _set(tcache, span1, dialogue1) _set(tcache, span2, dialogue2) doc.units.append(dialogue2) dialogue2.features = {}
def generic_token_spans(text, tokens, offset=0, txtfn=None): """ Given a string and a sequence of substrings within than string, infer a span for each of the substrings. We do this spans by walking the text and the tokens we consume substrings and skipping over any whitespace (including that which is within the tokens). For this to work, the substring sequence must be identical to the text modulo whitespace. Spans are relative to the start of the string itself, but can be shifted by passing an offset (the start of the original string's span). Empty tokens are accepted but have a zero-length span. Note: this function is lazy so you can use it incrementally provided you can generate the tokens lazily too You probably want `token_spans` instead; this function is meant to be used for similar tasks outside of pos tagging :param txtfn: function to extract text from a token (default None, treated as identity function) """ txt_iter = ifilterfalse(lambda x: x[1].isspace(), enumerate(text)) txtfn = txtfn or (lambda x: x) last = offset # for corner case of empty tokens for token in tokens: tok_chars = list(ifilterfalse(lambda x: x.isspace(), txtfn(token))) if not tok_chars: yield Span(last, last) continue prefix = list(islice(txt_iter, len(tok_chars))) if not prefix: msg = "Too many tokens (current: %s)" % txtfn(token) raise EducePosTagException(msg) last = prefix[-1][0] + 1 + offset span = Span(prefix[0][0] + offset, last) pretty_prefix = text[span.char_start:span.char_end] # check the text prefix to make sure we have the same # non-whitespace characters for txt_pair, tok_char in zip(prefix, tok_chars): idx, txt_char = txt_pair if txt_char != tok_char: msg = "token mismatch at char %d (%s vs %s)\n"\ % (idx, txt_char, tok_char)\ + " token: [%s]\n" % token\ + " text: [%s]" % pretty_prefix raise EducePosTagException(msg) yield span
def _load_rst_wsj_corpus_text_file_wsj(f): """Actually do load""" text = f.read() start = 0 sent_id = 0 output_sents = [] output_paras = [] for para_id, paragraph in enumerate(text.split(WSJ_SEP_PARA)): para_sents = [] for sentence in paragraph.split(WSJ_SEP_SENT): end = start + len(sentence) # NEW: remove trailing white space rws = len(sentence) - len(sentence.rstrip()) if rws: end -= rws # end NEW if end > start: para_sents.append(Sentence(sent_id, Span(start, end))) sent_id += 1 start = end + rws + 1 # + 1 for + len(WSJ_SEP_SENT) output_paras.append(Paragraph(para_id, para_sents)) output_sents.extend(para_sents) start += 2 # whitespace and second newline return text, output_sents, output_paras
def shift_span(span, updates, stretch_right=False): """ Given a span and an updates tuple, return a Span that is shifted over to reflect the updates Parameters ---------- span: Span updates: Updates stretch_right : boolean, optional If True, stretch the right boundary of an annotation that buts up against the left of a new annotation. This is recommended for annotations that should fully cover a given span, like dialogues for documents. Returns ------- span: Span See also -------- shift_char: for details on how this works """ start = shift_char(span.char_start, updates) if stretch_right: end = shift_char(span.char_end, updates) else: # this is to avoid spurious overstretching of the right # boundary of an annotation that buts up against the # left of a new annotation end = 1 + shift_char(span.char_end - 1, updates) return Span(start, end)
def _dialogues_in_turns(corpus, turn1, turn2): """ Given a pair of turns """ # grab a document from the set (assumption here is that # they are all morally the same doc) if not corpus.values(): sys.exit("No documents selected") doc = corpus.values()[0] starting_turn = get_turn(turn1, doc) ending_turn = get_turn(turn2, doc) # there's a bit of fuzz for whitespace before/after the # turns span = Span(starting_turn.text_span().char_start - 1, ending_turn.text_span().char_end + 1) def is_in_range(anno): """ If the annotation is a dialogue that is covered by the turns in question """ return is_dialogue(anno) and span.encloses(anno.span) return [ anno_id_to_tuple(x.local_id()) for x in doc.annotations() if is_in_range(x) ]
class Token(RawToken, Standoff): """ A token with a part of speech tag and some character offsets associated with it. """ def __init__(self, tok, span): RawToken.__init__(self, tok.word, tok.tag) Standoff.__init__(self) self.span = span def __str__(self): return '%s\t%s' % (RawToken.__str__(self), self.span) def __unicode__(self): return '%s\t%s' % (RawToken.__unicode__(self), self.span) # left padding Token _lpad_word = '__START__' _lpad_tag = '__START__' _lpad_span = Span(0, 0) @classmethod def left_padding(cls): "Return a special Token for left padding" return Token(RawToken(cls._lpad_word, cls._lpad_tag), cls._lpad_span)
def token_spans(text, tokens, offset=0): """ Given a string and a sequence of RawToken representing tokens in that string, infer the span for each token. Return the results as a sequence of Token objects. We infer these spans by walking the text as we consume tokens, and skipping over any whitespace in between. For this to work, the raw token text must be identical to the text modulo whitespace. Spans are relative to the start of the string itself, but can be shifted by passing an offset (the start of the original string's span) """ token_words = [tok.word for tok in tokens] spans = generic_token_spans(text, token_words, offset) res = [Token(tok, span) for tok, span in zip(tokens, spans)] # sanity checks that should be moved to tests for orig_tok, new_tok in zip(tokens, res): span = Span(new_tok.span.char_start - offset, new_tok.span.char_end - offset) snippet = text[span.char_start:span.char_end] assert snippet == new_tok.word assert orig_tok.word == new_tok.word assert orig_tok.tag == new_tok.tag return res
def approximate_cover(elts, tgt): """Returns True if elts covers tgt's span. This is approximate because we only check that: * the first and last elements respectively begin and end at the extremities of tgt.span, * consecutive elements don't overlap. Because of the second item, we assume that elts has been sorted by span. Parameters ---------- elts : sorted list of Annotation Sequence of elements tgt : Annotation Target annotation Returns ------- res : boolean True if elts approximately cover tgt.span """ span_seq = Span(elts[0].span.char_start, elts[-1].span.char_end) res = (span_eq(span_seq, tgt.text_span(), eps=1) and all( elt_cur.overlaps(elt_nxt) is None for elt_cur, elt_nxt in zip(elts[:-1], elts[1:]))) return res
def _mk_token(ttoken, span): """ Convert a tweaked token and the span it's been aligned with into a proper Token object. """ if ttoken.offset != 0: span = Span(span.char_start + ttoken.offset, span.char_end) return Token(ttoken, span)
def anno(doc, prefix, tspan): "pad text segment as needed" prefix_t = "..."\ if tspan.char_start + len(prefix) < info.span.char_start\ else "" myspan = Span(info.span.char_start, tspan.char_end) return "".join([prefix, prefix_t, annotate_doc(doc, span=myspan)])
def compute_updates(src_doc, tgt_doc, matches): """Return updates that would need to be made on the target document. Given matches between the source and target document, return span updates along with any source annotations that do not have an equivalent in the target document (the latter may indicate that resegmentation has taken place, or that there is some kind of problem) Parameters ---------- src_doc : Document tgt_doc : Document matches : [Match] Returns ------- updates: Updates """ res = Updates() # case 2 and 5 (to be pruned below) res.expected_src_only.extend(src_doc.units) res.abnormal_tgt_only.extend(tgt_doc.units) # case 1, 2 and 4 for src, tgt, size in matches: tgt_to_src = src - tgt res.shift_if_ge[tgt] = tgt_to_src # case 1 and 2 src_annos = enclosed(Span(src, src + size), src_doc.units) tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units) for src_anno in src_annos: res.expected_src_only.remove(src_anno) # prune from case 5 src_span = src_anno.text_span() tgt_equiv = [ x for x in tgt_annos if x.text_span().shift(tgt_to_src) == src_span ] if not tgt_equiv: # case 4 res.abnormal_src_only.append(src_anno) for tgt_anno in tgt_equiv: # prun from case 2 if tgt_anno in res.abnormal_tgt_only: res.abnormal_tgt_only.remove(tgt_anno) return res
def __init__(self, node, children, origin=None): SearchableTree.__init__(self, node, children) Standoff.__init__(self, origin) if not children: raise Exception("Can't create a tree with no children") self.children = children start = min(x.span.char_start for x in children) end = max(x.span.char_end for x in children) self.span = Span(start, end)
def __init__(self, t, offset, origin=None): extent = t['extent'] word = t['word'] tag = t['POS'] span = Span(extent[0], extent[1] + 1).shift(offset) postag.Token.__init__(self, postag.RawToken(word, tag), span) self.features = copy.copy(t) for k in ['s_id', 'word', 'extent', 'POS']: del self.features[k]
def __init__(self, node, children, link, origin=None): SearchableTree.__init__(self, node, children) Standoff.__init__(self, origin) nodes = children if not self.is_root(): nodes.append(self.label()) start = min(x.span.char_start for x in nodes) end = max(x.span.char_end for x in nodes) self.link = link self.span = Span(start, end) self.origin = origin
def main(args): """Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser`. """ output_dir = get_output_dir(args, default_overwrite=True) # locate insertion site: target document reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) # TODO mark units with FIXME, optionally delete in/out relations span = args.span sub_text = args.sub_text minor = args.minor # store before/after annos_before = [] annos_after = [] for tgt_k, tgt_doc in tgt_corpus.items(): annos_before.append(annotate_doc(tgt_doc, span=span)) # process new_tgt_doc = replace_text_at_span(tgt_doc, span, sub_text, minor=minor) # WIP new_span, depends on the offset offset = len(sub_text) - (span.char_end - span.char_start) new_span = Span(span.char_start, span.char_end + offset) # end WIP annos_after.append(annotate_doc(new_tgt_doc, span=new_span)) # show diff and save doc diffs = [ "======= REPLACE TEXT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc) ] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir) # commit message tgt_k, tgt_doc = list(tgt_corpus.items())[0] anno_str_before = annos_before[0] anno_str_after = annos_after[0] if tgt_k and not args.no_commit_msg: print("-----8<------") print(commit_msg(tgt_k, anno_str_before, anno_str_after))
def shift_anno(anno, offset, point): """Get a shifted copy of an annotation""" anno2 = copy.deepcopy(anno) if not isinstance(anno, Unit): return anno2 anno_span = anno2.text_span() if anno_span.char_start >= point: # if the annotation is entirely after the deletion site, # shift the whole span anno2.span = anno_span.shift(offset) elif anno_span.char_end >= point: # if the annotation straddles the substitution site, # stretch (shift its end) anno2.span = Span(anno_span.char_start, anno_span.char_end + offset) return anno2
def __init__(self, t, offset, origin=None): """ Parameters ---------- t : dict Token from corenlp's XML output. offset : int Offset from the span of the corenlp token to the document. origin : FileId, optional Identifier for the document. """ extent = t['extent'] word = t['word'] tag = t['POS'] span = Span(extent[0], extent[1] + 1).shift(offset) postag.Token.__init__(self, postag.RawToken(word, tag), span) self.features = copy.copy(t) for k in ['s_id', 'word', 'extent', 'POS']: del self.features[k]
def shift_anno(anno, span_old, text_new): """Get a shifted copy of an annotation""" anno2 = copy.deepcopy(anno) if not isinstance(anno, Unit): return anno2 offset = (len(text_new) - (span_old.char_end - span_old.char_start)) anno_span = anno2.text_span() if anno_span.char_start >= span_old.char_end: # if the annotation is entirely after the substitution site, # shift the whole span anno2.span = anno_span.shift(offset) elif anno_span.char_end >= span_old.char_end: # if the annotation straddles the substitution site, # stretch (shift its end) anno2.span = Span(anno_span.char_start, anno_span.char_end + offset) return anno2
def _parse_edu(descr, edu_start, start=0): """ Parse an RST DT leaf string """ sdesc = descr.strip() if sdesc.startswith("<s>"): sdesc = sdesc[3:] if sdesc.endswith("</s>"): sdesc = sdesc[:-4] if sdesc.startswith("<EDU>") and sdesc.endswith("</EDU>"): text = sdesc[5:-6] # remove <EDU></EDU> mark elif sdesc.startswith("_!") and sdesc.endswith("_!"): text = sdesc[2:-2] else: text = sdesc end = start + len(text) span = Span(start, end) # text-span (not the same as EDU span) return EDU(edu_start, span, text)
def syntactic_node_seq(ptree, tokens): """Find the sequence of syntactic nodes covering a sequence of tokens. Parameters ---------- ptree: `nltk.tree.Tree` Syntactic tree. tokens: sequence of `Token` Sequence of tokens under scrutiny. Returns ------- syn_nodes: list of `nltk.tree.Tree` Spanning sequence of nodes of the syntactic tree. """ txt_span = Span(tokens[0].text_span().char_start, tokens[-1].text_span().char_end) for node in ptree.subtrees(lambda t: t.text_span().encloses(txt_span)): # * spanning node if node.text_span() == txt_span: return [node] # * otherwise: spanning subsequence of kid nodes txt_span_start = txt_span.char_start txt_span_end = txt_span.char_end kids_start = [x.text_span().char_start for x in node] kids_end = [x.text_span().char_end for x in node] try: idx_left = kids_start.index(txt_span_start) except ValueError: continue try: idx_right = kids_end.index(txt_span_end) except ValueError: continue if idx_left == idx_right: continue return [x for x in node[idx_left:idx_right + 1]] else: return []
def fill(self, current, edu1, edu2, target=None): vec = self if target is None else target doc = current.doc big_span = edu1.text_span().merge(edu2.text_span()) # spans for the turns that come between the two edus turns_between_span = Span(edu1.turn.text_span().char_end, edu2.turn.text_span().char_start) turns_between = turns_in_span(doc, turns_between_span) inner_edus = edus_in_span(doc, big_span) if edu1.identifier() != ROOT: # not present anyway inner_edus.remove(edu1) if edu2.identifier() != ROOT: inner_edus.remove(edu2) gap = EduGap(inner_edus=inner_edus, turns_between=turns_between, sf_cache=self.sf_cache) for key in self.keys: vec[key.name] = key.function(current, gap, edu1, edu2)
class Sentence(Standoff): """ Just a text span really """ def __init__(self, num, span): super(Sentence, self).__init__() self.span = span self.num = num "sentence ID in document" def text_span(self): return self.span # left padding _lpad_num = -1 _lpad_span = Span(0, 0) @classmethod def left_padding(cls): """Return a left padding Sentence""" return cls(cls._lpad_num, cls._lpad_span)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ _screen_args(args) corpus = read_corpus(args, verbose=True) output_dir = get_output_dir(args, default_overwrite=True) old_span = args.span new_span = Span(old_span.char_start + args.nudge_start, old_span.char_end + args.nudge_end) for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) found = False for anno in new_doc.units: if anno.span == old_span: anno.span = copy.deepcopy(new_span) found = True if found: diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span)) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) else: print("WARNING: No annotations found for %s in %s" % (old_span, k), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation span = old_span.merge(new_span) commit_info = CommitInfo(key=k, before=old_doc, after=new_doc, span=span) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info)) announce_output_dir(output_dir)
def _load_rst_wsj_corpus_text_file_file(f): """Actually do load""" text = f.read() start = 0 sent_id = 0 output_sents = [] output_paras = None # paragraph marking are missing from these documents for sentence in text.split(FIL_SEP_SENT): end = start + len(sentence) # NEW: remove leading white spaces lws = len(sentence) - len(sentence.lstrip()) if lws: start += lws # end NEW if end > start: output_sents.append(Sentence(sent_id, Span(start, end))) sent_id += 1 start = end + 3 # + 3 for + len(FIL_SEP_SENT) # TODO remove trailing '\n' of last sentence return text, output_sents, output_paras