def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Return a tokenized document. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.tkd_tokens.extend(result) return doc
def _norm(toknum, tagged_token): "tweak a token to match RST_DT text" word, tag = tagged_token if (ptb_name, toknum) in _PTB_SUBSTS: prefix, tweak = _PTB_SUBSTS[(ptb_name, toknum)] return TweakedToken(word, tag, tweak, prefix) elif is_empty_category(tag) and is_nonword_token(word): return TweakedToken(word, tag, "") tweak = PTB_TO_TEXT.get(word, word) tweak = slash_re.sub('/', tweak) tweak = star_re.sub('*', tweak) tweak = None if tweak == word else tweak return TweakedToken(word, tag, tweak)
def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with tokenization. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.set_tokens(result) return doc