def _transform(self, document): if self.raw: return [ word_tokenize(s) for s in sentence_tokenize(self._join(document)) ] else: return [ word_tokenize(s) for s in sentence_tokenize(self._clean(document)) ]
def _transform(self, document): lines_from_section = section_extract(self.section_regex, document['description']) return [ word_tokenize(clean_str(strip_bullets_from_line(line.text))) for line in lines_from_section ]
def word_tokenizer_gen(sent_gent): for sent in sent_gent: yield word_tokenize(sent)