def tokenize_to_words(self, text): """Get a sequence of lowercase strings corresponding to the words in a text. Sentence-boundaries are only implicitly preserved (e.g. through punctuation). """ text = unicode_utils.to_unicode(text) words = self.word_tokenizer.tokenize(text) return words
def replace(self, text): """ Replaces all matches for the patterns in :py:attr:`compiled_patterns` with their corresponding replacements in :py:attr:`compiled_patterns`. :param text: The text that will be scanned for replacements. """ text = unicode_utils.to_unicode(text) for pattern, repl in self.compiled_patterns: text = re.sub(pattern, repl, text) return text
def tokenize_text(self, text): """Returns a sequence of sequences containing each sentence in a text, and then each word. """ text = unicode_utils.to_unicode(text) sents = self.sent_tokenizer.tokenize(text) word_tokenized_sents = ( self.word_tokenizer.tokenize(sent) for sent in sents ) return word_tokenized_sents
def get_word_count(self, text): """Gets the word count for a text by tokenizing it and returning the length of the resulting sequence. *Note*: This is an expensive operation that should not be called if we already have a tokenized version of the text. """ text = unicode_utils.to_unicode(text) words = self.tokenize_to_words(text) return len(list(words))
def tokenize_to_sentences(self, text): """Returns a sequence containing each sentence in a text. Each sentence is converted to lowercase stripped of whitespace at the beginning and end. """ text = unicode_utils.to_unicode(text) word_tokenized_sents = self.tokenize_text(text) lowercase_sents = ( ( word.lower().strip() for word in words ) for words in word_tokenized_sents ) return lowercase_sents