def tokenize(self, text): """Tokenize a chunk of text. Pulled mostly verbatim from the SpamBayes code. """ maxword = 20 text = numeric_entity_re.sub(numeric_entity_replacer, text) for cracker in (crack_urls,): text, tokens = cracker(text) for t in tokens: yield t text = breaking_entity_re.sub(' ', text) text = html_re.sub('', text) for w in text.split(): n = len(w) if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t
def tokenize(self, text): """Tokenize a chunk of text. Pulled mostly verbatim from the SpamBayes code. """ maxword = 20 # Replace numeric character entities (like a for the letter # 'a'). text = numeric_entity_re.sub(numeric_entity_replacer, text) # Crack open URLs and extract useful bits of marrow... for cracker in (crack_urls,): text, tokens = cracker(text) for t in tokens: yield t # Remove HTML/XML tags. Also . <br> and <p> tags should # create a space too. text = breaking_entity_re.sub(' ', text) # It's important to eliminate HTML tags rather than, e.g., # replace them with a blank (as this code used to do), else # simple tricks like # Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion # can be used to disguise words. <br> and <p> were special- # cased just above (because browsers break text on those, # they can't be used to hide words effectively). text = html_re.sub('', text) # Tokenize everything in the body. for w in text.split(): n = len(w) # Make sure this range matches in tokenize_word(). if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t
def tokenize(self, text): """Tokenize a chunk of text. Pulled mostly verbatim from the SpamBayes code. """ maxword = 20 # Replace numeric character entities (like a for the letter # 'a'). text = numeric_entity_re.sub(numeric_entity_replacer, text) # Crack open URLs and extract useful bits of marrow... for cracker in (crack_urls, ): text, tokens = cracker(text) for t in tokens: yield t # Remove HTML/XML tags. Also . <br> and <p> tags should # create a space too. text = breaking_entity_re.sub(' ', text) # It's important to eliminate HTML tags rather than, e.g., # replace them with a blank (as this code used to do), else # simple tricks like # Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion # can be used to disguise words. <br> and <p> were special- # cased just above (because browsers break text on those, # they can't be used to hide words effectively). text = html_re.sub('', text) # Tokenize everything in the body. for w in text.split(): n = len(w) # Make sure this range matches in tokenize_word(). if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t