예제 #1
0
 def tokenize(self, text):
     """Tokenize a chunk of text.
     Pulled mostly verbatim from the SpamBayes code.
     """
     maxword = 20
     text = numeric_entity_re.sub(numeric_entity_replacer, text)
     for cracker in (crack_urls,):
         text, tokens = cracker(text)
         for t in tokens:
             yield t
     text = breaking_entity_re.sub(' ', text)
     text = html_re.sub('', text)
     for w in text.split():
         n = len(w)
         if 3 <= n <= maxword:
             yield w
         elif n >= 3:
             for t in tokenize_word(w):
                 yield t
예제 #2
0
    def tokenize(self, text):
        """Tokenize a chunk of text.

        Pulled mostly verbatim from the SpamBayes code.
        """

        maxword = 20
        # Replace numeric character entities (like &#97; for the letter
        # 'a').
        text = numeric_entity_re.sub(numeric_entity_replacer, text)

        # Crack open URLs and extract useful bits of marrow...
        for cracker in (crack_urls,):
            text, tokens = cracker(text)
            for t in tokens:
                yield t

        # Remove HTML/XML tags.  Also &nbsp;.  <br> and <p> tags should
        # create a space too.
        text = breaking_entity_re.sub(' ', text)
        # It's important to eliminate HTML tags rather than, e.g.,
        # replace them with a blank (as this code used to do), else
        # simple tricks like
        #    Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion
        # can be used to disguise words.  <br> and <p> were special-
        # cased just above (because browsers break text on those,
        # they can't be used to hide words effectively).
        text = html_re.sub('', text)

        # Tokenize everything in the body.
        for w in text.split():
            n = len(w)
            # Make sure this range matches in tokenize_word().
            if 3 <= n <= maxword:
                yield w

            elif n >= 3:
                for t in tokenize_word(w):
                    yield t
    def tokenize(self, text):
        """Tokenize a chunk of text.

        Pulled mostly verbatim from the SpamBayes code.
        """

        maxword = 20
        # Replace numeric character entities (like &#97; for the letter
        # 'a').
        text = numeric_entity_re.sub(numeric_entity_replacer, text)

        # Crack open URLs and extract useful bits of marrow...
        for cracker in (crack_urls, ):
            text, tokens = cracker(text)
            for t in tokens:
                yield t

        # Remove HTML/XML tags.  Also &nbsp;.  <br> and <p> tags should
        # create a space too.
        text = breaking_entity_re.sub(' ', text)
        # It's important to eliminate HTML tags rather than, e.g.,
        # replace them with a blank (as this code used to do), else
        # simple tricks like
        #    Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion
        # can be used to disguise words.  <br> and <p> were special-
        # cased just above (because browsers break text on those,
        # they can't be used to hide words effectively).
        text = html_re.sub('', text)

        # Tokenize everything in the body.
        for w in text.split():
            n = len(w)
            # Make sure this range matches in tokenize_word().
            if 3 <= n <= maxword:
                yield w

            elif n >= 3:
                for t in tokenize_word(w):
                    yield t