예제 #1
0
 def lang_independent_sub(self, text):
     """Performs the language independent string substituitions. """
     # It's a strange order of regexes.
     # It'll be better to unescape after STRIP_EOL_HYPHEN
     # but let's keep it close to the original NIST implementation.
     regexp, substitution = self.STRIP_SKIP
     text = regexp.sub(substitution, text)
     text = xml_unescape(text)
     regexp, substitution = self.STRIP_EOL_HYPHEN
     text = regexp.sub(substitution, text)
     return text
예제 #2
0
 def lang_independent_sub(self, text):
     """Performs the language independent string substituitions. """
     # It's a strange order of regexes.
     # It'll be better to unescape after STRIP_EOL_HYPHEN
     # but let's keep it close to the original NIST implementation.
     regexp, substitution = self.STRIP_SKIP
     text = regexp.sub(substitution, text)
     text = xml_unescape(text)
     regexp, substitution = self.STRIP_EOL_HYPHEN
     text = regexp.sub(substitution, text)
     return text
예제 #3
0
    def international_tokenize(self, text, lowercase=False,
                               split_non_ascii=True,
                               return_str=False):
        text = text_type(text)
        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
        # first before unescaping.
        regexp, substitution = self.STRIP_SKIP
        text = regexp.sub(substitution, text)
        regexp, substitution = self.STRIP_EOL_HYPHEN
        text = regexp.sub(substitution, text)
        text = xml_unescape(text)

        if lowercase:
            text = text.lower()

        for regexp, substitution in self.INTERNATIONAL_REGEXES:
            text = regexp.sub(substitution, text)

        # Make sure that there's only one space only between words.
        # Strip leading and trailing spaces.
        text = ' '.join(text.strip().split())
        return text if return_str else text.split()