def lang_independent_sub(self, text): """Performs the language independent string substituitions. """ # It's a strange order of regexes. # It'll be better to unescape after STRIP_EOL_HYPHEN # but let's keep it close to the original NIST implementation. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) text = xml_unescape(text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) return text
def international_tokenize(self, text, lowercase=False, split_non_ascii=True, return_str=False): text = text_type(text) # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied # first before unescaping. regexp, substitution = self.STRIP_SKIP text = regexp.sub(substitution, text) regexp, substitution = self.STRIP_EOL_HYPHEN text = regexp.sub(substitution, text) text = xml_unescape(text) if lowercase: text = text.lower() for regexp, substitution in self.INTERNATIONAL_REGEXES: text = regexp.sub(substitution, text) # Make sure that there's only one space only between words. # Strip leading and trailing spaces. text = ' '.join(text.strip().split()) return text if return_str else text.split()