def _tokenize(self, text): # Pre-clean text = text.strip() # Apply pre-processors for pp in self.pre_processor_funcs: log.debug("pre-processing: %s", pp) text = pp(text) if _len(text) <= self.GOOGLE_TTS_MAX_CHARS: return _clean_tokens([text]) # Tokenize log.debug("tokenizing: %s", self.tokenizer_func) tokens = self.tokenizer_func(text) # Clean tokens = _clean_tokens(tokens) # Minimize min_tokens = [] for t in tokens: min_tokens += _minimize(t, ' ', self.GOOGLE_TTS_MAX_CHARS) # Filter empty tokens, post-minimize tokens = [t for t in min_tokens if t] return min_tokens
def test_strip(): _in = [" Bacon ", "& ", "ipsum\r", "."] _out = ["Bacon", "&", "ipsum"] assert _clean_tokens(_in) == _out
def test_only_space_and_punc(): _in = [",(:)?", "\t ", "\n"] _out = [] assert _clean_tokens(_in) == _out
def test_strip(self): _in = [" Bacon ", "& ", "ipsum\r", "."] _out = ["Bacon", "&", "ipsum"] self.assertEqual(_clean_tokens(_in), _out)
def test_only_space_and_punc(self): _in = [",(:)?", "\t ", "\n"] _out = [] self.assertEqual(_clean_tokens(_in), _out)