def _tokenize(self, text): # Pre-clean text = text.strip() # Apply pre-processors for pp in self.pre_processor_funcs: log.debug("pre-processing: %s", pp) text = pp(text) if _len(text) <= self.GOOGLE_TTS_MAX_CHARS: return _clean_tokens([text]) # Tokenize log.debug("tokenizing: %s", self.tokenizer_func) tokens = self.tokenizer_func(text) # Clean tokens = _clean_tokens(tokens) # Minimize min_tokens = [] for t in tokens: min_tokens += _minimize(t, ' ', self.GOOGLE_TTS_MAX_CHARS) # Filter empty tokens, post-minimize tokens = [t for t in min_tokens if t] return min_tokens
def test_ascii(): _in = "Bacon ipsum dolor sit amet" _out = ["Bacon", "ipsum", "dolor sit", "amet"] assert _minimize(_in, delim, Lmax) == _out
def test_startwith_delim(): _in = delim + "test" _out = ["test"] assert _minimize(_in, delim, Lmax) == _out
def test_unicode(): _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。" _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"] assert _minimize(_in, delim, Lmax) == _out
def test_ascii_no_delim(): _in = "Baconipsumdolorsitametflankcornedbee" _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"] assert _minimize(_in, delim, Lmax) == _out
def test_startwith_delim(self): _in = self.delim + "test" _out = ["test"] self.assertEqual(_minimize(_in, self.delim, self.max), _out)
def test_unicode(self): _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。" _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"] self.assertEqual(_minimize(_in, self.delim, self.max), _out)
def test_ascii_no_delim(self): _in = "Baconipsumdolorsitametflankcornedbee" _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"] self.assertEqual(_minimize(_in, self.delim, self.max), _out)
def test_ascii(self): _in = "Bacon ipsum dolor sit amet" _out = ["Bacon", "ipsum", "dolor sit", "amet"] self.assertEqual(_minimize(_in, self.delim, self.max), _out)