Пример #1
0
    def _tokenize(self, text):
        # Pre-clean
        text = text.strip()

        # Apply pre-processors
        for pp in self.pre_processor_funcs:
            log.debug("pre-processing: %s", pp)
            text = pp(text)

        if _len(text) <= self.GOOGLE_TTS_MAX_CHARS:
            return _clean_tokens([text])

        # Tokenize
        log.debug("tokenizing: %s", self.tokenizer_func)
        tokens = self.tokenizer_func(text)

        # Clean
        tokens = _clean_tokens(tokens)

        # Minimize
        min_tokens = []
        for t in tokens:
            min_tokens += _minimize(t, ' ', self.GOOGLE_TTS_MAX_CHARS)

        # Filter empty tokens, post-minimize
        tokens = [t for t in min_tokens if t]

        return min_tokens
Пример #2
0
def test_ascii():
    _in = "Bacon ipsum dolor sit amet"
    _out = ["Bacon", "ipsum", "dolor sit", "amet"]
    assert _minimize(_in, delim, Lmax) == _out
Пример #3
0
def test_startwith_delim():
    _in = delim + "test"
    _out = ["test"]
    assert _minimize(_in, delim, Lmax) == _out
Пример #4
0
def test_unicode():
    _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。"
    _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"]
    assert _minimize(_in, delim, Lmax) == _out
Пример #5
0
def test_ascii_no_delim():
    _in = "Baconipsumdolorsitametflankcornedbee"
    _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"]
    assert _minimize(_in, delim, Lmax) == _out
Пример #6
0
 def test_startwith_delim(self):
     _in = self.delim + "test"
     _out = ["test"]
     self.assertEqual(_minimize(_in, self.delim, self.max), _out)
Пример #7
0
 def test_unicode(self):
     _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。"
     _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"]
     self.assertEqual(_minimize(_in, self.delim, self.max), _out)
Пример #8
0
 def test_ascii_no_delim(self):
     _in = "Baconipsumdolorsitametflankcornedbee"
     _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"]
     self.assertEqual(_minimize(_in, self.delim, self.max), _out)
Пример #9
0
 def test_ascii(self):
     _in = "Bacon ipsum dolor sit amet"
     _out = ["Bacon", "ipsum", "dolor sit", "amet"]
     self.assertEqual(_minimize(_in, self.delim, self.max), _out)