def syllable_tokenize(text): """ :param str text: input string to be tokenized :return: returns list of strings of syllables """ syllables = [] if text: words = word_tokenize(text) trie = create_custom_dict_trie(custom_dict_source=syllable_dict()) for word in words: syllables.extend(dict_word_tokenize(text=word, custom_dict_trie=trie)) return syllables
from pythainlp.tokenize import syllable_tokenize as word_tokenize import sklearn_crfsuite from pythainlp.spell.pn import NorvigSpellChecker try: from pythainlp.corpus.thaisyllable import get_data as syllable_dict from pythainlp.corpus import stopwords stopwords = stopwords.words('thai') except: from pythainlp.corpus.common import thai_syllables, thai_stopwords stopwords = list(thai_stopwords()) syllable_dict = thai_syllables templates_file = os.path.join(os.path.dirname(pythaispell.__file__), "sp.model") invalidChars = set(string.punctuation.replace("_", "")) dict_s = list(set(syllable_dict())) def c(word): for i in list('กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ'): if i in word: return True return False def n(word): for i in list('ฅฉผฟฌหฮ'): if i in word: return True return False