def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertEqual(get_corpus_db_detail("XXX"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertTrue(download(name="test", version="0.1")) self.assertTrue(remove("test"))
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_family_names(), frozenset) self.assertIsInstance(list(thai_family_names())[0], str) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertFalse(download(name="test", version="0.0")) self.assertFalse(download(name="test", version="0.0.0")) self.assertFalse(download(name="test", version="0.0.1")) self.assertFalse(download(name="test", version="0.0.2")) self.assertFalse(download(name="test", version="0.0.3")) self.assertFalse(download(name="test", version="0.0.4")) self.assertIsNotNone(download(name="test", version="0.0.5")) self.assertTrue(download("test")) self.assertIsNotNone(remove("test")) # remove existing self.assertIsNotNone(download(name="test", version="0.0.6")) self.assertIsNotNone(download(name="test", version="0.0.7")) self.assertIsNotNone(download(name="test", version="0.0.8")) self.assertIsNotNone(download(name="test", version="0.0.9")) self.assertIsNotNone(download(name="test", version="0.0.10")) with self.assertRaises(Exception) as context: self.assertIsNotNone(download(name="test", version="0.0.11")) self.assertTrue( "Hash does not match expected." in str(context.exception)) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test"))
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) self.assertFalse(remove("test"))
def syllable_tokenize(text: str, engine: str = "default") -> List[str]: """ This function is to tokenize text into syllable (Thai: พยางค์), a unit of pronunciation having one vowel sound. For example, the word 'รถไฟ' contains two syallbles including 'รถ', and 'ไฟ'. Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize` with *newmm* as a tokenizer. The function tokenize the text with the dictionary of Thai words from :func:`pythainlp.corpus.common.thai_words` and then dictionary of Thai syllable from :func:`pythainlp.corpus.common.thai_syllables`. As a result, only syllables are obtained. :param str text: input string to be tokenized :param str engine: name of the syllable tokenizer :return: list of syllables where whitespaces in the text **are included** :rtype: list[str] **Options for engine** * *default* * *ssg* - CRF syllable segmenter for Thai. :Example:: :: from pythainlp.tokenize import syllable_tokenize text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า' syllable_tokenize(text) ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว', 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ if not text or not isinstance(text, str): return [] tokens = [] if engine == "default": words = word_tokenize(text) trie = dict_trie(dict_source=thai_syllables()) for word in words: tokens.extend(word_tokenize(text=word, custom_dict=trie)) else: from .ssg import segment tokens = segment(text) return tokens
def syllable_tokenize(text: str) -> List[str]: """ :param str text: input string to be tokenized :return: list of syllables """ if not text or not isinstance(text, str): return [] tokens = [] if text: words = word_tokenize(text) trie = dict_trie(dict_source=thai_syllables()) for word in words: tokens.extend(word_tokenize(text=word, custom_dict=trie)) return tokens
def syllable_tokenize(text: str) -> List[str]: """ :param str text: input string to be tokenized :return: returns list of strings of syllables """ if not text or not isinstance(text, str): return [] tokens = [] if text: words = word_tokenize(text) trie = dict_trie(dict_source=thai_syllables()) for word in words: tokens.extend(word_tokenize(text=word, custom_dict=trie)) return tokens
"sent_tokenize", "subword_tokenize", "syllable_tokenize", "word_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words from pythainlp.util.trie import Trie DEFAULT_WORD_TOKENIZE_ENGINE = "newmm" DEFAULT_SENT_TOKENIZE_ENGINE = "crfcut" DEFAULT_SUBWORD_TOKENIZE_ENGINE = "tcc" DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "dict" DEFAULT_WORD_DICT_TRIE = Trie(thai_words()) DEFAULT_SYLLABLE_DICT_TRIE = Trie(thai_syllables()) DEFAULT_DICT_TRIE = DEFAULT_WORD_DICT_TRIE from pythainlp.tokenize.core import ( Tokenizer, sent_tokenize, subword_tokenize, syllable_tokenize, word_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus THAI2FIT_TOKENIZER = Tokenizer( custom_dict=_get_corpus("words_th_thai2fit_201810.txt"), engine="newmm")
ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \ [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN] SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS) PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "th-roberta-base": 514, } # Store pre tokenizer function (text cutter) PRE_TOKENIZERS_MAP = {'newmm': partial( word_tokenize, custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS)))) ), 'syllable': partial( word_tokenize, custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS)))) ), } _nb_cores = multiprocessing.cpu_count() def split_additional_special_token(texts): """ Split list of text by additional special exclude space token. Args: texts: list of text. Returns: list_of_pre_cut_texts: list of list of pre cut text.
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "th-roberta-base": 514, } # Store pre tokenizer function (text cutter) PRE_TOKENIZERS_MAP = { 'newmm': partial(word_tokenize, custom_dict=Trie( frozenset( set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))), 'syllable': partial(word_tokenize, custom_dict=Trie( frozenset( set(thai_syllables()).union( set(ADDITIONAL_SPECIAL_TOKENS))))), } _nb_cores = multiprocessing.cpu_count() def split_additional_special_token(texts): """ Split list of text by additional special exclude space token. Args: texts: list of text. Returns: list_of_pre_cut_texts: list of list of pre cut text.