Пример #1
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertEqual(get_corpus_db_detail("XXX"),
                         {})  # corpus does not exist
        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertTrue(download(name="test", version="0.1"))
        self.assertTrue(remove("test"))
Пример #2
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_family_names(), frozenset)
        self.assertIsInstance(list(thai_family_names())[0], str)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist
        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertFalse(download(name="test", version="0.0"))
        self.assertFalse(download(name="test", version="0.0.0"))
        self.assertFalse(download(name="test", version="0.0.1"))
        self.assertFalse(download(name="test", version="0.0.2"))
        self.assertFalse(download(name="test", version="0.0.3"))
        self.assertFalse(download(name="test", version="0.0.4"))
        self.assertIsNotNone(download(name="test", version="0.0.5"))
        self.assertTrue(download("test"))
        self.assertIsNotNone(remove("test"))  # remove existing
        self.assertIsNotNone(download(name="test", version="0.0.6"))
        self.assertIsNotNone(download(name="test", version="0.0.7"))
        self.assertIsNotNone(download(name="test", version="0.0.8"))
        self.assertIsNotNone(download(name="test", version="0.0.9"))
        self.assertIsNotNone(download(name="test", version="0.0.10"))
        with self.assertRaises(Exception) as context:
            self.assertIsNotNone(download(name="test", version="0.0.11"))
        self.assertTrue(
            "Hash does not match expected." in str(context.exception))
        self.assertIsNotNone(download(name="test", version="0.1"))
        self.assertIsNotNone(remove("test"))
Пример #3
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
Пример #4
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
Пример #5
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     self.assertIsNotNone(thai_female_names())
     self.assertIsNotNone(thai_male_names())
     self.assertEqual(get_corpus_db_detail("XXX"), {})
     self.assertIsNone(download("test"))
     self.assertIsNone(download("test", force=True))
     self.assertIsNotNone(get_corpus_db_detail("test"))
     self.assertIsNotNone(remove("test"))
     self.assertFalse(remove("test"))
Пример #6
0
def syllable_tokenize(text: str, engine: str = "default") -> List[str]:
    """
    This function is to tokenize text into syllable (Thai: พยางค์), a unit of
    pronunciation having one vowel sound.  For example, the word 'รถไฟ'
    contains two syallbles including 'รถ', and 'ไฟ'.
    Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
    with *newmm* as a tokenizer. The function tokenize the text with
    the dictionary of Thai words from
    :func:`pythainlp.corpus.common.thai_words`
    and then dictionary of Thai syllable from
    :func:`pythainlp.corpus.common.thai_syllables`.
    As a result, only syllables are obtained.

    :param str text: input string to be tokenized
    :param str engine: name of the syllable tokenizer
    :return: list of syllables where whitespaces in the text **are included**
    :rtype: list[str]
    **Options for engine**
        * *default*
        * *ssg* - CRF syllable segmenter for Thai.
    :Example::
    ::

        from pythainlp.tokenize import syllable_tokenize

        text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
        syllable_tokenize(text)
        ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
        'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
    """

    if not text or not isinstance(text, str):
        return []

    tokens = []
    if engine == "default":
        words = word_tokenize(text)
        trie = dict_trie(dict_source=thai_syllables())
        for word in words:
            tokens.extend(word_tokenize(text=word, custom_dict=trie))
    else:
        from .ssg import segment

        tokens = segment(text)

    return tokens
Пример #7
0
def syllable_tokenize(text: str) -> List[str]:
    """
    :param str text: input string to be tokenized
    :return: list of syllables
    """

    if not text or not isinstance(text, str):
        return []

    tokens = []
    if text:
        words = word_tokenize(text)
        trie = dict_trie(dict_source=thai_syllables())
        for word in words:
            tokens.extend(word_tokenize(text=word, custom_dict=trie))

    return tokens
Пример #8
0
def syllable_tokenize(text: str) -> List[str]:
    """
    :param str text: input string to be tokenized

    :return: returns list of strings of syllables
    """

    if not text or not isinstance(text, str):
        return []

    tokens = []
    if text:
        words = word_tokenize(text)
        trie = dict_trie(dict_source=thai_syllables())
        for word in words:
            tokens.extend(word_tokenize(text=word, custom_dict=trie))

    return tokens
Пример #9
0
    "sent_tokenize",
    "subword_tokenize",
    "syllable_tokenize",
    "word_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
from pythainlp.util.trie import Trie

DEFAULT_WORD_TOKENIZE_ENGINE = "newmm"
DEFAULT_SENT_TOKENIZE_ENGINE = "crfcut"
DEFAULT_SUBWORD_TOKENIZE_ENGINE = "tcc"
DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "dict"

DEFAULT_WORD_DICT_TRIE = Trie(thai_words())
DEFAULT_SYLLABLE_DICT_TRIE = Trie(thai_syllables())
DEFAULT_DICT_TRIE = DEFAULT_WORD_DICT_TRIE

from pythainlp.tokenize.core import (
    Tokenizer,
    sent_tokenize,
    subword_tokenize,
    syllable_tokenize,
    word_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus

THAI2FIT_TOKENIZER = Tokenizer(
    custom_dict=_get_corpus("words_th_thai2fit_201810.txt"), engine="newmm")
Пример #10
0
ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \
    [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN]
SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS)

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "th-roberta-base": 514,
}

# Store pre tokenizer function (text cutter)
PRE_TOKENIZERS_MAP = {'newmm': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
                      'syllable': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
    }

_nb_cores = multiprocessing.cpu_count()


def split_additional_special_token(texts):
    """
    Split list of text by additional special exclude space token.

    Args:
        texts: list of text.

    Returns:
        list_of_pre_cut_texts: list of list of pre cut text.
Пример #11
0
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "th-roberta-base": 514,
}

# Store pre tokenizer function (text cutter)
PRE_TOKENIZERS_MAP = {
    'newmm':
    partial(word_tokenize,
            custom_dict=Trie(
                frozenset(
                    set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))),
    'syllable':
    partial(word_tokenize,
            custom_dict=Trie(
                frozenset(
                    set(thai_syllables()).union(
                        set(ADDITIONAL_SPECIAL_TOKENS))))),
}

_nb_cores = multiprocessing.cpu_count()


def split_additional_special_token(texts):
    """
    Split list of text by additional special exclude space token.

    Args:
        texts: list of text.

    Returns:
        list_of_pre_cut_texts: list of list of pre cut text.