示例#1
0
    def test_word_tokenize(self):
        self.assertEqual(word_tokenize(""), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="ulmfit"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut"))
        self.assertIsNotNone(word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX"))

        self.assertIsNotNone(dict_trie(()))
        self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
        self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(dict_trie(FROZEN_DICT_TRIE))

        self.assertIsNotNone(word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE))
        self.assertIsNotNone(
            word_tokenize("ทดสอบ", engine="deepcut", custom_dict=FROZEN_DICT_TRIE)
        )
        self.assertIsNotNone(
            word_tokenize("ทดสอบ", engine="XX", custom_dict=FROZEN_DICT_TRIE)
        )
示例#2
0
    def test_word_tokenize(self):
        self.assertEqual(word_tokenize(""), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
        )  # XX engine is not existed

        self.assertIsNotNone(dict_trie(()))
        self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
        self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
        self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"}))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE))
        self.assertIsNotNone(
            dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
        )

        self.assertTrue(
            "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
        )

        # Commented out until this unittest bug get fixed:
        # https://bugs.python.org/issue29620
        # with self.assertWarns(DeprecationWarning):
        #     dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE)
        self.assertEqual(
            word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
            dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
        )
示例#3
0
def createBOW(ls_txt, corpus):

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    BOW_t = [list() for i in range(len(ls_txt))]
    l = 0
    for i in ls_txt:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW_t[l].append(tmp.count(j))
                tmp.remove(j)
            else:
                BOW_t[l].append(0)

        if len(tmp) != 0:
            BOW_t[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW_t[l].append(0)
        l += 1

    # corpus_t = corpus.append('Other')
    # ch = pd.DataFrame({
    #     'train':corpus,
    #     'target':BOW_t[0]
    # })
    # ch
    # predictiontree = dtree.predict(BOW_t)
    return list(BOW_t)
示例#4
0
def tokenize_text_list_test(ls):

    print("working on")
    li = [
        'cfcut', 'deepcut', 'etcc', 'longest', 'multi_cut', 'newmm', 'ssg',
        'tcc', 'trie'
    ]
    # li=['cfcut','newmm']
    custom_dict = set(thai_words())
    trie = dict_trie(dict_source=custom_dict)
    p, q = [], []
    for x in li:
        start = time.process_time()
        if x == 'deepcut':
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l, engine=x) for l in ls
                ]))
        else:
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l,
                                                     engine=x,
                                                     custom_dict=trie)
                    for l in ls
                ]))
        p.append(g)
        # print(g)
        tim = time.process_time() - start
        q.append(tim)
    return p, q
示例#5
0
 def test_dict_word_tokenize(self):
     self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), [])
     self.assertIsNotNone(
         dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE)
     )
     self.assertIsNotNone(dict_trie(()))
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์",
             custom_dict=FROZEN_DICT_TRIE,
             engine="longest",
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm"
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX"
         )
     )
 def __init__(self):
     self.menus = []
     #self.menu = {}
     self.normwords = []
     self.lang_convert_th_eng = {
         "ร้อน": "Hot",
         "เย็น": "Ice",
         "ปั่น": "Frappe",
     }
     self.word_filter = [
         "ร้อน", "เย็น", "ปั่น", "แก้ว", "ใหญ่", "หวาน", "น้ำตาล", "นม",
         "นมข้น", "วิปครีม", "วิป", "ไม่", "ใส่", "น้อย", "ปกติ", "กลาง",
         "มาก", "เยอะ", "เพิ่ม"
     ]
     with open("other_module/menu.csv", encoding='utf-8') as csvfile:
         reader = csv.reader(csvfile)
         for row in reader:  # each row is a list
             self.lang_convert_th_eng[row[0]] = row[1]
             #self.menu.append([row[0],row[1]])
             self.menus.append(row[0])
     self.keyword = [item for item in self.menus if item != "ชา"]
     self.keyword.append("วิป")
     self.keyword.append("วิปครีม")
     self.keyword.append("ร้อน")
     self.keyword_dict = dict_trie(self.keyword)
     with open("other_module/normwords.csv", encoding='utf-8') as csvfile:
         reader = csv.reader(csvfile)
         for row in reader:  # each row is a list
             self.normwords.append(row)
示例#7
0
def main():
    engineOption = ["newmm", "longest-matching", "dict", "ulmfit"]
    f = codecs.open('input.txt', encoding='utf-8')
    fsort = open("output-sort.csv", "w", encoding="utf-8")

    text = ""
    for line in f:
        # print (line)
        text = text + line

    custom_words_list = set(thai_words())
    custom_words_list.add('รีเทนเนอร์')
    custom_words_list.add('จัดฟัน')
    custom_words_list.add('ฟันชิด')
    trie = dict_trie(dict_source=custom_words_list)
    _tokenizer = Tokenizer(custom_dict=trie, engine='newmm')

    print('------ Starting to tokenize words ------')
    # words = word_tokenize(text, engine=engineOption[0])
    words = _tokenizer.word_tokenize(text)
    i = 0
    wordsNew = ""
    for word in words:
        if word and (not word.isspace(
        )) and word != '-' and word != '/' and not word.isnumeric():
            i = i + 1
            # print(i , ': ' , word.strip() )
            wordsNew = wordsNew + word.strip() + " "
    f.close()

    print('------ Starting to count words: ------')
    wordlist = wordsNew.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w.strip()))
        dictionary = wordListToFreqDict(wordlist)
        sorteddict = sortFreqDict(dictionary)
        i = i + 1
        if (i % 150 == 0):
            print(".")
        else:
            print(".", end='')

    print('------ Starting to sort words and write to file ------')
    for s in sorteddict:
        print(s[1], "|", s[0])
        fsort.write(s[1] + "|" + str(s[0]))
        fsort.write('\n')
    fsort.close()
示例#8
0
def train():
    df = pd.read_csv("Data/Expenses.csv")

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    corpus = []
    for i in df.text:
        for j in word_tokenize(i, engine='dict', custom_dict=trie):
            if j not in corpus:
                corpus.append(j)

    BOW = [list() for i in range(len(df.text))]
    l = 0
    count = 1
    for i in df.text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1

    ytarget = df.cate
    xtrain = BOW

    dtree = DecisionTreeClassifier()
    dtree.fit(X=xtrain, y=ytarget)

    return corpus, dtree
示例#9
0
    def ProcessText(self, text):

        dataBase = database()
        streetDf, addressDf = dataBase.ReadStreetName()

        streetList = dataBase.DataframeToList(streetDf)
        addressList = dataBase.DataframeToList(addressDf)
        districtList = dataBase.districtName_
        wordList = districtList + streetList + addressList

        custom_words_list = set(thai_words())
        custom_words_list.update(wordList)
        custom_words_list.update(self.specWord)

        trie = dict_trie(dict_source=custom_words_list)

        custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel)
        proc = custom_tokenizer.word_tokenize(text)

        cleanList_1 = []
        cleanList = []
        [
            cleanList_1.append(
                i.translate(str.maketrans('', '', string.punctuation)))
            for i in proc
        ]
        [
            cleanList.append(i.translate(str.maketrans('', '', '1234567890')))
            for i in cleanList_1
        ]

        procText = list(filter(lambda x: x != " ", proc))
        procText = list(filter(lambda x: x != "  ", procText))
        procText = list(filter(lambda x: x != "", procText))
        #procText = list(filter(lambda x: len(x)>2, procText))
        joinText = ' '.join(procText)
        #print(joinText)
        return joinText
示例#10
0
def syllable_tokenize_lu(text: str) -> List[str]:
    """Reference https://thainlp.org/pythainlp/docs/2.0/_modules/pythainlp/tokenize.html#syllable_tokenize"""
    if not text or not isinstance(text, str):
        return []

    tokens = []
    # Read lu syllable list
    with open(LU_SYLLABLE_FILENAME, 'r') as f:
        syllable_lu_dict = json.load(f)

    # Create custom dict trie for Lu
    lu_syllable = syllable_lu_dict['data']
    dict_source = frozenset(set(lu_syllable))
    trie = dict_trie(dict_source)

    if text:
        words = word_tokenize(text, custom_dict=trie)
        #print("lu", words)
        #dict_source = frozenset(set(lu_syllable).union(set(thai_syllables())))
        for word in words:
            tokens.extend(word_tokenize(text=word, custom_dict=trie))

    return tokens
def word_sylleble(text):
    tokens = []
    if text:
        trie = dict_trie(dict_source=listtext)
        tokens.extend(onecut(trie, text=text))
    return [tokens, text]
示例#12
0
# -*- coding: utf-8 -*-
from pythainlp.tokenize import word_tokenize, dict_trie
from pythainlp.corpus import thai_stopwords, thai_words, tnc
from pythainlp.util import normalize
import data
stopwords = list(thai_stopwords())
thaiword = list(thai_words())
#tnc1=[word for word,i in tnc.word_freqs()]
thaiword.remove("กินข้าว")
datadict = dict_trie(
    list(set(data.ccc + thaiword + stopwords + data.conjunctions)))  #+tnc1)))


def wordcut(word):
    global datadict
    return word_tokenize(word, custom_dict=datadict)