Пример #1
0
# import stopwords
from pythainlp import sent_tokenize, word_tokenize, thai_characters
from pythainlp.corpus import remove, get_corpus_path, get_corpus
from pythainlp.corpus.common import thai_stopwords
from pylexto import LexTo
from pythainlp.tokenize import etcc
from pythainlp.tag import pos_tag
from pythainlp.util import collate




text = "ค่าจ้าง ค่าเช่า ค่าตอบแทน ค่าบริการ จำเป็นต้องหัก ณ ที่จ่ายส่งสรรพากร"

text_cutting = word_tokenize(text, engine="deepcut")
text_collate = collate(text_cutting)
print("deepcut  :", text_cutting) #Engine ที่เหมาะกับงานเราที่สุด


text_tag_list = pos_tag(text_collate)
print(text_tag_list)

text_pos = []
pos_vact = []
pos = ""

for itr in range(len(text_tag_list)):
    text_pos.append([text_tag_list[itr][0],text_tag_list[itr][1]])

         
    if text_pos[itr][1] == None:
Пример #2
0
def isThai(chr):
    for i in chr:
        cVal = ord(i)
        if (cVal >= 3584 and cVal <= 3711):
            pass
        elif i == " " or i == ".":  # ข้อยกเว้น
            pass
        else:
            return False
    return True


def clean(word):
    if (len(word) > 1 and isThai(word) and "\t" not in word
            and ".." not in word and word.startswith("์") == False
            and word.isdecimal() == False and word.isnumeric() == False
            and word.isspace() == False and word.isdigit() == False
            and re.search(r'[^0-9a-zA-Z|^\d+?\.\d+?$]', word)):
        return True
    return False


for i in files:
    with open(i, "r", encoding="utf-8-sig", errors='ignore') as f:
        word_all.extend([
            j.strip().replace("[", "").replace("]", "") for j in f.readlines()
        ])
word_all = collate([i for i in list(set(word_all)) if clean(i)])
print(len(word_all))
with open("wordlist.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(word_all))
Пример #3
0
 def test_collate(self):
     self.assertEqual(collate(["ไก่", "กก"]), ["กก", "ไก่"])
     self.assertEqual(
         collate(["ไก่", "เป็ด", "หมู", "วัว"]),
         ["ไก่", "เป็ด", "วัว", "หมู"],
     )
Пример #4
0
 def sort(words):
     return collate(words)
Пример #5
0
 def sortReverse(words):
     return collate(words, reverse=True)
Пример #6
0
 def test_collate(self):
     self.assertEqual(collate(["ไก่", "กก"]), ["กก", "ไก่"])
     self.assertEqual(
         collate(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"]
     )