# import stopwords from pythainlp import sent_tokenize, word_tokenize, thai_characters from pythainlp.corpus import remove, get_corpus_path, get_corpus from pythainlp.corpus.common import thai_stopwords from pylexto import LexTo from pythainlp.tokenize import etcc from pythainlp.tag import pos_tag from pythainlp.util import collate text = "ค่าจ้าง ค่าเช่า ค่าตอบแทน ค่าบริการ จำเป็นต้องหัก ณ ที่จ่ายส่งสรรพากร" text_cutting = word_tokenize(text, engine="deepcut") text_collate = collate(text_cutting) print("deepcut :", text_cutting) #Engine ที่เหมาะกับงานเราที่สุด text_tag_list = pos_tag(text_collate) print(text_tag_list) text_pos = [] pos_vact = [] pos = "" for itr in range(len(text_tag_list)): text_pos.append([text_tag_list[itr][0],text_tag_list[itr][1]]) if text_pos[itr][1] == None:
def isThai(chr): for i in chr: cVal = ord(i) if (cVal >= 3584 and cVal <= 3711): pass elif i == " " or i == ".": # ข้อยกเว้น pass else: return False return True def clean(word): if (len(word) > 1 and isThai(word) and "\t" not in word and ".." not in word and word.startswith("์") == False and word.isdecimal() == False and word.isnumeric() == False and word.isspace() == False and word.isdigit() == False and re.search(r'[^0-9a-zA-Z|^\d+?\.\d+?$]', word)): return True return False for i in files: with open(i, "r", encoding="utf-8-sig", errors='ignore') as f: word_all.extend([ j.strip().replace("[", "").replace("]", "") for j in f.readlines() ]) word_all = collate([i for i in list(set(word_all)) if clean(i)]) print(len(word_all)) with open("wordlist.txt", "w", encoding="utf-8") as f: f.write("\n".join(word_all))
def test_collate(self): self.assertEqual(collate(["ไก่", "กก"]), ["กก", "ไก่"]) self.assertEqual( collate(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"], )
def sort(words): return collate(words)
def sortReverse(words): return collate(words, reverse=True)
def test_collate(self): self.assertEqual(collate(["ไก่", "กก"]), ["กก", "ไก่"]) self.assertEqual( collate(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"] )