def text_to_item(self, raw_text):
     res = {}
     text = self.replace_word(raw_text)
     words = dict_word_tokenize(text, self.keyword_dict)
     new_w = []
     for w in words:
         if w in self.keyword:
             new_w.append(w)
         else:
             word = word_tokenize(w)
             [new_w.append(i) for i in word]
     words = new_w
     print(words)
     items_translated_2 = self.translate2(words)
     # items = self.split_item(words)
     # items_translated_1 = []
     # for item in items:
     #     item = self.translate1(item)
     #     items_translated_1.append(item)
     # if items_translated_2 != items_translated_1:
     #     print(raw_text)
     #     print(items_translated_1)
     #     print(items_translated_2)
     if (items_translated_2 == []):
         res["status"] = "fail to processing \"{}\".".format(raw_text)
     else:
         res["status"] = "process word \"{}\" complete.".format(raw_text)
     res["item"] = items_translated_2
     return res
예제 #2
0
def word_tokenize_to_g2p(text):
    wordall = dict_word_tokenize(text, custom_dict_trie=DEFAULT_DICT_TRIE)
    list = []
    for a in wordall:
        try:
            list.append(data[a])  #romanization(a,engine='pyicu'))
        except:
            word_list_icu = word_tokenize(a, engine="icu")
            for b in word_list_icu:
                list.append(romanization(b, engine='pyicu'))
    return '|'.join(list)
예제 #3
0
def word_tokenize_to_g2p(text):
	wordall=dict_word_tokenize(text, custom_dict=DEFAULT_DICT_TRIE)
	list=[]
	for a in wordall:
		try:
			list.append(data[a])#romanization(a,engine='pyicu'))
		except:
			word_list_icu=word_tokenize(a)
			for b in word_list_icu:
				list.append(romanization(b))
	return '|'.join(list).replace("-","|").split('|')
예제 #4
0
 def test_dict_word_tokenize(self):
     self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), [])
     self.assertIsNotNone(
         dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE)
     )
     self.assertIsNotNone(dict_trie(()))
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์",
             custom_dict=FROZEN_DICT_TRIE,
             engine="longest",
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm"
         )
     )
     self.assertIsNotNone(
         dict_word_tokenize(
             "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX"
         )
     )
예제 #5
0
    def test_word_tokenize(self):
        self.assertEqual(word_tokenize(""), [])
        self.assertEqual(
            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
        )

        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="newmm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="mm")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="longest")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="icu")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut")
        )
        self.assertIsNotNone(
            word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
        )  # XX engine is not existed

        self.assertIsNotNone(dict_trie(()))
        self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
        self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
        self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"}))
        self.assertIsNotNone(dict_trie(thai_words()))
        self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE))
        self.assertIsNotNone(
            dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
        )

        self.assertTrue(
            "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
        )

        # Commented out until this unittest bug get fixed:
        # https://bugs.python.org/issue29620
        # with self.assertWarns(DeprecationWarning):
        #     dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE)
        self.assertEqual(
            word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
            dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
        )
예제 #6
0
 def run(self, text):
     self.sound = None
     self.text_cut = dict_word_tokenize(text, Trie(self.word_list))
     self.i = 0  # ประกาศตัวแปร i เพื่อใช้ในการลูป
     self.num_sound = 0  # ประกาศตัวแปร num_sound สำหรับเก็บจำนวนเสียงที่ถูกรวม
     while self.i < len(self.text_cut):
         if self.text_cut[
                 self.
                 i] in self.word_list and self.num_sound == 0:  # ถ้ามีคำนี้ในระบบและเป็นคำแรกที่รวม
             self.sound = AudioSegment.from_wav(
                 "data/" + self.data_file[self.text_cut[self.i]] +
                 ".wav")  # ให้ self.sound แทนไฟล์เสียงที่ถูกตัดไฟล์แรก
             self.num_sound += 1
         elif self.text_cut[self.i] in self.word_list:
             self.sound += AudioSegment.from_wav(
                 "data/" + self.data_file[self.text_cut[self.i]] +
                 ".wav")  # ทำการรวมไฟล์เสียงเข้าไป
             self.num_sound += 1
         self.i += 1
     self.sound.export(self.file, format="wav")
예제 #7
0
# -*- coding: utf-8 -*-
from pythainlp.tokenize import dict_word_tokenize, create_custom_dict_trie

data = create_custom_dict_trie("wordlist.txt")
while True:
    text = input("text : ")
    print(dict_word_tokenize(text, custom_dict_trie=data, engine="newmm"))
    print("\r\n")
예제 #8
0
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=True#False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
sents=data_all
tokens = []
boundaries = set()
offset = 0
def check_punctuation(text):
	for i in text:
		if i in list(set(punctuation)):
			return True
	return False
def num_there(s):
    return any(i.isdigit() for i in s)
for sent in sents:
	tokens.extend(sent)
def segment(txt):
    return dict_word_tokenize(text=txt,
                              data=get_data(filename) + get_data(filename2),
                              data_type="list",
                              engine="newmm")
예제 #10
0
# -*- coding: utf-8 -*-
import sqlite3
connection = sqlite3.connect('db.sqlite3')
cursor = connection.execute('select word from word')
wordlist=[i[0] for i in cursor.fetchall()]
#print('\n'.join(wordlist))
print("จำนวนคำ : "+str(len(wordlist)))
connection.close()
from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie,word_tokenize
dictthai=create_custom_dict_trie(wordlist)
while True:
    text=input("text : ")
    if text=="exit":
        break
    print("ผลจาก dict : \t"+'|'.join(dict_word_tokenize(text,dictthai)))
    print("ผลจาก PyThaiNLP : \t"+'|'.join(word_tokenize(text)))
예제 #11
0
파일: __init__.py 프로젝트: xemoe/pythainlp
 def test_dict_word_tokenize(self):
     self.assertEqual(dict_word_tokenize(""), [])
예제 #12
0
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
sents=data_all
tokens = []
boundaries = set()
offset = 0
def check_punctuation(text):
	for i in text:
		if i in list(set(punctuation)):
			return True
	return False
def num_there(s):
    return any(i.isdigit() for i in s)
for sent in sents:
	tokens.extend(sent)
예제 #13
0
 def test_dict_word_tokenize(self):
     self.assertEqual(dict_word_tokenize(""), [])