示例#1
0
def proc_ques(ques,
              pretrain_name='airesearch/wangchanberta-base-att-spm-uncased',
              maxlen=416):
    tokenizer = CamembertTokenizerFast.from_pretrained(pretrain_name,
                                                       model_max_length=maxlen)
    q = ques['question']
    q = q.lower()
    q = normalize(q)
    return tokenizer(q, return_tensors="pt", padding='max_length')
    # ques_ix = np.zeros(max_token, np.int64)

    # words = re.sub(
    #     r"([.,'!?\"()*#:;])",
    #     '',
    #     ques['question'].lower()
    # ).replace('-', ' ').replace('/', ' ').split()

    # for ix, word in enumerate(words):
    #     if word in token_to_ix:
    #         ques_ix[ix] = token_to_ix[word]
    #     else:
    #         ques_ix[ix] = token_to_ix['UNK']

    #     if ix + 1 == max_token:
    #         break

    return ques_ix
示例#2
0
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return normalize(remove_punc(lower(s.strip()))).replace('\xa0', ' ')
示例#3
0
def tokenize(stat_ques_list,
             pretrain_name='airesearch/wangchanberta-base-att-spm-uncased',
             maxlen=416):
    tokenizer = CamembertTokenizerFast.from_pretrained(pretrain_name)
    tokenized_dataset = []
    for q in stat_ques_list:
        q = q['question']
        q = q.lower()
        q = normalize(q)
        tokenized_dataset.append(tokenizer(q, padding='max_length'))

    return tokenized_dataset
示例#4
0
 def listen(self,text,play=True):
     text=normalize(text)
     self.list=word_tokenize(text)
     if play and self.engine=="thaitts":
         try:
             thaitts(" ".join(self.list),self.thaitts,"./t.wav")
             playsound('./t.wav')
         except:
             print("ไม่สามารถพูดได้ : "+str(text))
     elif play and self.engine=="g":
         gTTS1(text,"t.mp3")
         playsound('t.mp3')
示例#5
0
def process_corpus(corpus, number_token="<NUM>", oov_token="<OOV>"):
    # Create an empty dictionary and token list
    dictionary = {oov_token: 1}
    tokenized_corpus = []

    corpus = corpus[1:1000]

    for entry in corpus:
        # Normalize the entry
        entry = normalize(entry)

        # Tokenize each entry
        tokens = np.array(
            tokenize.word_tokenize(entry,
                                   engine='newmm',
                                   keep_whitespace=False))

        # Remove non-Thai words
        tokens = tokens[[
            isthai(t, ignore_chars="0123456789") and t != "" for t in tokens
        ]]

        # Replace numbers with text
        #tokens = [re.sub("^\d*$",num2words(t, lang = 'th'),t) for t in tokens]

        # Add the tokens to the tokenized corpus
        tokenized_corpus.append(tokens)

        # Add the tokens to the dictionary and increment counts
        for t in tokens:
            if t in dictionary:
                dictionary[t] = dictionary[t] + 1
            else:
                dictionary[t] = 1

    return tokenized_corpus, dictionary
示例#6
0
# -*- coding: utf-8 -*-

from pythainlp.util import normalize

print(normalize("เเปลก") == "แปลก")  # เ เ ป ล ก กับ แปลก
示例#7
0
	def test_normalize(self):
		self.assertEqual(normalize("เเปลก"),"แปลก")
示例#8
0
หรืออีก
อย่างหนึ่ง
หากแต่ว่า
เหตุดังนั้น
เหตุ
นี้
เหมือนดังว่า
อย่างไรก็ดี
อย่างไรก็
ตาม
อนึ่งคือว่า
อีกประการหนึ่ง
อีก
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=True#False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
示例#9
0
def segment_sentences(words):
    start = 0
    sents = []
    num_true = 0.0
    num_all = 0
    for i, word in enumerate(words):
        dist = classifier.prob_classify(punct_features(words, i))
        for label in dist.samples():
            if label == True:
                num_true += dist.prob(label)
        if classifier.classify(punct_features(words,
                                              i)) == True and num_true > 0.60:
            sents.append(words[start:i + 1])
            start = i + 1
    if start < len(words):
        sents.append(words[start:])
    #print(num_true/num_all)
    return sents


while True:
    thai_sent = normalize(input("Text : "))
    #thai_word=word_tokenize(thai_sent,thai_tokenize)#
    text_all = dict_word_tokenize(thai_sent, thaiword)  #[]
    """temp=thai_sent.split(' ')
	for data in temp:
		thai_word=dict_word_tokenize(data,thaiword)
		text_all.extend(thai_word)"""
    #print(text_all)
    thai_sents = segment_sentences(text_all)
    print('sent : ' + '/'.join([''.join(i) for i in thai_sents]))
示例#10
0
	def test_normalize(self):
    		self.assertEqual(normalize("เเปลก"),"แปลก")
示例#11
0
หรืออีก
อย่างหนึ่ง
หากแต่ว่า
เหตุดังนั้น
เหตุ
นี้
เหมือนดังว่า
อย่างไรก็ดี
อย่างไรก็
ตาม
อนึ่งคือว่า
อีกประการหนึ่ง
อีก
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=True#False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
# print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
示例#12
0
from pythainlp.util import normalize
p = os.path.join(".", "text")
listfile = [i for i in list(glob.glob(p + "/*.txt"))]


def readfile(path):
    with open(path, "r", encoding="utf-8-sig") as f:
        return f.read()


def writefile(path, data):
    with open(path, "w", encoding="utf-8") as f:
        f.write(data)


def clean(data):
    rule = [("", "์"), ("", "่"), ("", "้"), ("", "ี"), ("", "็"),
            ("", "้"), ("", "่"), ("", "ิ"), ("", "ื"), ("", "ั"),
            ("", "๊"), (" ่", "่"), (" ้", "้"), (" ๋", "๋"), (" ๊", "๊"),
            (" ็", "็"), (" ั", "ั"), (" ู้", " ู้".replace(" ", "")),
            (" ื้", " ื้".replace(" ", "")), (" ์", "์"), (" ิ", "ิ"),
            (" ื", "ื"), (" ี่", " ี่".replace(" ", ""))]
    for i in rule:
        data = data.replace(i[0], i[1])
    return data


listdata = [normalize(clean(normalize(readfile(i)))) for i in listfile]

for i, file in enumerate(listfile):
    writefile(file, listdata[i])
示例#13
0
หรืออีก
อย่างหนึ่ง
หากแต่ว่า
เหตุดังนั้น
เหตุ
นี้
เหมือนดังว่า
อย่างไรก็ดี
อย่างไรก็
ตาม
อนึ่งคือว่า
อีกประการหนึ่ง
อีก
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
示例#14
0
 def test_normalize(self):
     self.assertEqual(normalize('เเปลก'), 'แปลก')
def word_tokenization(text):
    tokenized = " ".join(word_tokenize(normalize(text), keep_whitespace=False))
    return tokenized
示例#16
0
# -*- coding: utf-8 -*-
from pythainlp.util import normalize
print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
示例#17
0
 def test_normalize(self):
     self.assertEqual(normalize("เเปลก"), "แปลก")
     self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))
示例#18
0
def text_cleaning(texts):
    clean_word = []
    stop_words = thai_stopwords()
    for text in texts:
        #emoji list
        pos_emoji = re.compile(
            u'[\U0001F600\U0001F603\U0001F604\U0001F601\U0001F606\U0001F60A\U0000263A\U0000FE0F\U0001F923\U0001F642\U0001F609\U0001F60C\U0001F619\U0001F617\U0001F618\U0001F970\U0001F60D\U0001F61A\U0001F60B\U0001F61B\U0001F61D\U0001F61C\U0001F973\U0001F60F\U0001F633\U0001F638\U0001F63A\U0001F63D\U0001F63B\U0001F63C\U0001F44D\U0001F3FB\U0001F91F\U0001F3FB\U0001F918\U0001F3FB\U0001F48B\U00002764\U0000FE0F\U0001F9E1\U0001F49B\U0001F49A\U0001F499\U0001F49C\U00002763\U0000FE0F\U0001F495\U0001F49E\U0001F493\U0001F497\U0001F496\U0001F498\U0001F49D]',
            flags=re.UNICODE)
        neg_emoji = re.compile(
            u'[\U0001F494\U0001F642\U0001F643\U0001F61E\U0001F612\U0001F60F\U0001F614\U0001F61F\U0001F615\U0001F641\U00002639\U0000FE0F\U0001F623\U0001F616\U0001F62B\U0001F629\U0001F97A\U0001F622\U0001F62D\U0001F60F\U0001F624\U0001F620\U0001F621\U0001F92C\U0001F92F\U0001F975\U0001F628\U0001F630\U0001F625\U0001F613\U0001F925\U0001F636\U0001F610\U0001F611\U0001F644\U0001F626\U0001F640\U0001F63E\U0001F63C\U0001F595\U0001F3FB\U0001F44E\U0001F3FB\U0001F9B6\U0001F3FB\U0001F448\U0001F3FB\U0001F91E\U0001F3FB\U0001F44B\U0001F3FB\U0001F47F\U0001F47A\U0001F921\U0001F92E\U0001F974\U0001F463]',
            flags=re.UNICODE)
        pos_count = len(re.findall(pos_emoji, text))
        neg_count = len(re.findall(neg_emoji, text))
        #text.replace('☺️', 'posemo')
        #for emo in pos_emoji: text = text.replace(emo,'posemo')

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
            "]+",
            flags=re.UNICODE)
        text = emoji_pattern.sub(r"", text)
        #delte Link hashtag and mention
        text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text)
        text = re.sub(r"^https://t.co/[A-Za-z0-9]*\s", "", text)
        text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", "", text)
        text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", "", text)
        #find and delete laugh
        laugh_count = len(re.findall(r'(5)\1{2,}(6?){3,}', text))
        text = re.sub(r'(5)\1{2,}(6?){3,}', '', text)
        #delete symbol
        text = re.sub(r'[!-@[-`{-~]', "", text)
        #text = re.sub("\d+", "", text) #number
        text = normalize(text)

        #Tokenization

        tokens = word_tokenize(text)

        #deletion of whitespace & one letter text

        i = 0
        for token in list(tokens):
            if (len(token) == 1 or len(token) == token.count(token[0]) or token
                    in ['xxrep', 'xxwrep', '', 'ชา', 'นนท์', 'ปอนด์', 'ป้อม']):
                tokens.pop(i)
                i = i - 1
            i = i + 1

#Add thailaugh posemoji negemoji tag

        for a in range(laugh_count):
            tokens.append('thailaugh')
        for a in range(pos_count):
            tokens.append('posemoji')
        for a in range(neg_count):
            tokens.append('negemoji')


# POS Tag

# from pythainlp.tag import pos_tag
# pos = pos_tag(tokens,corpus='orchid_ud')
# keep_tag = ['VERB', 'ADJ', 'ADV', 'INTJ', 'AUX']
#keep_tag = ['VACT','VATT','ADVN','ADVI','ADVP','ADVS','FIXV','NEG','ADJ','']

# pos_tags = [t[0] for t in pos if (t[1] in keep_tag) or (t[0] == "thailaugh")
# or (t[0] == "posemoji")  or (t[0] == "negemoji")]
# tokens = pos_tags

# Delete Stop Word

        filtered_sentence = []
        for t in tokens:
            if t not in stop_words:
                #t = ''.join(c[0] for c in itertools.groupby(t))
                filtered_sentence.append(t)
        clean_word.append(','.join(filtered_sentence))
    return clean_word
示例#19
0
    def test_normalize(self):
        self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))

        # normalize sara e + sara e
        self.assertEqual(normalize("เเปลก"), "แปลก")

        # normalize consonant + nikhahit + sara aa
        self.assertEqual(normalize("นํา"), "นำ")
        self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33")

        # normalize consonant + tone mark + nikhahit + sara aa
        self.assertEqual(normalize("\u0e01\u0e48\u0e4d\u0e32"),
                         "\u0e01\u0e48\u0e33")

        # reorder consonant + follow vowel + tone mark
        self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30")

        # reorder consonant + nikhahit + tone mark + sara aa
        self.assertEqual(normalize("\u0e01\u0e4d\u0e48\u0e32"),
                         "\u0e01\u0e48\u0e33")

        # reorder consonant + follow vowel + tone mark
        self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")

        # remove repeating following vowels
        self.assertEqual(normalize("กาา"), "กา")
        self.assertEqual(normalize("กา า  า  า"), "กา")
        self.assertEqual(normalize("กา าาะา"), "กาะา")

        # remove epeating tone marks
        self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")

        # remove repeating different ton emarks
        self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49")
        self.assertEqual(normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"),
                         "\u0e01\u0e49")

        # remove tone mark at the beginning of text
        self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")

        # remove duplicate spaces
        self.assertEqual(remove_dup_spaces("  ab  c d  "), "ab c d")
        self.assertEqual(remove_dup_spaces("\nab  c   \n d \n"), "ab c\nd")

        # remove tone marks
        self.assertEqual(remove_tonemark("จิ้น"), "จิน")
        self.assertEqual(remove_tonemark("เก๋า"), "เกา")
        self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง"))
        with self.assertWarns(DeprecationWarning):
            delete_tone("ค้าบ")

        # remove zero width chars
        self.assertEqual(remove_zw("กา\u200b"), "กา")
        self.assertEqual(remove_zw("ก\u200cา"), "กา")
        self.assertEqual(remove_zw("\u200bกา"), "กา")
        self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
示例#20
0
def normalize_word(text):
    return normalize(text)
示例#21
0
 def test_normalize(self):
     self.assertEqual(normalize("เเปลก"), "แปลก")
     self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))