示例#1
0
def split_word(text):
    th_stop = tuple(thai_stopwords())
    en_stop = tuple(get_stop_words('en'))
    p_stemmer = PorterStemmer()

    tokens = word_tokenize(text,engine='newmm')
    
    # Remove Thai and English stop words
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # Find Thai and English stem words
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # Remove numbers
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # Remove space
    tokens = [i for i in tokens if not ' ' in i]

    return tokens
def remove_stopwords(tokenized_ls):
    removed_stopwords = []
    for text in tokenized_ls:
        tmp = []
        for word in text:
            if word not in thai_stopwords():
                tmp.append(word)
        removed_stopwords.append(tmp)
    return removed_stopwords
示例#3
0
    def __init__(self):
        self.mt5_tokenizer = MT5Tokenizer.from_pretrained(
            "Pollawat/mt5-small-thai-qa-qg")
        self.mt5_model = MT5ForConditionalGeneration.from_pretrained(
            "Pollawat/mt5-small-thai-qa-qg")

        self.wangchanberta_tokenizer = AutoTokenizer.from_pretrained(
            "airesearch/wangchanberta-base-att-spm-uncased")
        self.wangchanberta_model = AutoModelForMaskedLM.from_pretrained(
            "airesearch/wangchanberta-base-att-spm-uncased")
        self.wangchanberta_pipeline = pipeline(
            task='fill-mask',
            tokenizer=self.wangchanberta_tokenizer,
            model=self.wangchanberta_model)
        self.stopwords = thai_stopwords()
def manageData(text):
    stop_words = set(thai_stopwords())
    word_tokens = word_tokenize(text, engine='newmm')
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    text_join = " ".join(filtered_sentence)
    senten = text_join.split()
    dataList = pd.read_csv('dataTest.csv',
                           usecols=lambda column: column not in ['class'])
    header = dataList.columns.values
    count_all = []
    for w in header:
        count_all.append(senten.count(w))
    x_train = replace(dataList.values, 1)
    y_train = pd.read_csv('dataTest.csv', usecols=['class']).values
    x_test = np.column_stack(replace(count_all, 1))
    return x_train, y_train, x_test
示例#5
0
def text_cleaning(texts):
    clean_word = []
    stop_words = thai_stopwords()
    for text in texts:
        #emoji list
        pos_emoji = re.compile(
            u'[\U0001F600\U0001F603\U0001F604\U0001F601\U0001F606\U0001F60A\U0000263A\U0000FE0F\U0001F923\U0001F642\U0001F609\U0001F60C\U0001F619\U0001F617\U0001F618\U0001F970\U0001F60D\U0001F61A\U0001F60B\U0001F61B\U0001F61D\U0001F61C\U0001F973\U0001F60F\U0001F633\U0001F638\U0001F63A\U0001F63D\U0001F63B\U0001F63C\U0001F44D\U0001F3FB\U0001F91F\U0001F3FB\U0001F918\U0001F3FB\U0001F48B\U00002764\U0000FE0F\U0001F9E1\U0001F49B\U0001F49A\U0001F499\U0001F49C\U00002763\U0000FE0F\U0001F495\U0001F49E\U0001F493\U0001F497\U0001F496\U0001F498\U0001F49D]',
            flags=re.UNICODE)
        neg_emoji = re.compile(
            u'[\U0001F494\U0001F642\U0001F643\U0001F61E\U0001F612\U0001F60F\U0001F614\U0001F61F\U0001F615\U0001F641\U00002639\U0000FE0F\U0001F623\U0001F616\U0001F62B\U0001F629\U0001F97A\U0001F622\U0001F62D\U0001F60F\U0001F624\U0001F620\U0001F621\U0001F92C\U0001F92F\U0001F975\U0001F628\U0001F630\U0001F625\U0001F613\U0001F925\U0001F636\U0001F610\U0001F611\U0001F644\U0001F626\U0001F640\U0001F63E\U0001F63C\U0001F595\U0001F3FB\U0001F44E\U0001F3FB\U0001F9B6\U0001F3FB\U0001F448\U0001F3FB\U0001F91E\U0001F3FB\U0001F44B\U0001F3FB\U0001F47F\U0001F47A\U0001F921\U0001F92E\U0001F974\U0001F463]',
            flags=re.UNICODE)
        pos_count = len(re.findall(pos_emoji, text))
        neg_count = len(re.findall(neg_emoji, text))
        #text.replace('☺️', 'posemo')
        #for emo in pos_emoji: text = text.replace(emo,'posemo')

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
            "]+",
            flags=re.UNICODE)
        text = emoji_pattern.sub(r"", text)
        #delte Link hashtag and mention
        text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text)
        text = re.sub(r"^https://t.co/[A-Za-z0-9]*\s", "", text)
        text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", "", text)
        text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", "", text)
        #find and delete laugh
        laugh_count = len(re.findall(r'(5)\1{2,}(6?){3,}', text))
        text = re.sub(r'(5)\1{2,}(6?){3,}', '', text)
        #delete symbol
        text = re.sub(r'[!-@[-`{-~]', "", text)
        #text = re.sub("\d+", "", text) #number
        text = normalize(text)

        #Tokenization

        tokens = word_tokenize(text)

        #deletion of whitespace & one letter text

        i = 0
        for token in list(tokens):
            if (len(token) == 1 or len(token) == token.count(token[0]) or token
                    in ['xxrep', 'xxwrep', '', 'ชา', 'นนท์', 'ปอนด์', 'ป้อม']):
                tokens.pop(i)
                i = i - 1
            i = i + 1

#Add thailaugh posemoji negemoji tag

        for a in range(laugh_count):
            tokens.append('thailaugh')
        for a in range(pos_count):
            tokens.append('posemoji')
        for a in range(neg_count):
            tokens.append('negemoji')


# POS Tag

# from pythainlp.tag import pos_tag
# pos = pos_tag(tokens,corpus='orchid_ud')
# keep_tag = ['VERB', 'ADJ', 'ADV', 'INTJ', 'AUX']
#keep_tag = ['VACT','VATT','ADVN','ADVI','ADVP','ADVS','FIXV','NEG','ADJ','']

# pos_tags = [t[0] for t in pos if (t[1] in keep_tag) or (t[0] == "thailaugh")
# or (t[0] == "posemoji")  or (t[0] == "negemoji")]
# tokens = pos_tags

# Delete Stop Word

        filtered_sentence = []
        for t in tokens:
            if t not in stop_words:
                #t = ''.join(c[0] for c in itertools.groupby(t))
                filtered_sentence.append(t)
        clean_word.append(','.join(filtered_sentence))
    return clean_word
示例#6
0
 def Tokenize_word(self,text):
     
     ######## Thai word segment ######## ver1
     '''sent = text[0].replace("'","")
     word = word_tokenize(sent, engine='deepcut') # use this method
     wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
     words =[]
     for w in wword:
         if w not in common.thai_stopwords():
             words = [str for str in words if str]
             words.append(w)
     return words'''
 
     ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai)
     sent = text[0].replace("'","")    
     word = word_tokenize(sent, engine='deepcut') # use this method
     #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
     th_no_stopwords =[]
     all_no_stopwords =[]
     th_correct_words =[]
     eng_correct_words =[]
     mix_correct_words =[]
     mix1_correct_words =[]
     all_correct_words =[]
     all_correct_words_final =[]
     check_thai_list = []
     #for tw in wword:
     for tw in word:
         if tw not in common.thai_stopwords():
             th_no_stopwords = [str for str in th_no_stopwords if str]
             th_no_stopwords.append(tw)
     #print("th_no_stopwords = ", th_no_stopwords)
     for ew in th_no_stopwords:
         if ew not in stopwords.words('english'):
             all_no_stopwords = [str for str in all_no_stopwords if str]        
             all_no_stopwords.append(ew)
     #print("all_no_stopwords = ", all_no_stopwords)
     for c in all_no_stopwords:
         thai = isthai(c)
         number = c.isnumeric()
         if not thai:
             no_num = c.isalpha()
             match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
             if no_num:
                 spell = SpellChecker()
                 eng_correct = spell.correction(c) #pn
                 eng_correct_words.append(eng_correct)
                 #print("eng = ", eng_correct)
             elif match1:
                 mix = c
                 mix_correct_words.append(mix)
                 #print("mix = ", mix)
             else:
                 num = c #No return
                 #print("num = ", num)
         elif thai:
             checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn
             th_correct = checker.correct(c)
             th_correct_words.append(th_correct)
             #print("thai = ", th_correct)
           
     all_correct_words = th_correct_words + eng_correct_words + mix_correct_words
     all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
     all_correct_words_final = list(filter(None, all_correct_words))
     #print("words = ", all_correct_words_final)  
     return all_correct_words_final
 
     
     ######## Eng word segment ########
     '''word = text[0]
示例#7
0
import codecs
import re
import string
import os
import pythaispell
from pythainlp.tokenize import tcc
from pythainlp.tokenize import syllable_tokenize as word_tokenize
import sklearn_crfsuite
from pythainlp.spell.pn import NorvigSpellChecker
try:
    from pythainlp.corpus.thaisyllable import get_data as syllable_dict
    from pythainlp.corpus import stopwords
    stopwords = stopwords.words('thai')
except:
    from pythainlp.corpus.common import thai_syllables, thai_stopwords
    stopwords = list(thai_stopwords())
    syllable_dict = thai_syllables

templates_file = os.path.join(os.path.dirname(pythaispell.__file__),
                              "sp.model")
invalidChars = set(string.punctuation.replace("_", ""))
dict_s = list(set(syllable_dict()))


def c(word):
    for i in list('กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ'):
        if i in word:
            return True
    return False

示例#8
0
def get_th_stop():
    return tuple(thai_stopwords())
示例#9
0
    def Tokenize_word(self,text):
        
        ######## Thai word segment ########
        ''''sent = text[0].replace("'","")
        word = word_tokenize(sent, engine='deepcut') # use this method
        wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
        words =[]
        for w in wword:
            if w not in common.thai_stopwords():
                words = [str for str in words if str]
                words.append(w)
        return words'''
    
        ######## Thai word segment ######## ver.2 -> stopwords, type of words
        sent = text[0].replace("'","")
        word = word_tokenize(sent, engine='deepcut') # use this method
        wword = [x.replace('.',' ').replace('%',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
        th_no_stopwords =[]
        eng_no_stopwords =[]
        th_correct_words =[]
        eng_correct_words =[]
        mix_correct_words =[]
        mix1_correct_words =[]
        all_correct_words =[]
        all_correct_words_final =[]
        check_thai_list = []
        for w in wword:
            thai = isthai(w)
            #number = c.isnumeric()
            if thai:
                if w not in common.thai_stopwords():
                    #th_no_stopwords = [str for str in th_no_stopwords if str]        
                    th_no_stopwords.append(w)
                    #print("thai = ", th_correct)
            elif not thai:
                if w not in stopwords.words('english'):
                    #eng_no_stopwords = [str for str in eng_no_stopwords if str]        
                    #eng_no_stopwords.append(w)
                    no_num = w.isalpha()
                    match1 = re.findall('\D', w) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
                    if no_num:
                        eng = w
                        eng_no_stopwords.append(eng)
                        #print("eng = ", eng_correct)
                    elif match1:
                        mix = w
                        mix_correct_words.append(mix)
                        #print("mix = ", mix)
                    else:
                        num = w #No return
                        #print("num = ", num)
            

        #print("th_correct_words = ", th_correct_words)
        #print("eng_correct_stopwords = ", eng_correct_words)
        
        all_correct_words = th_no_stopwords + eng_no_stopwords + mix_correct_words
        all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
        all_correct_words_final = list(filter(None, all_correct_words))
        #print("words = ", all_correct_words)
        return all_correct_words_final
    
        
        ######## Eng word segment ########
        '''word = text[0]