def split_word(text): th_stop = tuple(thai_stopwords()) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() tokens = word_tokenize(text,engine='newmm') # Remove Thai and English stop words tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # Find Thai and English stem words # English tokens = [p_stemmer.stem(i) for i in tokens] # Thai tokens_temp=[] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp # Remove numbers tokens = [i for i in tokens if not i.isnumeric()] # Remove space tokens = [i for i in tokens if not ' ' in i] return tokens
def remove_stopwords(tokenized_ls): removed_stopwords = [] for text in tokenized_ls: tmp = [] for word in text: if word not in thai_stopwords(): tmp.append(word) removed_stopwords.append(tmp) return removed_stopwords
def __init__(self): self.mt5_tokenizer = MT5Tokenizer.from_pretrained( "Pollawat/mt5-small-thai-qa-qg") self.mt5_model = MT5ForConditionalGeneration.from_pretrained( "Pollawat/mt5-small-thai-qa-qg") self.wangchanberta_tokenizer = AutoTokenizer.from_pretrained( "airesearch/wangchanberta-base-att-spm-uncased") self.wangchanberta_model = AutoModelForMaskedLM.from_pretrained( "airesearch/wangchanberta-base-att-spm-uncased") self.wangchanberta_pipeline = pipeline( task='fill-mask', tokenizer=self.wangchanberta_tokenizer, model=self.wangchanberta_model) self.stopwords = thai_stopwords()
def manageData(text): stop_words = set(thai_stopwords()) word_tokens = word_tokenize(text, engine='newmm') filtered_sentence = [w for w in word_tokens if not w in stop_words] text_join = " ".join(filtered_sentence) senten = text_join.split() dataList = pd.read_csv('dataTest.csv', usecols=lambda column: column not in ['class']) header = dataList.columns.values count_all = [] for w in header: count_all.append(senten.count(w)) x_train = replace(dataList.values, 1) y_train = pd.read_csv('dataTest.csv', usecols=['class']).values x_test = np.column_stack(replace(count_all, 1)) return x_train, y_train, x_test
def text_cleaning(texts): clean_word = [] stop_words = thai_stopwords() for text in texts: #emoji list pos_emoji = re.compile( u'[\U0001F600\U0001F603\U0001F604\U0001F601\U0001F606\U0001F60A\U0000263A\U0000FE0F\U0001F923\U0001F642\U0001F609\U0001F60C\U0001F619\U0001F617\U0001F618\U0001F970\U0001F60D\U0001F61A\U0001F60B\U0001F61B\U0001F61D\U0001F61C\U0001F973\U0001F60F\U0001F633\U0001F638\U0001F63A\U0001F63D\U0001F63B\U0001F63C\U0001F44D\U0001F3FB\U0001F91F\U0001F3FB\U0001F918\U0001F3FB\U0001F48B\U00002764\U0000FE0F\U0001F9E1\U0001F49B\U0001F49A\U0001F499\U0001F49C\U00002763\U0000FE0F\U0001F495\U0001F49E\U0001F493\U0001F497\U0001F496\U0001F498\U0001F49D]', flags=re.UNICODE) neg_emoji = re.compile( u'[\U0001F494\U0001F642\U0001F643\U0001F61E\U0001F612\U0001F60F\U0001F614\U0001F61F\U0001F615\U0001F641\U00002639\U0000FE0F\U0001F623\U0001F616\U0001F62B\U0001F629\U0001F97A\U0001F622\U0001F62D\U0001F60F\U0001F624\U0001F620\U0001F621\U0001F92C\U0001F92F\U0001F975\U0001F628\U0001F630\U0001F625\U0001F613\U0001F925\U0001F636\U0001F610\U0001F611\U0001F644\U0001F626\U0001F640\U0001F63E\U0001F63C\U0001F595\U0001F3FB\U0001F44E\U0001F3FB\U0001F9B6\U0001F3FB\U0001F448\U0001F3FB\U0001F91E\U0001F3FB\U0001F44B\U0001F3FB\U0001F47F\U0001F47A\U0001F921\U0001F92E\U0001F974\U0001F463]', flags=re.UNICODE) pos_count = len(re.findall(pos_emoji, text)) neg_count = len(re.findall(neg_emoji, text)) #text.replace('☺️', 'posemo') #for emo in pos_emoji: text = text.replace(emo,'posemo') emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) text = emoji_pattern.sub(r"", text) #delte Link hashtag and mention text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text) text = re.sub(r"^https://t.co/[A-Za-z0-9]*\s", "", text) text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", "", text) text = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", "", text) #find and delete laugh laugh_count = len(re.findall(r'(5)\1{2,}(6?){3,}', text)) text = re.sub(r'(5)\1{2,}(6?){3,}', '', text) #delete symbol text = re.sub(r'[!-@[-`{-~]', "", text) #text = re.sub("\d+", "", text) #number text = normalize(text) #Tokenization tokens = word_tokenize(text) #deletion of whitespace & one letter text i = 0 for token in list(tokens): if (len(token) == 1 or len(token) == token.count(token[0]) or token in ['xxrep', 'xxwrep', '', 'ชา', 'นนท์', 'ปอนด์', 'ป้อม']): tokens.pop(i) i = i - 1 i = i + 1 #Add thailaugh posemoji negemoji tag for a in range(laugh_count): tokens.append('thailaugh') for a in range(pos_count): tokens.append('posemoji') for a in range(neg_count): tokens.append('negemoji') # POS Tag # from pythainlp.tag import pos_tag # pos = pos_tag(tokens,corpus='orchid_ud') # keep_tag = ['VERB', 'ADJ', 'ADV', 'INTJ', 'AUX'] #keep_tag = ['VACT','VATT','ADVN','ADVI','ADVP','ADVS','FIXV','NEG','ADJ',''] # pos_tags = [t[0] for t in pos if (t[1] in keep_tag) or (t[0] == "thailaugh") # or (t[0] == "posemoji") or (t[0] == "negemoji")] # tokens = pos_tags # Delete Stop Word filtered_sentence = [] for t in tokens: if t not in stop_words: #t = ''.join(c[0] for c in itertools.groupby(t)) filtered_sentence.append(t) clean_word.append(','.join(filtered_sentence)) return clean_word
def Tokenize_word(self,text): ######## Thai word segment ######## ver1 '''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai) sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] all_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] #for tw in wword: for tw in word: if tw not in common.thai_stopwords(): th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(tw) #print("th_no_stopwords = ", th_no_stopwords) for ew in th_no_stopwords: if ew not in stopwords.words('english'): all_no_stopwords = [str for str in all_no_stopwords if str] all_no_stopwords.append(ew) #print("all_no_stopwords = ", all_no_stopwords) for c in all_no_stopwords: thai = isthai(c) number = c.isnumeric() if not thai: no_num = c.isalpha() match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: spell = SpellChecker() eng_correct = spell.correction(c) #pn eng_correct_words.append(eng_correct) #print("eng = ", eng_correct) elif match1: mix = c mix_correct_words.append(mix) #print("mix = ", mix) else: num = c #No return #print("num = ", num) elif thai: checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn th_correct = checker.correct(c) th_correct_words.append(th_correct) #print("thai = ", th_correct) all_correct_words = th_correct_words + eng_correct_words + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words_final) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]
import codecs import re import string import os import pythaispell from pythainlp.tokenize import tcc from pythainlp.tokenize import syllable_tokenize as word_tokenize import sklearn_crfsuite from pythainlp.spell.pn import NorvigSpellChecker try: from pythainlp.corpus.thaisyllable import get_data as syllable_dict from pythainlp.corpus import stopwords stopwords = stopwords.words('thai') except: from pythainlp.corpus.common import thai_syllables, thai_stopwords stopwords = list(thai_stopwords()) syllable_dict = thai_syllables templates_file = os.path.join(os.path.dirname(pythaispell.__file__), "sp.model") invalidChars = set(string.punctuation.replace("_", "")) dict_s = list(set(syllable_dict())) def c(word): for i in list('กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ'): if i in word: return True return False
def get_th_stop(): return tuple(thai_stopwords())
def Tokenize_word(self,text): ######## Thai word segment ######## ''''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver.2 -> stopwords, type of words sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('%',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] eng_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] for w in wword: thai = isthai(w) #number = c.isnumeric() if thai: if w not in common.thai_stopwords(): #th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(w) #print("thai = ", th_correct) elif not thai: if w not in stopwords.words('english'): #eng_no_stopwords = [str for str in eng_no_stopwords if str] #eng_no_stopwords.append(w) no_num = w.isalpha() match1 = re.findall('\D', w) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: eng = w eng_no_stopwords.append(eng) #print("eng = ", eng_correct) elif match1: mix = w mix_correct_words.append(mix) #print("mix = ", mix) else: num = w #No return #print("num = ", num) #print("th_correct_words = ", th_correct_words) #print("eng_correct_stopwords = ", eng_correct_words) all_correct_words = th_no_stopwords + eng_no_stopwords + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]