def test_wordnet(self): self.assertIsInstance(wordnet.langs(), list) self.assertIn("tha", wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]) self.assertIsNotNone(wordnet.synsets("นก")) self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) self.assertEqual(wordnet.morphy("dogs"), "dog") bird = wordnet.synset("bird.n.01") mouse = wordnet.synset("mouse.n.01") self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)) self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)) self.assertEqual(wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse)) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
def find_synonyms(self, word: str, pos: str = None, postag_corpus: str = "lst20") -> List[str]: """ Find synonyms from wordnet :param str word: word :param str pos: part-of-speech type :param str postag_corpus: postag corpus name :return: list of synonyms :rtype: List[str] """ self.synonyms = [] if pos is None: self.list_synsets = wordnet.synsets(word) else: self.p2w_pos = postype2wordnet(pos, postag_corpus) if self.p2w_pos != '': self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos) else: self.list_synsets = wordnet.synsets(word) for self.synset in wordnet.synsets(word): for self.syn in self.synset.lemma_names(lang='tha'): self.synonyms.append(self.syn) self.synonyms_without_duplicates = list( OrderedDict.fromkeys(self.synonyms)) return self.synonyms_without_duplicates
def test_wordnet(self): self.assertIsNotNone(wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] ) self.assertIsNotNone(wordnet.synsets("นก")) self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) self.assertEqual(wordnet.morphy("dogs"), "dog") bird = wordnet.synset("bird.n.01") mouse = wordnet.synset("mouse.n.01") self.assertEqual( wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse) ) self.assertEqual( wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) ) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
def split_word(text): tokens = word_tokenize(text, engine='newmm') # # Remove stop words ภาษาไทย และภาษาอังกฤษ # tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # หารากศัพท์ภาษาไทย และภาษาอังกฤษ # English tokens = [p_stemmer.stem(i) for i in tokens] # Thai tokens_temp = [] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp # ลบตัวเลข tokens = [i for i in tokens if not i.isnumeric()] # ลบช่องว่าง tokens = [i for i in tokens if not ' ' in i] # tokens_list = [split_word(txt) for txt in text_list] return tokens
def split_word(text): # ตัดคำโดยใช้ dict ใน corpus ที่ผม edit ไป มันจะตัดเฉพาะเมนูอาหารที่ผมใส่ไปใน words.th.txt tokens = word_tokenize(text, engine='dict') # Remove stop words ภาษาไทย และภาษาอังกฤษ tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # หารากศัพท์ภาษาไทย และภาษาอังกฤษ # English tokens = [p_stemmer.stem(i) for i in tokens] # Thai tokens_temp = [] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp # ลบตัวเลข tokens = [i for i in tokens if not i.isnumeric()] # ลบช่องว่าง tokens = [i for i in tokens if not ' ' in i] return tokens
def split_word(text): th_stop = tuple(thai_stopwords()) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() tokens = word_tokenize(text,engine='newmm') # Remove Thai and English stop words tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # Find Thai and English stem words # English tokens = [p_stemmer.stem(i) for i in tokens] # Thai tokens_temp=[] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp # Remove numbers tokens = [i for i in tokens if not i.isnumeric()] # Remove space tokens = [i for i in tokens if not ' ' in i] return tokens
def compute_wordnet_path_scores(pairs): """ Compute WordNet path similarity for a list of input word pairs Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity lch_similarity we can't use. path_similarity seems to have better results than wup_similarity If we don't find a path between the two works, we add "None" to the result list @returns: this list of simility scores, and the number of OOV-word-pairs """ structed_oov_pairs = 0 wn_scores = [] for index, pair in enumerate(pairs): w1 = wordnet.synsets(pair[0]) w2 = wordnet.synsets(pair[1]) if len(w1) > 0 and len(w2) > 0: # just use the first synset of each term if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset': path = wordnet.path_similarity(w1[0], w2[0]) # return the highest sim between all synset combinations elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar': path = -1 for syn1 in w1: for syn2 in w2: tmppath = wordnet.path_similarity(syn1, syn2) if tmppath and tmppath > path: path = tmppath if path == -1: # if no path found, set back to None path = None else: raise RuntimeError( 'WORDNET_PATH_SIMILARITY_TYPE is not set in config!') wn_scores.append(path) else: wn_scores.append(None) structed_oov_pairs += 1 return wn_scores, structed_oov_pairs
def compute_wordnet_path_scores(pairs): """ Compute WordNet path similarity for a list of input word pairs Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity lch_similarity we can't use. path_similarity seems to have better results than wup_similarity If we don't find a path between the two works, we add "None" to the result list @returns: this list of simility scores, and the number of OOV-word-pairs """ print("DEBUG: starting compute_wordnet_path_scores") from pythainlp.corpus import wordnet structed_oov_pairs = 0 # wohlg: we count word pairs for which we have no path wn_scores = [] for index, pair in enumerate(pairs): w1 = wordnet.synsets(pair[0]) w2 = wordnet.synsets(pair[1]) if len(w1) > 0 and len(w2) > 0: if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset': # just use the first synset of each term path = wordnet.path_similarity(w1[0], w2[0]) # path = wordnet.lch_similarity(w1[0], w2[0]) ## we can't use it, requires the same part-of-speech for both words # path = wordnet.wup_similarity(w1[0], w2[0]) elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar': # return the highest sim between all synset combinations path = -1 for syn1 in w1: for syn2 in w2: tmppath = wordnet.path_similarity(syn1, syn2) if tmppath and tmppath > path: path = tmppath if path == -1: path = None # if no path found, set back to None wn_scores.append(path) else: wn_scores.append(None) structed_oov_pairs += 1 return wn_scores, structed_oov_pairs
def split_word(text): th_stop = tuple(stopwords.words('thai')) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() tokens = word_tokenize(text) tokens = [i for i in tokens if not i in th_stop and not i in en_stop] tokens = [p_stemmer.stem(i) for i in tokens] tokens_temp = [] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp tokens = [i for i in tokens if not i.isnumeric()] tokens = [i for i in tokens if not ' ' in i] return tokens
vowel = 'เแโใไ' e_alphabet = 'abcdefghijklmnopqrstuvwxyz' be_alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' num = '0123456789' al = [alphabet,vowel,e_alphabet,be_alphabet,num] for a in al: for index in a: t_word = [] for word in dict[index]: t_word.append(word) for word in t_word: tmp = dict[index][word] synonyms = [] for syn in wordnet.synsets(word): for s in syn.lemma_names('tha'): synonyms.append(s) synonyms = list(set(synonyms)) print(synonyms) for i in synonyms: try : dict[i[0]][i] except KeyError: try : dict[i[0]][i] = tmp except KeyError: continue start = time.time()
def findDocuments(): # initial databased start = time.time() dict = SqliteDict('E:\\CPE#Y4\\databaseTF\\lastest_db\\new-db.sqlite', autocommit=True) dict = dict['doc'] end = time.time() print("Time to initial db", end - start) # initial data and test set q = open('test_set\\new_sample_questions_tokenize.json', mode='r', encoding="utf-8-sig") data = json.load(q) # validate = json.load(open("test_set\\new_sample_questions_answer.json", mode='r', encoding="utf-8-sig")) doc = 0 data = data[doc:] print(data.__len__()) string = '' question_words = stopwords.words('thai') question_words.append('กี่') question_words.append('ใด') test_output = [] no_word = [] for s in data: string += "question " + str(doc) print("question", doc, s) # segment until no space and do rule-based suffix = ['คือ', 'กี่', 'ใด'] r = [] for i in s: if ' ' in i: for j in i.split(): s.append(j) r.append(i) continue for j in suffix: if i.endswith(j) or i.startswith(j): s.append(rreplace(i, j, ' ', 1)) r.append(i) break for i in r: s.remove(i) ######################################################################################## s.sort() s = list(set(s)) search = [] cantfind = [] # # find by sqlitedict for f in range(s.__len__()): if (s[f].isspace()) or (s[f] in question_words): continue if (s[f][0] == ' ') or (s[f][-1] == ' '): s[f] = s[f].strip() try: tmp = dict[s[f][0]][s[f]] search.append((s[f], tmp)) except KeyError: # # if no index find by synonyms cantfind.append(s[f]) synonyms = [] for syn in wordnet.synsets(s[f]): for i in syn.lemma_names('tha'): synonyms.append(i) if s[f] in synonyms: synonyms.remove(s[f]) for i in synonyms: try: tmp = dict[i[0]][i] search.append((i, tmp)) break except KeyError: cantfind.append(i) no_word.append(cantfind) ######################################################################################## # remove least mean tf-idf word = [] pool = [] search.sort(key=lambda s: s[1][0][0], reverse=True) for i in range(0): if (search.__len__() > 2): search.pop() else: break search.sort(key=lambda s: len(s[1])) for i in range(search.__len__()): try: word.append(search[i][0]) pool.append(search[i][1][1:]) except IndexError: break ######################################################################################## answer_index = [] count = [] # rank answer in answer pool c = {} weight = [5, 1] for i in range(pool.__len__()): for k, v in pool[i]: try: if i < weight.__len__(): c[k] += v * weight[i] else: c[k] += v except KeyError: if i < weight.__len__(): c[k] = v * weight[i] for key, value in c.items(): answer_index.append(key) count.append(value) ######################################################################################## answer_n = nlargest(count.__len__(), count) answer = [] for i in answer_n: index = count.index(i) answer.append(answer_index[index]) answer_index.pop(index) count.pop(index) print(answer.__len__(), answer[:6]) test_output.append(answer) ### return this . doc += 1 return test_output, no_word
def compute_mahtab_scores(pairs): """ Based on https://aclweb.org/anthology/S17-2040 Section: 3.2 # TODO Alexey: maybe there is an implementation of this and we don't have to implement ourselves! # See in https://aclweb.org/anthology/S17-2040 and in https://www.aclweb.org/anthology/S16-1091 """ from pythainlp.corpus import wordnet structed_oov_pairs = 0 # wohlg: we count word pairs for which we have no path mahtab_scores = [] current_score = None for index, pair in enumerate(pairs): w1 = wordnet.synsets(pair[0]) w2 = wordnet.synsets(pair[1]) if len(w1) > 0 and len(w2) > 0: # remark Gerhard: we check for current_score==-1 just to check that we didn't set the score # *** Step 1: "If two words are exactly the same or are two different writing forms of one word or belong to the same synset, the distance will be zero (D(x,y)=0)." *** ## words are the sam if pair[0] == pair[1]: current_score = 0 continue ## "are two different writing forms of one word" -- Gerhard: don't know how to handle this -> skip?! ## "belong to the same synset" s1 = wordnet.synsets(pair[0]) s2 = wordnet.synsets(pair[0]) # TODO: check that there is an otherlapping synset # *** Step 2: "If two words have more than four common senses in their corresponding synsets, the distance will be one (D(x, y) =1)" *** # TODO: Compute sets of senses of synsets of both words, and then see if set intersection has more than 4 elements # *** Step 3: "If there is a direct or two-level hypernym relation between the corresponding synsets of words, the distance will be two (D(x, y) =2)." *** # *** Step 4: "If two words share any common sense, the distance will be three (D(x, y) =3)" *** # *** Step 5: "If two words are derivationally related, the distance will be four (D(x, y) =4)." *** ## What does that mean??? ## maybe explained in https://www.aclweb.org/anthology/S16-1091 ## Additional less strict rules # *** Step 6: "1. If there is any relation except hypernym between synsets of two words, the distance will be three (D(x, y) =3)." *** # *** Step 7: "2. If there is any two-links relation except hypernym between synsets of two words, the distance will be four (D(x, y) =4)." *** # *** Step 8: "3. If there is any three-links relation between synsets of two words, the distance will be five (D(x, y) =5)." *** # *** Step 9: "After all, if no relation is found between a pair of word to measure the distance between them, the distance will set to -1 a" *** current_score = -1 # *** Step 10: "the distance will set to -1 and then we calculate similarity score using equation 1 introduced by(Rychalska et al., 2016):" *** # see Equation (1) in the paper, We set alpha to 0.25 and beta to 1 as these values seemed to yield the best results if current_score < 0: mahtab_scores.append(0) else: s = math.exp(-0.25 * current_score) mahtab_scores.append(s) # TODO .. test if formula works correctly # if Alexey is ambitionious he can have a look at BabelNet as well, but I think it's not necessary else: mahtab_scores.append(None) structed_oov_pairs += 1 return mahtab_scores, structed_oov_pairs
def findDocuments(start_idx=0, end_idx=0): # initial databased q = open('./data/final/final_tokenized_question.json', mode='r', encoding="utf-8-sig") # change path start = time.time() dict = SqliteDict('./data/sqlite_db/doc_add_missing.sqlite', autocommit=True) # change path dict = dict['doc'] end = time.time() print("Time to initial db", end - start) # initial data and test set # q = open('./ThaiQACorpus-EvaluationDataset-tokenize.json', mode='r', encoding="utf-8-sig") # change path # validate = json.load(open("./../new_sample_questions_answer.json", mode='r', encoding="utf-8-sig")) # change path doc = 0 data = json.load(q) data = data[start_idx:end_idx] print(data.__len__()) save = 0 string = '' question_words = stopwords.words('thai') question_words.append('กี่') question_words.append('ใด') test_output = [] for s in data: start = time.time() string += "question " + str(doc) print("question", doc, s) # segment until no space and do rule-based suffix = ['คือ', 'กี่', 'ใด'] r=[] for i in s: if ' ' in i: for j in i.split(): s.append(j) r.append(i) continue for j in suffix: if i.endswith(j) or i.startswith(j): s.append(rreplace(i, j, ' ', 1)) r.append(i) break for i in r : s.remove(i) ######################################################################################## s.sort() s = list(set(s)) search = [] cantfind = [] # # find by sqlitedict for f in range(s.__len__()): if (s[f].isspace()) or (s[f] in question_words): continue if (s[f][0] == ' ') or (s[f][-1] == ' '): s[f] = s[f].strip() try: tmp = dict[s[f][0]][s[f]] search.append((s[f], tmp)) except KeyError: # # if no index find by synonyms cantfind.append(s[f]) synonyms = [] for syn in wordnet.synsets(s[f]): for i in syn.lemma_names('tha'): synonyms.append(i) # if synonyms.__len__() == 0 : # if s[f].endswith('คือ'): # synonyms.append(rreplace(s[f], 'คือ', '', 1)) # elif s[f].endswith('กี่'): # synonyms.append(rreplace(s[f], 'กี่', '', 1)) # elif s[f].endswith('ใด'): # synonyms.append(rreplace(s[f], 'ใด', '', 1)) # synonyms = deepcut.tokenize(s[f]) if s[f] in synonyms : synonyms.remove(s[f]) for i in synonyms: try: tmp = dict[i[0]][i] search.append((i, tmp)) break except KeyError: cantfind.append(i) ######################################################################################## # remove least mean tf-idf word = [] pool = [] search.sort(key=lambda s: s[1][0][0], reverse=True) for i in range(0): if (search.__len__() > 2): search.pop() else: break search.sort(key=lambda s: len(s[1])) for i in range(search.__len__()): try: word.append(search[i][0]) pool.append(search[i][1][1:]) except IndexError: break # weight shortest in case shortest + best tf-idf # for i in range(pool[0].__len__()): # pool[0][i][1] *= 3 ######################################################################################## answer_index = [] count = [] # rank answer in answer pool c = {} weight = [5,1] for i in range(pool.__len__()): for k, v in pool[i]: try: if i < weight.__len__(): c[k] += v*weight[i] else: c[k] += v except KeyError: if i < weight.__len__(): c[k] = v*weight[i] for key, value in c.items(): answer_index.append(key) count.append(value) ######################################################################################## answer_n = nlargest(count.__len__(), count) answer = [] for i in answer_n: index = count.index(i) answer.append(answer_index[index]) answer_index.pop(index) count.pop(index) print(answer.__len__(), answer[:6]) test_output.append(answer[:50]) ### return this . doc += 1 return test_output
def Processing(E1): p_stemmer = PorterStemmer() ThaiWord = list(thaisw.words('thai')) #print(' Thaiwords : ', ThaiWord) EngWord = list(set(engsw.words('english'))) #print(' ew : ',EngWord, ' : ', type(EngWord)) Morewords = [ u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ', u'test', u'.', u',', u'ทำ', u'-', u'/' ] All_Stop_Word = ThaiWord + EngWord + Morewords #print(' ALL : ',All_Stop_Word) EntryList = [] for n in E1: # check=detect(n[0]) # th or en #print(' text : ', n[0], ' :: ',check) EntryList.append(n[0]) #print(' EntryList : ', EntryList) Outcome = [] for r in EntryList: Dummy = [] tokens = [] tokens = list(eng_tokens(r)) lowered = [t.lower() for t in tokens] #print(' Dummy : ',lowered) lowered = " ".join(lowered) #Dummy=list(thai_tokens(lowered, engine='newmm')) words = set(thai_words()) words.add(u'ไทยเบฟ') words.add(u'ผสานพลัง') words.add(u'โอกาส') words.add(u'ถังไม้โอ๊ค') custom_tokenizer = Tokenizer(words) Dummy = list(custom_tokenizer.word_tokenize(lowered)) #print(' Dummy 2 : ',Dummy) Outcome.append(Dummy) #print(' Outcome : ',Outcome, ' : ', len(Outcome)) NoStop = [] for n in Outcome: Dummy = [] Dummy = [word for word in n if word not in All_Stop_Word] NoStop.append(Dummy) print(' No stop : ', NoStop, ' len: ', len(NoStop)) Lemma = [] for n in NoStop: Dummy = [] Dummy = [p_stemmer.stem(word) for word in n] Lemma.append(Dummy) print(' Lemma : ', Lemma, ' len: ', len(Lemma)) ''' # Instantiate the WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # Lemmatize all tokens into a new list: lemmatized Lemma=[] for n in NoStop: Dummy=[] Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n] Lemma.append(Dummy) #print(' lemma : ', Lemma, ' :: ', type(Lemma)) ''' Lemma_temp = [] for n in Lemma: Dummy = [] for i in n: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): Dummy.append(w_syn[0].lemma_names('tha')[0]) else: Dummy.append(i) Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not i.isnumeric()] Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not ' ' in i] Lemma_temp.append(Dummy) Lemma = Lemma_temp #print(' lemma : ', Lemma, ' :: ', type(Lemma)) return Lemma
# In[ ]: wn.synset("object.n.01").lemma_names(lang="jpn") # In[ ]: x = list(wn.all_synsets("n")) # In[ ]: x[0].lemma_names(lang="tha") # In[ ]: wn.synsets("親", lang="jpn") # In[ ]: wn.synset("gray.a.01").lemma_names(lang="eng")