def getCandFromDict(self, word): """ use edit distance to generate candidates input: word output: sort_words - words sorted in frequency order with edit distance no larger than 3 sort_freq - sorted frequency """ # word = loaikitudau(word) cand_words = [] freq_list = [] cur_dist = 0 # print(word) # for i in self.small_corpus: # if dist(i, word) <= self.DIST_LIMIT: # cand_words.append(i) # freq_list.append(self.freq_small_corpus[self.small_corpus.index(i)]) while (cand_words == [] and cur_dist <= 3): cur_dist += 1 # small corpus là những từ có ở miền đích for key in self.small_corpus: # smaller corpus ## levenshtein_distance: transposition = 2 ## if (editdistance.eval(key, word) <= DIST_LIMIT): ## cand_words.append(key) # damerau_levenshtein_distance: transposition = 1 if (dist(key, word) <= cur_dist): cand_words.append(key) freq_list.append( self.freq_small_corpus[self.small_corpus.index(key)]) ## trả về danh sách các từ tương tự với dist <=3 # filter words with low frequency if (freq_list != [] and max(freq_list) > 0): # 2 dòng vô dụng cand_words = [ cand_words[ind] for ind in range(len(freq_list)) if freq_list[ind] > 0 ] freq_list = [freq for freq in freq_list if freq > 0] # sort words by frequency # sort và sắp xếp lại sort_inds = np.argsort(freq_list)[::-1] sort_words = [cand_words[ind] for ind in sort_inds] sort_freq = [freq_list[ind] for ind in sort_inds] else: # cũng vô dụng nốt sort_words = cand_words[:] sort_freq = [0] * len(cand_words) #print "cand_words", cand_words return sort_words, sort_freq
def getCandFromDict(word, refined_corpus): """ use edit distance to generate candidates input: word output: a list of candidate words """ cand_words = [] cur_dist = 0 while (cand_words == []): cur_dist += 1 for key in refined_corpus: # smaller corpus if (dist(key, word) <= cur_dist): cand_words.append(key) return cand_words
def generateTrueCandCorrection(orig_sent_list, error_sent_list, correction_list): gold_corrections = [] for ind in range(len(orig_sent_list)): orig_sent_seq = orig_sent_list[ind].strip().split() error_sent_seq = error_sent_list[ind].strip().split() correct_word = correction_list[ind] wrong_word = "" for word in error_sent_seq: if (word not in orig_sent_seq and dist(word, correct_word)<=1): wrong_word = word break #correct_word = correction_list[ind] gold_corrections.append([(correct_word, wrong_word)]) return gold_corrections
def getCandFromDict_regularCheck(self, word, refined_corpus): """ use edit distance to generate candidates input: word output: a list of candidate words """ # word = loaikitudau(word) # tìm những từ gần giống từ sai ở trong khi khoảng cách chỉnh sửa nhỏ nhất cand_words = [] cur_dist = 0 while (cand_words == []): cur_dist += 1 for key in refined_corpus: # smaller corpus if (dist(key, word) <= cur_dist): cand_words.append(key) return cand_words
def getCandFromDict(word): """ use edit distance to generate candidates input: word output: a list of candidate words """ cand_words = [] cur_dist = 0 while (cand_words == []): cur_dist += 1 for key in small_corpus: # smaller corpus ## levenshtein_distance: transposition = 2 ## if (editdistance.eval(key, word) <= DIST_LIMIT): ## cand_words.append(key) # damerau_levenshtein_distance: transposition = 1 if (dist(key, word) <= cur_dist): cand_words.append(key) #print "cand_words", cand_words return cand_words
def getCandFromDict(word): """ use edit distance to generate candidates input: word output: sort_words - words sorted in frequency order with edit distance no larger than 3 sort_freq - sorted frequency """ cand_words = [] freq_list = [] cur_dist = 0 while (cand_words == [] and cur_dist <= 3): cur_dist += 1 for key in small_corpus: # smaller corpus ## levenshtein_distance: transposition = 2 ## if (editdistance.eval(key, word) <= DIST_LIMIT): ## cand_words.append(key) # damerau_levenshtein_distance: transposition = 1 if (dist(key, word) <= cur_dist): cand_words.append(key) freq_list.append(train_corpus[key]) # filter words with low frequency if (freq_list != [] and max(freq_list) > 0): cand_words = [ cand_words[ind] for ind in range(len(freq_list)) if freq_list[ind] > 0 ] freq_list = [freq for freq in freq_list if freq > 0] # sort words by frequency sort_inds = np.argsort(freq_list)[::-1] sort_words = [cand_words[ind] for ind in sort_inds] sort_freq = [freq_list[ind] for ind in sort_inds] else: sort_words = cand_words[:] sort_freq = [0] * len(cand_words) #print "cand_words", cand_words return sort_words, sort_freq