def getCandFromDict(self, word):
        """
        use edit distance to generate candidates
        input: word
        output: sort_words - words sorted in frequency order with edit distance no larger than 3
                    sort_freq - sorted frequency
        """
        # word = loaikitudau(word)
        cand_words = []
        freq_list = []
        cur_dist = 0
        # print(word)
        # for i in self.small_corpus:
        #     if dist(i, word) <= self.DIST_LIMIT:
        #         cand_words.append(i)
        #         freq_list.append(self.freq_small_corpus[self.small_corpus.index(i)])

        while (cand_words == [] and cur_dist <= 3):
            cur_dist += 1
            # small corpus là những từ có ở miền đích
            for key in self.small_corpus:  # smaller corpus
                ##  levenshtein_distance: transposition = 2
                ##        if (editdistance.eval(key, word) <= DIST_LIMIT):
                ##            cand_words.append(key)

                # damerau_levenshtein_distance: transposition = 1
                if (dist(key, word) <= cur_dist):
                    cand_words.append(key)
                    freq_list.append(
                        self.freq_small_corpus[self.small_corpus.index(key)])
            ## trả về danh sách các từ tương tự với dist <=3

        # filter words with low frequency
        if (freq_list != [] and max(freq_list) > 0):
            # 2 dòng vô dụng
            cand_words = [
                cand_words[ind] for ind in range(len(freq_list))
                if freq_list[ind] > 0
            ]
            freq_list = [freq for freq in freq_list if freq > 0]
            # sort words by frequency
            # sort và sắp xếp lại
            sort_inds = np.argsort(freq_list)[::-1]
            sort_words = [cand_words[ind] for ind in sort_inds]
            sort_freq = [freq_list[ind] for ind in sort_inds]
        else:
            # cũng vô dụng nốt
            sort_words = cand_words[:]
            sort_freq = [0] * len(cand_words)
        #print "cand_words", cand_words
        return sort_words, sort_freq
Exemplo n.º 2
0
def getCandFromDict(word, refined_corpus):
    """
    use edit distance to generate candidates
    input: word
    output: a list of candidate words
    """
    cand_words = []
    cur_dist = 0
    while (cand_words == []):
        cur_dist += 1
        for key in refined_corpus: # smaller corpus
            if (dist(key, word) <= cur_dist):
                cand_words.append(key)
    return cand_words
Exemplo n.º 3
0
def generateTrueCandCorrection(orig_sent_list, error_sent_list, correction_list):
    gold_corrections = []
    for ind in range(len(orig_sent_list)):
        orig_sent_seq = orig_sent_list[ind].strip().split()
        error_sent_seq = error_sent_list[ind].strip().split()
        correct_word = correction_list[ind]
        wrong_word = ""
        for word in error_sent_seq:
            if (word not in orig_sent_seq and dist(word, correct_word)<=1):
                wrong_word = word
                break
        #correct_word = correction_list[ind]
        gold_corrections.append([(correct_word, wrong_word)])
    return gold_corrections
 def getCandFromDict_regularCheck(self, word, refined_corpus):
     """
     use edit distance to generate candidates
     input: word
     output: a list of candidate words
     """
     # word = loaikitudau(word)
     # tìm những từ gần giống từ sai ở trong khi khoảng cách chỉnh sửa nhỏ nhất
     cand_words = []
     cur_dist = 0
     while (cand_words == []):
         cur_dist += 1
         for key in refined_corpus:  # smaller corpus
             if (dist(key, word) <= cur_dist):
                 cand_words.append(key)
     return cand_words
Exemplo n.º 5
0
def getCandFromDict(word):
    """
    use edit distance to generate candidates
    input: word
    output: a list of candidate words
    """
    cand_words = []
    cur_dist = 0

    while (cand_words == []):
        cur_dist += 1
        for key in small_corpus:  # smaller corpus
            ##  levenshtein_distance: transposition = 2
            ##        if (editdistance.eval(key, word) <= DIST_LIMIT):
            ##            cand_words.append(key)

            # damerau_levenshtein_distance: transposition = 1
            if (dist(key, word) <= cur_dist):
                cand_words.append(key)
    #print "cand_words", cand_words
    return cand_words
def getCandFromDict(word):
    """
    use edit distance to generate candidates
    input: word
    output: sort_words - words sorted in frequency order with edit distance no larger than 3
                sort_freq - sorted frequency
    """
    cand_words = []
    freq_list = []
    cur_dist = 0

    while (cand_words == [] and cur_dist <= 3):
        cur_dist += 1
        for key in small_corpus:  # smaller corpus
            ##  levenshtein_distance: transposition = 2
            ##        if (editdistance.eval(key, word) <= DIST_LIMIT):
            ##            cand_words.append(key)

            # damerau_levenshtein_distance: transposition = 1
            if (dist(key, word) <= cur_dist):
                cand_words.append(key)
                freq_list.append(train_corpus[key])

    # filter words with low frequency
    if (freq_list != [] and max(freq_list) > 0):
        cand_words = [
            cand_words[ind] for ind in range(len(freq_list))
            if freq_list[ind] > 0
        ]
        freq_list = [freq for freq in freq_list if freq > 0]
        # sort words by frequency
        sort_inds = np.argsort(freq_list)[::-1]
        sort_words = [cand_words[ind] for ind in sort_inds]
        sort_freq = [freq_list[ind] for ind in sort_inds]
    else:
        sort_words = cand_words[:]
        sort_freq = [0] * len(cand_words)
    #print "cand_words", cand_words
    return sort_words, sort_freq