def spell_score(misspelling, candidates, method=1): """ Calculates the edit distance between a misspelling and each candidate according to the chosen method :param misspelling: misspelling :param candidates: list of candidates :param method: chosen method from [1, 2, 3, 4] :return: list of edit distances between misspelling and each candidate """ lexical_scores = [damerau_levenshtein_distance(misspelling, candidate) for candidate in candidates] if method == 1: return lexical_scores else: phonetic_scores = [damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0]) for candidate in candidates] if method == 2: return [phonetic_score if phonetic_score != 0 else 1 for phonetic_score in phonetic_scores] elif method == 3: return [0.5 * (a + b) for a, b in zip(lexical_scores, phonetic_scores)] elif method == 4: return [(2 * a + b) ** 2 for a, b in zip(lexical_scores, phonetic_scores)] else: raise ValueError('Method must be element from [1, 2, 3, 4]')
def homepage(): value1 = str(request.form['input1']) value2 = str(request.form['input2']) fuzzyNoDM = fuzz.ratio(value1, value2) fuzzyDM = fuzz.ratio(dm(value1)[0], dm(value2)[1]) ptr = homophone(value1, value2) print(ptr) redirect('/results') return correct(value1, value2, ptr, fuzzyDM, fuzzyNoDM)
def candidates(misspellings, language='en'): vocab = json.load(open("lexicon_" + language + ".json", 'r')) vocab_dict = load_vocab(vocab) print(str(len(misspellings)) + ' misspellings to generate candidates for') candidates_list = [] print("Generating Damerau-Levenshtein candidates") for i, misspelling in enumerate(misspellings): candidates_list.append( levenshtein_candidates(misspelling, vocab_dict, editdistance=2)) print("Generating Double Metaphone candidates edit distance 1") metaphone_dict = load_metaphones(vocab) vocab_dict = load_vocab(list(metaphone_dict.keys())) metaphone_candidates = [ levenshtein_candidates(dm(misspelling)[0], vocab_dict, editdistance=1) for misspelling in misspellings ] soundslike_candidates = [ convert_candidates(candidates, detection, metaphone_dict) for candidates, detection in zip(metaphone_candidates, misspellings) ] candidates_list = [ list(set(candidates1 + candidates2)) for candidates1, candidates2 in zip(candidates_list, soundslike_candidates) ] return candidates_list
def homophone(str1, str2): print(dm(str1)) print(dm(str2)) if fuzz.ratio(dm(str1)[0], dm(str2)[0]) == 100: return 1 elif fuzz.ratio( dm(str1)[1], dm(str2)[1]) == 100 and dm(str1)[1] != '' and dm(str2)[1] != '': return 1
def noisychannel_ranking(self, candidates_list): """ An approximate implementation of the ranking method described in Lai et al. (2015), 'Automated Misspelling Detection and Correction in Clinical Free-Text Records' :param candidates_list: list of candidate list per misspelling :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] for misspelling, candidates in zip(self.misspellings, candidates_list): if not candidates: correction_list.append('') continue score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance( misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance( dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance)**2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') else: correction_list.append( [candidates[i] for i in np.argsort(score_list)[:self.k]]) return correction_list
def noisychannel_ranking(self, detection_list, candidates_list): """ An approximate implementation of the ranking method described in (Lai et al. 2015) :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :param frequency_dict: corpus frequencies from training data :param k_best: if True, return k highest ranked candidates instead of single one :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] for misspelling, candidates in zip(detection_list, candidates_list): score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance(misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance) ** 2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if len(score_list) > 1: sorted_distances = [score_list[i] for i in np.argsort(score_list)] top1 = sorted_distances[0] top2 = sorted_distances[1] confidence = abs(top1 - top2) / top1 confidences.append(confidence) else: confidences.append(0) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append([candidates[i] for i in np.argsort(score_list)[:self.k]]) else: raise ValueError('k must be positive natural number') self.confidences = confidences return correction_list
def load_metaphones(vocab): """ :param vocab_file: either a list containing the vocabulary, or a text file which contains one lexical item per line :return: dictionary with mappings between Double Metaphone representations and corresponding lexical items """ # MAKE METAPHONE-LEXICAL MAPPING metaphone_dict = {} for item in vocab: metaphones = dm(item) for metaphone in metaphones: if metaphone: try: metaphone_dict[metaphone].append(item) except KeyError: metaphone_dict[metaphone] = [] metaphone_dict[metaphone].append(item) return metaphone_dict
candidates_list.append( levenshtein_candidates(misspelling, vocab_dict, editdistance=1)) else: print("Generating Damerau-Levenshtein candidates edit distance 2") for i, misspelling in enumerate(detection_list): print(i) candidates_list.append( levenshtein_candidates(misspelling, vocab_dict, editdistance=2)) if sys.argv[2] == "all": print("Generating Double Metaphone candidates edit distance 1") metaphone_dict = load_metaphones(vocab) vocab_dict = load_vocab(list(metaphone_dict.keys())) metaphone_candidates = [ levenshtein_candidates(dm(misspelling)[0], vocab_dict, editdistance=1) for misspelling in detection_list ] soundslike_candidates = [ convert_candidates(candidates, detection, metaphone_dict) for candidates, detection in zip(metaphone_candidates, detection_list) ] candidates_list = [ list(set(candidates1 + candidates2)) for candidates1, candidates2 in zip(candidates_list, soundslike_candidates) ] with open(sys.argv[3], 'w') as f: