def get_edit_distance(request): comparer_dict = { "damerau_osa": { "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA), "expected": DamerauOsa, }, "levenshtein": { "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN), "expected": Levenshtein, }, "damerau_osa_fast": { "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST), "expected": DamerauOsaFast, }, "levenshtein_fast": { "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST), "expected": LevenshteinFast, }, } yield comparer_dict[request.param]["actual"], comparer_dict[ request.param]["expected"]
def lookup_compound(self, phrase, max_edit_distance, ignore_non_words=False): """lookup_compound supports compound aware automatic spelling correction of multi-word input strings with three cases: 1. mistakenly inserted space into a correct word led to two incorrect terms 2. mistakenly omitted space between two correct words led to one incorrect combined term 3. multiple independent input terms with/without spelling errors Find suggested spellings for a multi-word input string (supports word splitting/merging). Keyword arguments: phrase -- The string being spell checked. max_edit_distance -- The maximum edit distance between input and suggested words. Return: A List of SuggestItem object representing suggested correct spellings for the input string. """ # Parse input string into single terms term_list_1 = helpers.parse_words(phrase) # Second list of single terms with preserved cases so we can ignore # acronyms (all cap words) if ignore_non_words: term_list_2 = helpers.parse_words(phrase, True) suggestions = list() suggestion_parts = list() distance_comparer = EditDistance(self._distance_algorithm) # translate every item to its best suggestion, otherwise it remains # unchanged is_last_combi = False for i, __ in enumerate(term_list_1): if ignore_non_words: if helpers.try_parse_int64(term_list_1[i]) is not None: suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0)) continue # if re.match(r"\b[A-Z]{2,}\b", term_list_2[i]): if helpers.is_acronym(term_list_2[i]): suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0)) continue suggestions = self.lookup(term_list_1[i], Verbosity.TOP, max_edit_distance) # combi check, always before split if i > 0 and not is_last_combi: suggestions_combi = self.lookup( term_list_1[i - 1] + term_list_1[i], Verbosity.TOP, max_edit_distance) if suggestions_combi: best_1 = suggestion_parts[-1] if suggestions: best_2 = suggestions[0] else: best_2 = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) # make sure we're comparing with the lowercase form of the # previous word distance_1 = distance_comparer.compare( term_list_1[i - 1] + " " + term_list_1[i], best_1.term.lower() + " " + best_2.term, max_edit_distance) if (distance_1 >= 0 and suggestions_combi[0].distance + 1 < distance_1): suggestions_combi[0].distance += 1 suggestion_parts[-1] = suggestions_combi[0] is_last_combi = True continue is_last_combi = False # alway split terms without suggestion / never split terms with # suggestion ed=0 / never split single char terms if (suggestions and (suggestions[0].distance == 0 or len(term_list_1[i]) == 1)): # choose best suggestion suggestion_parts.append(suggestions[0]) else: # if no perfect suggestion, split word into pairs suggestions_split = list() # add original term if suggestions: suggestions_split.append(suggestions[0]) if len(term_list_1[i]) > 1: for j in range(1, len(term_list_1[i])): part_1 = term_list_1[i][:j] part_2 = term_list_1[i][j:] suggestions_1 = self.lookup(part_1, Verbosity.TOP, max_edit_distance) if suggestions_1: # if split correction1 == einzelwort correction if (suggestions and suggestions[0].term == suggestions_1[0].term): break suggestions_2 = self.lookup( part_2, Verbosity.TOP, max_edit_distance) if suggestions_2: # if split correction1 == einzelwort correction if (suggestions and suggestions[0].term == suggestions_2[0].term): break # select best suggestion for split pair tmp_term = (suggestions_1[0].term + " " + suggestions_2[0].term) tmp_distance = distance_comparer.compare( term_list_1[i], tmp_term, max_edit_distance) if tmp_distance < 0: tmp_distance = max_edit_distance + 1 tmp_count = min(suggestions_1[0].count, suggestions_2[0].count) suggestion_split = SuggestItem( tmp_term, tmp_distance, tmp_count) suggestions_split.append(suggestion_split) # early termination of split if suggestion_split.distance == 1: break if suggestions_split: # select best suggestion for split pair suggestions_split.sort() suggestion_parts.append(suggestions_split[0]) else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) suggestion_parts.append(si) else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) suggestion_parts.append(si) joined_term = "" joined_count = sys.maxsize for si in suggestion_parts: joined_term += si.term + " " joined_count = min(joined_count, si.count) suggestion = SuggestItem( joined_term.rstrip(), distance_comparer.compare(phrase, joined_term, 2**31 - 1), joined_count) suggestions_line = list() suggestions_line.append(suggestion) return suggestions_line
def test_unknown_distance_algorithm(self): with pytest.raises(ValueError) as excinfo: _ = EditDistance(2) assert "unknown distance algorithm" == str(excinfo.value)
def lookup(self, phrase, verbosity, max_edit_distance=None, include_unknown=False): """Find suggested spellings for a given phrase word. Keyword arguments: phrase -- The word being spell checked. verbosity -- The value controlling the quantity/closeness of the returned suggestions. max_edit_distance -- The maximum edit distance between phrase and suggested words. include_unknown -- Include phrase word in suggestions, if no words within edit distance found. Return: A list of SuggestItem object representing suggested correct spellings for the phrase word, sorted by edit distance, and secondarily by count frequency. """ if max_edit_distance is None: max_edit_distance = self._max_dictionary_edit_distance if max_edit_distance > self._max_dictionary_edit_distance: raise ValueError("Distance too large") suggestions = list() phrase_len = len(phrase) def early_exit(): if include_unknown and not suggestions: suggestions.append( SuggestItem(phrase, max_edit_distance + 1, 0)) return suggestions # early exit - word is too big to possibly match any words if phrase_len - max_edit_distance > self._max_length: return early_exit() # quick look for exact match suggestion_count = 0 if phrase in self._words: suggestion_count = self._words[phrase] suggestions.append(SuggestItem(phrase, 0, suggestion_count)) # early exit - return exact match, unless caller wants all matches if verbosity != Verbosity.ALL: return early_exit() # early termination, if we only want to check if word in dictionary or # get its frequency e.g. for word segmentation if max_edit_distance == 0: return early_exit() considered_deletes = set() considered_suggestions = set() # we considered the phrase already in the 'phrase in self._words' above considered_suggestions.add(phrase) max_edit_distance_2 = max_edit_distance candidate_pointer = 0 candidates = list() # add original prefix phrase_prefix_len = phrase_len if phrase_prefix_len > self._prefix_length: phrase_prefix_len = self._prefix_length candidates.append(phrase[:phrase_prefix_len]) else: candidates.append(phrase) distance_comparer = EditDistance(self._distance_algorithm) while candidate_pointer < len(candidates): candidate = candidates[candidate_pointer] candidate_pointer += 1 candidate_len = len(candidate) len_diff = phrase_prefix_len - candidate_len # early termination: if candidate distance is already higher than # suggestion distance, than there are no better suggestions to be # expected if len_diff > max_edit_distance_2: # skip to next candidate if Verbosity.ALL, look no # further if Verbosity.TOP or CLOSEST (candidates are # ordered by delete distance, so none are closer than current) if verbosity == Verbosity.ALL: continue break if self.get_str_hash(candidate) in self._deletes: dict_suggestions = self._deletes[self.get_str_hash(candidate)] for suggestion in dict_suggestions: if suggestion == phrase: continue suggestion_len = len(suggestion) # phrase and suggestion lengths diff > allowed/current best # distance if (abs(suggestion_len - phrase_len) > max_edit_distance_2 # suggestion must be for a different delete string, # in same bin only because of hash collision or suggestion_len < candidate_len # if suggestion len = delete len, then it either # equals delete or is in same bin only because of # hash collision or (suggestion_len == candidate_len and suggestion != candidate)): continue suggestion_prefix_len = min(suggestion_len, self._prefix_length) if (suggestion_prefix_len > phrase_prefix_len and suggestion_prefix_len - candidate_len > max_edit_distance_2): continue # True Damerau-Levenshtein Edit Distance: adjust distance, # if both distances>0 # We allow simultaneous edits (deletes) of max_edit_distance # on on both the dictionary and the phrase term. # For replaces and adjacent transposes the resulting edit # distance stays <= max_edit_distance. # For inserts and deletes the resulting edit distance might # exceed max_edit_distance. # To prevent suggestions of a higher edit distance, we need # to calculate the resulting edit distance, if there are # simultaneous edits on both sides. # Example: (bank==bnak and bank==bink, but bank!=kanb and # bank!=xban and bank!=baxn for max_edit_distance=1) # Two deletes on each side of a pair makes them all equal, # but the first two pairs have edit distance=1, the others # edit distance=2. distance = 0 min_distance = 0 if candidate_len == 0: # suggestions which have no common chars with phrase # (phrase_len<=max_edit_distance && # suggestion_len<=max_edit_distance) distance = max(phrase_len, suggestion_len) if (distance > max_edit_distance_2 or suggestion in considered_suggestions): continue elif suggestion_len == 1: distance = (phrase_len if phrase.index(suggestion[0]) < 0 else phrase_len - 1) if (distance > max_edit_distance_2 or suggestion in considered_suggestions): continue # number of edits in prefix ==maxediddistance AND no # identical suffix, then editdistance>max_edit_distance and # no need for Levenshtein calculation # (phraseLen >= prefixLength) && # (suggestionLen >= prefixLength) else: # handles the shortcircuit of min_distance assignment # when first boolean expression evaluates to False if self._prefix_length - max_edit_distance == candidate_len: min_distance = (min(phrase_len, suggestion_len) - self._prefix_length) else: min_distance = 0 # pylint: disable=C0301,R0916 if (self._prefix_length - max_edit_distance == candidate_len and (min_distance > 1 and phrase[phrase_len + 1 - min_distance:] != suggestion[suggestion_len + 1 - min_distance:]) or (min_distance > 0 and phrase[phrase_len - min_distance] != suggestion[suggestion_len - min_distance] and (phrase[phrase_len - min_distance - 1] != suggestion[suggestion_len - min_distance] or phrase[phrase_len - min_distance] != suggestion[suggestion_len - min_distance - 1]))): continue else: # delete_in_suggestion_prefix is somewhat expensive, # and only pays off when verbosity is TOP or CLOSEST if ((verbosity != Verbosity.ALL and not self.delete_in_suggestion_prefix( candidate, candidate_len, suggestion, suggestion_len)) or suggestion in considered_suggestions): continue considered_suggestions.add(suggestion) distance = distance_comparer.compare( phrase, suggestion, max_edit_distance_2) if distance < 0: continue # do not process higher distances than those already found, # if verbosity<ALL (note: max_edit_distance_2 will always # equal max_edit_distance when Verbosity.ALL) if distance <= max_edit_distance_2: suggestion_count = self._words[suggestion] si = SuggestItem(suggestion, distance, suggestion_count) if suggestions: if verbosity == Verbosity.CLOSEST: # we will calculate DamLev distance only to the # smallest found distance so far if distance < max_edit_distance_2: suggestions = list() elif verbosity == Verbosity.TOP: if (distance < max_edit_distance_2 or suggestion_count > suggestions[0].count): max_edit_distance_2 = distance suggestions[0] = si continue if verbosity != Verbosity.ALL: max_edit_distance_2 = distance suggestions.append(si) # add edits: derive edits (deletes) from candidate (phrase) and # add them to candidates list. this is a recursive process until # the maximum edit distance has been reached if (len_diff < max_edit_distance and candidate_len <= self._prefix_length): # do not create edits with edit distance smaller than # suggestions already found if (verbosity != Verbosity.ALL and len_diff >= max_edit_distance_2): continue for i in range(candidate_len): delete = candidate[:i] + candidate[i + 1:] if delete not in considered_deletes: considered_deletes.add(delete) candidates.append(delete) if len(suggestions) > 1: suggestions.sort() return suggestions
def lookup_compound(self, phrase, max_edit_distance, ignore_non_words=False, transfer_casing=False): """`lookup_compound` supports compound aware automatic spelling correction of multi-word input strings with three cases: 1. mistakenly inserted space into a correct word led to two incorrect terms 2. mistakenly omitted space between two correct words led to one incorrect combined term 3. multiple independent input terms with/without spelling errors Find suggested spellings for a multi-word input string (supports word splitting/merging). **Args**: * phrase (str): The string being spell checked. * max_edit_distance (int): The maximum edit distance between\ input and suggested words. * transfer_casing (bool): A flag to determine whether the casing (eg upper- vs lowercase) should be carried over\ from the phrase **Returns**: A list of :class:`SuggestItem` object representing suggested\ correct spellings for the input string. """ # Parse input string into single terms term_list_1 = helpers.parse_words(phrase) # Second list of single terms with preserved cases so we can # ignore acronyms (all cap words) if ignore_non_words: term_list_2 = helpers.parse_words(phrase, True) suggestions = list() suggestion_parts = list() distance_comparer = EditDistance(self._distance_algorithm) # translate every item to its best suggestion, otherwise it # remains unchanged is_last_combi = False for i, __ in enumerate(term_list_1): if ignore_non_words: if helpers.try_parse_int64(term_list_1[i]) is not None: suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0)) continue if helpers.is_acronym(term_list_2[i]): suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0)) continue suggestions = self.lookup(term_list_1[i], Verbosity.TOP, max_edit_distance) # combi check, always before split if i > 0 and not is_last_combi: suggestions_combi = self.lookup( term_list_1[i - 1] + term_list_1[i], Verbosity.TOP, max_edit_distance) if suggestions_combi: best_1 = suggestion_parts[-1] if suggestions: best_2 = suggestions[0] else: # estimated word occurrence probability # P=10 / (N * 10^word length l) best_2 = SuggestItem(term_list_1[i], max_edit_distance + 1, 10 // 10**len(term_list_1[i])) # distance_1=edit distance between 2 split terms and # their best corrections : als comparative value # for the combination distance_1 = best_1.distance + best_2.distance if (distance_1 >= 0 and (suggestions_combi[0].distance + 1 < distance_1 or (suggestions_combi[0].distance + 1 == distance_1 and (suggestions_combi[0].count > best_1.count / self.N * best_2.count)))): suggestions_combi[0].distance += 1 suggestion_parts[-1] = suggestions_combi[0] is_last_combi = True continue is_last_combi = False # alway split terms without suggestion / never split terms # with suggestion ed=0 / never split single char terms if suggestions and (suggestions[0].distance == 0 or len(term_list_1[i]) == 1): # choose best suggestion suggestion_parts.append(suggestions[0]) else: # if no perfect suggestion, split word into pairs suggestion_split_best = None # add original term if suggestions: suggestion_split_best = suggestions[0] if len(term_list_1[i]) > 1: for j in range(1, len(term_list_1[i])): part_1 = term_list_1[i][:j] part_2 = term_list_1[i][j:] suggestions_1 = self.lookup(part_1, Verbosity.TOP, max_edit_distance) if suggestions_1: suggestions_2 = self.lookup( part_2, Verbosity.TOP, max_edit_distance) if suggestions_2: # select best suggestion for split pair tmp_term = (suggestions_1[0].term + " " + suggestions_2[0].term) tmp_distance = distance_comparer.compare( term_list_1[i], tmp_term, max_edit_distance) if tmp_distance < 0: tmp_distance = max_edit_distance + 1 if suggestion_split_best is not None: if tmp_distance > suggestion_split_best.distance: continue if tmp_distance < suggestion_split_best.distance: suggestion_split_best = None if tmp_term in self._bigrams: tmp_count = self._bigrams[tmp_term] # increase count, if split # corrections are part of or # identical to input single term # correction exists if suggestions: best_si = suggestions[0] # alternatively remove the # single term from # suggestion_split, but then # other splittings could win if suggestions_1[ 0].term + suggestions_2[ 0].term == term_list_1[i]: # make count bigger than # count of single term # correction tmp_count = max( tmp_count, best_si.count + 2) elif (suggestions_1[0].term == best_si.term or suggestions_2[0].term == best_si.term): # make count bigger than # count of single term # correction tmp_count = max( tmp_count, best_si.count + 1) # no single term correction exists elif suggestions_1[0].term + suggestions_2[ 0].term == term_list_1[i]: tmp_count = max( tmp_count, max(suggestions_1[0].count, suggestions_2[0].count) + 2) else: # The Naive Bayes probability of # the word combination is the # product of the two word # probabilities: P(AB)=P(A)*P(B) # use it to estimate the frequency # count of the combination, which # then is used to rank/select the # best splitting variant tmp_count = min( self.bigram_count_min, int(suggestions_1[0].count / self.N * suggestions_2[0].count)) suggestion_split = SuggestItem( tmp_term, tmp_distance, tmp_count) if (suggestion_split_best is None or suggestion_split.count > suggestion_split_best.count): suggestion_split_best = suggestion_split if suggestion_split_best is not None: # select best suggestion for split pair suggestion_parts.append(suggestion_split_best) self._replaced_words[ term_list_1[i]] = suggestion_split_best else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, int(10 / 10**len(term_list_1[i]))) suggestion_parts.append(si) self._replaced_words[term_list_1[i]] = si else: # estimated word occurrence probability # P=10 / (N * 10^word length l) si = SuggestItem(term_list_1[i], max_edit_distance + 1, int(10 / 10**len(term_list_1[i]))) suggestion_parts.append(si) self._replaced_words[term_list_1[i]] = si joined_term = "" joined_count = self.N for si in suggestion_parts: joined_term += si.term + " " joined_count *= si.count / self.N joined_term = joined_term.rstrip() if transfer_casing: joined_term = helpers.transfer_casing_for_similar_text( phrase, joined_term) suggestion = SuggestItem( joined_term, distance_comparer.compare(phrase, joined_term, 2**31 - 1), int(joined_count)) suggestions_line = list() suggestions_line.append(suggestion) return suggestions_line
def lookup(self, phrase, verbosity, max_edit_distance=None, include_unknown=False, ignore_token=None, transfer_casing=False): """Find suggested spellings for a given phrase word. **Args**: * phrase (str): The word being spell checked. * verbosity (:class:`Verbosity`): The value controlling the\ quantity/closeness of the returned suggestions. * max_edit_distance (int): The maximum edit distance between\ phrase and suggested words. * include_unknown (bool): A flag to determine whether to\ include phrase word in suggestions, if no words within\ edit distance found. * ignore_token (regex pattern): A regex pattern describing\ what words/phrases to ignore and leave unchanged * transfer_casing (bool): A flag to determine whether the casing (eg upper- vs lowercase) should be carried over\ from the phrase **Returns**: A list of :class:`SuggestItem` object representing suggested\ correct spellings for the phrase word, sorted by edit\ distance, and secondarily by count frequency. **Raises**: * ValueError: `max_edit_distance` is greater than\ `_max_dictionary_edit_distance` """ if max_edit_distance is None: max_edit_distance = self._max_dictionary_edit_distance if max_edit_distance > self._max_dictionary_edit_distance: raise ValueError("Distance too large") suggestions = list() phrase_len = len(phrase) if transfer_casing: original_phrase = phrase phrase = phrase.lower() def early_exit(): if include_unknown and not suggestions: suggestions.append( SuggestItem(phrase, max_edit_distance + 1, 0)) return suggestions # early exit - word is too big to possibly match any words if phrase_len - max_edit_distance > self._max_length: return early_exit() # quick look for exact match suggestion_count = 0 if phrase in self._words: suggestion_count = self._words[phrase] suggestions.append(SuggestItem(phrase, 0, suggestion_count)) # early exit - return exact match, unless caller wants all # matches if verbosity != Verbosity.ALL: return early_exit() if (ignore_token is not None and re.match(ignore_token, phrase) is not None): suggestion_count = 1 suggestions.append(SuggestItem(phrase, 0, suggestion_count)) # early exit - return exact match, unless caller wants all # matches if verbosity != Verbosity.ALL: return early_exit() # early termination, if we only want to check if word in # dictionary or get its frequency e.g. for word segmentation if max_edit_distance == 0: return early_exit() considered_deletes = set() considered_suggestions = set() # we considered the phrase already in the # 'phrase in self._words' above considered_suggestions.add(phrase) max_edit_distance_2 = max_edit_distance candidate_pointer = 0 candidates = list() # add original prefix phrase_prefix_len = phrase_len if phrase_prefix_len > self._prefix_length: phrase_prefix_len = self._prefix_length candidates.append(phrase[:phrase_prefix_len]) else: candidates.append(phrase) distance_comparer = EditDistance(self._distance_algorithm) while candidate_pointer < len(candidates): candidate = candidates[candidate_pointer] candidate_pointer += 1 candidate_len = len(candidate) len_diff = phrase_prefix_len - candidate_len # early termination: if candidate distance is already # higher than suggestion distance, than there are no better # suggestions to be expected if len_diff > max_edit_distance_2: # skip to next candidate if Verbosity.ALL, look no # further if Verbosity.TOP or CLOSEST (candidates are # ordered by delete distance, so none are closer than # current) if verbosity == Verbosity.ALL: continue break if candidate in self._deletes: dict_suggestions = self._deletes[candidate] for suggestion in dict_suggestions: if suggestion == phrase: continue suggestion_len = len(suggestion) # phrase and suggestion lengths # diff > allowed/current best distance if (abs(suggestion_len - phrase_len) > max_edit_distance_2 # suggestion must be for a different delete # string, in same bin only because of hash # collision or suggestion_len < candidate_len # if suggestion len = delete len, then it # either equals delete or is in same bin # only because of hash collision or (suggestion_len == candidate_len and suggestion != candidate)): continue suggestion_prefix_len = min(suggestion_len, self._prefix_length) if (suggestion_prefix_len > phrase_prefix_len and suggestion_prefix_len - candidate_len > max_edit_distance_2): continue # True Damerau-Levenshtein Edit Distance: adjust # distance, if both distances>0 # We allow simultaneous edits (deletes) of # max_edit_distance on on both the dictionary and # the phrase term. For replaces and adjacent # transposes the resulting edit distance stays # <= max_edit_distance. For inserts and deletes the # resulting edit distance might exceed # max_edit_distance. To prevent suggestions of a # higher edit distance, we need to calculate the # resulting edit distance, if there are # simultaneous edits on both sides. # Example: (bank==bnak and bank==bink, but # bank!=kanb and bank!=xban and bank!=baxn for # max_edit_distance=1). Two deletes on each side of # a pair makes them all equal, but the first two # pairs have edit distance=1, the others edit # distance=2. distance = 0 min_distance = 0 if candidate_len == 0: # suggestions which have no common chars with # phrase (phrase_len<=max_edit_distance && # suggestion_len<=max_edit_distance) distance = max(phrase_len, suggestion_len) if (distance > max_edit_distance_2 or suggestion in considered_suggestions): continue elif suggestion_len == 1: distance = (phrase_len if phrase.index(suggestion[0]) < 0 else phrase_len - 1) if (distance > max_edit_distance_2 or suggestion in considered_suggestions): continue # number of edits in prefix ==maxediddistance AND # no identical suffix, then # editdistance>max_edit_distance and no need for # Levenshtein calculation # (phraseLen >= prefixLength) && # (suggestionLen >= prefixLength) else: # handles the shortcircuit of min_distance # assignment when first boolean expression # evaluates to False if self._prefix_length - max_edit_distance == candidate_len: min_distance = (min(phrase_len, suggestion_len) - self._prefix_length) else: min_distance = 0 # pylint: disable=C0301,R0916 if (self._prefix_length - max_edit_distance == candidate_len and (min_distance > 1 and phrase[phrase_len + 1 - min_distance:] != suggestion[suggestion_len + 1 - min_distance:]) or (min_distance > 0 and phrase[phrase_len - min_distance] != suggestion[suggestion_len - min_distance] and (phrase[phrase_len - min_distance - 1] != suggestion[suggestion_len - min_distance] or phrase[phrase_len - min_distance] != suggestion[suggestion_len - min_distance - 1]))): continue else: # delete_in_suggestion_prefix is somewhat # expensive, and only pays off when # verbosity is TOP or CLOSEST if ((verbosity != Verbosity.ALL and not self._delete_in_suggestion_prefix( candidate, candidate_len, suggestion, suggestion_len)) or suggestion in considered_suggestions): continue considered_suggestions.add(suggestion) distance = distance_comparer.compare( phrase, suggestion, max_edit_distance_2) if distance < 0: continue # do not process higher distances than those # already found, if verbosity<ALL (note: # max_edit_distance_2 will always equal # max_edit_distance when Verbosity.ALL) if distance <= max_edit_distance_2: suggestion_count = self._words[suggestion] si = SuggestItem(suggestion, distance, suggestion_count) if suggestions: if verbosity == Verbosity.CLOSEST: # we will calculate DamLev distance # only to the smallest found distance # so far if distance < max_edit_distance_2: suggestions = list() elif verbosity == Verbosity.TOP: if (distance < max_edit_distance_2 or suggestion_count > suggestions[0].count): max_edit_distance_2 = distance suggestions[0] = si continue if verbosity != Verbosity.ALL: max_edit_distance_2 = distance suggestions.append(si) # add edits: derive edits (deletes) from candidate (phrase) # and add them to candidates list. this is a recursive # process until the maximum edit distance has been reached if (len_diff < max_edit_distance and candidate_len <= self._prefix_length): # do not create edits with edit distance smaller than # suggestions already found if (verbosity != Verbosity.ALL and len_diff >= max_edit_distance_2): continue for i in range(candidate_len): delete = candidate[:i] + candidate[i + 1:] if delete not in considered_deletes: considered_deletes.add(delete) candidates.append(delete) if len(suggestions) > 1: suggestions.sort() if transfer_casing: suggestions = [ SuggestItem( helpers.transfer_casing_for_similar_text( original_phrase, s.term), s.distance, s.count) for s in suggestions ] early_exit() return suggestions
def test_unknown_distance_algorithm(self): with pytest.raises(ValueError) as excinfo: __ = EditDistance(DistanceAlgorithm.LEVENSHTEIN) self.assertEqual("Unknown distance algorithm", str(excinfo.value))
def test_unknown_distance_algorithm(self): print(' - %s' % inspect.stack()[0][3]) with pytest.raises(ValueError) as excinfo: __ = EditDistance(DistanceAlgorithm.LEVENSHTEIN) self.assertEqual("Unknown distance algorithm", str(excinfo.value))