Exemplo n.º 1
0
def get_edit_distance(request):
    comparer_dict = {
        "damerau_osa": {
            "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA),
            "expected": DamerauOsa,
        },
        "levenshtein": {
            "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN),
            "expected": Levenshtein,
        },
        "damerau_osa_fast": {
            "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST),
            "expected": DamerauOsaFast,
        },
        "levenshtein_fast": {
            "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST),
            "expected": LevenshteinFast,
        },
    }
    yield comparer_dict[request.param]["actual"], comparer_dict[
        request.param]["expected"]
Exemplo n.º 2
0
    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False):
        """lookup_compound supports compound aware automatic spelling
        correction of multi-word input strings with three cases:
        1. mistakenly inserted space into a correct word led to two incorrect
           terms
        2. mistakenly omitted space between two correct words led to one
           incorrect combined term
        3. multiple independent input terms with/without spelling errors

        Find suggested spellings for a multi-word input string (supports word
        splitting/merging).

        Keyword arguments:
        phrase -- The string being spell checked.
        max_edit_distance -- The maximum edit distance between input and
            suggested words.

        Return:
        A List of SuggestItem object representing suggested correct spellings
        for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can ignore
        # acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it remains
        # unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                # if re.match(r"\b[A-Z]{2,}\b", term_list_2[i]):
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1, 0)
                    # make sure we're comparing with the lowercase form of the
                    # previous word
                    distance_1 = distance_comparer.compare(
                        term_list_1[i - 1] + " " + term_list_1[i],
                        best_1.term.lower() + " " + best_2.term,
                        max_edit_distance)
                    if (distance_1 >= 0 and
                            suggestions_combi[0].distance + 1 < distance_1):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms with
            # suggestion ed=0 / never split single char terms
            if (suggestions and
                (suggestions[0].distance == 0 or len(term_list_1[i]) == 1)):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestions_split = list()
                # add original term
                if suggestions:
                    suggestions_split.append(suggestions[0])
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            # if split correction1 == einzelwort correction
                            if (suggestions and suggestions[0].term
                                    == suggestions_1[0].term):
                                break
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # if split correction1 == einzelwort correction
                                if (suggestions and suggestions[0].term
                                        == suggestions_2[0].term):
                                    break
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                tmp_count = min(suggestions_1[0].count,
                                                suggestions_2[0].count)
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                suggestions_split.append(suggestion_split)
                                # early termination of split
                                if suggestion_split.distance == 1:
                                    break

                    if suggestions_split:
                        # select best suggestion for split pair
                        suggestions_split.sort()
                        suggestion_parts.append(suggestions_split[0])
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         0)
                        suggestion_parts.append(si)
                else:
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0)
                    suggestion_parts.append(si)
        joined_term = ""
        joined_count = sys.maxsize
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count = min(joined_count, si.count)
        suggestion = SuggestItem(
            joined_term.rstrip(),
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            joined_count)
        suggestions_line = list()
        suggestions_line.append(suggestion)
        return suggestions_line
Exemplo n.º 3
0
 def test_unknown_distance_algorithm(self):
     with pytest.raises(ValueError) as excinfo:
         _ = EditDistance(2)
     assert "unknown distance algorithm" == str(excinfo.value)
Exemplo n.º 4
0
    def lookup(self,
               phrase,
               verbosity,
               max_edit_distance=None,
               include_unknown=False):
        """Find suggested spellings for a given phrase word.

        Keyword arguments:
        phrase -- The word being spell checked.
        verbosity -- The value controlling the quantity/closeness of the
            returned suggestions.
        max_edit_distance -- The maximum edit distance between phrase and
            suggested words.
        include_unknown -- Include phrase word in suggestions, if no words
            within edit distance found.

        Return:
        A list of SuggestItem object representing suggested correct spellings
        for the phrase word, sorted by edit distance, and secondarily by count
        frequency.
        """
        if max_edit_distance is None:
            max_edit_distance = self._max_dictionary_edit_distance
        if max_edit_distance > self._max_dictionary_edit_distance:
            raise ValueError("Distance too large")
        suggestions = list()
        phrase_len = len(phrase)

        def early_exit():
            if include_unknown and not suggestions:
                suggestions.append(
                    SuggestItem(phrase, max_edit_distance + 1, 0))
            return suggestions

        # early exit - word is too big to possibly match any words
        if phrase_len - max_edit_distance > self._max_length:
            return early_exit()

        # quick look for exact match
        suggestion_count = 0
        if phrase in self._words:
            suggestion_count = self._words[phrase]
            suggestions.append(SuggestItem(phrase, 0, suggestion_count))
            # early exit - return exact match, unless caller wants all matches
            if verbosity != Verbosity.ALL:
                return early_exit()

        # early termination, if we only want to check if word in dictionary or
        # get its frequency e.g. for word segmentation
        if max_edit_distance == 0:
            return early_exit()

        considered_deletes = set()
        considered_suggestions = set()
        # we considered the phrase already in the 'phrase in self._words' above
        considered_suggestions.add(phrase)

        max_edit_distance_2 = max_edit_distance
        candidate_pointer = 0
        candidates = list()

        # add original prefix
        phrase_prefix_len = phrase_len
        if phrase_prefix_len > self._prefix_length:
            phrase_prefix_len = self._prefix_length
            candidates.append(phrase[:phrase_prefix_len])
        else:
            candidates.append(phrase)
        distance_comparer = EditDistance(self._distance_algorithm)
        while candidate_pointer < len(candidates):
            candidate = candidates[candidate_pointer]
            candidate_pointer += 1
            candidate_len = len(candidate)
            len_diff = phrase_prefix_len - candidate_len

            # early termination: if candidate distance is already higher than
            # suggestion distance, than there are no better suggestions to be
            # expected
            if len_diff > max_edit_distance_2:
                # skip to next candidate if Verbosity.ALL, look no
                # further if Verbosity.TOP or CLOSEST (candidates are
                # ordered by delete distance, so none are closer than current)
                if verbosity == Verbosity.ALL:
                    continue
                break

            if self.get_str_hash(candidate) in self._deletes:
                dict_suggestions = self._deletes[self.get_str_hash(candidate)]
                for suggestion in dict_suggestions:
                    if suggestion == phrase:
                        continue
                    suggestion_len = len(suggestion)
                    # phrase and suggestion lengths diff > allowed/current best
                    # distance
                    if (abs(suggestion_len - phrase_len) > max_edit_distance_2
                            # suggestion must be for a different delete string,
                            # in same bin only because of hash collision
                            or suggestion_len < candidate_len
                            # if suggestion len = delete len, then it either
                            # equals delete or is in same bin only because of
                            # hash collision
                            or (suggestion_len == candidate_len
                                and suggestion != candidate)):
                        continue
                    suggestion_prefix_len = min(suggestion_len,
                                                self._prefix_length)
                    if (suggestion_prefix_len > phrase_prefix_len
                            and suggestion_prefix_len - candidate_len >
                            max_edit_distance_2):
                        continue
                    # True Damerau-Levenshtein Edit Distance: adjust distance,
                    # if both distances>0
                    # We allow simultaneous edits (deletes) of max_edit_distance
                    # on on both the dictionary and the phrase term.
                    # For replaces and adjacent transposes the resulting edit
                    # distance stays <= max_edit_distance.
                    # For inserts and deletes the resulting edit distance might
                    # exceed max_edit_distance.
                    # To prevent suggestions of a higher edit distance, we need
                    # to calculate the resulting edit distance, if there are
                    # simultaneous edits on both sides.
                    # Example: (bank==bnak and bank==bink, but bank!=kanb and
                    # bank!=xban and bank!=baxn for max_edit_distance=1)
                    # Two deletes on each side of a pair makes them all equal,
                    # but the first two pairs have edit distance=1, the others
                    # edit distance=2.
                    distance = 0
                    min_distance = 0
                    if candidate_len == 0:
                        # suggestions which have no common chars with phrase
                        # (phrase_len<=max_edit_distance &&
                        # suggestion_len<=max_edit_distance)
                        distance = max(phrase_len, suggestion_len)
                        if (distance > max_edit_distance_2
                                or suggestion in considered_suggestions):
                            continue
                    elif suggestion_len == 1:
                        distance = (phrase_len
                                    if phrase.index(suggestion[0]) < 0 else
                                    phrase_len - 1)
                        if (distance > max_edit_distance_2
                                or suggestion in considered_suggestions):
                            continue
                    # number of edits in prefix ==maxediddistance AND no
                    # identical suffix, then editdistance>max_edit_distance and
                    # no need for Levenshtein calculation
                    # (phraseLen >= prefixLength) &&
                    # (suggestionLen >= prefixLength)
                    else:
                        # handles the shortcircuit of min_distance assignment
                        # when first boolean expression evaluates to False
                        if self._prefix_length - max_edit_distance == candidate_len:
                            min_distance = (min(phrase_len, suggestion_len) -
                                            self._prefix_length)
                        else:
                            min_distance = 0
                        # pylint: disable=C0301,R0916
                        if (self._prefix_length - max_edit_distance
                                == candidate_len and
                            (min_distance > 1
                             and phrase[phrase_len + 1 - min_distance:] !=
                             suggestion[suggestion_len + 1 - min_distance:]) or
                            (min_distance > 0
                             and phrase[phrase_len - min_distance] !=
                             suggestion[suggestion_len - min_distance] and
                             (phrase[phrase_len - min_distance - 1] !=
                              suggestion[suggestion_len - min_distance]
                              or phrase[phrase_len - min_distance] !=
                              suggestion[suggestion_len - min_distance - 1]))):
                            continue
                        else:
                            # delete_in_suggestion_prefix is somewhat expensive,
                            # and only pays off when verbosity is TOP or CLOSEST
                            if ((verbosity != Verbosity.ALL
                                 and not self.delete_in_suggestion_prefix(
                                     candidate, candidate_len, suggestion,
                                     suggestion_len))
                                    or suggestion in considered_suggestions):
                                continue
                            considered_suggestions.add(suggestion)
                            distance = distance_comparer.compare(
                                phrase, suggestion, max_edit_distance_2)
                            if distance < 0:
                                continue
                    # do not process higher distances than those already found,
                    # if verbosity<ALL (note: max_edit_distance_2 will always
                    # equal max_edit_distance when Verbosity.ALL)
                    if distance <= max_edit_distance_2:
                        suggestion_count = self._words[suggestion]
                        si = SuggestItem(suggestion, distance,
                                         suggestion_count)
                        if suggestions:
                            if verbosity == Verbosity.CLOSEST:
                                # we will calculate DamLev distance only to the
                                # smallest found distance so far
                                if distance < max_edit_distance_2:
                                    suggestions = list()
                            elif verbosity == Verbosity.TOP:
                                if (distance < max_edit_distance_2
                                        or suggestion_count >
                                        suggestions[0].count):
                                    max_edit_distance_2 = distance
                                    suggestions[0] = si
                                continue
                        if verbosity != Verbosity.ALL:
                            max_edit_distance_2 = distance
                        suggestions.append(si)
            # add edits: derive edits (deletes) from candidate (phrase) and
            # add them to candidates list. this is a recursive process until
            # the maximum edit distance has been reached
            if (len_diff < max_edit_distance
                    and candidate_len <= self._prefix_length):
                # do not create edits with edit distance smaller than
                # suggestions already found
                if (verbosity != Verbosity.ALL
                        and len_diff >= max_edit_distance_2):
                    continue
                for i in range(candidate_len):
                    delete = candidate[:i] + candidate[i + 1:]
                    if delete not in considered_deletes:
                        considered_deletes.add(delete)
                        candidates.append(delete)
        if len(suggestions) > 1:
            suggestions.sort()
        return suggestions
    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False,
                        transfer_casing=False):
        """`lookup_compound` supports compound aware automatic spelling
        correction of multi-word input strings with three cases:

        1. mistakenly inserted space into a correct word led to two
           incorrect terms
        2. mistakenly omitted space between two correct words led to
           one incorrect combined term
        3. multiple independent input terms with/without spelling
           errors

        Find suggested spellings for a multi-word input string
        (supports word splitting/merging).

        **Args**:

        * phrase (str): The string being spell checked.
        * max_edit_distance (int): The maximum edit distance between\
            input and suggested words.
        * transfer_casing (bool): A flag to determine whether the
            casing (eg upper- vs lowercase) should be carried over\
            from the phrase

        **Returns**:
        A list of :class:`SuggestItem` object representing suggested\
            correct spellings for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can
        # ignore acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it
        # remains unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        # estimated word occurrence probability
                        # P=10 / (N * 10^word length l)
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1,
                                             10 // 10**len(term_list_1[i]))
                    # distance_1=edit distance between 2 split terms and
                    # their best corrections : als comparative value
                    # for the combination
                    distance_1 = best_1.distance + best_2.distance
                    if (distance_1 >= 0 and
                        (suggestions_combi[0].distance + 1 < distance_1 or
                         (suggestions_combi[0].distance + 1 == distance_1 and
                          (suggestions_combi[0].count >
                           best_1.count / self.N * best_2.count)))):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms
            # with suggestion ed=0 / never split single char terms
            if suggestions and (suggestions[0].distance == 0
                                or len(term_list_1[i]) == 1):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestion_split_best = None
                # add original term
                if suggestions:
                    suggestion_split_best = suggestions[0]
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                if suggestion_split_best is not None:
                                    if tmp_distance > suggestion_split_best.distance:
                                        continue
                                    if tmp_distance < suggestion_split_best.distance:
                                        suggestion_split_best = None
                                if tmp_term in self._bigrams:
                                    tmp_count = self._bigrams[tmp_term]
                                    # increase count, if split
                                    # corrections are part of or
                                    # identical to input single term
                                    # correction exists
                                    if suggestions:
                                        best_si = suggestions[0]
                                        # alternatively remove the
                                        # single term from
                                        # suggestion_split, but then
                                        # other splittings could win
                                        if suggestions_1[
                                                0].term + suggestions_2[
                                                    0].term == term_list_1[i]:
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 2)
                                        elif (suggestions_1[0].term
                                              == best_si.term
                                              or suggestions_2[0].term
                                              == best_si.term):
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 1)
                                    # no single term correction exists
                                    elif suggestions_1[0].term + suggestions_2[
                                            0].term == term_list_1[i]:
                                        tmp_count = max(
                                            tmp_count,
                                            max(suggestions_1[0].count,
                                                suggestions_2[0].count) + 2)
                                else:
                                    # The Naive Bayes probability of
                                    # the word combination is the
                                    # product of the two word
                                    # probabilities: P(AB)=P(A)*P(B)
                                    # use it to estimate the frequency
                                    # count of the combination, which
                                    # then is used to rank/select the
                                    # best splitting variant
                                    tmp_count = min(
                                        self.bigram_count_min,
                                        int(suggestions_1[0].count / self.N *
                                            suggestions_2[0].count))
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                if (suggestion_split_best is None
                                        or suggestion_split.count >
                                        suggestion_split_best.count):
                                    suggestion_split_best = suggestion_split

                    if suggestion_split_best is not None:
                        # select best suggestion for split pair
                        suggestion_parts.append(suggestion_split_best)
                        self._replaced_words[
                            term_list_1[i]] = suggestion_split_best
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         int(10 / 10**len(term_list_1[i])))
                        suggestion_parts.append(si)
                        self._replaced_words[term_list_1[i]] = si
                else:
                    # estimated word occurrence probability
                    # P=10 / (N * 10^word length l)
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                     int(10 / 10**len(term_list_1[i])))
                    suggestion_parts.append(si)
                    self._replaced_words[term_list_1[i]] = si
        joined_term = ""
        joined_count = self.N
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count *= si.count / self.N
        joined_term = joined_term.rstrip()
        if transfer_casing:
            joined_term = helpers.transfer_casing_for_similar_text(
                phrase, joined_term)
        suggestion = SuggestItem(
            joined_term,
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            int(joined_count))
        suggestions_line = list()
        suggestions_line.append(suggestion)

        return suggestions_line
    def lookup(self,
               phrase,
               verbosity,
               max_edit_distance=None,
               include_unknown=False,
               ignore_token=None,
               transfer_casing=False):
        """Find suggested spellings for a given phrase word.

        **Args**:

        * phrase (str): The word being spell checked.
        * verbosity (:class:`Verbosity`): The value controlling the\
            quantity/closeness of the returned suggestions.
        * max_edit_distance (int): The maximum edit distance between\
            phrase and suggested words.
        * include_unknown (bool): A flag to determine whether to\
            include phrase word in suggestions, if no words within\
            edit distance found.
        * ignore_token (regex pattern): A regex pattern describing\
            what words/phrases to ignore and leave unchanged
        * transfer_casing (bool): A flag to determine whether the
            casing (eg upper- vs lowercase) should be carried over\
            from the phrase

        **Returns**:
        A list of :class:`SuggestItem` object representing suggested\
            correct spellings for the phrase word, sorted by edit\
            distance, and secondarily by count frequency.

        **Raises**:

        * ValueError: `max_edit_distance` is greater than\
            `_max_dictionary_edit_distance`
        """
        if max_edit_distance is None:
            max_edit_distance = self._max_dictionary_edit_distance
        if max_edit_distance > self._max_dictionary_edit_distance:
            raise ValueError("Distance too large")
        suggestions = list()
        phrase_len = len(phrase)

        if transfer_casing:
            original_phrase = phrase
            phrase = phrase.lower()

        def early_exit():
            if include_unknown and not suggestions:
                suggestions.append(
                    SuggestItem(phrase, max_edit_distance + 1, 0))
            return suggestions

        # early exit - word is too big to possibly match any words
        if phrase_len - max_edit_distance > self._max_length:
            return early_exit()

        # quick look for exact match
        suggestion_count = 0
        if phrase in self._words:
            suggestion_count = self._words[phrase]
            suggestions.append(SuggestItem(phrase, 0, suggestion_count))
            # early exit - return exact match, unless caller wants all
            # matches
            if verbosity != Verbosity.ALL:
                return early_exit()

        if (ignore_token is not None
                and re.match(ignore_token, phrase) is not None):
            suggestion_count = 1
            suggestions.append(SuggestItem(phrase, 0, suggestion_count))
            # early exit - return exact match, unless caller wants all
            # matches
            if verbosity != Verbosity.ALL:
                return early_exit()

        # early termination, if we only want to check if word in
        # dictionary or get its frequency e.g. for word segmentation
        if max_edit_distance == 0:
            return early_exit()

        considered_deletes = set()
        considered_suggestions = set()
        # we considered the phrase already in the
        # 'phrase in self._words' above
        considered_suggestions.add(phrase)

        max_edit_distance_2 = max_edit_distance
        candidate_pointer = 0
        candidates = list()

        # add original prefix
        phrase_prefix_len = phrase_len
        if phrase_prefix_len > self._prefix_length:
            phrase_prefix_len = self._prefix_length
            candidates.append(phrase[:phrase_prefix_len])
        else:
            candidates.append(phrase)
        distance_comparer = EditDistance(self._distance_algorithm)
        while candidate_pointer < len(candidates):
            candidate = candidates[candidate_pointer]
            candidate_pointer += 1
            candidate_len = len(candidate)
            len_diff = phrase_prefix_len - candidate_len

            # early termination: if candidate distance is already
            # higher than suggestion distance, than there are no better
            # suggestions to be expected
            if len_diff > max_edit_distance_2:
                # skip to next candidate if Verbosity.ALL, look no
                # further if Verbosity.TOP or CLOSEST (candidates are
                # ordered by delete distance, so none are closer than
                # current)
                if verbosity == Verbosity.ALL:
                    continue
                break

            if candidate in self._deletes:
                dict_suggestions = self._deletes[candidate]
                for suggestion in dict_suggestions:
                    if suggestion == phrase:
                        continue
                    suggestion_len = len(suggestion)
                    # phrase and suggestion lengths
                    # diff > allowed/current best distance
                    if (abs(suggestion_len - phrase_len) > max_edit_distance_2
                            # suggestion must be for a different delete
                            # string, in same bin only because of hash
                            # collision
                            or suggestion_len < candidate_len
                            # if suggestion len = delete len, then it
                            # either equals delete or is in same bin
                            # only because of hash collision
                            or (suggestion_len == candidate_len
                                and suggestion != candidate)):
                        continue
                    suggestion_prefix_len = min(suggestion_len,
                                                self._prefix_length)
                    if (suggestion_prefix_len > phrase_prefix_len
                            and suggestion_prefix_len - candidate_len >
                            max_edit_distance_2):
                        continue
                    # True Damerau-Levenshtein Edit Distance: adjust
                    # distance, if both distances>0
                    # We allow simultaneous edits (deletes) of
                    # max_edit_distance on on both the dictionary and
                    # the phrase term. For replaces and adjacent
                    # transposes the resulting edit distance stays
                    # <= max_edit_distance. For inserts and deletes the
                    # resulting edit distance might exceed
                    # max_edit_distance. To prevent suggestions of a
                    # higher edit distance, we need to calculate the
                    # resulting edit distance, if there are
                    # simultaneous edits on both sides.
                    # Example: (bank==bnak and bank==bink, but
                    # bank!=kanb and bank!=xban and bank!=baxn for
                    # max_edit_distance=1). Two deletes on each side of
                    # a pair makes them all equal, but the first two
                    # pairs have edit distance=1, the others edit
                    # distance=2.
                    distance = 0
                    min_distance = 0
                    if candidate_len == 0:
                        # suggestions which have no common chars with
                        # phrase (phrase_len<=max_edit_distance &&
                        # suggestion_len<=max_edit_distance)
                        distance = max(phrase_len, suggestion_len)
                        if (distance > max_edit_distance_2
                                or suggestion in considered_suggestions):
                            continue
                    elif suggestion_len == 1:
                        distance = (phrase_len
                                    if phrase.index(suggestion[0]) < 0 else
                                    phrase_len - 1)
                        if (distance > max_edit_distance_2
                                or suggestion in considered_suggestions):
                            continue
                    # number of edits in prefix ==maxediddistance AND
                    # no identical suffix, then
                    # editdistance>max_edit_distance and no need for
                    # Levenshtein calculation
                    # (phraseLen >= prefixLength) &&
                    # (suggestionLen >= prefixLength)
                    else:
                        # handles the shortcircuit of min_distance
                        # assignment when first boolean expression
                        # evaluates to False
                        if self._prefix_length - max_edit_distance == candidate_len:
                            min_distance = (min(phrase_len, suggestion_len) -
                                            self._prefix_length)
                        else:
                            min_distance = 0
                        # pylint: disable=C0301,R0916
                        if (self._prefix_length - max_edit_distance
                                == candidate_len and
                            (min_distance > 1
                             and phrase[phrase_len + 1 - min_distance:] !=
                             suggestion[suggestion_len + 1 - min_distance:]) or
                            (min_distance > 0
                             and phrase[phrase_len - min_distance] !=
                             suggestion[suggestion_len - min_distance] and
                             (phrase[phrase_len - min_distance - 1] !=
                              suggestion[suggestion_len - min_distance]
                              or phrase[phrase_len - min_distance] !=
                              suggestion[suggestion_len - min_distance - 1]))):
                            continue
                        else:
                            # delete_in_suggestion_prefix is somewhat
                            # expensive, and only pays off when
                            # verbosity is TOP or CLOSEST
                            if ((verbosity != Verbosity.ALL
                                 and not self._delete_in_suggestion_prefix(
                                     candidate, candidate_len, suggestion,
                                     suggestion_len))
                                    or suggestion in considered_suggestions):
                                continue
                            considered_suggestions.add(suggestion)
                            distance = distance_comparer.compare(
                                phrase, suggestion, max_edit_distance_2)
                            if distance < 0:
                                continue
                    # do not process higher distances than those
                    # already found, if verbosity<ALL (note:
                    # max_edit_distance_2 will always equal
                    # max_edit_distance when Verbosity.ALL)
                    if distance <= max_edit_distance_2:
                        suggestion_count = self._words[suggestion]
                        si = SuggestItem(suggestion, distance,
                                         suggestion_count)
                        if suggestions:
                            if verbosity == Verbosity.CLOSEST:
                                # we will calculate DamLev distance
                                # only to the smallest found distance
                                # so far
                                if distance < max_edit_distance_2:
                                    suggestions = list()
                            elif verbosity == Verbosity.TOP:
                                if (distance < max_edit_distance_2
                                        or suggestion_count >
                                        suggestions[0].count):
                                    max_edit_distance_2 = distance
                                    suggestions[0] = si
                                continue
                        if verbosity != Verbosity.ALL:
                            max_edit_distance_2 = distance
                        suggestions.append(si)
            # add edits: derive edits (deletes) from candidate (phrase)
            # and add them to candidates list. this is a recursive
            # process until the maximum edit distance has been reached
            if (len_diff < max_edit_distance
                    and candidate_len <= self._prefix_length):
                # do not create edits with edit distance smaller than
                # suggestions already found
                if (verbosity != Verbosity.ALL
                        and len_diff >= max_edit_distance_2):
                    continue
                for i in range(candidate_len):
                    delete = candidate[:i] + candidate[i + 1:]
                    if delete not in considered_deletes:
                        considered_deletes.add(delete)
                        candidates.append(delete)
        if len(suggestions) > 1:
            suggestions.sort()

        if transfer_casing:
            suggestions = [
                SuggestItem(
                    helpers.transfer_casing_for_similar_text(
                        original_phrase, s.term), s.distance, s.count)
                for s in suggestions
            ]

        early_exit()
        return suggestions
Exemplo n.º 7
0
 def test_unknown_distance_algorithm(self):
     with pytest.raises(ValueError) as excinfo:
         __ = EditDistance(DistanceAlgorithm.LEVENSHTEIN)
     self.assertEqual("Unknown distance algorithm", str(excinfo.value))
Exemplo n.º 8
0
 def test_unknown_distance_algorithm(self):
     print('  - %s' % inspect.stack()[0][3])
     with pytest.raises(ValueError) as excinfo:
         __ = EditDistance(DistanceAlgorithm.LEVENSHTEIN)
     self.assertEqual("Unknown distance algorithm", str(excinfo.value))