Пример #1
0
def precise_affix_score(word1: str, word2: str, diff_factor: float, *,
                        base: float) -> float:
    """
    Scoring, stage 3: Hardcore final score for affixed forms!

    It actually produces score of one of 3 groups:

    * > 1000: if the words are actually same with different casing (shouldn't happen when called from
      suggest, it should've already handled that!)
    * < -100: if the word difference is too much (what is "too much" defined by ``diff_factor``), only
      one of those questionable suggestions would be returned
    * -100...1000: just a normal suggestion score, defining its sorting position

    See also :meth:`filter_guesses` below which uses this separation into "groups" to drop some results.

    Args:
        word1: misspelled word
        word2: possible suggestion
        diff_factor: factor changing amount of suggestions (:attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>`)
        base: initial score of word1 against word2
    """

    lcs = sm.lcslen(word1, word2)

    # same characters with different casing -- "very good" suggestion class
    if len(word1) == len(word2) and len(word1) == lcs:
        return base + 2000

    # Score is: length of longest common subsequent minus length difference...
    result = 2 * lcs - abs(len(word1) - len(word2))

    # increase score by length of common start substring
    result += sm.leftcommonsubstring(word1, word2)

    cps, is_swap = sm.commoncharacterpositions(word1, word2.lower())
    # Add 1 if there were _any_ occurence of "same chars in same positions" in two words
    if cps:
        result += 1
    # Add 10 if the only difference of two words is "exactly two characters swapped"
    if is_swap:
        result += 10

    # Add regular four-gram weight
    result += sm.ngram(4, word1, word2, any_mismatch=True)

    # Sum of weighted bigrams used to estimate result quality
    bigrams = (sm.ngram(2, word1, word2, any_mismatch=True, weighted=True) +
               sm.ngram(2, word2, word1, any_mismatch=True, weighted=True))

    result += bigrams

    # diff_factor's ranges from 0 to 2 (depending of aff.MAXDIFF=0..10, with 10 meaning "give me all
    # possible ngrams" and 0 meaninig "avoid most of the questionable ngrams"); with MAXDIFF=10 the
    # factor would be 0, and this branch will be avoided; with MAXDIFF=0 the factor would be 2, and
    # lots of "slihtly similar" words would be dropped into "questionable" bag.
    if bigrams < (len(word1) + len(word2)) * diff_factor:
        result -= 1000

    return result
Пример #2
0
def rough_affix_score(word1: str, word2: str) -> float:
    """
    Scoring, stage 2: First (rough and quick) score of affixed forms: n-gram score with n=length of
    the misspelled word + longest start substring

    Args:
        word1: misspelled word
        word2: possible suggestion
    """

    return (sm.ngram(len(word1), word1, word2, any_mismatch=True) +
            sm.leftcommonsubstring(word1, word2))
Пример #3
0
def root_score(word1: str, word2: str) -> float:
    """
    Scoring, stage 1: Simple score for first dictionary words choosing: 3-gram score + longest start
    substring.

    Args:
        word1: misspelled word
        word2: possible suggestion
    """

    return (sm.ngram(3, word1, word2.lower(), longer_worse=True) +
            sm.leftcommonsubstring(word1, word2.lower()))
Пример #4
0
def detect_threshold(word: str) -> float:
    """
    Find minimum threshold for a passable suggestion

    Mangle original word three differnt ways (by replacing each 4th character with "*", starting from
    1st, 2nd or 3rd), and score them to generate a minimum acceptable score.

    Args:
        word: misspelled word
    """

    thresh = 0.0

    for start_pos in range(1, 4):
        mangled = list(word)
        for pos in range(start_pos, len(word), 4):
            mangled[pos] = '*'

        mangled_word = ''.join(mangled)

        thresh += sm.ngram(len(word), word, mangled_word, any_mismatch=True)

    # Take average of the three scores
    return thresh // 3 - 1
Пример #5
0
def phonet_suggest(misspelling: str, *, dictionary_words: List[dic.Word],
                   table: aff.PhonetTable) -> Iterator[str]:
    """
    Phonetical suggestion algorithm provides suggestions based on phonetical (prononication) similarity.
    It requires .aff file to define :attr:`PHONE <spylls.hunspell.data.aff.Aff.PHONE>` table --
    which, we should add, is *extremely* rare in known dictionaries.

    Internally:

    * selects words from dictionary similarly to
      :meth:`ngram_suggest <spylls.hunspell.algo.ngram_suggest.ngram_suggest>`
      (and even reuses its :meth:`root_score <spylls.hunspell.algo.ngram_suggest.root_score>`)
    * and scores their phonetic representations (calculated with :meth:`metaphone`) with phonetic
      representation of misspelling
    * then chooses the most similar ones with :meth:`final_score` (ngram-based comparison)

    Note, that as both this method, and :meth:`ngram_suggest <spylls.hunspell.algo.ngram_suggest.ngram_suggest>`
    iterate through the whole dictionary, Hunspell optimizes suggestion search to making it all
    in one module/one loop. Spylls splits them for clarity.

    Args:
        misspelling: Misspelled word
        dictionary_words: All words from dictionary (only stems are used)
        table: Table for metaphone producing
    """

    misspelling = misspelling.lower()
    misspelling_ph = metaphone(table, misspelling)

    scores: List[Tuple[float, str]] = []

    # First, select words from dictionary whose stems alike the misspelling we are trying to suggest.
    #
    # This cycle is exactly the same as the first cycle in ngram_suggest. In fact, in original Hunspell
    # both ngram and phonetical suggestion are done in one pass inside ngram_suggest, which is
    # more effective (one iteration through whole dictionary instead of two) but much harder to
    # understand and debug.
    #
    # Considering extreme rarity of metaphone-enabled dictionaries, and "educational" goal of
    # spylls, we split it out.
    for word in dictionary_words:
        if abs(len(word.stem) - len(misspelling)) > 3:
            continue

        # First, we calculate "regular" similarity score, just like in ngram_suggest
        nscore = ng.root_score(misspelling, word.stem)

        if word.alt_spellings:
            for variant in word.alt_spellings:
                nscore = max(nscore, ng.root_score(misspelling, variant))

        if nscore <= 2:
            continue

        # ...and if it shows words are somewhat close, we calculate metaphone score
        score = 2 * sm.ngram(
            3, misspelling_ph, metaphone(table, word.stem), longer_worse=True)

        if len(scores) > MAX_ROOTS:
            heapq.heappushpop(scores, (score, word.stem))
        else:
            heapq.heappush(scores, (score, word.stem))

    guesses = heapq.nlargest(MAX_ROOTS, scores)

    # Finally, we sort suggestions by simplistic string similarity metric (of the misspelling and
    # dictionary word's stem)
    guesses2 = [(score + final_score(misspelling, word.lower()), word)
                for (score, word) in guesses]
    # (NB: actually, we might not need ``key`` here, but it is
    # added for sorting stability; doesn't changes the objective quality of suggestions, but passes
    # hunspell test ``phone.sug``!)
    guesses2 = sorted(guesses2, key=itemgetter(0), reverse=True)

    for (_, sug) in guesses2:
        yield sug
Пример #6
0
def precise_affix_score(word1: str, word2: str, diff_factor: float, *,
                        base: float, has_phonetic: bool) -> float:
    """
    Scoring, stage 3: Hardcore final score for affixed forms!

    It actually produces score of one of 3 groups:

    * > 1000: if the words are actually same with different casing (surprisingly enough, not all of
      those are caught in the "editing" suggest; example: "unesco's" => "UNESCO's")
    * < -100: if the word difference is too much (what is "too much" defined by ``diff_factor``), only
      one of those questionable suggestions would be returned
    * -100...1000: just a normal suggestion score, defining its sorting position

    See also :meth:`filter_guesses` below which uses this separation into "groups" to drop some results.

    Args:
        word1: misspelled word
        word2: possible suggestion
        diff_factor: factor changing amount of suggestions (:attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>`)
        base: initial score of word1 against word2
        has_phonetic: whether there are :attr:`Aff.PHONE <spylls.hunspell.data.aff.Aff.PHONE>`
                      definitions present (it changes ngram thresholds a bit, in order to produce
                      less ngrams)
    """

    lcs = sm.lcslen(word1, word2)

    # same characters with different casing -- "very good" suggestion class
    if len(word1) == len(word2) and len(word1) == lcs:
        return base + 2000

    # Score is: length of longest common subsequent minus length difference...
    result = 2 * lcs - abs(len(word1) - len(word2))

    # increase score by length of common start substring
    result += sm.leftcommonsubstring(word1, word2)

    # Add 1 if there were _any_ occurence of "same chars in same positions" in two words
    if sm.commoncharacters(word1, word2.lower()):
        result += 1

    # Add regular four-gram weight
    result += sm.ngram(4, word1, word2, any_mismatch=True)

    # Sum of weighted bigrams used to estimate result quality
    bigrams = (sm.ngram(2, word1, word2, any_mismatch=True, weighted=True) +
               sm.ngram(2, word2, word1, any_mismatch=True, weighted=True))

    result += bigrams

    # diff_factor's ranges from 0 to 2 (depending of aff.MAXDIFF=0..10, with 10 meaning "give me all
    # possible ngrams" and 0 meaninig "avoid most of the questionable ngrams"); with MAXDIFF=10 the
    # factor would be 0, and this branch will be avoided; with MAXDIFF=0 the factor would be 2, and
    # lots of "slihtly similar" words would be dropped into "questionable" bag.
    #
    # In a presence of ``PHONE`` definitions table in aff-file (used for phonetic similarity search),
    # the threshold is a bit different. NB: I (zverok) believe it is a bug in Hunspell, because this
    # threshold difference probably (?) meant to produce _less_ questionable ngram suggestions in
    # a presence of phonetic ones, but actually produces more (branches confused?)
    if has_phonetic:
        questionable_limit = len(word2) * diff_factor
    else:
        questionable_limit = (len(word1) + len(word2)) * diff_factor
    if bigrams < questionable_limit:
        result -= 1000

    return result