예제 #1
0
 def _get_candidate_tokens(self, typo_info: TypoInfo) -> Set[Tuple[str, int]]:
     candidate_tokens = set()
     last_dist = -1
     edit_candidates_count = 0
     dist_calc = EditDistance(typo_info.typo, "damerau")
     if self.config["edit_dist_number"] > 0:
         for suggestion in self.checker.lookup(typo_info.typo, 2, self.config["max_distance"]):
             if suggestion.distance != last_dist:
                 edit_candidates_count = 0
                 last_dist = suggestion.distance
             if edit_candidates_count >= self.config["edit_dist_number"]:
                 continue
             candidate_tokens.add((suggestion.term, suggestion.distance))
             edit_candidates_count += 1
     if self.config["neighbors_number"] > 0:
         typo_neighbors = self._closest(self._vec(typo_info.typo),
                                        self.config["neighbors_number"])
         candidate_tokens |= set((
             candidate,
             dist_calc.damerau_levenshtein_distance(candidate, self.config["radius"]))
             for candidate in typo_neighbors if candidate in self.tokens)
         if len(typo_info.before + typo_info.after) > 0:
             context_neighbors = self._closest(
                 self._compound_vec("%s %s" % (typo_info.before, typo_info.after)),
                 self.config["neighbors_number"])
             candidate_tokens |= set([(
                 candidate,
                 dist_calc.damerau_levenshtein_distance(candidate, self.config["radius"]))
                 for candidate in context_neighbors if candidate in self.tokens])
     candidate_tokens.add((typo_info.typo, 0))
     return candidate_tokens
예제 #2
0
 def test_corruptions(self):
     token = "abcdefgh"
     dist_calculator = EditDistance(token, "damerau")
     for corruption, distance in [(rand_delete, -1), (rand_insert, 1),
                                  (rand_substitution, 0), (rand_swap, 0)]:
         for _ in range(100):
             corrupted = corruption(token)
             self.assertEqual(dist_calculator.compare(corrupted, 1), 1)
             self.assertEqual(len(corrupted), len(token) + distance)
     for _ in range(100):
         corrupted, _ = _rand_typo((token, token), 1.0, 0.0)
         self.assertEqual(dist_calculator.compare(corrupted, 1), 1)
예제 #3
0
    def _lookup_corrections_for_token(self, typo_info: TypoInfo) -> List[Features]:
        candidates = []
        candidate_tokens = self._get_candidate_tokens(typo_info)
        typo_vec = self._vec(typo_info.typo)
        dist_calc = EditDistance(typo_info.typo, "damerau")
        for candidate in set(candidate_tokens):
            candidate_vec = self.wv[candidate]
            dist = dist_calc.damerau_levenshtein_distance(candidate, self.radius)

            if dist < 0:
                continue
            candidates.append(self._generate_features(typo_info, dist, typo_vec,
                                                      candidate, candidate_vec))

        return candidates
예제 #4
0
 def test_compare(self):
     dist_calculator = EditDistance("token", "damerau")
     self.assertEqual(dist_calculator.compare("tokem", 1), 1)
     self.assertEqual(dist_calculator.compare("tokems", 1), -1)
     self.assertEqual(dist_calculator.compare("tokems", 2), 2)