def _get_candidate_tokens(self, typo_info: TypoInfo) -> Set[Tuple[str, int]]: candidate_tokens = set() last_dist = -1 edit_candidates_count = 0 dist_calc = EditDistance(typo_info.typo, "damerau") if self.config["edit_dist_number"] > 0: for suggestion in self.checker.lookup(typo_info.typo, 2, self.config["max_distance"]): if suggestion.distance != last_dist: edit_candidates_count = 0 last_dist = suggestion.distance if edit_candidates_count >= self.config["edit_dist_number"]: continue candidate_tokens.add((suggestion.term, suggestion.distance)) edit_candidates_count += 1 if self.config["neighbors_number"] > 0: typo_neighbors = self._closest(self._vec(typo_info.typo), self.config["neighbors_number"]) candidate_tokens |= set(( candidate, dist_calc.damerau_levenshtein_distance(candidate, self.config["radius"])) for candidate in typo_neighbors if candidate in self.tokens) if len(typo_info.before + typo_info.after) > 0: context_neighbors = self._closest( self._compound_vec("%s %s" % (typo_info.before, typo_info.after)), self.config["neighbors_number"]) candidate_tokens |= set([( candidate, dist_calc.damerau_levenshtein_distance(candidate, self.config["radius"])) for candidate in context_neighbors if candidate in self.tokens]) candidate_tokens.add((typo_info.typo, 0)) return candidate_tokens
def test_corruptions(self): token = "abcdefgh" dist_calculator = EditDistance(token, "damerau") for corruption, distance in [(rand_delete, -1), (rand_insert, 1), (rand_substitution, 0), (rand_swap, 0)]: for _ in range(100): corrupted = corruption(token) self.assertEqual(dist_calculator.compare(corrupted, 1), 1) self.assertEqual(len(corrupted), len(token) + distance) for _ in range(100): corrupted, _ = _rand_typo((token, token), 1.0, 0.0) self.assertEqual(dist_calculator.compare(corrupted, 1), 1)
def _lookup_corrections_for_token(self, typo_info: TypoInfo) -> List[Features]: candidates = [] candidate_tokens = self._get_candidate_tokens(typo_info) typo_vec = self._vec(typo_info.typo) dist_calc = EditDistance(typo_info.typo, "damerau") for candidate in set(candidate_tokens): candidate_vec = self.wv[candidate] dist = dist_calc.damerau_levenshtein_distance(candidate, self.radius) if dist < 0: continue candidates.append(self._generate_features(typo_info, dist, typo_vec, candidate, candidate_vec)) return candidates
def test_compare(self): dist_calculator = EditDistance("token", "damerau") self.assertEqual(dist_calculator.compare("tokem", 1), 1) self.assertEqual(dist_calculator.compare("tokems", 1), -1) self.assertEqual(dist_calculator.compare("tokems", 2), 2)