class TestEditFinder(unittest.TestCase): def setUp(self): self.finder = EditFinder() #@unittest.skip('') def test_deletion(self): word = "throne" error = "thron" edits = self.finder.find(word, error) self.assertEquals([('ne', 'n')], edits) #@unittest.skip('') def test_transposition(self): word = "their" error = "thier" edits = self.finder.find(word, error) self.assertEquals([('ei', 'ie')], edits) #@unittest.skip('') def test_substitution(self): word = "scar" error = "scax" edits = self.finder.find(word, error) print(word, error, edits) self.assertEquals([('ar', 'ax')], edits) #@unittest.skip('') def test_build_edits_rotation(self): word = "tragedy" error = "tradegy" first, second = self.finder.align(word, error) start = 3 end = start + 2 self.assertTrue(self.finder.edit_is_rotation(first, second, start, end)) edits = self.finder.build_edits(first, second) expected = [('aged', 'adeg')] self.assertEquals(expected, edits) #@unittest.skip('') def test_build_edits_transposition(self): word = "their" error = "thier" first, second = self.finder.align(word, error) # The words are aligned like this: # "th-eir" # "thei-r" # So a transposition spans three characters. start = 2 end = start + 2 self.assertTrue(self.finder.edit_is_transposition(first, second, start, end)) expected = [('ei', 'ie')] edits = self.finder.build_edits(first, second) self.assertEquals(expected, edits) #@unittest.skip('') def test_build_edits_insertion(self): tests = [{ 'word': 'the', 'error': 'thre', 'start': 2, 'expected': ('th', 'thr') }, { 'word': 'car', 'error': 'pcar', 'start': 0, 'expected': ('^', '^p') }] for test in tests: word = test['word'] error = test['error'] start = test['start'] end = start expected = test['expected'] first, second = self.finder.align(word, error) self.assertTrue(self.finder.edit_is_insertion(first, second, start, end)) edits = self.finder.build_insertion(first, second, start, end) self.assertEquals(expected, edits) edits = self.finder.build_edits(first, second) self.assertEquals([expected], edits) #@unittest.skip('') def test_build_edits_deletion(self): tests = [{ 'word': 'three', 'error': 'thre', 'start': 4, 'expected': ('ee', 'e') }, { 'word': 'three', 'error': 'hree', 'start': 0, 'expected': ('^t', '^') }] for test in tests: word = test['word'] error = test['error'] start = test['start'] end = start expected = test['expected'] first, second = self.finder.align(word, error) self.assertTrue(self.finder.edit_is_deletion(first, second, start, end)) edits = self.finder.build_deletion(first, second, start, end) self.assertEquals(expected, edits) edits = self.finder.build_edits(first, second) self.assertEquals([expected], edits) #@unittest.skip('') def test_build_edits_substitution(self): word = "scar" error = "scax" expected = ("ar", "ax") first, second = self.finder.align(word, error) # The words are aligned like this: # "scar" # "scax" start = 3 end = start self.assertTrue(self.finder.edit_is_substitution(first, second, start, end)) edits = self.finder.build_substitution(first, second, start, end) self.assertEquals(expected, edits) edits = self.finder.build_edits(first, second) self.assertEquals([expected], edits) #@unittest.skip('') def test_no_edits(self): word = "replacement" error = "replasments" # # The words are aligned like this: # "replacement-" # "replas-ments" # This should be a substitution, a deletion, and an insertion. # ('c','s'), ('ce', 'c'), ('t', 'ts') first, second = self.finder.align(word, error) expected = [('ac','as'), ('ce', 'c'), ('nt', 'nts')] edits = self.finder.build_edits(first, second) self.assertEquals(expected, edits) #@unittest.skip('') def test_apply_straight(self): word = "straight" error = "strait" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_apply_generally(self): word = "generally" error = "geneology" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_apply_critics(self): word = "critics" error = "criticists" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_apply_professor(self): word = "professor" error = "proffesor" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_apply_one(self): word = "one" error = "noone" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_apply_throughout(self): word = "throughout" error = "throught" edits = self.finder.find(word, error) self.assertEquals(error, self.finder.apply(word, edits)) #@unittest.skip('') def test_remove_dashes(self): word = "crit-cs" self.assertEquals("critcs", self.finder.remove_dashes(5, word)[1]) self.assertEquals(4, self.finder.remove_dashes(5, word)[0]) self.assertEquals(2, self.finder.remove_dashes(2, word)[0]) self.assertEquals(4, self.finder.remove_dashes(4, word)[0]) #@unittest.skip('') def test_remove_double_dashes(self): word = "cr-t-cs" self.assertEquals("crtcs", self.finder.remove_dashes(5, word)[1]) self.assertEquals(3, self.finder.remove_dashes(5, word)[0]) self.assertEquals(1, self.finder.remove_dashes(1, word)[0]) self.assertEquals(3, self.finder.remove_dashes(4, word)[0]) #@unittest.skip('') def test_remove_no_dashes(self): word = "critics" self.assertEquals("critics", self.finder.remove_dashes(5, word)[1]) self.assertEquals(5, self.finder.remove_dashes(5, word)[0]) #@unittest.skip('') def test_apply_on_wiki(self): words = spelling.mitton.load_mitton_words('data/wikipedia.dat') pairs = spelling.mitton.build_mitton_pairs(words) #with open("data/wikipedia.dat","r") as f: # for line in f: # if line[0] == "$": # correct = line[1:-1] # else: # incorrect = line[:-1] for incorrect,correct in pairs: edits = self.finder.find(correct, incorrect) try: recovered_error = self.finder.apply(correct, edits) self.assertEquals(incorrect, recovered_error) except AssertionError as e: print(incorrect, correct, edits, recovered_error, e)
def run(self): errors = [] pbar = build_progressbar(self.real_words) finder = EditFinder() for i,word in enumerate(self.real_words): pbar.update(i+1) # Find all the edits we can make to this word. possible_edits = list() probs = list() for subseq in subsequences(word): # Probably delete this if statement as redundant. for e in self.edit_db.edits(subseq): _, error_subseq, count = e possible_edit = (subseq, error_subseq) if count > 0: possible_edits.append(possible_edit) probs.append(count) if len(possible_edits) == 0: continue probs = np.array(probs) probs = probs / float(probs.sum()) seen_edits = set() errors_for_word = [] attempts = 0. # Try to generate up to the requested number of errors per word. while True: try: attempts += 1. if self.enough_errors_for_word(word, errors_for_word): # Generated enough errors for this word. break elif attempts > 10 and len(errors_for_word) / attempts < 0.1: # Not finding many errors to apply. Break out. break # Sample the number of edits. edit_sizes = np.arange(1, self.max_edits_per_error+1) edit_size_probs = 1. / edit_sizes edit_size_probs /= edit_size_probs.sum() size = self.random_state.choice(edit_sizes, size=1, replace=False, p=edit_size_probs)[0] # Sample edits with probability proportional to the edit's frequency. edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs) edit = [] for i in edit_idx: pe = possible_edits[i] if pe in seen_edits: continue seen_edits.add(pe) edit.append(pe) if len(edit) == 0: continue # Avoid applying edits that result in unlikely errors. for constraint in self.constraints: for e in edit: if constraint(word, e): raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \ (e[0], e[1], word)) error = finder.apply(word, edit) if error in self.blacklist: # Skip blacklisted words (i.e. non-words in a corpus used to generate the # edit patterns in the edit database). continue errors_for_word.append((word, len(possible_edits), edit, error)) except EditConstraintError as e: if self.verbose: print(e) errors.extend(errors_for_word) pbar.finish() return errors