示例#1
0
class TestEditFinder(unittest.TestCase):
    def setUp(self):
        self.finder = EditFinder()
    
    #@unittest.skip('')
    def test_deletion(self):
        word = "throne"
        error = "thron"
        edits = self.finder.find(word, error)
        self.assertEquals([('ne', 'n')], edits)

    #@unittest.skip('')
    def test_transposition(self):
        word = "their"
        error = "thier"
        edits = self.finder.find(word, error)
        self.assertEquals([('ei', 'ie')], edits)

    #@unittest.skip('')
    def test_substitution(self):
        word = "scar"
        error = "scax"
        edits = self.finder.find(word, error)
        print(word, error, edits)
        self.assertEquals([('ar', 'ax')], edits)

    #@unittest.skip('')
    def test_build_edits_rotation(self):
        word = "tragedy"
        error = "tradegy"
        first, second = self.finder.align(word, error)
        start = 3
        end = start + 2
        self.assertTrue(self.finder.edit_is_rotation(first, second, start, end))
        edits = self.finder.build_edits(first, second)
        expected = [('aged', 'adeg')]
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_build_edits_transposition(self):
        word = "their"
        error = "thier"
        first, second = self.finder.align(word, error)
        # The words are aligned like this:
        #     "th-eir"
        #     "thei-r"
        # So a transposition spans three characters.
        start = 2
        end = start + 2
        self.assertTrue(self.finder.edit_is_transposition(first, second, start, end))
        expected = [('ei', 'ie')]
        edits = self.finder.build_edits(first, second)
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_build_edits_insertion(self):
        tests = [{
                    'word': 'the',
                    'error': 'thre',
                    'start': 2,
                    'expected': ('th', 'thr')
                },
                {
                    'word': 'car',
                    'error': 'pcar',
                    'start': 0,
                    'expected': ('^', '^p')
                }]
        for test in tests:
            word = test['word']
            error = test['error']
            start = test['start']
            end = start
            expected = test['expected']

            first, second = self.finder.align(word, error)
            self.assertTrue(self.finder.edit_is_insertion(first, second, start, end))
            edits = self.finder.build_insertion(first, second, start, end)
            self.assertEquals(expected, edits)
            edits = self.finder.build_edits(first, second)
            self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_build_edits_deletion(self):
        tests = [{
                    'word': 'three',
                    'error': 'thre',
                    'start': 4,
                    'expected': ('ee', 'e')
                },
                {
                    'word': 'three',
                    'error': 'hree',
                    'start': 0,
                    'expected': ('^t', '^')
                }]
        for test in tests:
            word = test['word']
            error = test['error']
            start = test['start']
            end = start
            expected = test['expected']
            first, second = self.finder.align(word, error)
            self.assertTrue(self.finder.edit_is_deletion(first, second, start, end))
            edits = self.finder.build_deletion(first, second, start, end)
            self.assertEquals(expected, edits)
            edits = self.finder.build_edits(first, second)
            self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_build_edits_substitution(self):
        word = "scar"
        error = "scax"
        expected = ("ar", "ax")
        first, second = self.finder.align(word, error)
        # The words are aligned like this:
        #     "scar"
        #     "scax"
        start = 3
        end = start
        self.assertTrue(self.finder.edit_is_substitution(first, second, start, end))
        edits = self.finder.build_substitution(first, second, start, end)
        self.assertEquals(expected, edits)
        edits = self.finder.build_edits(first, second)
        self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_no_edits(self):
        word =  "replacement"
        error = "replasments"
        #
        # The words are aligned like this:
        #     "replacement-"
        #     "replas-ments"
        # This should be a substitution, a deletion, and an insertion.
        #     ('c','s'), ('ce', 'c'), ('t', 'ts')
        first, second = self.finder.align(word, error)
        expected = [('ac','as'), ('ce', 'c'), ('nt', 'nts')]
        edits = self.finder.build_edits(first, second)
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_apply_straight(self):
        word =  "straight"
        error = "strait"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_generally(self):
        word =  "generally"
        error = "geneology"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_critics(self):
        word =  "critics"
        error = "criticists"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_professor(self):
        word =  "professor"
        error = "proffesor"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_one(self):
        word =  "one"
        error = "noone"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_throughout(self):
        word =  "throughout"
        error = "throught"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_remove_dashes(self):
        word =  "crit-cs"
        self.assertEquals("critcs", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(4, self.finder.remove_dashes(5, word)[0])
        self.assertEquals(2, self.finder.remove_dashes(2, word)[0])
        self.assertEquals(4, self.finder.remove_dashes(4, word)[0])

    #@unittest.skip('')
    def test_remove_double_dashes(self):
        word =  "cr-t-cs"
        self.assertEquals("crtcs", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(3, self.finder.remove_dashes(5, word)[0])
        self.assertEquals(1, self.finder.remove_dashes(1, word)[0])
        self.assertEquals(3, self.finder.remove_dashes(4, word)[0])

    #@unittest.skip('')
    def test_remove_no_dashes(self):
        word =  "critics"
        self.assertEquals("critics", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(5, self.finder.remove_dashes(5, word)[0])

    #@unittest.skip('')
    def test_apply_on_wiki(self):
        words = spelling.mitton.load_mitton_words('data/wikipedia.dat')
        pairs = spelling.mitton.build_mitton_pairs(words)
        #with open("data/wikipedia.dat","r") as f:
        #    for line in f:
        #        if line[0] == "$":
        #            correct = line[1:-1]
        #        else:
        #            incorrect = line[:-1]
        for incorrect,correct in pairs:
            edits = self.finder.find(correct, incorrect)
            try:
                recovered_error = self.finder.apply(correct, edits)
                self.assertEquals(incorrect, recovered_error)
            except AssertionError as e:
                print(incorrect, correct, edits, recovered_error, e)
def build_operation_corpus(distance, operation, words, n=3, random_state=17):
    if isinstance(random_state, int):
        random_state = np.random.RandomState(seed=random_state)

    editor = Editor()
    edit_finder = EditFinder()
    pbar = build_progressbar(words)

    corpus = init_corpus()

    words_set = set(words)

    for i,w in enumerate(words):
        pbar.update(i+1)
        edits = set([w])
        #print('initial edits', edits)
        for i in range(distance):
            #print(w, i)
            new_edits = set()
            for edit in edits:
                #print('getting edits for %s' % edit)
                edits_for = editor.edit(edit, operation)
                new_edits.update(edits_for)
                #print('edits for %s %s' % (edit, str(new_edits)))

            # Remove the word itself from new edits.
            try:
                new_edits.remove(w)
            except KeyError:
                pass

            # Remove real words from the edits.
            for edit in new_edits.copy():
                if edit in words_set:
                    new_edits.remove(edit)

            # Break out if we can't make any new edits.
            if len(new_edits) == 0:
                new_edits = edits
                break

            #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w))

            n_choice = min(n, len(new_edits))

            try:
                edits = random_state.choice(list(new_edits), size=n_choice, replace=False)
            except ValueError as e:
                #print(w, new_edits, e)
                raise e

            #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n))

        try:
            edits = random_state.choice(list(edits), size=n, replace=False)
        except ValueError:
            pass

        for edit in edits:
            corpus['word'].append(unicode(edit))
            # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410.
            corpus['marked_word'].append('^' + edit + '$')
            corpus['real_word'].append(w)
            corpus['binary_target'].append(0)
            corpus['multiclass_target'].append(0)

            orig_chars = []
            changed_chars = []
            for orig,changed in edit_finder.find(w, edit):
                orig_chars.append(orig)
                changed_chars.append(changed)
            corpus['orig_pattern'].append('-'.join(orig_chars))
            corpus['changed_pattern'].append('-'.join(changed_chars))

    pbar.finish()

    corpus['distance'] = [distance for w in corpus['word']]
    corpus['operation'] = [operation for w in corpus['word']]

    return corpus