示例#1
0
 def setUp(self):
     self.finder = EditFinder()
def build_operation_corpus(distance, operation, words, n=3, random_state=17):
    if isinstance(random_state, int):
        random_state = np.random.RandomState(seed=random_state)

    editor = Editor()
    edit_finder = EditFinder()
    pbar = build_progressbar(words)

    corpus = init_corpus()

    words_set = set(words)

    for i,w in enumerate(words):
        pbar.update(i+1)
        edits = set([w])
        #print('initial edits', edits)
        for i in range(distance):
            #print(w, i)
            new_edits = set()
            for edit in edits:
                #print('getting edits for %s' % edit)
                edits_for = editor.edit(edit, operation)
                new_edits.update(edits_for)
                #print('edits for %s %s' % (edit, str(new_edits)))

            # Remove the word itself from new edits.
            try:
                new_edits.remove(w)
            except KeyError:
                pass

            # Remove real words from the edits.
            for edit in new_edits.copy():
                if edit in words_set:
                    new_edits.remove(edit)

            # Break out if we can't make any new edits.
            if len(new_edits) == 0:
                new_edits = edits
                break

            #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w))

            n_choice = min(n, len(new_edits))

            try:
                edits = random_state.choice(list(new_edits), size=n_choice, replace=False)
            except ValueError as e:
                #print(w, new_edits, e)
                raise e

            #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n))

        try:
            edits = random_state.choice(list(edits), size=n, replace=False)
        except ValueError:
            pass

        for edit in edits:
            corpus['word'].append(unicode(edit))
            # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410.
            corpus['marked_word'].append('^' + edit + '$')
            corpus['real_word'].append(w)
            corpus['binary_target'].append(0)
            corpus['multiclass_target'].append(0)

            orig_chars = []
            changed_chars = []
            for orig,changed in edit_finder.find(w, edit):
                orig_chars.append(orig)
                changed_chars.append(changed)
            corpus['orig_pattern'].append('-'.join(orig_chars))
            corpus['changed_pattern'].append('-'.join(changed_chars))

    pbar.finish()

    corpus['distance'] = [distance for w in corpus['word']]
    corpus['operation'] = [operation for w in corpus['word']]

    return corpus
示例#3
0
class TestEditFinder(unittest.TestCase):
    def setUp(self):
        self.finder = EditFinder()
    
    #@unittest.skip('')
    def test_deletion(self):
        word = "throne"
        error = "thron"
        edits = self.finder.find(word, error)
        self.assertEquals([('ne', 'n')], edits)

    #@unittest.skip('')
    def test_transposition(self):
        word = "their"
        error = "thier"
        edits = self.finder.find(word, error)
        self.assertEquals([('ei', 'ie')], edits)

    #@unittest.skip('')
    def test_substitution(self):
        word = "scar"
        error = "scax"
        edits = self.finder.find(word, error)
        print(word, error, edits)
        self.assertEquals([('ar', 'ax')], edits)

    #@unittest.skip('')
    def test_build_edits_rotation(self):
        word = "tragedy"
        error = "tradegy"
        first, second = self.finder.align(word, error)
        start = 3
        end = start + 2
        self.assertTrue(self.finder.edit_is_rotation(first, second, start, end))
        edits = self.finder.build_edits(first, second)
        expected = [('aged', 'adeg')]
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_build_edits_transposition(self):
        word = "their"
        error = "thier"
        first, second = self.finder.align(word, error)
        # The words are aligned like this:
        #     "th-eir"
        #     "thei-r"
        # So a transposition spans three characters.
        start = 2
        end = start + 2
        self.assertTrue(self.finder.edit_is_transposition(first, second, start, end))
        expected = [('ei', 'ie')]
        edits = self.finder.build_edits(first, second)
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_build_edits_insertion(self):
        tests = [{
                    'word': 'the',
                    'error': 'thre',
                    'start': 2,
                    'expected': ('th', 'thr')
                },
                {
                    'word': 'car',
                    'error': 'pcar',
                    'start': 0,
                    'expected': ('^', '^p')
                }]
        for test in tests:
            word = test['word']
            error = test['error']
            start = test['start']
            end = start
            expected = test['expected']

            first, second = self.finder.align(word, error)
            self.assertTrue(self.finder.edit_is_insertion(first, second, start, end))
            edits = self.finder.build_insertion(first, second, start, end)
            self.assertEquals(expected, edits)
            edits = self.finder.build_edits(first, second)
            self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_build_edits_deletion(self):
        tests = [{
                    'word': 'three',
                    'error': 'thre',
                    'start': 4,
                    'expected': ('ee', 'e')
                },
                {
                    'word': 'three',
                    'error': 'hree',
                    'start': 0,
                    'expected': ('^t', '^')
                }]
        for test in tests:
            word = test['word']
            error = test['error']
            start = test['start']
            end = start
            expected = test['expected']
            first, second = self.finder.align(word, error)
            self.assertTrue(self.finder.edit_is_deletion(first, second, start, end))
            edits = self.finder.build_deletion(first, second, start, end)
            self.assertEquals(expected, edits)
            edits = self.finder.build_edits(first, second)
            self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_build_edits_substitution(self):
        word = "scar"
        error = "scax"
        expected = ("ar", "ax")
        first, second = self.finder.align(word, error)
        # The words are aligned like this:
        #     "scar"
        #     "scax"
        start = 3
        end = start
        self.assertTrue(self.finder.edit_is_substitution(first, second, start, end))
        edits = self.finder.build_substitution(first, second, start, end)
        self.assertEquals(expected, edits)
        edits = self.finder.build_edits(first, second)
        self.assertEquals([expected], edits)

    #@unittest.skip('')
    def test_no_edits(self):
        word =  "replacement"
        error = "replasments"
        #
        # The words are aligned like this:
        #     "replacement-"
        #     "replas-ments"
        # This should be a substitution, a deletion, and an insertion.
        #     ('c','s'), ('ce', 'c'), ('t', 'ts')
        first, second = self.finder.align(word, error)
        expected = [('ac','as'), ('ce', 'c'), ('nt', 'nts')]
        edits = self.finder.build_edits(first, second)
        self.assertEquals(expected, edits)

    #@unittest.skip('')
    def test_apply_straight(self):
        word =  "straight"
        error = "strait"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_generally(self):
        word =  "generally"
        error = "geneology"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_critics(self):
        word =  "critics"
        error = "criticists"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_professor(self):
        word =  "professor"
        error = "proffesor"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_one(self):
        word =  "one"
        error = "noone"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_apply_throughout(self):
        word =  "throughout"
        error = "throught"
        edits = self.finder.find(word, error)
        self.assertEquals(error, self.finder.apply(word, edits))

    #@unittest.skip('')
    def test_remove_dashes(self):
        word =  "crit-cs"
        self.assertEquals("critcs", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(4, self.finder.remove_dashes(5, word)[0])
        self.assertEquals(2, self.finder.remove_dashes(2, word)[0])
        self.assertEquals(4, self.finder.remove_dashes(4, word)[0])

    #@unittest.skip('')
    def test_remove_double_dashes(self):
        word =  "cr-t-cs"
        self.assertEquals("crtcs", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(3, self.finder.remove_dashes(5, word)[0])
        self.assertEquals(1, self.finder.remove_dashes(1, word)[0])
        self.assertEquals(3, self.finder.remove_dashes(4, word)[0])

    #@unittest.skip('')
    def test_remove_no_dashes(self):
        word =  "critics"
        self.assertEquals("critics", self.finder.remove_dashes(5, word)[1])
        self.assertEquals(5, self.finder.remove_dashes(5, word)[0])

    #@unittest.skip('')
    def test_apply_on_wiki(self):
        words = spelling.mitton.load_mitton_words('data/wikipedia.dat')
        pairs = spelling.mitton.build_mitton_pairs(words)
        #with open("data/wikipedia.dat","r") as f:
        #    for line in f:
        #        if line[0] == "$":
        #            correct = line[1:-1]
        #        else:
        #            incorrect = line[:-1]
        for incorrect,correct in pairs:
            edits = self.finder.find(correct, incorrect)
            try:
                recovered_error = self.finder.apply(correct, edits)
                self.assertEquals(incorrect, recovered_error)
            except AssertionError as e:
                print(incorrect, correct, edits, recovered_error, e)
示例#4
0
文件: jobs.py 项目: ndronen/spelling
    def run(self):
        errors = []
        pbar = build_progressbar(self.real_words)

        finder = EditFinder()

        for i,word in enumerate(self.real_words):
            pbar.update(i+1)

            # Find all the edits we can make to this word.
            possible_edits = list()
            probs = list()
            for subseq in subsequences(word):
                # Probably delete this if statement as redundant.
                for e in self.edit_db.edits(subseq):
                    _, error_subseq, count = e
                    possible_edit = (subseq, error_subseq)
                    if count > 0:
                        possible_edits.append(possible_edit)
                        probs.append(count)

            if len(possible_edits) == 0:
                continue

            probs = np.array(probs)
            probs = probs / float(probs.sum())

            seen_edits = set()
            errors_for_word = []
            attempts = 0.

            # Try to generate up to the requested number of errors per word.
            while True:
                try:
                    attempts += 1.

                    if self.enough_errors_for_word(word, errors_for_word):
                        # Generated enough errors for this word.
                        break
                    elif attempts > 10 and len(errors_for_word) / attempts < 0.1:
                        # Not finding many errors to apply.  Break out.
                        break

                    # Sample the number of edits.
                    edit_sizes = np.arange(1, self.max_edits_per_error+1)
                    edit_size_probs = 1. / edit_sizes
                    edit_size_probs /= edit_size_probs.sum()
                    size = self.random_state.choice(edit_sizes, size=1, replace=False,
                            p=edit_size_probs)[0]

                    # Sample edits with probability proportional to the edit's frequency.
                    edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs)

                    edit = []
                    for i in edit_idx:
                        pe = possible_edits[i]
                        if pe in seen_edits:
                            continue
                        seen_edits.add(pe)
                        edit.append(pe)

                    if len(edit) == 0:
                        continue
    
                    # Avoid applying edits that result in unlikely errors.
                    for constraint in self.constraints:
                        for e in edit:
                            if constraint(word, e):
                                raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \
                                        (e[0], e[1], word))

                    error = finder.apply(word, edit)
                    if error in self.blacklist:
                        # Skip blacklisted words (i.e. non-words in a corpus used to generate the
                        # edit patterns in the edit database).
                        continue

                    errors_for_word.append((word, len(possible_edits), edit, error))

                except EditConstraintError as e:
                    if self.verbose:
                        print(e)

            errors.extend(errors_for_word)

        pbar.finish()
    
        return errors