示例#1
0
文件: jobs.py 项目: ndronen/spelling
 def run(self):
     """
     words : list
         The words for which to find 
     distance : callable
         Function taking two words and returning a distance.
     dictionary : spelling.dictionary 
         Instance of a class in spelling.dictionary.
     """
     nearest = []
     pbar = build_progressbar(self.words)
     for i,word in enumerate(self.words):
         pbar.update(i+1)
         suggestions = self.suggest(word)
         if len(suggestions) == 0:
             nearest.append((word, "", 100))
         else:
             distances = [(word, s, self.distance(word, s))
                 for s in suggestions]
             sorted_distances = sorted(distances,
                 key=operator.itemgetter(2))
             nearest.append(sorted_distances[0])
     pbar.finish()
     return pd.DataFrame(data=nearest,
         columns=['word', 'suggestion', 'distance'])
示例#2
0
文件: jobs.py 项目: ndronen/spelling
 def run(self):
     df = pd.read_csv(self.input_csv, sep='\t', encoding='utf8')
     unique_words = df.word.unique()
     pbar = build_progressbar(unique_words)
     for i, word in enumerate(unique_words):
         pbar.update(i+1)
         df_tmp = df[df.word == word]
         df_tmp.to_csv(self.output_csv % i,
                 sep='\t', index=False, encoding='utf8')
     pbar.finish()
示例#3
0
文件: train.py 项目: ndronen/spelling
def fit_cv(estimator, df_train, df_valid, feature_names, target_name, scale=False, correct_word_is_in_suggestions=False, random_state=17, verbose=True):

    df_valid = df_valid.copy()

    X_train = df_train[feature_names]
    X_valid = df_valid[feature_names]
    
    y_train = df_train[target_name]
    y_valid = df_valid[target_name]

    if verbose:
        print('train %d valid %d' % (len(X_train), len(X_valid)))

    scaler = StandardScaler()
    if scale:
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)

    estimator.fit(X_train, y_train)
    if verbose:
        print(estimator)

    # Some errors might not be unique for a given correct word
    # (i.e. ther -> their, ther -> there), so when getting the
    # validation set predictions, partition the examples by
    # correct word and error.
    correct_words = df_valid.correct_word.unique()
    pbar = build_progressbar(correct_words)
    y_hat_valid_proba = estimator.predict_proba(X_valid)
    for i,correct_word in enumerate(df_valid.correct_word.unique()):
        cw_mask = df_valid.correct_word == correct_word
        errors = df_valid[cw_mask].error.unique()
        pbar.update(i+1)
        for j,error in enumerate(errors):
            mask = (cw_mask & (df_valid.error == error)).values
            y_valid_tmp = y_valid[mask].values
            y_valid_tmp_proba = y_hat_valid_proba[mask]

            start = len(y_valid_tmp) - 1
            stop = -1
            step = -1
            ranks = np.argsort(y_valid_tmp_proba[:, 1])
            indices = np.arange(start, stop, step)
            y_valid_tmp_pred = np.zeros_like(y_valid_tmp)
            y_valid_tmp_pred[ranks] = np.arange(start, stop, step)

            new_suggestion_index = np.ones_like(y_valid_tmp)
            new_suggestion_index = y_valid_tmp_pred
            df_valid.loc[mask, 'suggestion_index'] = new_suggestion_index

    pbar.finish()

    return df_valid
示例#4
0
文件: train.py 项目: ndronen/spelling
def add_features_from_vectorizer(df, vectorizer, column, feature_name_prefix=None):
    if feature_name_prefix is None:
        feature_name_prefix = column + '_'
    count_features = np.array(vectorizer.transform(df[column]).todense())
    rindex = [(v,k) for k,v in vectorizer.vocabulary_.items()]
    sorted_rindex = sorted(rindex,
            key=operator.itemgetter(0))
    feature_names = [feature_name_prefix+t[1] for t in sorted_rindex]

    pbar = build_progressbar(feature_names)
    for i,feature_name in enumerate(feature_names):
        pbar.update(i+1)
        df.loc[:, feature_name] = count_features[:, i]
    pbar.finish()
    return feature_names
示例#5
0
文件: jobs.py 项目: ndronen/spelling
 def run(self):
     corpus = []
     pbar = build_progressbar(self.words)
     for i,word in enumerate(self.words):
         pbar.update(i+1)
         for d in self.distances:
             # Make this a set, because typo_generator doesn't
             # guarantee uniqueness.
             typos = set()
             for t in typo_generator(word, d):
                 if t == word:
                     continue
                 if self.sample(word, t, d):
                     typos.add(t)
                 if len(typos) == self.max_examples_per_word:
                     break
             for t in typos:
                 corpus.append((word,t,d))
     pbar.finish()
     print("generated %d errors for %d words" %
             (len(corpus), len(self.words)))
     return pd.DataFrame(data=corpus, columns=['word', 'typo', 'distance'])
def build_operation_corpus(distance, operation, words, n=3, random_state=17):
    if isinstance(random_state, int):
        random_state = np.random.RandomState(seed=random_state)

    editor = Editor()
    edit_finder = EditFinder()
    pbar = build_progressbar(words)

    corpus = init_corpus()

    words_set = set(words)

    for i,w in enumerate(words):
        pbar.update(i+1)
        edits = set([w])
        #print('initial edits', edits)
        for i in range(distance):
            #print(w, i)
            new_edits = set()
            for edit in edits:
                #print('getting edits for %s' % edit)
                edits_for = editor.edit(edit, operation)
                new_edits.update(edits_for)
                #print('edits for %s %s' % (edit, str(new_edits)))

            # Remove the word itself from new edits.
            try:
                new_edits.remove(w)
            except KeyError:
                pass

            # Remove real words from the edits.
            for edit in new_edits.copy():
                if edit in words_set:
                    new_edits.remove(edit)

            # Break out if we can't make any new edits.
            if len(new_edits) == 0:
                new_edits = edits
                break

            #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w))

            n_choice = min(n, len(new_edits))

            try:
                edits = random_state.choice(list(new_edits), size=n_choice, replace=False)
            except ValueError as e:
                #print(w, new_edits, e)
                raise e

            #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n))

        try:
            edits = random_state.choice(list(edits), size=n, replace=False)
        except ValueError:
            pass

        for edit in edits:
            corpus['word'].append(unicode(edit))
            # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410.
            corpus['marked_word'].append('^' + edit + '$')
            corpus['real_word'].append(w)
            corpus['binary_target'].append(0)
            corpus['multiclass_target'].append(0)

            orig_chars = []
            changed_chars = []
            for orig,changed in edit_finder.find(w, edit):
                orig_chars.append(orig)
                changed_chars.append(changed)
            corpus['orig_pattern'].append('-'.join(orig_chars))
            corpus['changed_pattern'].append('-'.join(changed_chars))

    pbar.finish()

    corpus['distance'] = [distance for w in corpus['word']]
    corpus['operation'] = [operation for w in corpus['word']]

    return corpus
示例#7
0
    def on_epoch_end(self, epoch, logs={}):
        correct = []
        y = []
        y_hat = []
        y_hat_dictionary = []

        counter = 0
        pbar = build_progressbar(self.n_samples)
        print('\n')
        g = self.generator.generate(exhaustive=True, train=False)
        n_failed = 0
        while True:
            pbar.update(counter)
            try:
                next_batch = next(g)
            except StopIteration:
                break

            assert isinstance(next_batch, dict)

            # The dictionary's predictions.  Get these first, so we can
            # skip any that the dictionary doesn't have suggestions for.
            # This is to ensure that the evaluation occurs on even ground.
            non_words = next_batch['non_word']
            correct_words = next_batch['correct_word']
            failed = []
            for i,non_word in enumerate(non_words):
                suggestions = self.dictionary[str(non_word)]
                try:
                    suggestion = suggestions[0]
                    target = self.target_map[suggestion]
                    if target is None:
                        raise ValueError('target is None for %s => %s' % (non_word, suggestion))
                    y_hat_dictionary.append(target)
                except IndexError:
                    # I don't know what to do if the dictionary doesn't
                    # offer any suggestions.
                    failed.append(True)
                except KeyError as e:
                    # Or if we don't have a target for the suggested replacement.
                    failed.append(True)

            if any(failed):
                n_failed += len(failed)
                continue

            # The gold standard.
            targets = next_batch[self.config.target_name]
            y.append(np.argmax(targets, axis=1))

            # The model's predictions.
            pred = self.model.predict(next_batch, verbose=0)[self.config.target_name]
            y_hat.append(np.argmax(pred, axis=1))

            counter += len(targets)
            #if counter >= self.n_samples:
            #    print('%d >= %d - stopping loop' % (counter, self.n_samples))
            #    break

        pbar.finish()

        self.config.logger('\n%d dictionary lookups failed reporting results for %d examples\n' %
                    (n_failed, len(y)))

        self.config.logger('\n')
        self.config.logger('Dictionary')
        self.config.logger('accuracy %.04f F1 %0.4f' %
            (accuracy_score(y, y_hat_dictionary), f1_score(y, y_hat_dictionary, average='weighted')))

        self.config.logger('\n')
        self.config.logger('ConvNet')
        self.config.logger('accuracy %.04f F1 %0.4f\n' %
            (accuracy_score(y, y_hat), f1_score(y, y_hat, average='weighted')))
        self.config.logger('\n')
示例#8
0
文件: jobs.py 项目: ndronen/spelling
    def run(self):
        errors = []
        pbar = build_progressbar(self.real_words)

        finder = EditFinder()

        for i,word in enumerate(self.real_words):
            pbar.update(i+1)

            # Find all the edits we can make to this word.
            possible_edits = list()
            probs = list()
            for subseq in subsequences(word):
                # Probably delete this if statement as redundant.
                for e in self.edit_db.edits(subseq):
                    _, error_subseq, count = e
                    possible_edit = (subseq, error_subseq)
                    if count > 0:
                        possible_edits.append(possible_edit)
                        probs.append(count)

            if len(possible_edits) == 0:
                continue

            probs = np.array(probs)
            probs = probs / float(probs.sum())

            seen_edits = set()
            errors_for_word = []
            attempts = 0.

            # Try to generate up to the requested number of errors per word.
            while True:
                try:
                    attempts += 1.

                    if self.enough_errors_for_word(word, errors_for_word):
                        # Generated enough errors for this word.
                        break
                    elif attempts > 10 and len(errors_for_word) / attempts < 0.1:
                        # Not finding many errors to apply.  Break out.
                        break

                    # Sample the number of edits.
                    edit_sizes = np.arange(1, self.max_edits_per_error+1)
                    edit_size_probs = 1. / edit_sizes
                    edit_size_probs /= edit_size_probs.sum()
                    size = self.random_state.choice(edit_sizes, size=1, replace=False,
                            p=edit_size_probs)[0]

                    # Sample edits with probability proportional to the edit's frequency.
                    edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs)

                    edit = []
                    for i in edit_idx:
                        pe = possible_edits[i]
                        if pe in seen_edits:
                            continue
                        seen_edits.add(pe)
                        edit.append(pe)

                    if len(edit) == 0:
                        continue
    
                    # Avoid applying edits that result in unlikely errors.
                    for constraint in self.constraints:
                        for e in edit:
                            if constraint(word, e):
                                raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \
                                        (e[0], e[1], word))

                    error = finder.apply(word, edit)
                    if error in self.blacklist:
                        # Skip blacklisted words (i.e. non-words in a corpus used to generate the
                        # edit patterns in the edit database).
                        continue

                    errors_for_word.append((word, len(possible_edits), edit, error))

                except EditConstraintError as e:
                    if self.verbose:
                        print(e)

            errors.extend(errors_for_word)

        pbar.finish()
    
        return errors
示例#9
0
    def compute_metrics(self, generator, name, exhaustive, epoch, logs={}, do_callbacks=False):
        correct = []
        y = []
        y_hat = []
        y_hat_binary = []
        y_hat_dictionary = []
        y_hat_dictionary_binary = []
        counter = 0
        pbar = build_progressbar(self.n_samples)
        print('\n%s\n' % name)

        g = generator.generate(exhaustive=exhaustive)

        while True:
            pbar.update(counter)
            # Each call to next results in a batch of possible
            # corrections, only one of which is correct.
            try:
                next_batch = next(g)
            except StopIteration:
                break

            if isinstance(next_batch, (tuple, list)):
                d, sample_weight = next_batch
            else:
                assert isinstance(next_batch, dict)
                d = next_batch
                sample_weight = None

            targets = d[self.config.target_name]
            pred = self.model.predict(d, verbose=0)[self.config.target_name]

            y.extend(targets[:, 1].tolist())

            y_hat_tmp = [0] * len(targets)
            y_hat_tmp[np.argmax(pred[:, 1])] = 1
            y_hat.extend(y_hat_tmp)
            if targets[:, 1][np.argmax(pred[:, 1])] == 1:
                y_hat_binary.append(1)
            else:
                y_hat_binary.append(0)

            correct_word = d['correct_word'][0]

            y_hat_dictionary_tmp = []
            if d['candidate_word'][0] == correct_word:
                y_hat_dictionary_binary.append(1)
            else:
                y_hat_dictionary_binary.append(0)

            for i,c in enumerate(d['candidate_word']):
                # The first word in the results returned by the dictionary
                # is the dictionary's highest-scoring candidate for
                # replacing the non-word.
                if i == 0:
                    y_hat_dictionary_tmp.append(1)
                else:
                    y_hat_dictionary_tmp.append(0)
            y_hat_dictionary.extend(y_hat_dictionary_tmp)

            if len(y_hat_dictionary_tmp) != len(targets):
                raise ValueError('non_word %s correct_word %s dictlen %d targetslen %d' %
                        (d['non_word'][0], d['correct_word'][0],
                            len(y_hat_dictionary_tmp),
                            len(targets)))

            counter += 1
            if counter >= self.n_samples:
                break

        pbar.finish()

        self.config.logger('\n')
        self.config.logger('Dictionary %s binary accuracy %.04f accuracy %.04f F1 %0.4f' % 
                (
                    name,
                    sum(y_hat_dictionary_binary) / float(len(y_hat_dictionary_binary)),
                    accuracy_score(y, y_hat_dictionary),
                    f1_score(y, y_hat_dictionary)
                ))
        self.config.logger('Dictionary confusion matrix')
        self.config.logger(confusion_matrix(y, y_hat_dictionary))

        model_binary_accuracy = sum(y_hat_binary) / float(len(y_hat_binary))
        model_accuracy = accuracy_score(y, y_hat)
        model_f1 = f1_score(y, y_hat)

        self.config.logger('\n')
        self.config.logger('ConvNet %s binary accuracy %.04f accuracy %.04f F1 %0.4f' % 
                (name, model_binary_accuracy, model_accuracy, model_f1))
        self.config.logger('ConvNet confusion matrix')
        self.config.logger(confusion_matrix(y, y_hat))

        self.config.logger('\n')

        if do_callbacks:
            logs['f1'] = model_f1
            for cb in self.callbacks:
                cb.on_epoch_end(epoch, logs)