def run(self): """ words : list The words for which to find distance : callable Function taking two words and returning a distance. dictionary : spelling.dictionary Instance of a class in spelling.dictionary. """ nearest = [] pbar = build_progressbar(self.words) for i,word in enumerate(self.words): pbar.update(i+1) suggestions = self.suggest(word) if len(suggestions) == 0: nearest.append((word, "", 100)) else: distances = [(word, s, self.distance(word, s)) for s in suggestions] sorted_distances = sorted(distances, key=operator.itemgetter(2)) nearest.append(sorted_distances[0]) pbar.finish() return pd.DataFrame(data=nearest, columns=['word', 'suggestion', 'distance'])
def run(self): df = pd.read_csv(self.input_csv, sep='\t', encoding='utf8') unique_words = df.word.unique() pbar = build_progressbar(unique_words) for i, word in enumerate(unique_words): pbar.update(i+1) df_tmp = df[df.word == word] df_tmp.to_csv(self.output_csv % i, sep='\t', index=False, encoding='utf8') pbar.finish()
def fit_cv(estimator, df_train, df_valid, feature_names, target_name, scale=False, correct_word_is_in_suggestions=False, random_state=17, verbose=True): df_valid = df_valid.copy() X_train = df_train[feature_names] X_valid = df_valid[feature_names] y_train = df_train[target_name] y_valid = df_valid[target_name] if verbose: print('train %d valid %d' % (len(X_train), len(X_valid))) scaler = StandardScaler() if scale: X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) estimator.fit(X_train, y_train) if verbose: print(estimator) # Some errors might not be unique for a given correct word # (i.e. ther -> their, ther -> there), so when getting the # validation set predictions, partition the examples by # correct word and error. correct_words = df_valid.correct_word.unique() pbar = build_progressbar(correct_words) y_hat_valid_proba = estimator.predict_proba(X_valid) for i,correct_word in enumerate(df_valid.correct_word.unique()): cw_mask = df_valid.correct_word == correct_word errors = df_valid[cw_mask].error.unique() pbar.update(i+1) for j,error in enumerate(errors): mask = (cw_mask & (df_valid.error == error)).values y_valid_tmp = y_valid[mask].values y_valid_tmp_proba = y_hat_valid_proba[mask] start = len(y_valid_tmp) - 1 stop = -1 step = -1 ranks = np.argsort(y_valid_tmp_proba[:, 1]) indices = np.arange(start, stop, step) y_valid_tmp_pred = np.zeros_like(y_valid_tmp) y_valid_tmp_pred[ranks] = np.arange(start, stop, step) new_suggestion_index = np.ones_like(y_valid_tmp) new_suggestion_index = y_valid_tmp_pred df_valid.loc[mask, 'suggestion_index'] = new_suggestion_index pbar.finish() return df_valid
def add_features_from_vectorizer(df, vectorizer, column, feature_name_prefix=None): if feature_name_prefix is None: feature_name_prefix = column + '_' count_features = np.array(vectorizer.transform(df[column]).todense()) rindex = [(v,k) for k,v in vectorizer.vocabulary_.items()] sorted_rindex = sorted(rindex, key=operator.itemgetter(0)) feature_names = [feature_name_prefix+t[1] for t in sorted_rindex] pbar = build_progressbar(feature_names) for i,feature_name in enumerate(feature_names): pbar.update(i+1) df.loc[:, feature_name] = count_features[:, i] pbar.finish() return feature_names
def run(self): corpus = [] pbar = build_progressbar(self.words) for i,word in enumerate(self.words): pbar.update(i+1) for d in self.distances: # Make this a set, because typo_generator doesn't # guarantee uniqueness. typos = set() for t in typo_generator(word, d): if t == word: continue if self.sample(word, t, d): typos.add(t) if len(typos) == self.max_examples_per_word: break for t in typos: corpus.append((word,t,d)) pbar.finish() print("generated %d errors for %d words" % (len(corpus), len(self.words))) return pd.DataFrame(data=corpus, columns=['word', 'typo', 'distance'])
def build_operation_corpus(distance, operation, words, n=3, random_state=17): if isinstance(random_state, int): random_state = np.random.RandomState(seed=random_state) editor = Editor() edit_finder = EditFinder() pbar = build_progressbar(words) corpus = init_corpus() words_set = set(words) for i,w in enumerate(words): pbar.update(i+1) edits = set([w]) #print('initial edits', edits) for i in range(distance): #print(w, i) new_edits = set() for edit in edits: #print('getting edits for %s' % edit) edits_for = editor.edit(edit, operation) new_edits.update(edits_for) #print('edits for %s %s' % (edit, str(new_edits))) # Remove the word itself from new edits. try: new_edits.remove(w) except KeyError: pass # Remove real words from the edits. for edit in new_edits.copy(): if edit in words_set: new_edits.remove(edit) # Break out if we can't make any new edits. if len(new_edits) == 0: new_edits = edits break #print('new edits for %s %s (after removing %s)' % (edit, str(new_edits), w)) n_choice = min(n, len(new_edits)) try: edits = random_state.choice(list(new_edits), size=n_choice, replace=False) except ValueError as e: #print(w, new_edits, e) raise e #print('%d edits for %s %s (after sampling %d)' % (n_choice, edit, str(edits), n)) try: edits = random_state.choice(list(edits), size=n, replace=False) except ValueError: pass for edit in edits: corpus['word'].append(unicode(edit)) # Use start-of-word and end-of-word markers as in http://arxiv.org/abs/1602.02410. corpus['marked_word'].append('^' + edit + '$') corpus['real_word'].append(w) corpus['binary_target'].append(0) corpus['multiclass_target'].append(0) orig_chars = [] changed_chars = [] for orig,changed in edit_finder.find(w, edit): orig_chars.append(orig) changed_chars.append(changed) corpus['orig_pattern'].append('-'.join(orig_chars)) corpus['changed_pattern'].append('-'.join(changed_chars)) pbar.finish() corpus['distance'] = [distance for w in corpus['word']] corpus['operation'] = [operation for w in corpus['word']] return corpus
def on_epoch_end(self, epoch, logs={}): correct = [] y = [] y_hat = [] y_hat_dictionary = [] counter = 0 pbar = build_progressbar(self.n_samples) print('\n') g = self.generator.generate(exhaustive=True, train=False) n_failed = 0 while True: pbar.update(counter) try: next_batch = next(g) except StopIteration: break assert isinstance(next_batch, dict) # The dictionary's predictions. Get these first, so we can # skip any that the dictionary doesn't have suggestions for. # This is to ensure that the evaluation occurs on even ground. non_words = next_batch['non_word'] correct_words = next_batch['correct_word'] failed = [] for i,non_word in enumerate(non_words): suggestions = self.dictionary[str(non_word)] try: suggestion = suggestions[0] target = self.target_map[suggestion] if target is None: raise ValueError('target is None for %s => %s' % (non_word, suggestion)) y_hat_dictionary.append(target) except IndexError: # I don't know what to do if the dictionary doesn't # offer any suggestions. failed.append(True) except KeyError as e: # Or if we don't have a target for the suggested replacement. failed.append(True) if any(failed): n_failed += len(failed) continue # The gold standard. targets = next_batch[self.config.target_name] y.append(np.argmax(targets, axis=1)) # The model's predictions. pred = self.model.predict(next_batch, verbose=0)[self.config.target_name] y_hat.append(np.argmax(pred, axis=1)) counter += len(targets) #if counter >= self.n_samples: # print('%d >= %d - stopping loop' % (counter, self.n_samples)) # break pbar.finish() self.config.logger('\n%d dictionary lookups failed reporting results for %d examples\n' % (n_failed, len(y))) self.config.logger('\n') self.config.logger('Dictionary') self.config.logger('accuracy %.04f F1 %0.4f' % (accuracy_score(y, y_hat_dictionary), f1_score(y, y_hat_dictionary, average='weighted'))) self.config.logger('\n') self.config.logger('ConvNet') self.config.logger('accuracy %.04f F1 %0.4f\n' % (accuracy_score(y, y_hat), f1_score(y, y_hat, average='weighted'))) self.config.logger('\n')
def run(self): errors = [] pbar = build_progressbar(self.real_words) finder = EditFinder() for i,word in enumerate(self.real_words): pbar.update(i+1) # Find all the edits we can make to this word. possible_edits = list() probs = list() for subseq in subsequences(word): # Probably delete this if statement as redundant. for e in self.edit_db.edits(subseq): _, error_subseq, count = e possible_edit = (subseq, error_subseq) if count > 0: possible_edits.append(possible_edit) probs.append(count) if len(possible_edits) == 0: continue probs = np.array(probs) probs = probs / float(probs.sum()) seen_edits = set() errors_for_word = [] attempts = 0. # Try to generate up to the requested number of errors per word. while True: try: attempts += 1. if self.enough_errors_for_word(word, errors_for_word): # Generated enough errors for this word. break elif attempts > 10 and len(errors_for_word) / attempts < 0.1: # Not finding many errors to apply. Break out. break # Sample the number of edits. edit_sizes = np.arange(1, self.max_edits_per_error+1) edit_size_probs = 1. / edit_sizes edit_size_probs /= edit_size_probs.sum() size = self.random_state.choice(edit_sizes, size=1, replace=False, p=edit_size_probs)[0] # Sample edits with probability proportional to the edit's frequency. edit_idx = self.random_state.choice(len(probs), size=size, replace=False, p=probs) edit = [] for i in edit_idx: pe = possible_edits[i] if pe in seen_edits: continue seen_edits.add(pe) edit.append(pe) if len(edit) == 0: continue # Avoid applying edits that result in unlikely errors. for constraint in self.constraints: for e in edit: if constraint(word, e): raise EditConstraintError("can't apply edit %s=>%s to word '%s'" % \ (e[0], e[1], word)) error = finder.apply(word, edit) if error in self.blacklist: # Skip blacklisted words (i.e. non-words in a corpus used to generate the # edit patterns in the edit database). continue errors_for_word.append((word, len(possible_edits), edit, error)) except EditConstraintError as e: if self.verbose: print(e) errors.extend(errors_for_word) pbar.finish() return errors
def compute_metrics(self, generator, name, exhaustive, epoch, logs={}, do_callbacks=False): correct = [] y = [] y_hat = [] y_hat_binary = [] y_hat_dictionary = [] y_hat_dictionary_binary = [] counter = 0 pbar = build_progressbar(self.n_samples) print('\n%s\n' % name) g = generator.generate(exhaustive=exhaustive) while True: pbar.update(counter) # Each call to next results in a batch of possible # corrections, only one of which is correct. try: next_batch = next(g) except StopIteration: break if isinstance(next_batch, (tuple, list)): d, sample_weight = next_batch else: assert isinstance(next_batch, dict) d = next_batch sample_weight = None targets = d[self.config.target_name] pred = self.model.predict(d, verbose=0)[self.config.target_name] y.extend(targets[:, 1].tolist()) y_hat_tmp = [0] * len(targets) y_hat_tmp[np.argmax(pred[:, 1])] = 1 y_hat.extend(y_hat_tmp) if targets[:, 1][np.argmax(pred[:, 1])] == 1: y_hat_binary.append(1) else: y_hat_binary.append(0) correct_word = d['correct_word'][0] y_hat_dictionary_tmp = [] if d['candidate_word'][0] == correct_word: y_hat_dictionary_binary.append(1) else: y_hat_dictionary_binary.append(0) for i,c in enumerate(d['candidate_word']): # The first word in the results returned by the dictionary # is the dictionary's highest-scoring candidate for # replacing the non-word. if i == 0: y_hat_dictionary_tmp.append(1) else: y_hat_dictionary_tmp.append(0) y_hat_dictionary.extend(y_hat_dictionary_tmp) if len(y_hat_dictionary_tmp) != len(targets): raise ValueError('non_word %s correct_word %s dictlen %d targetslen %d' % (d['non_word'][0], d['correct_word'][0], len(y_hat_dictionary_tmp), len(targets))) counter += 1 if counter >= self.n_samples: break pbar.finish() self.config.logger('\n') self.config.logger('Dictionary %s binary accuracy %.04f accuracy %.04f F1 %0.4f' % ( name, sum(y_hat_dictionary_binary) / float(len(y_hat_dictionary_binary)), accuracy_score(y, y_hat_dictionary), f1_score(y, y_hat_dictionary) )) self.config.logger('Dictionary confusion matrix') self.config.logger(confusion_matrix(y, y_hat_dictionary)) model_binary_accuracy = sum(y_hat_binary) / float(len(y_hat_binary)) model_accuracy = accuracy_score(y, y_hat) model_f1 = f1_score(y, y_hat) self.config.logger('\n') self.config.logger('ConvNet %s binary accuracy %.04f accuracy %.04f F1 %0.4f' % (name, model_binary_accuracy, model_accuracy, model_f1)) self.config.logger('ConvNet confusion matrix') self.config.logger(confusion_matrix(y, y_hat)) self.config.logger('\n') if do_callbacks: logs['f1'] = model_f1 for cb in self.callbacks: cb.on_epoch_end(epoch, logs)