def add_hunspell_pipe(model): try: spacy.tokens.Token.get_extension('hunspell_spell') log.warning(f'SpaCy Token already has a hunspell Pipe section . . .') return model except ValueError: pass if sys.platform == 'linux' or sys.platform == 'linux2': hunspell = spaCyHunSpell(model, 'linux') elif sys.platform == 'darwin': hunspell = spaCyHunSpell(model, 'mac') else: # sys.platform == 'win32': try: # TODO determine paths for en_US.dic and en_US.aff on windows hunspell = spaCyHunSpell(model, ('en_US.dic', 'en_US.aff')) except Exception: log.warning( 'Failed to locate en_US.dic and en_US.aff files. Substituting with fake . . .' ) hunspell = passthroughSpaCyPipe() try: model.add_pipe(hunspell) except ValueError: log.warning( f'SpaCy parser {model} already has a hunspell Pipe section...') return model
def add_hunspell_pipe(model): if sys.platform == 'linux' or sys.platform == 'linux2': hunspell = spaCyHunSpell(model, 'linux') elif sys.platform == 'darwin': hunspell = spaCyHunSpell(model, 'mac') else: # sys.platform == 'win32': try: # TODO determine paths for en_US.dic and en_US.aff on windows hunspell = spaCyHunSpell(model, ('en_US.dic', 'en_US.aff')) except Exception: log.warn('Failed to locate en_US.dic and en_US.aff files. Substituting with fake . . .') hunspell = passthroughSpaCyPipe() model.add_pipe(hunspell) return model
def __init__(self): self.nlp = spacy.load('en') self.hunspell = spaCyHunSpell( self.nlp, ('./src/hunspell/en_US.dic', './src/hunspell/en_US.aff')) self.nlp.add_pipe(self.hunspell) self.nlp.add_pipe(self.merge_entities, name='merge_entities') self.exclude = set(string.punctuation) self.date_pos = [] self.date_neg = [] self.relation_date = []
def spell_check(query, indices_to_ignore): hunspell = spaCyHunSpell(nlp, 'linux') nlp.add_pipe(hunspell, name="hunspell") doc = nlp(query) original_data = [token.text for token in doc] for i, data in enumerate(doc): if i in indices_to_ignore: continue if not data._.hunspell_spell: suggestions = data._.hunspell_suggest suggestions_vocab = [nlp.vocab[suggestion] for suggestion in suggestions] result = [data.similarity(ind) for ind in suggestions_vocab] max_word_index = result.index(max(result)) word = suggestions[max_word_index] original_data[i] = word response = " ".join(original_data) return response
def spellCheck(): query = request.args.get('query') hunspell = spaCyHunSpell(larger_nlp, 'linux') larger_nlp.add_pipe(hunspell) doc = larger_nlp(query) original_data = [token.text for token in doc] for i, data in enumerate(doc): if (not data._.hunspell_spell): suggestions = data._.hunspell_suggest suggestions_vocab = [ larger_nlp.vocab[suggestion] for suggestion in suggestions ] result = [data.similarity(ind) for ind in suggestions_vocab] max_word_index = result.index(max(result)) word = suggestions[max_word_index] original_data[i] = word response = " ".join(original_data) return json.dumps({"data": response})
import spacy from spacy_hunspell import spaCyHunSpell def convertToWordVectors(suggestions): return [nlp.vocab[suggestion] for suggestion in suggestions] nlp = spacy.load('en_core_web_lg') hunspell = spaCyHunSpell(nlp, 'linux') nlp.add_pipe(hunspell) doc = nlp('hwo to read data') print(doc) original_data = [token.text for token in doc] for i, data in enumerate(doc): if (not data._.hunspell_spell): suggestions = data._.hunspell_suggest suggestions_vocab = convertToWordVectors(suggestions) result = [data.similarity(ind) for ind in suggestions_vocab] max_word_index = result.index(max(result)) word = suggestions[max_word_index] original_data[i] = word print(" ".join(original_data))
# -*- coding: utf-8 -*- from __future__ import unicode_literals import os if __name__ == '__main__': # -- spacy pipeline test import spacy from spacy_hunspell import spaCyHunSpell nlp = spacy.load('en_core_web_sm') hunspell = spaCyHunSpell(nlp) nlp.add_pipe(hunspell) # cheeseburger test doc = nlp('I can haz cheezeburger.') haz = doc[2] assert haz._.hunspell_spell == False assert haz._.hunspell_suggest[:5] == \ ['ha', 'haze', 'hazy', 'has', 'hat'] # unicode test doc = nlp('I have a \u2033 bottle') doc = nlp('עקבת אחריו בכל רחבי המדינה.') doc = nlp(' ł (for example) ') doc = nlp('He said \u201CHello\u201D')