def test_spelling(self): # Assert case-sensitivity + numbers. for a, b in ( ( ".", "." ), ( "?", "?" ), ( "!", "!" ), ( "I", "I" ), ( "a", "a" ), ( "42", "42" ), ("3.14", "3.14"), ( "The", "The" ), ( "the", "the" )): self.assertEqual(en.suggest(a)[0][0], b) # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.suggest(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i + j) > 0.70) print("pattern.en.suggest()")
def tense_of_word(word): tense = None tag = word.tag word = word.string # Use wordnet suggestion if word is mispelled word = word if suggest(word)[0][1] == 1.0 else suggest(word)[0][0] # Check defined rules first tense = FUTURE if word in rules.FUTURE_INDICATORS else tense tense = PAST if word in rules.PAST_INDICATORS else tense if tense is not None: return tense # check pre-defined rules tense = PAST if tag in rules.PAST else tense tense = PRESENT if tag in rules.PRESENT else tense tense = FUTURE if tag in rules.FUTURE else tense if tense is not None: return tense # Now check pattern.en tense tense = PAST if tense_of_word_h(word, PAST) and not tense_of_word_h(word, PRESENT) and not tense_of_word_h(word, FUTURE) else tense tense = PRESENT if not tense_of_word_h(word, PAST) and tense_of_word_h(word, PRESENT) and not tense_of_word_h(word, FUTURE) else tense tense = FUTURE if not tense_of_word_h(word, PAST) and not tense_of_word_h(word, PRESENT) and tense_of_word_h(word, FUTURE) else tense return tense
def test_spelling(self): # Assert case-sensitivity + numbers. for a, b in ( (".", "."), ("?", "?"), ("!", "!"), ("I", "I"), ("a", "a"), ("42", "42"), ("3.14", "3.14"), ("The", "The"), ("the", "the")): self.assertEqual(en.suggest(a)[0][0], b) # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.suggest(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i + j) > 0.70) print("pattern.en.suggest()")
def spelling_correct(token_list): res_correct = [] for word in token_list: if not check_english(word) : word_correct = max(dict(suggest(word)), key=lambda k: dict(suggest(word))[k]) res_correct.append(word_correct) else: res_correct.append(word) return res_correct
def spell(): if request.method == 'POST': #this block is only entered when the form is submitted word = request.form.get('word') return jsonify({'data':json.dumps(suggest(word))}) return '''<form method="POST">
def data_preprocessing(X): ''' text pre-processing ''' # convert all characters to lower case X = [x.lower() for x in X] # remove random characters out of ASCII in the text X = [x.encode("ascii","ignore") for x in X] X = [x.decode() for x in X] # remove the meaningless "_U" in the text X = [re.sub('_u',' ', x) for x in X] # replace @username with X = [re.sub('@\w+','username',x) for x in X] # remove website links X = [re.sub(r'(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?','', x+' ') for x in X] # remove symbol X = [re.sub('[/(){}\[\]\|@,;]',' ', x) for x in X] X = [re.sub('[^0-9a-z ]',' ', x) for x in X] # consolidate multiple spaces X = [re.sub(' +',' ', x) for x in X] # spell correction for i,x in enumerate(X): print("[INFO] this is {}/{} tweet! ".format(i,len(X))) words = x.split() for j,word in enumerate(words): if word not in DRUGLIST: word = reduce_lengthening(word) try: suggestion = suggest(word)[0] except: suggestion = suggest(word)[0] if suggestion[1]>0.8: # do not change words with low confidence words[j] = suggestion[0] else: pass # print(word,suggestion) else: word = 'drugname'# replace the drugnames with drugname X[i] = ' '.join(words) # remove stop words STOPWORDS = set(stopwords.words('english')) for i,x in enumerate(X): X[i] = ' '.join([word for word in x.split() if word not in STOPWORDS]) return X
def get_english_suggestions(word): suggestions = [] for cur in suggest(word): cur_word = cur[0] if (cur[1] != 0): suggestions.append(cur_word) if (len(suggestions) < 5): return suggestions else: return suggestions[0:5]
def test_spelling(self): # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.suggest(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i+j) > 0.70) print "pattern.en.suggest()"
def apply_spell_correction(self, x): """ :param x: :return: """ try: pattern = re.compile(r"(.)\1{2,}") list_of_elem = x.split(" ") clean_x = [pattern.sub(r"\1\1", i) for i in list_of_elem] suggest_val = [suggest(i)[0][0] for i in clean_x] return " ".join(suggest_val) except Exception as e: logger.error(e)
def checkSpellingError(text, nlp, correctFlag=False): ''' Purpose: To check if text has errors due to wrong spellings. Additionally, it returns corrected sentence. Parameters: text: string A string of text-single or a paragraph. correctFlag:boolean True or False Returns: count: integer text: Corrected sentence. (If correctFlag is True) ''' doc = nlp(text) count = 0 text = "" for sen in doc.sentences: for word in sen.words: #print(word.text.lower()) l = ["'s", "n't", "'ll"] try: sugList = suggest(word.text.lower()) except: sugList = [] l.append(word.text.lower()) for k in sugList: l.append(k[0]) if (word.text.lower() in l) or (word.lemma in l): text += word.text text += " " continue else: count += 1 text += sugList[0][0] text += " " if correctFlag == True: return count, text else: return count
def spellCheck(uIn): uIn = shortenWords(uIn) unchecked = uIn.split(' ') checked = "" end = "" #preserves punctuation at end of user's input if any(p in uIn[-1:] for p in punctuation): end = uIn[-1:] first = True #check for first run to prevent leading space for w in unchecked: suggestion = suggest(w) word = suggestion[0][0] for i in suggestion: if (w == i[0] or w[0].isupper()): word = w if not first: checked += ' ' else: first = False checked += word return checked + end
for word in wrong: print bcolors.Red +" "+word+" :: "+subprocess.check_output(["espeak", "-q", "--ipa",'-v', 'en-us', word]).decode('utf-8')+bcolors.White print bcolors.White + "Completed spell learning" sys.exit() if(sup_spellbee): for word in word_list.split(): if len(wn.synsets(word)) is not 0: tcount = 0 learnt = 0 entity =(word, tcount) wrong.append(entity) else: print suggest(word) rspellbee() correct = sorted(correct,key=lambda x: x[1],reverse=True) for entity in correct: if(entity[1] is not 0): print "%20s : %d" % (entity[0],entity[1]) print bcolors.White + "Completed spell bee" sys.exit() for word in word_list.split(): if len(wn.synsets(word)) is not 0: #rlemma = l.lemmatize(word) iterator = iterator + 1 if(len(sys.argv) is 5): if(iterator < int(sys.argv[3])): continue
def sp(text): print("sp") sp_chk.append(suggest(text)) return()
print 'p' in tenses('purred') # By alias. print PAST in tenses('purred') print(PAST, 1, PL) in tenses('purred') # rule-based conjugation print 'google' in verbs.infinitives print 'googled' in verbs.inflections print conjugate('googled', tense=PARTICIPLE, parse=False) print conjugate('googled', tense=PARTICIPLE, parse=True) # quantification print number("seventy-five point two") # "seventy-five point two" => 75.2 print numerals(2.245, round=2) # 2.245 => "two point twenty-five" print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify({'carrot': 100, 'parrot': 20}) print quantify('carrot', amount=1000) # spelling print suggest("parot") # n-grams print ngrams("I am eating pizza.", n=2) # bigrams print ngrams("I am eating pizza.", n=3, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", continuous=False) # parser print parse( 'I eat pizza with a fork.', tokenize=True, # Split punctuation marks from words? tags=True, # Parse part-of-speech tags? (NN, JJ, ...) chunks=True, # Parse chunks? (NP, VP, PNP, ...) relations=False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata=False, # Parse lemmata? (ate => eat) encoding='utf-8', # Input string encoding.
def test_spelling_punctuation(self): self.assertEqual(en.suggest("!"), [("!", 1.0)]) self.assertEqual(en.suggest("?"), [("?", 1.0)]) self.assertEqual(en.suggest("."), [('.', 1.0)])
def __init__(self, dict_name='en', max_dist=2): self.spell_dict = enchant.Dict(dict_name) self.max_dist = max_dist def replace(self, word): if self.spell_dict.check(word): return word suggestions = self.spell_dict.suggest(word) if not suggestions: return word return min(suggestions, key=lambda sugg: edit_distance(word, sugg)) # for sugg in suggestions: # print sugg, edit_distance(word, sugg) if __name__=="__main__": SENTENCE = 'Yesteday I wrnt to the pqrk!'.split(' ') print " ".join([suggest(word)[0][0] for word in SENTENCE]) sr = SpellingReplacer() print " ".join([sr.replace(word) for word in SENTENCE]) # print " ".join([d.suggest(word) for word in SENTENCE])
def test_spelling_numbers(self): self.assertEqual(en.suggest("42"), [("42", 1.0)]) self.assertEqual(en.suggest("3.1415"), [("3.1415", 1.0)])
from pattern.en import suggest from aspell import Speller # The algorithm manually implemented at spelling_correction_manual.py is available to be # used out of the box in the pattern library: # https://www.clips.uantwerpen.be/pattern. # (Compatible with python 2.7 only) # # "Pattern is a web mining module for the Python programming language. # It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), # natural language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), # machine learning (vector space model, clustering, SVM), network analysis and <canvas> visualization." # Other libraries are: # PyEnchant: http://pythonhosted.org/pyenchant/ # AspellPython, wrapper around GNU Aspell: https://github.com/WojciechMula/aspell-python (requires libaspell-dev) # sudo apt install libaspell-dev # pip install aspell-python-py3 print(suggest("fianlly")) print(suggest("flaot")) sp = Speller() print(sp.suggest("fianlly"))
word = match.group() def case_of(text): """ Return the case-function appropriate for text: upper, lower, title, or just str.: """ return (str.upper if text.isupper() else str.lower if text.islower() else str.title if text.istitle() else str) return case_of(word)(correct(word.lower())) def correct_text_generic(text): """ Correct all the words within a text, returning the corrected text. """ return re.sub('[a-zA-Z]+', correct_match, text) print(correct_text_generic('fianlly')) from pattern.en import suggest print(suggest('fianlly')) print(suggest('flaot'))
print bcolors.Red + " " + word + " :: " + subprocess.check_output( ["espeak", "-q", "--ipa", '-v', 'en-us', word ]).decode('utf-8') + bcolors.White print bcolors.White + "Completed spell learning" sys.exit() if (sup_spellbee): for word in word_list.split(): if len(wn.synsets(word)) is not 0: tcount = 0 learnt = 0 entity = (word, tcount) wrong.append(entity) else: print suggest(word) rspellbee() correct = sorted(correct, key=lambda x: x[1], reverse=True) for entity in correct: if (entity[1] is not 0): print "%20s : %d" % (entity[0], entity[1]) print bcolors.White + "Completed spell bee" sys.exit() for word in word_list.split(): if len(wn.synsets(word)) is not 0: #rlemma = l.lemmatize(word) iterator = iterator + 1 if (len(sys.argv) is 5): if (iterator < int(sys.argv[3])): continue
word = match.group() def case_of(text): """ Return the case-function appropriate for text: upper, lower, title, or just str.: """ return (str.upper if text.isupper() else str.lower if text.islower() else str.title if text.istitle() else str) return case_of(word)(correct(word.lower())) def correct_text_generic(text): """ Correct all the words within a text, returning the corrected text. """ return re.sub('[a-zA-Z]+', correct_match, text) print correct_text_generic('fianlly') from pattern.en import suggest print suggest('fianlly') print suggest('flaot')
def spell_checker(): print(suggest('poblem'))
def clean_text(self, text, short = True, length = True, contra = True, remove_stopwords = True, lemmatize = True, english = False, ngrams = False, spelling = True, spCy=False): print("cleantoo") # print("T:", text) """ Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings check spelling, lemmatize and compare with wordnet corpus for english words Paras: text: text data to clean remove_stopwords: if true, remove stop words text to reduce noise lemmatize: if true lemmatizes word english: if true compares w/ wordnet corpus to keep only english words ngrams: if true creates ngrams Returns: text: cleaned text data """ if contra: print("CLEAN") text = [self.remove_contractions(word) for word in sent_tokenize(text.lower())] text = " ".join(text) text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) text = re.sub(r'[^a-zA-Z]', " ", text) if length: print("LENGTH") #text = text.split() text = self.reduce_lengthening(text)# for w in text] #text = " ".join(text) if spelling: print("SPELLING") text = text.split() word = suggest(text) max_word=max(word[:][1]) #for w in self.token_nize(text)] text = " ".join(max_word[0]) if remove_stopwords: print("STOP") text = text.split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) if short: print("SHORT") text = ' '.join([w for w in text.split() if len(w)>3]) print("S: ", text) if lemmatize: print("LEM") text_sent = nltk.word_tokenize(text) text = [WordNetLemmatizer().lemmatize(w, self.getWordnetPos(w)) for w in text_sent] text = " ".join(text) # Spacy Lemmtization if spCy: text = " ".join(self.spCy(text)) if english: print("ENGLISH") text = ' '.join([w for w in text.split() if wordnet.synsets(w)]) print("P: ", text) if ngrams: print("NGRAM") text = text.split() text = [self.nGrams(text)] #text = " ".join(text) return text
def noise_generator(original_sentence_list, i, process_dict): noised_sentence_list = [] for target_sentence in original_sentence_list: generated_source = [] # x = x.lower() # this cause some error ignorance (Mec) x_split = nltk.word_tokenize(target_sentence) x_pos = nltk.pos_tag(x_split) # avoid too much error creation similar_flag = False replace_flag = False proposition_flag = False plural_flag = False tense_flag = False modal_flag = False incorrect_comparative_flag = False for token, pos in x_pos: similar_token = (pos in NOUN_TAGS and random.random() < 0.3 and not similar_flag) dropout_token = (token in DROPOUT_TOKENS and random.random() < 0.3) incorrect_comparative_token = (pos in ADJECTIVE_TAGS and random.random() < 0.3 and not incorrect_comparative_flag) preposition_token = (pos in PREPOSITION_TAGS and random.random() < 0.3 and not proposition_flag) replace_token = (token in REPLACEMENTS and random.random() < 0.3 and not replace_flag) pos_plural_token = (pos in NOUN_TAGS and random.random() < 0.3 and not plural_flag) pos_tense_token = (pos in VERBS_TAGS and random.random() < 0.3 and not tense_flag) pos_modal_token = (token in MODAL and random.random() < 0.3 and not modal_flag) if replace_token: generated_source.append(REPLACEMENTS[token]) replace_flag = True elif similar_token: temp = token[:-1] + "_" cand_list = suggest(temp) cand = random.choice(cand_list)[0] generated_source.append(cand) similar_flag = True elif preposition_token: generated_source.append(random.choice(PREPOSITIONS)) proposition_flag = True elif incorrect_comparative_token: generated_source.append( random.choice(COMPARATIVES) + " " + token) incorrect_comparative_flag = True elif pos_plural_token: token = change_pluralization(token) generated_source.append(token) plural_flag = True elif pos_tense_token: token = change_tense(token) generated_source.append(token) tense_flag = True elif not dropout_token: generated_source.append(token) elif pos_modal_token: generated_source.append(MODAL[token]) modal_flag = True noised_sentence_list.append(" ".join(generated_source)) process_dict[i] = noised_sentence_list
sent = parse(text, lemmata=True) sent = Sentence(sent) print(modality(sent)) text = "I think we can complete this task" sent = parse(text, lemmata=True) sent = Sentence(sent) print(modality(sent)) # ### Spelling Corrections from pattern.en import suggest print(suggest("Whitle")) from pattern.en import suggest print(suggest("Fracture")) # ### Working with Numbers from pattern.en import number, numerals print(number("one hundred and twenty two")) print(numerals(256.390, round=2)) from pattern.en import quantify print( quantify([
def _suggest_aux(text, threshold=0.9): suggestions = suggest(text) if suggestions[0][1] > threshold: return suggestions[0][0] return text
def test_spelling_punctuation(self): self.assertEqual(en.suggest("!"), [("!", 1.0)]) self.assertEqual(en.suggest("?"), [("?", 1.0)]) self.assertEqual(en.suggest("."), [(".", 1.0)])
def test_spelling_oneletter_words(self): self.assertEqual(en.suggest("I"), [("I", 1.0)]) self.assertEqual(en.suggest("a"), [("a", 1.0)])
def correctSpelling(s): words = [en.suggest(w) for w in s.split()] return ' '.join(words)
and preserve proper upper/lower/title case. """ word = match.group() def case_of(text): """ Return the case-function appropriate for text: upper, lower, title, or just str.: """ return (str.upper if text.isupper() else str.lower if text.islower() else str.title if text.istitle() else str) return case_of(word)(correct(word.lower())) def correct_text_generic(text): """ Correct all the words within a text, returning the corrected text. """ return re.sub('[a-zA-Z]+', correct_match, text) print correct_text_generic('fianlly') from pattern.en import suggest print suggest('fianlly') print suggest('flaot')
def clean_text(self, text, short=True, length=True, contra=True, remove_stopwords=True, lemmatize=True, english=True, ngrams=False, spelling=False, spCy=False, stem=False): """ Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings check spelling, lemmatize and compare with wordnet corpus for english words Paras: text: text data to clean remove_stopwords: if true, remove stop words text to reduce noise lemmatize: if true lemmatizes word english: if true compares w/ wordnet corpus to keep only english words ngrams: if true creates ngrams Returns: text: cleaned text data """ if contra: text = [ self.remove_contractions(word) for word in sent_tokenize(text.lower()) ] text = " ".join(text) text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) text = re.sub(r'[^a-zA-Z]', " ", text) if length: #text = text.split() text = self.reduce_lengthening(text) # for w in text] #text = " ".join(text) if spelling: text = text.split() word = suggest(text) max_word = max(word[:][1]) #for w in self.token_nize(text)] text = " ".join(max_word[0]) if remove_stopwords: text = text.split() stops = stopwords.words("english") newStopWords = ['please', "name", "thank"] stops.extend(newStopWords) text = [w for w in text if not w in stops] text = " ".join(text) if short: text = ' '.join([w for w in text.split() if len(w) > 3]) if lemmatize: text_sent = nltk.word_tokenize(text) text = [ WordNetLemmatizer().lemmatize(w, self.getWordnetPos(w)) for w in text_sent ] text = " ".join(text) if stem: text = self.portStem(text.split()) #text = " ".join(text) ##### Attempt at using Spacy Lemmtization ############## # Spacy Lemmtization # if spCy: # text = " ".join(self.spCy(text)) ##### Use of WordNet ###### if english: text = ' '.join([w for w in text.split() if wordnet.synsets(w)]) #### Creation of Ngrams ###### if ngrams: text = text.split() text = self.nGrams(text) #text = " ".join(text) return text
#a/an via pattern.en doesn't work wel... #only switch from an to a, not vice versa...? from pattern.en import article, suggest import helpers as h stories = [] with open('compareresults', 'r') as f: for line in f: line = line.strip() if line and line[-1] in "1234567890": stories.append(h.strip(line[:line.rfind(' ')]).split(' ')) sps = [] for s in stories: for w in s: sp = suggest(w) if len(sp) > 1: sps.append([w] + sp) def fixaan(l): for i in range(len(l)): if l[i] == 'a' or l[i] == 'an': newa = article(l[i + 1]) if newa == 'an': print l, i elif newa == 'a' and newa != l[i]: print l, i l[i] = newa
def corr(self, text): word = suggest(text) max_word = max(word[:][1]) return max_word[0]