def process_question(question): # process the question question = question.lower() question = question.strip('?').split() # check for any misspelled words for word in question: if spelling.correction(word) != word: print("Would you like to replace " + word + " with " + spelling.correction(word) + "? (Y/N)") answer = input(">> ").lower() if answer == 'y' or answer == 'yes': question = [ q.replace(word, spelling.correction(word)) for q in question ] elif answer == 'n' or answer == 'no': continue else: print("Please Enter Yes or No.") question = " ".join(question) # extract keywords from question r = Rake() r.extract_keywords_from_text(question) return r.get_ranked_phrases()
def spellcheck(word): suggestion = spelling.correction(word) # If we have a spellcheck suggestion if suggestion != word: return _( "Meowth! \"{0}\" is not a Pokemon! Did you mean \"{1}\"?").format( word, spelling.correction(word)) else: return _("Meowth! \"{0}\" is not a Pokemon! Check your spelling!" ).format(word)
def correct_spelling_errors(self): correctedWords = [] for word in self.words: cap = word[0].isupper() correction = spelling.correction(word.lower()) if cap: correction = correction[0].upper() + correction[1:] correctedWords.append(correction) if word != correction: self.spelling_err_count += 1 self.words = correctedWords
def corrections(text): "Spell-correct all words in text." return re.sub('[a-zA-Z]+', lambda m: spelling.correction(m.group(0)), text)
def spellingCorrectionques(text): question = " ".join([spelling.correction(item) for item in text['question'].split()]) return {"question": question}
def spellingCorrection(text): model_answer = " ".join([spelling.correction(item) for item in text['model_answer'].split()]) answer = " ".join([spelling.correction(item) for item in text['answer'].split()]) return {"model_answer": model_answer, "answer": answer}
def clean_data(data, stop_words=stopwords, stemmer=stemmer, lematizer=lemmatiser): ''' return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list RETURNS: 1. list of cleaned words 2. list of POS tags of the words 3. index of each word of (1) in the original list 4. original pos list of the ret_list ''' # # delete apostrophies, it's => its # data = data.replace('\'', '') # done by tokenize # data = re.sub('[\']', '', data) data = data.lower() # seperate sentences with full stops, question and exclaimation marks data = re.sub('[?!.]', '\n', data) data.replace('n\'t', ' not') data.replace('\'s', ' is') data.replace('\'s', ' is') data.replace('\'d', ' had') data.replace('\'ll', ' will') data.replace('\'ve', ' have') data.replace('\'er', ' never') data.replace('\'re', ' are') data.replace('\'re', ' are') # potential clause breakers, conjunctions # data = re.sub('[(but|except|aside from|apart from|other than|besides)]', '\n', data) # delete punctuation marks including ' and " data = re.sub('[^a-z\s]', '', data) # split each sentence data = data.split('\n') ret_list = [] ret_pos_list = [] ret_orig_pos_list = [] ret_index_list = [] for i in range(len(data)): # words from the review sentence ret_list.append([]) # POS tag of each word in the final word list of the review sentence ret_pos_list.append([]) # index of the word in the original reviwe sentence ret_index_list.append([]) sentence = data[i] # delete punctustion marks '?', '!' => ' ' -------- again? sentence = re.sub('[^a-z\s\']', ' ', sentence) sentence = delete_duplicate_letters.sub(r"\1\1", sentence) if sym_spell_check == 1: input_term = (sentence) # whole review suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # for suggestion in suggestions: if sentence != suggestions[0].term: # print(sentence, '\nCorrection:', suggestions[0].term,'\n') sentence = suggestions[0].term # take the first suggestion, split at spaces words = word_tokenize(sentence) if spell_check == 1: corrected_sentence = [] for word in words: if word == '': continue # blindly correct each word at depth spell_check_depth if not in dictionary cor_word = spelling.correction(word, spell_check_depth) corrected_sentence.append(cor_word) words = corrected_sentence # part of speech tagging words_pos = pos_tag(words) ret_orig_pos_list.append(words_pos) for j in range(len(words)): word = words[j] word_postag = words_pos[j][1] # lemmatize if lemmatize_words == 1: # postag=get_wordnet_pos(postag) for tag in ['v','a','s','r','n']: word_ = lemmatiser.lemmatize(word, pos=tag) if word_ != word: break word = word_ # stem () if lemmatize, stem words with same if stem_words == 1: stemmed_word = stemmer.stem(word) word = stemmed_word # keep a reverse dictionary of stemmed words, both may be the same if stemmed_word not in stemmed_words: stemmed_words[stemmed_word] = [] if word not in stemmed_words[stemmed_word]: stemmed_words[stemmed_word].append(word) if word not in string.punctuation and not word.isnumeric() and len(word) > 2 and word not in stop_words: ret_list[i].append(word) ret_pos_list[i].append(word_postag) ret_index_list[i].append(j) return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list
def clean_data(data, stop_words, stemmer=stemmer, lematizer=lemmatiser): data = data.lower() # delete punctustion marks '?', '!' => '' data = re.sub('[^a-z\s\']', ' ', data) data = delete_duplicate_letters.sub(r"\1\1", data) # if sym_spell_check == 1: # input_term = (data) # whole review # suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # # for suggestion in suggestions: # if data != suggestions[0].term: # # print(data, '\nCorrection:', suggestions[0].term,'\n') # data = suggestions[0].term # take the first suggestion, split at spaces words = word_tokenize(data) if spell_check == 1: corrected_sentence = [] for word in words: if word == '': continue # blindly correct each word at spell_check_depth if not in dictionary cor_word = spelling.correction(word, spell_check_depth) corrected_sentence.append(cor_word) words = corrected_sentence # part of speech tagging # words_pos = pos_tag(words) ret_list = [] # for j in range(len(words_pos)): for j in range(len(words)): # word, postag = words_pos[j] word = words[j] if word not in string.punctuation and not word.isnumeric() and len( word) > 2 and word not in stop_words: # lemmatize if lemmatize_words == 1: # postag=get_wordnet_pos(postag) for postag in ['v', 'a', 's', 'r', 'n']: word_ = lemmatiser.lemmatize(word, pos=postag) if word_ != word: break word = word_ # stem () if lemmatize, stem words with same if stem_words == 1: stemmed_word = stemmer.stem(word) word = stemmed_word # keep a reverse dictionary of stemmed words, both may be the same if stemmed_word not in stemmed_words: stemmed_words[stemmed_word] = [] if word not in stemmed_words[stemmed_word]: stemmed_words[stemmed_word].append(word) ret_list.append(word) return ret_list