예제 #1
0
def process_question(question):
    # process the question

    question = question.lower()
    question = question.strip('?').split()

    # check for any misspelled words
    for word in question:
        if spelling.correction(word) != word:
            print("Would you like to replace " + word + " with " +
                  spelling.correction(word) + "? (Y/N)")
            answer = input(">> ").lower()
            if answer == 'y' or answer == 'yes':
                question = [
                    q.replace(word, spelling.correction(word))
                    for q in question
                ]
            elif answer == 'n' or answer == 'no':
                continue
            else:
                print("Please Enter Yes or No.")

    question = " ".join(question)

    # extract keywords from question
    r = Rake()
    r.extract_keywords_from_text(question)
    return r.get_ranked_phrases()
예제 #2
0
def spellcheck(word):
    suggestion = spelling.correction(word)

    # If we have a spellcheck suggestion
    if suggestion != word:
        return _(
            "Meowth! \"{0}\" is not a Pokemon! Did you mean \"{1}\"?").format(
                word, spelling.correction(word))
    else:
        return _("Meowth! \"{0}\" is not a Pokemon! Check your spelling!"
                 ).format(word)
예제 #3
0
	def correct_spelling_errors(self):
		correctedWords = []
		for word in self.words:
			cap = word[0].isupper()
			correction = spelling.correction(word.lower())
			if cap:
				correction = correction[0].upper() + correction[1:]
			correctedWords.append(correction)
			if word != correction:
				self.spelling_err_count += 1
		self.words = correctedWords
예제 #4
0
def corrections(text): 
    "Spell-correct all words in text." 
    return re.sub('[a-zA-Z]+', lambda m: spelling.correction(m.group(0)), text)
예제 #5
0
def spellingCorrectionques(text):
  question = " ".join([spelling.correction(item) for item in text['question'].split()])
  return {"question": question}
예제 #6
0
def spellingCorrection(text):
  model_answer = " ".join([spelling.correction(item) for item in text['model_answer'].split()])
  answer = " ".join([spelling.correction(item) for item in text['answer'].split()])
  return {"model_answer": model_answer, "answer": answer}
def clean_data(data, stop_words=stopwords, stemmer=stemmer, lematizer=lemmatiser):

	'''
	return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list
	RETURNS: 
		1. list of cleaned words
		2. list of POS tags of the words
		3. index of each word of (1) in the original list
		4. original pos list of the ret_list
	'''

	# # delete apostrophies, it's => its
	# data = data.replace('\'', '')	# done by tokenize
	# data = re.sub('[\']', '', data)

	data = data.lower()

	# seperate sentences with full stops, question and exclaimation marks
	data = re.sub('[?!.]', '\n', data)

	data.replace('n\'t', ' not')
	data.replace('\'s', ' is')
	data.replace('\'s', ' is')
	data.replace('\'d', ' had')
	data.replace('\'ll', ' will')
	data.replace('\'ve', ' have')
	data.replace('\'er', ' never')
	data.replace('\'re', ' are')
	data.replace('\'re', ' are')
	# potential clause breakers, conjunctions
	# data = re.sub('[(but|except|aside from|apart from|other than|besides)]', '\n', data)

	# delete punctuation marks including ' and "
	data = re.sub('[^a-z\s]', '', data)

	# split each sentence
	data = data.split('\n')

	ret_list = []
	ret_pos_list = []
	ret_orig_pos_list = []
	ret_index_list = []

	for i in range(len(data)):

		# words from the review sentence
		ret_list.append([])

		# POS tag of each word in the final word list of the review sentence
		ret_pos_list.append([])

		# index of the word in the original reviwe sentence
		ret_index_list.append([])

		sentence = data[i]

		# delete punctustion marks '?', '!' => ' '  -------- again?
		sentence = re.sub('[^a-z\s\']', ' ', sentence)
		sentence = delete_duplicate_letters.sub(r"\1\1", sentence)

		if sym_spell_check == 1:

			input_term = (sentence)	# whole review
			suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup)
			#  for suggestion in suggestions:
			if sentence != suggestions[0].term:
				# print(sentence, '\nCorrection:', suggestions[0].term,'\n')
				sentence = suggestions[0].term  # take the first suggestion, split at spaces
		
		words = word_tokenize(sentence)	

		if spell_check == 1:
			corrected_sentence = []
			for word in words:
				if word == '':
					continue
				# blindly correct each word at depth spell_check_depth if not in dictionary
				cor_word = spelling.correction(word, spell_check_depth)
				corrected_sentence.append(cor_word)
			words = corrected_sentence

		# part of speech tagging
		words_pos = pos_tag(words)
		ret_orig_pos_list.append(words_pos)

		for j in range(len(words)):

			word = words[j]
			word_postag = words_pos[j][1]

			# lemmatize
			if lemmatize_words == 1:
				# postag=get_wordnet_pos(postag)
				for tag in ['v','a','s','r','n']:
					word_ = lemmatiser.lemmatize(word, pos=tag)
					if word_ != word:
						break
				word = word_

			# stem () if lemmatize, stem words with same 
			if stem_words == 1:
				stemmed_word = stemmer.stem(word)
				word = stemmed_word
				# keep a reverse dictionary of stemmed words, both may be the same
				if stemmed_word not in stemmed_words:
					stemmed_words[stemmed_word] = []
				if word not in stemmed_words[stemmed_word]:
					stemmed_words[stemmed_word].append(word)

			if word not in string.punctuation and not word.isnumeric() and len(word) > 2 and word not in stop_words:

				ret_list[i].append(word)
				ret_pos_list[i].append(word_postag)
				ret_index_list[i].append(j)

	return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list
def clean_data(data, stop_words, stemmer=stemmer, lematizer=lemmatiser):

    data = data.lower()
    # delete punctustion marks '?', '!' => ''
    data = re.sub('[^a-z\s\']', ' ', data)
    data = delete_duplicate_letters.sub(r"\1\1", data)

    # if sym_spell_check == 1:

    # 	input_term = (data)	# whole review
    # 	suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup)
    # 	#  for suggestion in suggestions:
    # 	if data != suggestions[0].term:
    # 		# print(data, '\nCorrection:', suggestions[0].term,'\n')
    # 		data = suggestions[0].term  # take the first suggestion, split at spaces

    words = word_tokenize(data)

    if spell_check == 1:
        corrected_sentence = []
        for word in words:
            if word == '':
                continue
            # blindly correct each word at spell_check_depth if not in dictionary
            cor_word = spelling.correction(word, spell_check_depth)
            corrected_sentence.append(cor_word)
        words = corrected_sentence

    # part of speech tagging
    # words_pos = pos_tag(words)

    ret_list = []

    # for j in range(len(words_pos)):
    for j in range(len(words)):
        # word, postag = words_pos[j]
        word = words[j]

        if word not in string.punctuation and not word.isnumeric() and len(
                word) > 2 and word not in stop_words:

            # lemmatize
            if lemmatize_words == 1:
                # postag=get_wordnet_pos(postag)
                for postag in ['v', 'a', 's', 'r', 'n']:
                    word_ = lemmatiser.lemmatize(word, pos=postag)
                    if word_ != word:
                        break
                word = word_

            # stem () if lemmatize, stem words with same
            if stem_words == 1:
                stemmed_word = stemmer.stem(word)
                word = stemmed_word
                # keep a reverse dictionary of stemmed words, both may be the same
                if stemmed_word not in stemmed_words:
                    stemmed_words[stemmed_word] = []
                if word not in stemmed_words[stemmed_word]:
                    stemmed_words[stemmed_word].append(word)

            ret_list.append(word)

    return ret_list