예제 #1
def process_question(question):
    # process the question

    question = question.lower()
    question = question.strip('?').split()

    # check for any misspelled words
    for word in question:
        if spelling.correction(word) != word:
            print("Would you like to replace " + word + " with " +
                  spelling.correction(word) + "? (Y/N)")
            answer = input(">> ").lower()
            if answer == 'y' or answer == 'yes':
                question = [
                    q.replace(word, spelling.correction(word))
                    for q in question
            elif answer == 'n' or answer == 'no':
                print("Please Enter Yes or No.")

    question = " ".join(question)

    # extract keywords from question
    r = Rake()
    return r.get_ranked_phrases()
예제 #2
def spellcheck(word):
    suggestion = spelling.correction(word)

    # If we have a spellcheck suggestion
    if suggestion != word:
        return _(
            "Meowth! \"{0}\" is not a Pokemon! Did you mean \"{1}\"?").format(
                word, spelling.correction(word))
        return _("Meowth! \"{0}\" is not a Pokemon! Check your spelling!"
예제 #3
	def correct_spelling_errors(self):
		correctedWords = []
		for word in self.words:
			cap = word[0].isupper()
			correction = spelling.correction(word.lower())
			if cap:
				correction = correction[0].upper() + correction[1:]
			if word != correction:
				self.spelling_err_count += 1
		self.words = correctedWords
예제 #4
def corrections(text): 
    "Spell-correct all words in text." 
    return re.sub('[a-zA-Z]+', lambda m: spelling.correction(m.group(0)), text)
예제 #5
def spellingCorrectionques(text):
  question = " ".join([spelling.correction(item) for item in text['question'].split()])
  return {"question": question}
예제 #6
def spellingCorrection(text):
  model_answer = " ".join([spelling.correction(item) for item in text['model_answer'].split()])
  answer = " ".join([spelling.correction(item) for item in text['answer'].split()])
  return {"model_answer": model_answer, "answer": answer}
def clean_data(data, stop_words=stopwords, stemmer=stemmer, lematizer=lemmatiser):

	return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list
		1. list of cleaned words
		2. list of POS tags of the words
		3. index of each word of (1) in the original list
		4. original pos list of the ret_list

	# # delete apostrophies, it's => its
	# data = data.replace('\'', '')	# done by tokenize
	# data = re.sub('[\']', '', data)

	data = data.lower()

	# seperate sentences with full stops, question and exclaimation marks
	data = re.sub('[?!.]', '\n', data)

	data.replace('n\'t', ' not')
	data.replace('\'s', ' is')
	data.replace('\'s', ' is')
	data.replace('\'d', ' had')
	data.replace('\'ll', ' will')
	data.replace('\'ve', ' have')
	data.replace('\'er', ' never')
	data.replace('\'re', ' are')
	data.replace('\'re', ' are')
	# potential clause breakers, conjunctions
	# data = re.sub('[(but|except|aside from|apart from|other than|besides)]', '\n', data)

	# delete punctuation marks including ' and "
	data = re.sub('[^a-z\s]', '', data)

	# split each sentence
	data = data.split('\n')

	ret_list = []
	ret_pos_list = []
	ret_orig_pos_list = []
	ret_index_list = []

	for i in range(len(data)):

		# words from the review sentence

		# POS tag of each word in the final word list of the review sentence

		# index of the word in the original reviwe sentence

		sentence = data[i]

		# delete punctustion marks '?', '!' => ' '  -------- again?
		sentence = re.sub('[^a-z\s\']', ' ', sentence)
		sentence = delete_duplicate_letters.sub(r"\1\1", sentence)

		if sym_spell_check == 1:

			input_term = (sentence)	# whole review
			suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup)
			#  for suggestion in suggestions:
			if sentence != suggestions[0].term:
				# print(sentence, '\nCorrection:', suggestions[0].term,'\n')
				sentence = suggestions[0].term  # take the first suggestion, split at spaces
		words = word_tokenize(sentence)	

		if spell_check == 1:
			corrected_sentence = []
			for word in words:
				if word == '':
				# blindly correct each word at depth spell_check_depth if not in dictionary
				cor_word = spelling.correction(word, spell_check_depth)
			words = corrected_sentence

		# part of speech tagging
		words_pos = pos_tag(words)

		for j in range(len(words)):

			word = words[j]
			word_postag = words_pos[j][1]

			# lemmatize
			if lemmatize_words == 1:
				# postag=get_wordnet_pos(postag)
				for tag in ['v','a','s','r','n']:
					word_ = lemmatiser.lemmatize(word, pos=tag)
					if word_ != word:
				word = word_

			# stem () if lemmatize, stem words with same 
			if stem_words == 1:
				stemmed_word = stemmer.stem(word)
				word = stemmed_word
				# keep a reverse dictionary of stemmed words, both may be the same
				if stemmed_word not in stemmed_words:
					stemmed_words[stemmed_word] = []
				if word not in stemmed_words[stemmed_word]:

			if word not in string.punctuation and not word.isnumeric() and len(word) > 2 and word not in stop_words:


	return ret_list, ret_pos_list, ret_index_list, ret_orig_pos_list
def clean_data(data, stop_words, stemmer=stemmer, lematizer=lemmatiser):

    data = data.lower()
    # delete punctustion marks '?', '!' => ''
    data = re.sub('[^a-z\s\']', ' ', data)
    data = delete_duplicate_letters.sub(r"\1\1", data)

    # if sym_spell_check == 1:

    # 	input_term = (data)	# whole review
    # 	suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup)
    # 	#  for suggestion in suggestions:
    # 	if data != suggestions[0].term:
    # 		# print(data, '\nCorrection:', suggestions[0].term,'\n')
    # 		data = suggestions[0].term  # take the first suggestion, split at spaces

    words = word_tokenize(data)

    if spell_check == 1:
        corrected_sentence = []
        for word in words:
            if word == '':
            # blindly correct each word at spell_check_depth if not in dictionary
            cor_word = spelling.correction(word, spell_check_depth)
        words = corrected_sentence

    # part of speech tagging
    # words_pos = pos_tag(words)

    ret_list = []

    # for j in range(len(words_pos)):
    for j in range(len(words)):
        # word, postag = words_pos[j]
        word = words[j]

        if word not in string.punctuation and not word.isnumeric() and len(
                word) > 2 and word not in stop_words:

            # lemmatize
            if lemmatize_words == 1:
                # postag=get_wordnet_pos(postag)
                for postag in ['v', 'a', 's', 'r', 'n']:
                    word_ = lemmatiser.lemmatize(word, pos=postag)
                    if word_ != word:
                word = word_

            # stem () if lemmatize, stem words with same
            if stem_words == 1:
                stemmed_word = stemmer.stem(word)
                word = stemmed_word
                # keep a reverse dictionary of stemmed words, both may be the same
                if stemmed_word not in stemmed_words:
                    stemmed_words[stemmed_word] = []
                if word not in stemmed_words[stemmed_word]:


    return ret_list