示例#1
0
def correct(x):
	line=re.findall(r"\b([a-zA-Z]+)\b", x)
	y=''
	for word in line:
		spell.correct(x)
		y=y+(spell.correct(word))+' '
	return y
示例#2
0
def correctedWord(word, d):
	#suggest = d.suggest(word)
	#if len(suggest) > 0:
	#	return random.choice(suggest)
	#else:
	#	return word
	ret = random.sample(spell.correct(word), 1)
	return ret[0]
示例#3
0
def correct_answer(NWORDS, all_words,student_answers):
	spell_corrected_answers = []
	for answer in student_answers:
		answer_words = []
		for word in answer[0].split():
			answer_words.append(spell.correct(word, NWORDS))
		answer_string = " ".join(answer_words)
		spell_corrected_answers.append((answer_string,answer[1]))
	return spell_corrected_answers
def spell_correct(tweet):
    #Spitting the sentence into words
    words = tweet.split()
    text = ""
    for w in words:
        #spell correction module in spell.py
        w = correct(w)
        text += w + " "

    return text
示例#5
0
def sanitize(text):
	ret = []
	for word in text:
		word = tolower(word)
		word = goodify(word)
		word = correct(word)
		if word not in stop_words:
			ret.append(word)
		else:
			continue
	return ret
示例#6
0
def getScore(n):
    text = nltk.corpus.brown.sents()
    ss = []
    wss = []
    for i in xrange(n):
        s = []
        while len(s) <= 2 or s[0][0] > "a" or s[-1] != ".":
            s = text[int(random.uniform(0, len(text)))]
        ss.append(s)

        ws = list(s)
        w = []
        while len(w) <= 1 or "." in w:
            j = int(random.uniform(0, len(ws) - 1))
            w = list(ws[j])
        k = int(random.uniform(0, len(w)))
        c = chr(int(random.uniform(ord("a"), ord("z") + 1)))
        w[k] = c
        ws[j] = "".join(w)
        # print c, s[j], ws[j]
        assert len(s[j]) == len(ws[j])
        wss.append(ws)

    ss = unsentences(ss)
    wss = unsentences(wss)
    css = spell.correct(wss)
    ss = splitSentence(ss)
    wss = splitSentence(wss)
    css = splitSentence(css)
    dd = 0
    for (s, ws, cs) in map(None, ss, wss, css):
        print unwords(s)
        print unwords(ws)
        print unwords(cs)
        d = distance.distance(s, cs)
        print d
        print
        if d > 0:
            dd += 1
    return float(dd) / n
def normal_srch(query):
	start_time=time.time()				
	wrong_words=index_search(query)
	split_query=str(query).split()
	corrected_word=""
	flag=0					#if wrong_words is not empty then generate corrected string
	if wrong_words:
		
		for x in split_query:
			if x in wrong_words:
				corrected_word = corrected_word + " " + spell.correct(x)				
			else:
				corrected_word=corrected_word + " " + x
		
		final_result=index_search(corrected_word)
		flag=1

	query_time=time.time()-start_time
	
	if wrong_words and not final_result:
		print "Did you mean... "
		for x in corrected_word.split():
			
			if x in split_query:
				print "%s"%x
			else:
				print "<font color=blue><i>%s</i></font>"%x
	
	still_words_remaining = not(len(wrong_words) == len(split_query))
	
	if not wrong_words or not final_result or still_words_remaining:
		if flag==1 :					##case when input string has been corrected..
			evaluate(corrected_word)
		else:
			evaluate(query)
		print "<br>Your search query took <b><font color=blue>%s</font></b> seconds.<br><hr>"%query_time
		print_results()
		
	else:
		print "<br>Your search <b>%s</b> did not match any document"%(query)		
	def get_stats(self, line2, label, badwords, negword_list, posword_list):
		codecs.register_error('replace_with_space', self.handler) 
			
			#count uppercase letters
		caps_count = sum(x.isupper() for x in line2)
	
			#remove garbage, lowercase & strip
		line = ''.join(filter(lambda x: ord(x)<128,line2.lower().strip()))
	
			#remove double quotes
		line = line[1:-1]
	
			#decode to ascii
		line = line.decode('string-escape').decode('utf-8','replace_with_space').encode('ascii','ignore').decode('unicode-escape').encode('iso-8859-1','replace_with_space')
	
			#remove @name
		line = re.sub(r'^@\w{2,}', r'NameOfPerson', line)
	
			#count words with unwanted repetitions
		rep_count = len(re.findall(r'(.)\1\1+',line))
	
			#remove unwanted repetitions
		line = re.sub(r'(.)\1\1+', r'\1', line)
	
			#replace badwords
		for badword in badwords:
			line = re.sub(r"\b"+re.escape(badword)+r"\b|[[email protected]#$%^&*+?~`]{3,}", r'xxbdWrdxx', line)
	
			#replace 'u' with 'you' & 'ur' with 'you are'
		line = re.sub(r"\bu\b", r'you', line)
		line = re.sub(r"\bu\s*r\b", r'you are', line)
			
			#correct spelling
		tmp_line = []
		for word in re.split(r"[^\w\,\'\.\-\?\!]+", line):
			tmp_line.append(spell.correct(word))
		line = ' '.join(tmp_line)
	
			#count negative words
		negword_count = 0
		for negword in negword_list:
			negword_count += line.count(negword.strip())

		#count positive words
		posword_count = 0
		for posword in posword_list:
			posword_count += line.count(posword.strip())
		
		#---categorize counts---
		#categorize badword_count
		badword_count = line.count("xxbdWrdxx")
		if badword_count >=3:
			badword_count = 3
		#categorize rep_count
		if rep_count == 2:
			rep_count = 1
		elif rep_count >=3:
			rep_count = 2
		#categorize negword_count
		if negword_count == 2:
			negword_count = 1
		elif negword_count >= 3 and negword_count <= 7:
			negword_count = 2
		elif negword_count >= 8:
			negword_count = 3
		#categorize posword_count
		if posword_count == 2:
			posword_count = 1
		elif posword_count >= 3 and posword_count <= 6:
			posword_count = 2
		elif posword_count >= 7:
			posword_count = 3
		#categorize caps_count
		if caps_count == 2:
			caps_count = 1
		elif caps_count >=3 and caps_count <=5:
			caps_count = 2
		elif caps_count >=6:
			caps_count = 3
	
		#write processed line and stats to file
		return [label, badword_count, rep_count, negword_count, posword_count, "\""+line+"\"", caps_count]
示例#9
0
    (c1, c2, q0, q2, flag, nr, r) = win.getevent()
    if c2 in "xX":
        if flag & 2:
            win.getevent()
        if flag & 8:
            win.getevent()
            win.getevent()
        win.writeevent(c1, c2, q0, q2)
        if c2 == "x" and r == "Del":
            outwin.delete()
            break
    if c1 == "K" and c2 == "I":
        ch = r[0]
        if ch in " \t\r\n":
            outwin.replace(",", "")
            continue
        while q0 >= 0 and not (ch in " \t\r\n"):
            sss = win.read(q0, q0+1)
            if not sss:
                # print("empty sss %d" % q0)
                sss = " "
            ch = sss[0]
            q0 -= 1
        if q0 < 0 and not(ch in " \t\r\n"):
            q0 = 0
        else:
            q0 += 2
        ss = win.read(q0,q2)
        lastcorrect = spell.correct(ss)
        outwin.replace(",", lastcorrect)
示例#10
0
文件: main.py 项目: ccxu/spellcheck
 def post(self):
     userInput = cgi.escape(self.request.get('text'))
     pageVar = {'word': userInput, 'correct': spell.correct(userInput)}  
     page = JINJA_ENVIRONMENT.get_template('main.html')
     self.response.write(page.render(pageVar))        
 def spellCorrector(self, text):
     tmp_line = []
     for word in re.split(r"[^\w\,\'\.\-\?\!]+", text):
         tmp_line.append(spell.correct(word))
     line = ' '.join(tmp_line)
     return line
示例#12
0
def stem(word):
	# word = stemmer.stem(word)
	word = correct(word)
	return word