示例#1
0
def train(tests):
	#files = os.listdir(c_dir)
	#print files
	#for f in files:
	#c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/"
	fpath = 'big.txt'
	doc = open(fpath,'r')
	words = re.findall('[a-z]+' , doc.read().lower())
	
	#define n-gram model
	#corp = brown.words()
	good_turing = lambda fdist,bins:GoodTuringProbDist(fdist,56707)
	model = NgramModel(2,words,good_turing)
	
	
	for s in tests:
		# Finding the possible misspelled words
		misspelled = []
		sentences = s.split('.')
		for sentence in sentences:
			word = sentence.split(' ')
			for w in word:
				if w.lower() not in words and w.lower() != '':
					misspelled.append(w)
	
	
		#finding the candidate words for words in the misspelled array
		candidates = {}
		for wrong in misspelled:
			pos = s.index(wrong)
			candidates[wrong] = (list(edit_distances.correct(wrong)))
		#print misspelled
		#print candidates
	
		
		# Find the n-gram probabilities for each correction
		corrections = {} # the dictionar which keeps the MLE for each correction for all misspelled
		for k in candidates.keys():
			MLEs = []
			for cand in candidates[k]:
				estimates = find_context.find_contexts(cand, s, model)
				MLEs.append((cand,estimates))
				#print MLEs
			corrections[k] = MLEs
	
		#Suggest the corrections
		m = lambda x: max(x,key = lambda y:y[1])
		for k in corrections.keys():		
			final_list = [c for c in corrections[k] if c[1] >0 and c[1] <=1]
			print "misspelled :" + k +"\n"
			print "correction :"  + m(final_list)[0] + str(m(final_list)[1])+ "\n\n"
def train(tests):

	#files = os.listdir(c_dir)
	#print files
	#for f in files:
	c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/"
	fpath = c_dir+'big.txt'
        #stemmer  = WordNetStemmer()
        #stem = lambda x:stemmer.stem(x)
	
	stops = stopwords.words('english')
	doc = open(fpath,'r')
	words = re.findall('[a-z]+' , doc.read().lower())	
	#words2 = [ w for w in words if w not in stops]
	#define n-gram model from the file stored using pickel
	
	#corp = pickle.load(open('corpfile'))
	corp = map(lambda x :x.lower(), brown.words()) 
	ispresent = lambda x : wn.words(x) != []
	#corp = filter(ispresent, corp)
	#corp_words = re.findall('[a-z]+' , corp)
	#corp = [ w for w in corp if w not in stops]
	corp_dict = offset_dict(corp)
	for s in tests:
		print s
		test_words = re.findall('[a-z]+' , s.lower())
		# Finding the possible misspelled words
		misspelled = []
		sentences = s.split('.')
		#mispos = {}
		#sentences = [s for s in sentences if s not in stops]
		for t_word in test_words :
                    if t_word.lower() not in words and t_word.lower() != '':
					misspelled.append(t_word.lower())
					#mispos[w.lower()] = s.index(w)
		#print mispos
			
		#finding the candidate words for words in the misspelled array
		candidates = {}
		for wrong in misspelled:
			#pos = s.index(wrong)
			candidates[wrong] = (list(edit_distances.correct(wrong)))
				
		#find the context words for the test sentences and the corpus
		corrections = {}
                for miss in misspelled:
			print test_words
			#find the context words for the mispelled words
			error_dict = offset_dict(test_words)
			error_context = list(set(concord(error_dict,test_words,miss)))
			error_context = [e for e in error_context if e not in stops]
			errcont = []
			for errc in error_context:
				errcont += list(set(concord(corp_dict,corp,errc)))
			errcont = filter(ispresent,errcont)
			errcont = [e for e in errcont if e not in stops]
                        errcont += error_context
			#
			#print "error context"
			#print error_context

			#print errcont
			#for each context word find how often they co-occur with each of the corrections
			counts = {}
			can_list = candidates[miss]
			#print can_list
			for c in can_list:
				cand_cooccur = list(set(concord(corp_dict,corp,c)))   #change the corpus here
				#cand_cooccur = filter(lambda x: edit_dist(c,miss) < 2, cand_cooccur)
				cand_cooccur = filter(ispresent,cand_cooccur)
				cand_cooccur = [ca for ca in cand_cooccur if ca not in stops]
				#print "printing candidate context for" + c +".....................\n\n\n\n"
				#print "candidate contexts for "+c
				#print cand_cooccur
				count = sum([cand_cooccur.count(i) for i in errcont])
				counts[c] = count,sim(errcont,c)
		
                        print counts
			corrections[miss] = max(counts,key = lambda a:counts.get(a))
			p = test_words.index(miss)
                        test_words[p] = max(counts,key = lambda a:counts.get(a))
                       
	
		#Suggest the corrections
		
			
			print "misspelled :" + miss +"\n"
			try:
				print "correction :"  + corrections[miss] + "\n\n"
			except ValueError:
				pass