def train(tests): #files = os.listdir(c_dir) #print files #for f in files: #c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/" fpath = 'big.txt' doc = open(fpath,'r') words = re.findall('[a-z]+' , doc.read().lower()) #define n-gram model #corp = brown.words() good_turing = lambda fdist,bins:GoodTuringProbDist(fdist,56707) model = NgramModel(2,words,good_turing) for s in tests: # Finding the possible misspelled words misspelled = [] sentences = s.split('.') for sentence in sentences: word = sentence.split(' ') for w in word: if w.lower() not in words and w.lower() != '': misspelled.append(w) #finding the candidate words for words in the misspelled array candidates = {} for wrong in misspelled: pos = s.index(wrong) candidates[wrong] = (list(edit_distances.correct(wrong))) #print misspelled #print candidates # Find the n-gram probabilities for each correction corrections = {} # the dictionar which keeps the MLE for each correction for all misspelled for k in candidates.keys(): MLEs = [] for cand in candidates[k]: estimates = find_context.find_contexts(cand, s, model) MLEs.append((cand,estimates)) #print MLEs corrections[k] = MLEs #Suggest the corrections m = lambda x: max(x,key = lambda y:y[1]) for k in corrections.keys(): final_list = [c for c in corrections[k] if c[1] >0 and c[1] <=1] print "misspelled :" + k +"\n" print "correction :" + m(final_list)[0] + str(m(final_list)[1])+ "\n\n"
def train(tests): #files = os.listdir(c_dir) #print files #for f in files: c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/" fpath = c_dir+'big.txt' #stemmer = WordNetStemmer() #stem = lambda x:stemmer.stem(x) stops = stopwords.words('english') doc = open(fpath,'r') words = re.findall('[a-z]+' , doc.read().lower()) #words2 = [ w for w in words if w not in stops] #define n-gram model from the file stored using pickel #corp = pickle.load(open('corpfile')) corp = map(lambda x :x.lower(), brown.words()) ispresent = lambda x : wn.words(x) != [] #corp = filter(ispresent, corp) #corp_words = re.findall('[a-z]+' , corp) #corp = [ w for w in corp if w not in stops] corp_dict = offset_dict(corp) for s in tests: print s test_words = re.findall('[a-z]+' , s.lower()) # Finding the possible misspelled words misspelled = [] sentences = s.split('.') #mispos = {} #sentences = [s for s in sentences if s not in stops] for t_word in test_words : if t_word.lower() not in words and t_word.lower() != '': misspelled.append(t_word.lower()) #mispos[w.lower()] = s.index(w) #print mispos #finding the candidate words for words in the misspelled array candidates = {} for wrong in misspelled: #pos = s.index(wrong) candidates[wrong] = (list(edit_distances.correct(wrong))) #find the context words for the test sentences and the corpus corrections = {} for miss in misspelled: print test_words #find the context words for the mispelled words error_dict = offset_dict(test_words) error_context = list(set(concord(error_dict,test_words,miss))) error_context = [e for e in error_context if e not in stops] errcont = [] for errc in error_context: errcont += list(set(concord(corp_dict,corp,errc))) errcont = filter(ispresent,errcont) errcont = [e for e in errcont if e not in stops] errcont += error_context # #print "error context" #print error_context #print errcont #for each context word find how often they co-occur with each of the corrections counts = {} can_list = candidates[miss] #print can_list for c in can_list: cand_cooccur = list(set(concord(corp_dict,corp,c))) #change the corpus here #cand_cooccur = filter(lambda x: edit_dist(c,miss) < 2, cand_cooccur) cand_cooccur = filter(ispresent,cand_cooccur) cand_cooccur = [ca for ca in cand_cooccur if ca not in stops] #print "printing candidate context for" + c +".....................\n\n\n\n" #print "candidate contexts for "+c #print cand_cooccur count = sum([cand_cooccur.count(i) for i in errcont]) counts[c] = count,sim(errcont,c) print counts corrections[miss] = max(counts,key = lambda a:counts.get(a)) p = test_words.index(miss) test_words[p] = max(counts,key = lambda a:counts.get(a)) #Suggest the corrections print "misspelled :" + miss +"\n" try: print "correction :" + corrections[miss] + "\n\n" except ValueError: pass