def bayesianCheck(line): #print line + '\t', usedSpell = False line = line.lower() line = line.replace('\n', '') line = re.sub('[^0-9a-zA-Z ]+', '', line) words_ = line.split(' ') set_of_words = [] minprob = 1 minidx = 0 poss_list = {} maxprob = 0 for i in range(0, len(words_)): temp = '' maxval = 0 poss = {} if words_[i] not in dicti: #print words_[i] usedSpell = True poss = text.correctWord(words_[i]) else: poss[words_[i]] = 1 continue for p in poss: if poss[p] > maxval: maxval = poss[p] temp = p if len(temp) == 0: temp = words_[i] #poss[temp] = 1 else: words_[i] = temp if maxprob < poss[temp]: maxidx = i maxprob = poss[temp] poss_list = poss kter = 0 if len(poss_list) == 0: set_of_words.append([]) for wter in words_: set_of_words[kter].append(wter) else: for w in poss_list: if (poss_list[w] > 0.2): set_of_words.append([]) for wter in words_: set_of_words[kter].append(wter) set_of_words[kter][minidx] = w kter += 1 con = context(set_of_words) col = collocations(set_of_words) #print con #print col #print "One Done #########################################" total = {} final = 0 for eve in con: total[eve] = con[eve] final += con[eve] for eve in col: if eve not in total: total[eve] = col[eve] else: total[eve] += col[eve] final += col[eve] for eve in total: total[eve] = total[eve]/final value = 0 coreve = "" if usedSpell: for eve in poss_list: if value < poss_list[eve]: value = poss_list[eve] coreve = eve #if coreve in total: # temp = cWords[coreve] # for x in temp: #print x + " " + str(total[x]) sorted_p = sorted(poss_list.items(), key=operator.itemgetter(1), reverse = True) #counter = 0 #output = {} #for fin in sorted_p: # if counter == 3: # break #print fin[0] + "\t" + str(fin[1]), # counter += 1 # output[fin[0]] = fin[1] return sorted_p #print else: sorted_t = sorted(total.items(), key=operator.itemgetter(1), reverse = True) #counter = 0 return sorted_t
def context(line): line = line.lower() line = line.replace('\n', '') line = re.sub('[^0-9a-zA-Z ]+', '', line) words_ = line.split(' ') set_of_words = [] minprob = 1 minidx = 0 poss_list = {} for i in range(0, len(words_)): temp = '' maxval = 0 poss = {} if words_[i] not in dicti: poss = text.correctWord(words_[i]) else: poss[words_[i]] = 1 for p in poss: if poss[p] > maxval: maxval = poss[p] temp = p words_[i] = temp if minprob > poss[temp]: minidx = i minprob = poss[temp] poss_list = poss kter = 0 if len(poss_list) == 0: set_of_words.append([]) for wter in words_: set_of_words[kter].append(wter) else: for w in poss_list: if (poss_list[w] > 0.2): set_of_words.append([]) for wter in words_: set_of_words[kter].append(wter) set_of_words[kter][minidx] = w kter += 1 #print(poss_list) for words in set_of_words: #print(words) for i in range(0, len(words)): words[i] = words[i].lower() for i in range(0, len(words)): if words[i] in cWords: start = max(0,i-3) end = min(i+3, len(words)-1) context = set() for j in range(start, end+1): context.add(words[j]) prob = {} confuse = cWords[words[i]] for w in confuse: prob[w] = priorConf[w] for c in context: val = 1 if c in contextWords[w]: val = (float(contextWords[w][c])+1)/(1.0*(confCounts[w] + len(contextWords[w]))) #val = (float(contextWords[w][c])) else: val = 1 prob[w] *= val maxval = 0 idx = '' #for p in prob: # print(p + ' '), # print(prob[p]) for k in prob: if prob[k] > maxval: maxval = prob[k] idx = k # print(k) normalize = 0 for k in prob: normalize += prob[k] for k in prob: prob[k] /= normalize #print(prob) words[i] = idx for i in words: print i+' ', print
words_ = line.split(' ') set_of_words = [] minprob = 1 minidx = 0 poss_list = {} end = time.time() print "1 " + str(end - start) start = time.time() for i in range(0, len(words_)): temp = '' maxval = 0 poss = {} start_cor = time.time() if words_[i] not in dicti: print words_[i] poss = text.correctWord(words_[i]) else: poss[words_[i]] = 1 continue end_cor = time.time() print "cor " + str(end_cor - start_cor) for p in poss: if poss[p] > maxval: maxval = poss[p] temp = p words_[i] = temp if minprob > poss[temp]: minidx = i minprob = poss[temp] poss_list = poss kter = 0