def TokeniseAll(): all = {} for post in corpus: try: h = unicode(post.htmlBlock, "utf-8", errors='ignore') t = post.title.encode("utf-8", 'ignore') words = stripTags(h.encode("utf-8")) + ", " + t countTokens(words, all) except: print "F**K! encoding problem!!!" return all
def toke(catName, post, catTokens): try: h = unicode(post.htmlBlock, "utf-8", errors='ignore') t = post.title.encode("utf-8", 'ignore') words = stripTags(h.encode("utf-8")) + ", " + t except: #print "mutherfuckin shitty post" words = "" #words = post.htmlBlock.decode("utf-8", 'ignore') if post.HasCategory(catName): countTokens(words, catTokens) return True else: return False
def GetCategoryProbabilities(post, tempCats): results = {} allTokens = cPickle.load(open(cwd + "/bayesCats/allTokens.txt", "r")) if len(tempCats) == 0: tempCats = getTempCats() for category in tempCats.keys(): catTokens = tempCats[category] words = stripTags(post.htmlBlock) + ", " + post.title.encode("ascii", "ignore") # + ", " + post.title.encode("utf8") #probs = {} instancesCat = {} instancesAll = {} # strip and lower all the words wordsList = map((lambda s: s.lower()), map((lambda s: s.strip()), words.split(" "))) for token in wordsList: #if reToken.sub("", token).strip().lower() in stopWords: # pass #else: if token.isalnum() == False: token = reToken.sub("", token) if catTokens.has_key(token): instancesCat[token] = catTokens[token] else: instancesCat[token] = 1 if allTokens.has_key(token): instancesAll[token] = allTokens[token] else: instancesAll[token] = 1 valuesCat = instancesCat.values() valuesAll = instancesAll.values() valuesCat.sort() valuesAll.sort() pcat = 1.0 pwcat = 1.0 pw = 1.0 for i in range(min(15, len(valuesCat))): pwcat = (valuesCat[i]) * pwcat for i in range(min(15, len(valuesAll))): pw = (valuesAll[i]) * pw if pw == 0: results[category] = 0 else: results[category] = (pcat * pwcat)/pw return results