예제 #1
0
파일: bayes.py 프로젝트: cdave1/blogbot
def TokeniseAll():
	all = {}
	for post in corpus:
		try:
			h = unicode(post.htmlBlock, "utf-8", errors='ignore')
			t = post.title.encode("utf-8", 'ignore')
			words = stripTags(h.encode("utf-8")) + ", " + t
			countTokens(words, all)
		except:
			print "F**K! encoding problem!!!"
	return all
예제 #2
0
파일: bayes.py 프로젝트: cdave1/blogbot
def toke(catName, post, catTokens):
	try:
		h = unicode(post.htmlBlock, "utf-8", errors='ignore')
		t = post.title.encode("utf-8", 'ignore')
		words = stripTags(h.encode("utf-8")) + ", " + t
	except:
		#print "mutherfuckin shitty post"
		words = ""
		#words = post.htmlBlock.decode("utf-8", 'ignore')

	if post.HasCategory(catName):
		countTokens(words, catTokens)
		return True
	else:
		return False
예제 #3
0
def GetCategoryProbabilities(post, tempCats):
    results = {}
    allTokens = cPickle.load(open(cwd + "/bayesCats/allTokens.txt", "r"))

    if len(tempCats) == 0: tempCats = getTempCats()
    for category in tempCats.keys():
        catTokens = tempCats[category]
        words = stripTags(post.htmlBlock) + ", " + post.title.encode("ascii", "ignore") # + ", " + post.title.encode("utf8")
        #probs = {}
	instancesCat = {}
	instancesAll = {}

	# strip and lower all the words
	wordsList = map((lambda s: s.lower()), map((lambda s: s.strip()), words.split(" ")))

        for token in wordsList:
            #if reToken.sub("", token).strip().lower() in stopWords:
            #    pass
            #else:
            if token.isalnum() == False:
                token = reToken.sub("", token)

            if catTokens.has_key(token): instancesCat[token] = catTokens[token]
	    else: instancesCat[token] = 1

	    if allTokens.has_key(token): instancesAll[token] = allTokens[token]
            else: instancesAll[token] = 1
	
        valuesCat = instancesCat.values()
	valuesAll = instancesAll.values()
        valuesCat.sort()
	valuesAll.sort()
	
	pcat = 1.0
        pwcat = 1.0
        pw = 1.0
		
        for i in range(min(15, len(valuesCat))):
            pwcat = (valuesCat[i]) * pwcat
        for i in range(min(15, len(valuesAll))):
            pw = (valuesAll[i]) * pw
		
        if pw == 0:
            results[category] = 0
        else:
            results[category] = (pcat * pwcat)/pw

    return results