Exemplo n.º 1
0
def checkPage(link):
	checkPage = page.Page(link)

	print 'Getting distribution for page over categories...'
	allCatsLinks = unpack('allCatsLinks.p')
	occurMatrix = unpack('occurMatrix.p')
	totals = unpack('totals.p')
	keyDict = unpack('keyDict.p')
	useLinks = unpack('useLinks.p')
	distribution = classify.naiveBayes(checkPage,allCatsLinks,occurMatrix,totals,keyDict,useLinks)
	for result in distribution:
		print link,'is a subpage of', result[0], 'with probability', round(result[1]*100,5),'%'

	return True
Exemplo n.º 2
0
def main():
	already_made = False
	while True:
		made = raw_input("Have you already initialized a prior probability?(Y/N) ")
		if made == "Y":
			already_made = True
			break
		elif made == "N":
			already_made = False
			break

	if not already_made:
		while True:
			catCount = raw_input("How many categories will you be using? ")
			try:
				count = int(catCount)
				break
			except:
				print "not a valid integer"

		while True:
			inpDepth = raw_input("How deep do you want to traverse (increases exponentially)? ")
			try:
				depth = int(inpDepth)
				break
			except:
				print "not a valid integer"

		while True:
			bagOfWords = raw_input("Use link approach or bag of words?(enter 1/0 respectively): ")
			try:
				BOWval = int(bagOfWords)
				useLinks = not not BOWval
				break
			except:
				print "not a valid integer"

		allCategoriesLinks = []
		print 'Please give the input in the form \"/wiki/Category:Example_category\".'
		print 'If a category is not valid input or not recognized, it will be dropped by the classifier.'
		inc = 0
		while inc < count:
			catLink = raw_input("enter your category: ")
			if categories.isCategory(catLink):
				allCategoriesLinks.append(catLink)
				inc += 1
			else:
				print "invalid category format"

		print 'Creating prior probilities for naive bayes clssification...'
		allCatsLinks,occurMatrix,totals,keyDict,useLinks = classify.createClassifier(allCategoriesLinks,useLinks,depth)

		serialize(allCatsLinks,'allCatsLinks.p')
		serialize(occurMatrix,'occurMatrix.p')
		serialize(totals,'totals.p')
		serialize(keyDict,'keyDict.p')
		serialize(useLinks,'useLinks.p')
		print 'Prior probabilities are now stored in serialized files.'

	while True:
		checkPageLink = raw_input("In similar format, give URL suffix of page you would like to classify: ")
		if page.isPage(checkPageLink):
			break
		else:
			print "There was an error connecting to the given page."

	checkPage = page.Page(checkPageLink)

	print 'Getting distribution for page over categories...'
	allCatsLinks = unpack('allCatsLinks.p')
	occurMatrix = unpack('occurMatrix.p')
	totals = unpack('totals.p')
	keyDict = unpack('keyDict.p')
	useLinks = unpack('useLinks.p')
	distribution = classify.naiveBayes(checkPage,allCatsLinks,occurMatrix,totals,keyDict,useLinks)
	for result in distribution:
		print checkPageLink,'is a subpage of', result[0], 'with probability', round(result[1]*100,5),'%'

	return None