def main(): already_made = False while True: made = raw_input("Have you already initialized a prior probability?(Y/N) ") if made == "Y": already_made = True break elif made == "N": already_made = False break if not already_made: while True: catCount = raw_input("How many categories will you be using? ") try: count = int(catCount) break except: print "not a valid integer" while True: inpDepth = raw_input("How deep do you want to traverse (increases exponentially)? ") try: depth = int(inpDepth) break except: print "not a valid integer" while True: bagOfWords = raw_input("Use link approach or bag of words?(enter 1/0 respectively): ") try: BOWval = int(bagOfWords) useLinks = not not BOWval break except: print "not a valid integer" allCategoriesLinks = [] print 'Please give the input in the form \"/wiki/Category:Example_category\".' print 'If a category is not valid input or not recognized, it will be dropped by the classifier.' inc = 0 while inc < count: catLink = raw_input("enter your category: ") if categories.isCategory(catLink): allCategoriesLinks.append(catLink) inc += 1 else: print "invalid category format" print 'Creating prior probilities for naive bayes clssification...' allCatsLinks,occurMatrix,totals,keyDict,useLinks = classify.createClassifier(allCategoriesLinks,useLinks,depth) serialize(allCatsLinks,'allCatsLinks.p') serialize(occurMatrix,'occurMatrix.p') serialize(totals,'totals.p') serialize(keyDict,'keyDict.p') serialize(useLinks,'useLinks.p') print 'Prior probabilities are now stored in serialized files.' while True: checkPageLink = raw_input("In similar format, give URL suffix of page you would like to classify: ") if page.isPage(checkPageLink): break else: print "There was an error connecting to the given page." checkPage = page.Page(checkPageLink) print 'Getting distribution for page over categories...' allCatsLinks = unpack('allCatsLinks.p') occurMatrix = unpack('occurMatrix.p') totals = unpack('totals.p') keyDict = unpack('keyDict.p') useLinks = unpack('useLinks.p') distribution = classify.naiveBayes(checkPage,allCatsLinks,occurMatrix,totals,keyDict,useLinks) for result in distribution: print checkPageLink,'is a subpage of', result[0], 'with probability', round(result[1]*100,5),'%' return None