def naiveBayesMC(): binSize = 10; maxPrice = 100; dataSplit = 0.70; print "Loading items... " allItems = getItems(db,filteritems=True,complete=True,sold=True,genre=ROCK); [trainItems,testItems] = splitItemSet(allItems,dataSplit); bins = generateBinArray(binSize,maxPrice); [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins); ##Only have to make these onece orderedWordList = generateOrderedWordList(allItems,lengthcutoff=5,frequencycutoff=1); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); predictedFinalPrices = [-1]*len(testItems); for priceCutOff in bins: print "Price cut off: ", priceCutOff [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,priceCutOff); [testingSetPredictions,prob_sell,prob_wontSell] = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1); #predicted category for testItems [0,1] predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins); for i in range(len(predictedFinalPrices)): if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1]; for i in range(len(predictedFinalPrices)): print i,testItems[i][0],testItems[i][START_PRICE],testItems[i][END_PRICE],getBinOf(bins,float(testItems[i][START_PRICE])), actualFinalPriceBinned[i],predictedFinalPrices[i],"\t\t",testItems[i][TITLE] print "Classification error on testing set is: ", classificationError(predictedFinalPrices,actualFinalPriceBinned);
def runClassificationErrorTest(): fail = False; p2 = [0,1,0]; a2 = [0,1,1]; if (m.classificationError([0],[0]) != 0): fail = True; if (m.classificationError([0],[1]) != 1): fail = True; if (m.classificationError([0],[-1]) != 1): fail = True; if (m.classificationError([0,0],[0,0]) != 0): fail = True; if (m.classificationError([0,0],[0,1]) != 0.5): fail = True; if (m.classificationError([0,0],[1,1]) != 1): fail = True; if (fail): print "Classification Error Function has failed!";
print "Items in testing set: ",len(testItems); orderedWordList = generateOrderedWordList(allItems,frequencycutoff = 4); print "Training on data set ..." [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,0) printTopWords(phi_k_sold,phi_k_unsold,orderedWordList) print "Preparing test data ... " itemTitleList = generateItemTitleList(testItems,orderedWordList); [testMatrix,testCategory] = generateMatrixData(orderedWordList,itemTitleList); print "Making predictions ... " [testingSetPredictions,prob_sell,prob_wontSell] = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1,uniformPrior=False); print "Classification error on testing set is: ", classificationError(testingSetPredictions,testCategory); def printTopWords(phi_k_sold,phi_k_unsold,orderedWordList): rating = []; for i in range(len(phi_k_sold)): rating.append(np.log(phi_k_sold[i] / phi_k_unsold[i]) ); ratingsSorted = [orderedWordList[i[0]] for i in sorted(enumerate(rating), key=lambda x:x[1])] reverseratingsSorted = [orderedWordList[i[0]] for i in sorted(enumerate(rating), key=lambda x:x[1])] reverseratingsSorted.reverse(); print print "Highest rated words: ", ratingsSorted[0:30] print "Lowest rated: ",reverseratingsSorted[0:30] ##