def generateTrainCategory(items,priceCutOff): arr = [] for row in items: if float(row[END_PRICE]) >= priceCutOff and didItemSell(row): arr.append(1); else: arr.append(0); return arr
def getFinalPrices(items,bins,default=0): actualFinalPrice = []; for row in items: if didItemSell(row): actualFinalPrice.append(float(row[END_PRICE])); else: actualFinalPrice.append(default); actualFinalPriceBinned = binnedFinalPrice(bins,actualFinalPrice); return actualFinalPrice,actualFinalPriceBinned
def generateItemTitleList(items,orderedWordList): item_title_words = []; for row in items: words = [didItemSell(row)]; for word in row[TITLE].split(" "): if len(word)>2: if orderedWordList.count(word.lower())!=0: words.append(orderedWordList.index(word.lower())); item_title_words.append(words); return item_title_words;
def crossValidate(binSize, num): priceCutOff = 15; maxPrice = 100; dataSplit = 0.70 #print "Loading items... " allItems = getItems(complete=True,sold=True); #remove genre parameter to search all genres allItems = filterItems(allItems); #allItems = selectNumItems(allItems, num); [trainItems,testItems] = splitItemSet(allItems,dataSplit); trainItems = selectNumItems(trainItems, num); actualFinalPrice = []; for row in testItems: if didItemSell(row): actualFinalPrice.append(float(row[END_PRICE])); else: actualFinalPrice.append(0); bins = generateBinArray(binSize,maxPrice); actualFinalPriceBinned = binnedFinalPrice(bins,actualFinalPrice); ##Only have to make these once orderedWordList = generateOrderedWordList(allItems); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); predictedFinalPrice = [-1]*len(testItems); for priceCutOff in bins: #print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold,phi_k_sold] = trainOnData(trainItems,orderedWordList,priceCutOff); testCategory = generateTrainCategory(testItems,priceCutOff) #actual category for testItems testingSetPredictions = makePredictions(testMatrix,phi_k_sold,phi_k_unsold); #predicted category for testItems for i in range(len(testItems)): if testingSetPredictions[i]==0 and predictedFinalPrice[i]==-1: if priceCutOff-binSize>float(testItems[i][END_PRICE]): predictedFinalPrice[i] = priceCutOff-binSize; elif priceCutOff-binSize<=float(testItems[i][END_PRICE]): predictedFinalPrice[i] = getBinOf(bins,float(testItems[i][END_PRICE])); if predictedFinalPrice[i]<0: predictedFinalPrice[i]=0; #if testItems[i][END_PRICE]>=priceCutOff: predictedFinalPrice[i] = priceCutOff; #else: predictedFinalPrice[i] = testItems[i][END_PRICE]; file = open('predictedactualprices.csv', 'w'); for i in range(len(predictedFinalPrice)): if predictedFinalPrice[i] == -1: predictedFinalPrice[i] = bins[-1]; for item in range(len(predictedFinalPrice)): file.write(str(predictedFinalPrice[item])+ ", "+ str(actualFinalPrice[item])+ "\n"); file.close(); return classificationError(predictedFinalPrice, actualFinalPriceBinned);