Python YelpPredictor примеры использования

Язык программирования: Python

Класс/Тип: YelpPredictor

Примеров на hotexamples.com: 3

Python YelpPredictor - 3 примера найдено. Это лучшие примеры Python кода для YelpPredictor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getAverageBusinessStars(2)

getAverageUserStars(2)

yelpColdStartSplitCV(1)

Пример #1

Показать файл

Файл: YelpTester.py Проект: johncallery/YelpRatingPredictor

def splitColdStart(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[0]+=1
                        userBizSplit.append(entry)
                    else:
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    coldStartCalc[1]+=1
                    userOnlySplit.append(entry)
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    coldStartCalc[2]+=1
                    bizOnlySplit.append(entry)
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                coldStartCalc[3]+=1
                coldSplit.append(entry)
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter

Пример #2

Показать файл

Файл: YelpTester.py Проект: johncallery/YelpRatingPredictor

def yelpCVTester(minRatings):
    print 'start yelp tester'
    
    cvNum = 10
    
    # Get map of known user->business->review/stars ratings
    userBizRatings = getUserBusinessRatings()
    
    # Get set of user/business pairs that have ratings
    # so that we can randomize the sets for 
    # RMSE testing of different methods
    userBiz = getUserBizReviewTuple(userBizRatings)
    
    # Get a mapping of review_id to rating
    # to use in RMSE calculation
    # format based on final submission requirements
    actualRatings = getReviewStarMap(userBizRatings)
    
    # get balanced cold start scenarios
    coldStartScenarios = simulateKaggleMix(userBiz,minRatings)
    
    '''
    ####################################
    MODIFY FOR TESTING COLD START COMBO
    ####################################
    '''
    # Divide each scenario into RMSE testing sets
    # randomly select equal-sized CV groups
    cvGroupSize = [math.trunc(len(coldStartScenarios[0]) / cvNum), math.trunc(len(coldStartScenarios[1]) / cvNum), math.trunc(len(coldStartScenarios[2]) / cvNum), math.trunc(len(coldStartScenarios[3]) / cvNum)]
    
    # Create a set of CV groups
    # There will be 10 lists, each containing a list of 4 scenario sets, each containing many [uid,bid,rid] lists.
    # [ [ [uid_14,bid_14,rid_14], [uid267,bid_267,rid_267],... ] , [ [uid_1372,bid_1372,rid_1372],[uid_2,bid_2,rid_2],... ] , ... ] 
    cvSet = []
    
    # for each CV group that we want
    # randomly select CVGroupSize entries and pop them from the relevant
    for x in range(0, cvNum):
        # variable to hold this CV test's specific set of entries
        tmpCVSet = []
        
        # for each scenario
        for y in range(0,len(cvGroupSize)):
            # variable to hold the set of scenarios for this CV test
            tmpCVScenario = []
            
            # for each desired entry
            for z in range(0,cvGroupSize[y]):
                # logic is to choose a random value from the list of user/business/review tuples
                # associated with the relevant cold start scenario
                # remove it from the relevant coldStartScenarios list and add it to the current CVScenarioSet
                # this should be repeatable because it calculates the new length each time
                tmpCVScenario.append(coldStartScenarios[y].pop(random.randint(0,len(coldStartScenarios[y])-1)))
            # End FOR each desired entry
            
            tmpCVSet.append(tmpCVScenario)
        # End FOR each scenario
        
        cvSet.append(tmpCVSet)
    # End FOR creating each CV group
    
    '''
    EVENTUALLY SPLIT RMSE CALCULATIONS SO THAT WE CAN IDENTIFY GAPS
    '''
    
    # Calculate RMSE for each CV group    
    # append test results to RMSE array
    rmse = []
    combinedRMSE = []
    splitRMSE = [[],[],[],[]]
    
    # loop through each CV in set and pass to the desired predictor method
    # save result in rmse[]
    for testSet in cvSet:
        #predictions = YelpPredictor.yelpTedWeights(testSet)
        #predictions = YelpPredictor.yelpUserCategoryAvg(testSet, 1)
        #predictions = YelpPredictor.yelpBizAvg(testSet, 1)
        #predictions = YelpPredictor.yelpUserAvg(testSet, 1)
        #predictions = YelpPredictor.yelpRandom(testSet)
        predictions = YelpPredictor.yelpColdStartSplitCV(testSet, minRatings)
        
        #tmpRMSE = getRMSE(predictions,actualRatings)
        
        # converted to get separate RMSE for each cold start scenario
        tmpRMSE = []
        
        tmpUserBizRMSE = getRMSE(predictions[0],actualRatings)
        tmpUserRMSE = getRMSE(predictions[1],actualRatings)
        tmpBizRMSE = getRMSE(predictions[2],actualRatings)
        tmpColdRMSE = getRMSE(predictions[3],actualRatings)

        splitRMSE[0].append(tmpUserBizRMSE)
        splitRMSE[1].append(tmpUserRMSE)
        splitRMSE[2].append(tmpBizRMSE)
        splitRMSE[3].append(tmpColdRMSE)
        
        tmpRMSE.append(tmpUserBizRMSE)
        tmpRMSE.append(tmpUserRMSE)
        tmpRMSE.append(tmpBizRMSE)
        tmpRMSE.append(tmpColdRMSE)
        
        tmpCombinedPredictions = predictions[0].copy()
        tmpCombinedPredictions.update(predictions[1])
        tmpCombinedPredictions.update(predictions[2])
        tmpCombinedPredictions.update(predictions[3])
        tmpCombinedRMSE = getRMSE(tmpCombinedPredictions,actualRatings)
        
        print str(tmpRMSE)
        print str(tmpCombinedRMSE)
        
        rmse.append(tmpRMSE)
        combinedRMSE.append(tmpCombinedRMSE)
    # End FOR testing each cv set
    
    # Save best, worst, standard deviation and mean RMSE
    bestScenario = [min(splitRMSE[0], key=float),min(splitRMSE[1], key=float),min(splitRMSE[2], key=float),min(splitRMSE[3], key=float)]
    
    worstScenario = [max(splitRMSE[0], key=float),max(splitRMSE[1], key=float),max(splitRMSE[2], key=float),max(splitRMSE[3], key=float)]
    
    meanScenario = [sum(splitRMSE[0])/float(len(splitRMSE[0])),sum(splitRMSE[1])/float(len(splitRMSE[1])),sum(splitRMSE[2])/float(len(splitRMSE[2])),sum(splitRMSE[3])/float(len(splitRMSE[3]))]
    
    sdScenario = [numpy.std(splitRMSE[0]),numpy.std(splitRMSE[1]),numpy.std(splitRMSE[2]),numpy.std(splitRMSE[3])]
    
    bestCombined = min(combinedRMSE, key=float)
    
    worstCombined = max(combinedRMSE, key=float)
    
    print 'num of CV tests: '+str(cvNum)
    print 'size of each CV set: '+str(cvGroupSize)
    print 'best scenarios: '+str(bestScenario)
    print 'worst scenarios: '+str(worstScenario)
    print 'mean scenarios: '+str(meanScenario)
    print 'sd scenarios: '+str(sdScenario)
    print 'best combined: '+str(bestCombined)
    print 'worst combined: '+str(worstCombined)
    print 'mean: '+str(sum(combinedRMSE)/float(len(combinedRMSE)))
    print 'sd: '+str(numpy.std(combinedRMSE))
    # calculate sd and mean 
    
    print 'end yelp tester'

Пример #3

Показать файл

Файл: YelpTester.py Проект: johncallery/YelpRatingPredictor

def simulateKaggleMix(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    # Calculate target #s for each scenario option.
    # First, get total number of reviews to process.
    totalReviews = len(reviews)
    # Then calculate target #s for each scenario 
    # based on analysis of final Kaggle test set
    targetCalc = [math.trunc(0.33*totalReviews),math.trunc(0.11*totalReviews),math.trunc(0.41*totalReviews),math.trunc(0.15*totalReviews)]
    # Create placeholder to accumulate reviews exceeding target numbers.
    # This will be used to re-balance after all true distributions has been completed.
    rebalance = []
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for user+biz scenario
                        if (coldStartCalc[0] < targetCalc[0]):
                            coldStartCalc[0]+=1
                            userBizSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE user+biz target reviews met
                    else:
                        # if we have not reached the target number of reviews for userOnly scenario
                        if (coldStartCalc[1] < targetCalc[1]):
                            coldStartCalc[1]+=1
                            userOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE userOnly target reviews met
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for userOnly scenario
                    if (coldStartCalc[1] < targetCalc[1]):
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE userOnly target reviews met
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for bizOnly scenario
                        if (coldStartCalc[2] < targetCalc[2]):
                            coldStartCalc[2]+=1
                            bizOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE bizOnly target reviews met
                    else:
                        # if we have not reached the target number of reviews for trueCold scenario
                        if (coldStartCalc[3] < targetCalc[3]):
                            coldStartCalc[3]+=1
                            coldSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE trueCold target reviews met
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    # if we have not reached the target number of reviews for bizOnly scenario
                    if (coldStartCalc[2] < targetCalc[2]):
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE bizOnly target reviews met
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                # if we have not reached the target number of reviews for trueCold scenario
                if (coldStartCalc[3] < targetCalc[3]):
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                else:
                    rebalance.append(entry)
                # End IF/ELSE trueCold target reviews met
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    # rebalance any scenarios that were not naturally completed
    
    # userOnly
    while ((coldStartCalc[1] < targetCalc[1]) and (len(rebalance) > 0)):
        coldStartCalc[1]+=1
        userOnlySplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use
    
    # bizOnly
    while ((coldStartCalc[2] < targetCalc[2]) and (len(rebalance) > 0)):
        coldStartCalc[2]+=1
        bizOnlySplit.append(rebalance.pop())
    # End WHILE bizOnly is unbalanced and we have available rebalance stock to use
    
    # true cold start
    while ((coldStartCalc[3] < targetCalc[3]) and (len(rebalance) > 0)):
        coldStartCalc[3]+=1
        coldSplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use        
    
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[0])+', 33%'
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[1])+', 11%'
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[2])+', 41%'
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[3])+', 15%'
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter