Пример #1
0
def splitColdStart(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[0]+=1
                        userBizSplit.append(entry)
                    else:
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    coldStartCalc[1]+=1
                    userOnlySplit.append(entry)
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    coldStartCalc[2]+=1
                    bizOnlySplit.append(entry)
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                coldStartCalc[3]+=1
                coldSplit.append(entry)
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter
Пример #2
0
def yelpCVTester(minRatings):
    print 'start yelp tester'
    
    cvNum = 10
    
    # Get map of known user->business->review/stars ratings
    userBizRatings = getUserBusinessRatings()
    
    # Get set of user/business pairs that have ratings
    # so that we can randomize the sets for 
    # RMSE testing of different methods
    userBiz = getUserBizReviewTuple(userBizRatings)
    
    # Get a mapping of review_id to rating
    # to use in RMSE calculation
    # format based on final submission requirements
    actualRatings = getReviewStarMap(userBizRatings)
    
    # get balanced cold start scenarios
    coldStartScenarios = simulateKaggleMix(userBiz,minRatings)
    
    '''
    ####################################
    MODIFY FOR TESTING COLD START COMBO
    ####################################
    '''
    # Divide each scenario into RMSE testing sets
    # randomly select equal-sized CV groups
    cvGroupSize = [math.trunc(len(coldStartScenarios[0]) / cvNum), math.trunc(len(coldStartScenarios[1]) / cvNum), math.trunc(len(coldStartScenarios[2]) / cvNum), math.trunc(len(coldStartScenarios[3]) / cvNum)]
    
    # Create a set of CV groups
    # There will be 10 lists, each containing a list of 4 scenario sets, each containing many [uid,bid,rid] lists.
    # [ [ [uid_14,bid_14,rid_14], [uid267,bid_267,rid_267],... ] , [ [uid_1372,bid_1372,rid_1372],[uid_2,bid_2,rid_2],... ] , ... ] 
    cvSet = []
    
    # for each CV group that we want
    # randomly select CVGroupSize entries and pop them from the relevant
    for x in range(0, cvNum):
        # variable to hold this CV test's specific set of entries
        tmpCVSet = []
        
        # for each scenario
        for y in range(0,len(cvGroupSize)):
            # variable to hold the set of scenarios for this CV test
            tmpCVScenario = []
            
            # for each desired entry
            for z in range(0,cvGroupSize[y]):
                # logic is to choose a random value from the list of user/business/review tuples
                # associated with the relevant cold start scenario
                # remove it from the relevant coldStartScenarios list and add it to the current CVScenarioSet
                # this should be repeatable because it calculates the new length each time
                tmpCVScenario.append(coldStartScenarios[y].pop(random.randint(0,len(coldStartScenarios[y])-1)))
            # End FOR each desired entry
            
            tmpCVSet.append(tmpCVScenario)
        # End FOR each scenario
        
        cvSet.append(tmpCVSet)
    # End FOR creating each CV group
    
    '''
    EVENTUALLY SPLIT RMSE CALCULATIONS SO THAT WE CAN IDENTIFY GAPS
    '''
    
    # Calculate RMSE for each CV group    
    # append test results to RMSE array
    rmse = []
    combinedRMSE = []
    splitRMSE = [[],[],[],[]]
    
    # loop through each CV in set and pass to the desired predictor method
    # save result in rmse[]
    for testSet in cvSet:
        #predictions = YelpPredictor.yelpTedWeights(testSet)
        #predictions = YelpPredictor.yelpUserCategoryAvg(testSet, 1)
        #predictions = YelpPredictor.yelpBizAvg(testSet, 1)
        #predictions = YelpPredictor.yelpUserAvg(testSet, 1)
        #predictions = YelpPredictor.yelpRandom(testSet)
        predictions = YelpPredictor.yelpColdStartSplitCV(testSet, minRatings)
        
        #tmpRMSE = getRMSE(predictions,actualRatings)
        
        # converted to get separate RMSE for each cold start scenario
        tmpRMSE = []
        
        tmpUserBizRMSE = getRMSE(predictions[0],actualRatings)
        tmpUserRMSE = getRMSE(predictions[1],actualRatings)
        tmpBizRMSE = getRMSE(predictions[2],actualRatings)
        tmpColdRMSE = getRMSE(predictions[3],actualRatings)

        splitRMSE[0].append(tmpUserBizRMSE)
        splitRMSE[1].append(tmpUserRMSE)
        splitRMSE[2].append(tmpBizRMSE)
        splitRMSE[3].append(tmpColdRMSE)
        
        tmpRMSE.append(tmpUserBizRMSE)
        tmpRMSE.append(tmpUserRMSE)
        tmpRMSE.append(tmpBizRMSE)
        tmpRMSE.append(tmpColdRMSE)
        
        tmpCombinedPredictions = predictions[0].copy()
        tmpCombinedPredictions.update(predictions[1])
        tmpCombinedPredictions.update(predictions[2])
        tmpCombinedPredictions.update(predictions[3])
        tmpCombinedRMSE = getRMSE(tmpCombinedPredictions,actualRatings)
        
        print str(tmpRMSE)
        print str(tmpCombinedRMSE)
        
        rmse.append(tmpRMSE)
        combinedRMSE.append(tmpCombinedRMSE)
    # End FOR testing each cv set
    
    # Save best, worst, standard deviation and mean RMSE
    bestScenario = [min(splitRMSE[0], key=float),min(splitRMSE[1], key=float),min(splitRMSE[2], key=float),min(splitRMSE[3], key=float)]
    
    worstScenario = [max(splitRMSE[0], key=float),max(splitRMSE[1], key=float),max(splitRMSE[2], key=float),max(splitRMSE[3], key=float)]
    
    meanScenario = [sum(splitRMSE[0])/float(len(splitRMSE[0])),sum(splitRMSE[1])/float(len(splitRMSE[1])),sum(splitRMSE[2])/float(len(splitRMSE[2])),sum(splitRMSE[3])/float(len(splitRMSE[3]))]
    
    sdScenario = [numpy.std(splitRMSE[0]),numpy.std(splitRMSE[1]),numpy.std(splitRMSE[2]),numpy.std(splitRMSE[3])]
    
    bestCombined = min(combinedRMSE, key=float)
    
    worstCombined = max(combinedRMSE, key=float)
    
    print 'num of CV tests: '+str(cvNum)
    print 'size of each CV set: '+str(cvGroupSize)
    print 'best scenarios: '+str(bestScenario)
    print 'worst scenarios: '+str(worstScenario)
    print 'mean scenarios: '+str(meanScenario)
    print 'sd scenarios: '+str(sdScenario)
    print 'best combined: '+str(bestCombined)
    print 'worst combined: '+str(worstCombined)
    print 'mean: '+str(sum(combinedRMSE)/float(len(combinedRMSE)))
    print 'sd: '+str(numpy.std(combinedRMSE))
    # calculate sd and mean 
    
    print 'end yelp tester'
Пример #3
0
def simulateKaggleMix(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    # Calculate target #s for each scenario option.
    # First, get total number of reviews to process.
    totalReviews = len(reviews)
    # Then calculate target #s for each scenario 
    # based on analysis of final Kaggle test set
    targetCalc = [math.trunc(0.33*totalReviews),math.trunc(0.11*totalReviews),math.trunc(0.41*totalReviews),math.trunc(0.15*totalReviews)]
    # Create placeholder to accumulate reviews exceeding target numbers.
    # This will be used to re-balance after all true distributions has been completed.
    rebalance = []
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for user+biz scenario
                        if (coldStartCalc[0] < targetCalc[0]):
                            coldStartCalc[0]+=1
                            userBizSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE user+biz target reviews met
                    else:
                        # if we have not reached the target number of reviews for userOnly scenario
                        if (coldStartCalc[1] < targetCalc[1]):
                            coldStartCalc[1]+=1
                            userOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE userOnly target reviews met
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for userOnly scenario
                    if (coldStartCalc[1] < targetCalc[1]):
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE userOnly target reviews met
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for bizOnly scenario
                        if (coldStartCalc[2] < targetCalc[2]):
                            coldStartCalc[2]+=1
                            bizOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE bizOnly target reviews met
                    else:
                        # if we have not reached the target number of reviews for trueCold scenario
                        if (coldStartCalc[3] < targetCalc[3]):
                            coldStartCalc[3]+=1
                            coldSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE trueCold target reviews met
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    # if we have not reached the target number of reviews for bizOnly scenario
                    if (coldStartCalc[2] < targetCalc[2]):
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE bizOnly target reviews met
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                # if we have not reached the target number of reviews for trueCold scenario
                if (coldStartCalc[3] < targetCalc[3]):
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                else:
                    rebalance.append(entry)
                # End IF/ELSE trueCold target reviews met
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    # rebalance any scenarios that were not naturally completed
    
    # userOnly
    while ((coldStartCalc[1] < targetCalc[1]) and (len(rebalance) > 0)):
        coldStartCalc[1]+=1
        userOnlySplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use
    
    # bizOnly
    while ((coldStartCalc[2] < targetCalc[2]) and (len(rebalance) > 0)):
        coldStartCalc[2]+=1
        bizOnlySplit.append(rebalance.pop())
    # End WHILE bizOnly is unbalanced and we have available rebalance stock to use
    
    # true cold start
    while ((coldStartCalc[3] < targetCalc[3]) and (len(rebalance) > 0)):
        coldStartCalc[3]+=1
        coldSplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use        
    
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[0])+', 33%'
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[1])+', 11%'
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[2])+', 41%'
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[3])+', 15%'
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter