def parseReviewToTrainingSet(reviewRDD, testReviewRDD): reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC)) userList = reviewRestaRDD.map(Review.getuserid).sortByKey().collect() restList = reviewRestaRDD.map(Review.getbusiid).sortByKey().collect() userListBC = sc.broadcast(userList) restListBC = sc.broadcast(restList) print(userList[10]) ''' Generate Dictionaries of users in training set and broadcast ''' userIdToNumDict, userNumToIdDict = assignNum(userList) userIdToNumDictBC = sc.broadcast(userIdToNumDict) userNumToIdDictBC = sc.broadcast(userNumToIdDict) ''' Generate Dictionaries of Restaurants in training set and broadcast ''' restIdToNumDict, restNumToIdDict = assignNum(restList) restIdToNumDictBC = sc.broadcast(restIdToNumDict) restNumToIdDictBC = sc.broadcast(restNumToIdDict) userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer).map(Review.reshape) userReviewRestaCollNormRDD = userReviewRestaRDD.map(Review.normalize) # Subtract average values userReviewRestaNormLst = userReviewRestaCollNormRDD.collect() # map(Review.flatten).flatMap(Review.vectorize) userAvgDict = dict(userReviewRestaCollNormRDD.map(lambda x: (x[0], x[1])).collect()) userReviewRestaLst = parseUserBusiLst(userReviewRestaNormLst, userIdToNumDict, restIdToNumDict) userReviewRestaNormRDD = sc.parallelize(userReviewRestaLst) #usrResStarTupleRDD = reviewRestaRDD.map(Review.getUsrResStar) testReviewRestRDD = testReviewRDD.map(Review.toString).map(Review.getUsrResStar) print(userReviewRestaNormRDD.take(10)) testReviewRestRDD = testReviewRestRDD.filter(lambda x: x[0] in userListBC.value and x[1] in restListBC.value)\ .map(lambda x: Review.normalizeStar(userAvgDict, x))\ .map(lambda x: Review.replaceIDwithNum(x, userIdToNumDictBC, restIdToNumDictBC)) #.filter(lambda x: x[0] != 0 and x[1] != 0) print(testReviewRestRDD.take(10)) # and x[1] in restaurantListBC.value)\ return userReviewRestaNormRDD, testReviewRestRDD
numRestaurants = ratings.values().map(lambda r: r[1]).distinct().count() print("Got %d ratings from %d users on %d restaurants." % (numRatings, numUsers, numRestaurants)) userList = ratings.values().map(lambda r: r[0]).distinct().collect() restList = ratings.values().map(lambda r: r[1]).distinct().collect() getUserIndex, getUserID = assignNum(userList) getRestIndex, getRestID = assignNum(restList) getUserIDBC = sc.broadcast(getUserID) getUserIndexBC = sc.broadcast(getUserIndex) getRestIndexBC = sc.broadcast(getRestIndex) getRestIDBC = sc.broadcast(getRestID) ratings = ratings.map(lambda (x, y): (x, Review.replaceIDwithNum(y, getUserIndexBC, getRestIndexBC))) usrRatingAvg = ratings.values().map(lambda x: (x[0], x[2])).reduceByKey(Review.reducer).map(Review.reshape)\ .filter(lambda x: len(x[1]) >= 3).map(Review.reshapeList)\ .map(lambda x: (x[0], sum(x[1])/float(len(x[1])))) usrRatingAvgBC = sc.broadcast(dict(usrRatingAvg.collect())) ratings = ratings.filter(lambda x: x[1][0] in usrRatingAvgBC.value).map(lambda (x, y): (x, Review.subtractAvg(y, usrRatingAvgBC))) numOfPartitions = 4 trainingVal = ratings.filter(lambda x: x[0] <= 6) \ .values() trainingMean = trainingVal.map(lambda x: x[2]).mean() training = trainingVal.repartition(numOfPartitions).cache()