示例#1
0
def parseReviewToTrainingSet(reviewRDD, testReviewRDD):
    reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC))
    userList = reviewRestaRDD.map(Review.getuserid).sortByKey().collect()
    restList = reviewRestaRDD.map(Review.getbusiid).sortByKey().collect()
    userListBC = sc.broadcast(userList)
    restListBC = sc.broadcast(restList)

    print(userList[10])

    '''
    Generate Dictionaries of users in training set and broadcast
    '''
    userIdToNumDict, userNumToIdDict = assignNum(userList)

    userIdToNumDictBC = sc.broadcast(userIdToNumDict)
    userNumToIdDictBC = sc.broadcast(userNumToIdDict)

    '''
    Generate Dictionaries of Restaurants in training set and broadcast
    '''
    restIdToNumDict, restNumToIdDict = assignNum(restList)

    restIdToNumDictBC = sc.broadcast(restIdToNumDict)
    restNumToIdDictBC = sc.broadcast(restNumToIdDict)


    userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer).map(Review.reshape)
    userReviewRestaCollNormRDD = userReviewRestaRDD.map(Review.normalize)  # Subtract average values
    userReviewRestaNormLst = userReviewRestaCollNormRDD.collect()  # map(Review.flatten).flatMap(Review.vectorize)
    userAvgDict = dict(userReviewRestaCollNormRDD.map(lambda x: (x[0], x[1])).collect())

    userReviewRestaLst = parseUserBusiLst(userReviewRestaNormLst, userIdToNumDict, restIdToNumDict)

    userReviewRestaNormRDD = sc.parallelize(userReviewRestaLst)
    #usrResStarTupleRDD = reviewRestaRDD.map(Review.getUsrResStar)


    testReviewRestRDD = testReviewRDD.map(Review.toString).map(Review.getUsrResStar)

    print(userReviewRestaNormRDD.take(10))
    testReviewRestRDD = testReviewRestRDD.filter(lambda x: x[0] in userListBC.value and x[1] in restListBC.value)\
        .map(lambda x: Review.normalizeStar(userAvgDict, x))\
        .map(lambda x: Review.replaceIDwithNum(x, userIdToNumDictBC, restIdToNumDictBC))
        #.filter(lambda x: x[0] != 0 and x[1] != 0)
    print(testReviewRestRDD.take(10))
 # and x[1] in restaurantListBC.value)\
    return userReviewRestaNormRDD, testReviewRestRDD
示例#2
0
    numRestaurants = ratings.values().map(lambda r: r[1]).distinct().count()

    print("Got %d ratings from %d users on %d restaurants." % (numRatings, numUsers, numRestaurants))

    userList = ratings.values().map(lambda r: r[0]).distinct().collect()
    restList = ratings.values().map(lambda r: r[1]).distinct().collect()

    getUserIndex, getUserID = assignNum(userList)
    getRestIndex, getRestID = assignNum(restList)

    getUserIDBC = sc.broadcast(getUserID)
    getUserIndexBC = sc.broadcast(getUserIndex)
    getRestIndexBC = sc.broadcast(getRestIndex)
    getRestIDBC = sc.broadcast(getRestID)

    ratings = ratings.map(lambda (x, y): (x, Review.replaceIDwithNum(y, getUserIndexBC, getRestIndexBC)))
    usrRatingAvg = ratings.values().map(lambda x: (x[0], x[2])).reduceByKey(Review.reducer).map(Review.reshape)\
                    .filter(lambda x: len(x[1]) >= 3).map(Review.reshapeList)\
                    .map(lambda x: (x[0], sum(x[1])/float(len(x[1]))))
    usrRatingAvgBC = sc.broadcast(dict(usrRatingAvg.collect()))
    ratings = ratings.filter(lambda x: x[1][0] in usrRatingAvgBC.value).map(lambda (x, y): (x, Review.subtractAvg(y, usrRatingAvgBC)))

    numOfPartitions = 4

    trainingVal = ratings.filter(lambda x: x[0] <= 6) \
        .values()

    trainingMean = trainingVal.map(lambda x: x[2]).mean()

    training = trainingVal.repartition(numOfPartitions).cache()