Пример #1
0
def calc_user_user_cf2(training_data, num_partitions=20):
    """
    A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf

    Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota)
    and Prof Michael Ekstrand (Texas State University)

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    user_groups = training_data.groupBy(lambda (user, item, rating): user)

    user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\
        (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))).coalesce(num_partitions)

    user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\
        map(lambda (user, ratings): (user, np.mean(list(ratings))))

    user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\
        .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating)))

    item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\
        .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\
        groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list)))

    predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\
        .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj))

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = training_data.map(lambda (user, item, rating): rating).max()
    min_rating = training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating = 0

    norm_predictions = predictions.map(lambda (user, item, pred): (
        user, item, rechelp.squish_preds(pred, min_rating, max_rating)))

    return norm_predictions
Пример #2
0
def calc_user_user_cf2(training_data, num_partitions=20):
    """
    A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf

    Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota)
    and Prof Michael Ekstrand (Texas State University)

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    user_groups = training_data.groupBy(lambda (user, item, rating): user)

    user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\
        (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))).coalesce(num_partitions)

    user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\
        map(lambda (user, ratings): (user, np.mean(list(ratings))))

    user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\
        .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating)))

    item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\
        .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\
        groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list)))

    predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\
        .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj))

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = training_data.map(lambda (user, item, rating): rating).max()
    min_rating = training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    norm_predictions = predictions.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))

    return norm_predictions
Пример #3
0
def calc_item_item_cf(training_data, num_partitions):
    """
    A very simple item-item CF algorithm in PySpark.

    Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota)
    and Prof Michael Ekstrand (Texas State University)

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    item_groups = training_data.groupBy(lambda (user, item, rating): item)
    item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\
                       (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))).coalesce(num_partitions)

    user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\
        .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\
        .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\
        .filter(lambda ((user, item2), (item,rating,sim)): item2!=item)

    predictions = user_item_sim.groupByKey()\
        .map(lambda ((user, item), rows): (user, item, get_item_prob(rows)))

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = training_data.map(lambda (user, item, rating): rating).max()
    min_rating = training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating = 0

    norm_predictions = predictions.map(lambda (user, item, pred): (
        user, item, rechelp.squish_preds(pred, min_rating, max_rating)))

    return norm_predictions
Пример #4
0
def calc_item_item_cf(training_data, num_partitions):
    """
    A very simple item-item CF algorithm in PySpark.

    Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota)
    and Prof Michael Ekstrand (Texas State University)

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    item_groups = training_data.groupBy(lambda (user, item, rating): item)
    item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\
                       (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))).coalesce(num_partitions)

    user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\
        .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\
        .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\
        .filter(lambda ((user, item2), (item,rating,sim)): item2!=item)

    predictions = user_item_sim.groupByKey()\
        .map(lambda ((user, item), rows): (user, item, get_item_prob(rows)))

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = training_data.map(lambda (user, item, rating): rating).max()
    min_rating = training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    norm_predictions = predictions.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))

    return norm_predictions
Пример #5
0
def calc_cf_mllib(y_training_data, num_partitions = 20):
    """
    Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
    min_rating = y_training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    #MLLIb has two methods, train and trainImplicit().  Implicit data will go between zero and 1
    if min_rating==0 and max_rating==1:
        model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5)
    else:
        model = ALS.train(y_training_data, rank = 10, iterations = 5)

    #predict all user, item pairs
    item_ids = y_training_data.map(lambda (u,i,r): i).distinct()
    user_ids = y_training_data.map(lambda (u,i,r): u).distinct()
    user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)

    predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))

    norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))


    return norm_predictions
Пример #6
0
def calc_cf_mllib(y_training_data, num_partitions=20):
    """
    Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
    min_rating = y_training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating = 0

    #MLLIb has two methods, train and trainImplicit().  Implicit data will go between zero and 1
    if min_rating == 0 and max_rating == 1:
        model = ALS.trainImplicit(y_training_data, rank=10, iterations=5)
    else:
        model = ALS.train(y_training_data, rank=10, iterations=5)

    #predict all user, item pairs
    item_ids = y_training_data.map(lambda (u, i, r): i).distinct()
    user_ids = y_training_data.map(lambda (u, i, r): u).distinct()
    user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)

    predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))

    norm_predictions = predicted.map(lambda (user, item, pred): (
        user, item, rechelp.squish_preds(pred, min_rating, max_rating)))

    return norm_predictions