예제 #1
    def _evaluate(self, predicted_df):
        k = self.get_k()

        window_spec = Window.partitionBy('user').orderBy(
        per_user_predicted_items_df = predicted_df \
            .select('user', 'item', 'prediction', F.rank().over(window_spec).alias('rank')) \
            .where('rank <= {0}'.format(k)) \
            .groupBy('user') \
            .agg(expr('collect_list(item) as items'))

        window_spec = Window.partitionBy('user').orderBy(
        per_user_actual_items_df = predicted_df \
            .select('user', 'item', 'starred_at', F.rank().over(window_spec).alias('rank')) \
            .where('rank <= {0}'.format(k)) \
            .groupBy('user') \
            .agg(expr('collect_list(item) as items'))

        per_user_items_rdd = per_user_predicted_items_df.join(F.broadcast(per_user_actual_items_df), 'user', 'inner') \
            .rdd \
            .map(lambda row: (row[1], row[2]))

        if per_user_items_rdd.isEmpty():
            return 0.0

        ranking_metrics = RankingMetrics(per_user_items_rdd)
        metric = ranking_metrics.ndcgAt(k)
        return metric
예제 #2
def Tester(spark, model, df_test, rank, regParam, alpha, K=500):
    #df_test = spark.read.parquet(formatted_test_address)
    targetUsers = df_test.select("user_id_numeric").distinct()
    userRecs = model.recommendForUserSubset(targetUsers, K)
    userRecs = userRecs.select("user_id_numeric",

    # need to get ordered list of track_id based on counts groupby individual users.
    # reference:https://stackoverflow.com/questions/46580253/collect-list-by-preserving-order-based-on-another-variable
    w = Window.partitionBy("user_id_numeric").orderBy(df_val['count'].desc())
    labels = df_val.withColumn('ActualRanking',
    labels = labels.select(['user_id_numeric', 'ActualRanking'

    # Get the metrics
    # predictionsAndlabels should be an RDD of (predicted ranking, ground truth set) pairs.
    # reference: https://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics
    predictionsAndlabels = userRecs.join(
        labels, [labels.user_id_numeric == userRecs.user_id_numeric],
        'left').select('track_id_numeric', 'ActualRanking')
    metricsRank = RankingMetrics(predictionsAndlabels.rdd)

    print("Params: Rank %f | regParam %f | alpha = %f" %
          (rank, regParam, alpha))
    print("p(15)   %.8f" % metricsRank.precisionAt(15))
    print("p(500)   %.8f" % metricsRank.precisionAt(500))
    print("MAP  %.8f" % metricsRank.meanAveragePrecision)
    print("nDCG %.8f" % metricsRank.ndcgAt(K))
예제 #3
def main(spark, txt):
    model = ALSModel.load('hdfs:/user/jm7955/' + args.model)
    distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' %

    print('finished writing in %d seconds' % int(timer() - start))
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)
    print('finished writing in %d seconds' % int(timer() - start))

    predictions = model.recommendForUserSubset(distinct_users, 500)\
        .select('user', F.col('recommendations.item').alias('item'))
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = predictions.join(
        labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    file = open(txt, 'w')

    file.write('metrics.meanAveragePrecision: %s\n' %
    file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
def main(spark, model_file, test_file):
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file

    # Load the parquet file
    test = spark.read.parquet(test_file)
    test = test.sort('user', ascending=False)
    model = ALSModel.load(model_file)

    user_subset = test.select("user").distinct()
    user_subset = model.recommendForUserSubset(user_subset, 500)

    user_subset = user_subset.select("user",
    user_subset = user_subset.sort('user', ascending=False)
    print("sort user")
    predictionAndLabels = user_subset.join(
        test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("joined predictions and counts")

    metrics = RankingMetrics(predictionAndLabels)
    print("made metrics")
    MAP = metrics.meanAveragePrecision
    precision = metrics.precisionAt(500)
    ndcg = metrics.ndcgAt(500)

    print('MAP: %f' % MAP)
    print('Precision: %f' % precision)
    print('NDCG: %f' % ndcg)
예제 #5
def evaluateTopk(model,data,top_k=500):
    validation: RDD
        - user, product (book_id), rating
    print("Getting Predictions...")
    tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]])

    print("Predictions and Labels...")
    final=k.rdd.map(lambda r: [r[1],r[3]])

    print("\nCalculate NDCG at {}...".format(top_k))
    print("NDCG at {}: {}".format(top_k,res1))

    print("\nCalculate MAP...")
    print("MAP: {}".format(res2))

    print("\nCalculate Precision at {}...".format(top_k))
    print("Precision at {}: {}".format(top_k,res1))

    return res1,res2,res3
예제 #6
def evaluation(df, model, ks):
	Evaluate the model.
	ks: a list of parameter k used in precision at k and NDCG at k.

	print(' Make predictions...')
	predictions = model.recommendForUserSubset(df, 500)

	print(' Prepare ground truth set and predicted set...')
	labels = df.groupBy('user').agg(F.collect_set('item')).collect()
	user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect()
	labels = sorted(labels, key = lambda x: x.user)
	user_pred = sorted(user_pred, key = lambda x: x.user)
	print(' Combine ground truth set and predicted set...')
	predictionAndLabels = []
	for i in range(len(user_pred)):
		predictionAndLabels.append((user_pred[i].item, labels[i][1]))
	print(' Parallelize...')
	predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000)
	print(' Calculate metrics...')
	metrics = RankingMetrics(predictionAndLabels)
	eval_results = []
	for k in ks:

	return eval_results
예제 #7
def get_rankMetrics(spark, df, trained_model, approx=False, k=500):
    This function evaluates the performance of a given model on a given dataset using Ranking Metrics,
    and returns the final performance metrics.

    df: DataFrame to evaluate on
    trained_model: trained model to evaluate
    approx: boolean; use ANN(approximate nearest neighbors) when True
    k: number of recommendation 
    import datetime
    import nmslib_recommend2
    import pyspark.sql.functions as F
    from pyspark.mllib.evaluation import RankingMetrics

    # change column names
    df = df.select(['user_id', 'book_id',
                    'rating']).toDF('user', 'item', 'rating')

    # relevant item if its centered rating > 0
    fn = F.udf(lambda x: 1.0 if x >= 3 else 0.0)
    df = df.withColumn('rating', fn(df.rating))
    relevant = df[df.rating == 1.0].groupBy('user').agg(F.collect_list('item'))

    # recommend k items for each user
    print("recommendation time comparison start: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    if approx:
        recommend = nmslib_recommend2.nmslib_recommend(spark, df,
                                                       trained_model, k)
        recommend = spark.createDataFrame(recommend, ["user", "recommend"])
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            rec_and_rel.append((rec, rel))
        userSubset = relevant.select('user')
        recommend = trained_model.recommendForUserSubset(userSubset, 500)
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            predict_items = [i.item for i in rec]
            rec_and_rel.append((predict_items, rel))
    print("recommendation time comparison end: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # Compute metrics
    rec_and_rel_rdd = spark.sparkContext.parallelize(rec_and_rel)
    metric_class = RankingMetrics(rec_and_rel_rdd)

    ndcg = metric_class.ndcgAt(k)
    map_ = metric_class.meanAveragePrecision
    pk = metric_class.precisionAt(k)

    return print("NDCG:", ndcg, "\nMAP:", map_, "\nPrecision:", pk)
예제 #8
def evaluate(model, test_data):
    test = test_data.map(lambda p: (p[0], p[1]))
    ret = model.predictAll(test) \
        .map(lambda r: (r.user, (r.product, r.rating))) \
        .groupByKey() \
        .mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)) \
        .mapValues(lambda l: [x[0] for x in l])
    gt_items = test_data.filter(lambda p: p[2] == 1.0).map(lambda r: (r[0], [r[1]]))
    predictionAndLabels = ret.join(gt_items).map(lambda r: (r[1][0], list(r[1][1])))
    metrics = RankingMetrics(predictionAndLabels)
    return metrics.ndcgAt(TopK)
예제 #9
def top_k_rankingmetrics(dataset=None,
	This function is to compute the ranking metrics from predictions.
	1. k: only evaluate the performance of the top k items
	2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt
	3. user, item, prediction: column names; string type

	refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html
    if dataset == None:
        print("Error! Please specify a dataset.")
    # prediction table
    windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc())
    perUserPredictedItemsDF = dataset \
     .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # actual target table
    windowSpec = Window.partitionBy(user).orderBy(col(rating).desc())
    perUserActualItemsDF = dataset \
     .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # join
    perUserItemsRDD = perUserPredictedItemsDF \
     .join(F.broadcast(perUserActualItemsDF), user, 'inner') \
     .rdd \
     .map(lambda row: (row[1], row[2]))
    ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD)
    # get the result of the metric
    if ranking_metrics == "precisionAt":
        precision_at_k = ranking_metrics_evaluator.precisionAt(k)
        #print("precisionAt: {}".format(round(precision_at_k, 4)))
        return precision_at_k
    elif ranking_metrics == "meanAveragePrecision":
        mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k)
        #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4)))
        return mean_avg_precision
    elif ranking_metrics == "ndcgAt":
        ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k)
        #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4)))
        return ndcg_at_k
def dummy_run(spark):

    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    import pyspark.sql.functions as F
    from pyspark.sql.functions import expr

        (82, 124, 5.0),
        (64, 123, 4.0),
        (27, 122, 3.0),
        (25, 122, 1.0),
        (12, 124, 2.0)
    ['user_id', 'book_id', 'rating'] 

        (82, 123, 5.0),
        (64, 122, 4.0),
        (27, 124, 3.0),
        (64, 123, 2.0),
        (12, 122, 4.0)
    ['user_id', 'book_id', 'rating'] 

    user_id = val.select('user_id').distinct()
    true_label = val.select('user_id', 'book_id')\
                .agg(expr('collect_list(book_id) as true_item'))

    als = ALS(rank = 3 , regParam=0.1, 
                userCol="user_id", itemCol="book_id", ratingCol='rating', 
                implicitPrefs=False, coldStartStrategy="drop")
    model = als.fit(train)

    recs = model.recommendForUserSubset(user_id, 2)
    pred_labels = recs.select('user_id','recommendations.book_id')
    pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    metrics = RankingMetrics(pred_true_rdd)
    mean_ap = metrics.meanAveragePrecision
    ndcg_at_k = metrics.ndcgAt(2)
    p_at_k= metrics.precisionAt(2)
    print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k)
예제 #11
    def __evaluate_ranking(self, rnk_inf: SparkDF):
        test_ground_truth = self.__test.groupBy("user_id").agg(collect_list("business_id").alias("business_gt"))

        pred_with_labels = rnk_inf.join(test_ground_truth, on="user_id").drop("user_id")

        metrics = RankingMetrics(pred_with_labels.rdd)

        results = {}

        for m in self.ranking_metrics:
            metric_name = "{}@{}".format(m, self.top_k)
            if "ndcg" in m:
                results[metric_name] = metrics.ndcgAt(self.top_k)
            elif m == "precision":
                results[metric_name] = metrics.precisionAt(self.top_k)

        return results
예제 #12
def recsys(spark):
    # Load data from parquet
    val = spark.read.parquet("val_set.parquet")
    test = spark.read.parquet("test_set.parquet")
    cols_to_drop = ['is_read', 'is_reviewed']
    test = test.drop(*cols_to_drop)
    val = val.drop(*cols_to_drop)

    # Load model from path
    model_path = "hdfs:/user/ago265/best_model"
    best_model = ALSModel.load(model_path)

    # Compile a list of all the books each user read
    val_users = val.select("user_id").distinct()

    val_books = val.select("user_id", "book_id")\
                                .agg(expr('collect_list(book_id) as books'))

    test_users = test.select("user_id").distinct()
    test_books = test.select("user_id", "book_id").groupBy("user_id").agg(expr('collect_list(book_id) as books'))

    # # Recommender System for all users at k=500
    # k = 500
    # print('Making top 500 recommendations for all users')
    # rec = best_model.recommendForAllUsers(k)

    # Recommender System for subset of users at k=10
    k = 10
    print('Making top {} recommendations for a subset of users'.format(k))
    rec = best_model.recommendForUserSubset(test_users, k)
    pred_label = rec.select('user_id','recommendations.book_id')

    # Create an RDD to evaluate with Ranking Metrics
    final_df = pred_label.join(test_books,['user_id'],'inner').select('book_id','books')
    final_rdd = final_df.rdd.map(lambda x: (x.book_id, x.books))
    metrics = RankingMetrics(final_rdd)
    result1 = metrics.meanAveragePrecision
    result2 = metrics.precisionAt(k)
    result3 = metrics.ndcgAt(k)
    print("MAP = ", result1)
    print("Precision at k = ", result2)
    print("NDCG at k = ", result3)
예제 #13
def get_val_metrics(model, val):
    preds = model.transform(val)
    recs = model.recommendForUserSubset(val, 500)
    top_items = recs.selectExpr('user as user', 'recommendations.item as top_items')
    true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list'))
    predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\
        .select('true_item_list', 'top_items')
    ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd)
    prec_at = ranking_metrics.precisionAt(500)
    mean_avg_prec = ranking_metrics.meanAveragePrecision
    ndcg = ranking_metrics.ndcgAt(500)
    rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError
    evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse')
    rmse = evaluator.evaluate(preds)
    return rmse, prec_at, mean_avg_prec, ndcg
예제 #14
def main(spark):
    val_df = spark.read.parquet(
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)

    predictions = val_df.groupBy("item").count().orderBy(
        "count", ascending=False).limit(500).collect()
    predictions = [row.item for row in predictions]
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = labels.rdd.map(lambda tup: (predictions, tup[1]))
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    print('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision)
    print('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    print('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
예제 #15
def main(spark, model_file, data_file, K):
    '''Main routine for Collaborative Filtering Model testing

        spark: SparkSession object

        model_file: string, path to store the model

        data_file: string, path to the parquet file to load

        K: int, evaluations are based on predictions of the top K items for each user
    testIdx = spark.read.parquet(data_file)
    model = ALSModel.load(model_file)

    users_val = testIdx.select("user_idx").distinct()

    perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K)
    perUserPredictedItemsDF = perUserPredictedItemsDF.select(
        "user_idx", "recommendations.track_idx").withColumnRenamed(
            'user_idx', 'user').withColumnRenamed('recommendations.track_idx',

    w2 = Window.partitionBy('user_idx').orderBy(col('count').desc())
    perUserActualItemsDF = testIdx.select(
        'user_idx', 'track_idx', 'count',
            'rank <= {0}'.format(K)).groupBy('user_idx').agg(
                expr('collect_list(track_idx) as items')).withColumnRenamed(
                    'user_idx', 'user')

    perUserItemsRDD = perUserPredictedItemsDF.join(
        perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)

    print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision)
    print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K))
    print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def main(spark, model_file, test_file):
    test_data = spark.read.parquet(test_file)
    als_model_tuned = ALSModel.load(model_file)

    print("Imported trained model and test data sets")

    #generating true values of book_id for each user_id
    groundTruth_test = test_data.groupby("user_id").agg(
    print("Created ground truth df for test set")

    # user_test_list=spark.sql('select distinct user_id from groundTruth_val where user_id=14')
    # rec = als_model_normal.recommendForUserSubset(user_test_list,500)

    #generating recs
    rec = als_model_tuned.recommendForAllUsers(500)
    print("500 recommendations for all users generated")

    #creating dataframe to have both true values and predicted values
    predictions_test = rec.join(groundTruth_test,
                                rec.user_id == groundTruth_test.user_id,

    #coverting to rdd for RankingMetrics()
    predAndLabels_test = predictions_test.select('recommendations.book_id',

    print("starting ranking metrics for test data")
    metrics_test = RankingMetrics(predAndLabels_test)

    #calculating metrics
    precision_test = metrics_test.precisionAt(500)
    map_test = metrics_test.meanAveragePrecision
    ndcg_test = metrics_test.ndcgAt(500)

    print('Test set , Precision at 500: {}'.format(precision_test))
    print('Test set , Mean Average Precision : {}'.format(map_test))
    print('Test set, ndcgAt500 : {}'.format(ndcg_test))
def main(spark, rank, regParam, path, fraction):
    TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction)
    ALS_PATH = TEMP_PATH + "/als"
    MODEL_PATH = TEMP_PATH + "/als_model"
    print("Loading model...")
    als = ALS.load(path + ALS_PATH)
    model = ALSModel.load(path + MODEL_PATH)
    print("Loading data...")
    testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format(
        path, fraction))

    # RMSE
    predictions = model.transform(testing)
    evaluator = RegressionEvaluator(metricName="rmse",
    rmse = evaluator.evaluate(predictions)
    print("RSME:", rmse)
    predictions = model.recommendForAllUsers(500)
    groundtruth = testing.groupby("user_id").agg(
    total = spark.sql(
        "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id"

    data = total.selectExpr("predictions.book_id", "groundtruth")
    print("df to rdd...")
    rdd = data.rdd.map(tuple)
    print("creating metrics...")
    metrics = RankingMetrics(rdd)
    print("meanAveragePrecision:", metrics.meanAveragePrecision)
    print("precision at 500:", metrics.precisionAt(500))
    print("ndcgAt 500:", metrics.ndcgAt(500))
예제 #18
def Ranking_evaluator (spark,model, val, metric_type):
    val_user = spark.sql('SELECT DISTINCT user_id FROM val')  
    #val_user = val.select('user_id').distinct()
    val_rec = model.recommendForUserSubset(val_user,500)
    val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations')
    val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating'))
    w= Window.partitionBy('user_id')
    val = val.sort(f.desc('rating'))
    val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct()
    scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner')
    rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect()
    ranks = spark.sparkContext.parallelize(rankLists)
    metrics = RankingMetrics(ranks)
    MAP = metrics.meanAveragePrecision
    Precision = metrics.precisionAt(500)
    NDCG = metrics.ndcgAt(500)
    if metric_type == 'Precision':
        return Precision, {'MAP': MAP,'NDCG': NDCG}
    elif metric_type == 'MAP':
        return MAP, {'Precision': Precision,'NDCG': NDCG}
    elif metric_type == 'NDCG':
        return NDCG, {'MAP': MAP, 'Precision': Precision}
        return None
예제 #19
def main(spark, log_comp=False, drop_low=False, drop_thr=0):

    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

combined = (
    recommended_songs.join(relevant_songs, on='user_id_encoded', how='inner')
    .map(lambda row: (row[1], row[2]))

# 929537


# ([107048, 127769, 129688, 113295, 145331], [43243, 32053, 32958, 25699, 33861])

rankingMetrics = RankingMetrics(combined)
ndcgAtK = rankingMetrics.ndcgAt(k)

# 1.8102832147923323e-05
#only showing top 5 rows

from pyspark.mllib.evaluation import RankingMetrics

# (c)
# calculate ranking metricss
metrics = RankingMetrics(
    recommendation_and_relevant \
    .select(['song_recommended', 'song_relevant']) \
    .rdd \
    .map(lambda row: (row[0], row[1]))

print("PRECISION @ 10: ", metrics.precisionAt(N))
print("MAP @ 10: ", metrics.meanAveragePrecision)
print("NDCG @10: ", metrics.ndcgAt(N))

#PRECISION @ 10:  0.7214197272224288
#MAP @ 10:  0.727511046690355
#NDCG @10:  0.8838562387037185

K = 5
print("PRECISION @ 5: ", metrics.precisionAt(K))
print("MAP @ 5: ", metrics.meanAveragePrecision)
print("NDCG @5: ", metrics.ndcgAt(K))

#PRECISION @ 5:  0.8564546973286272
#MAP @ 5:  0.727511046690355
#NDCG @5:  0.8868546173633727
test_metric = temp.join(test_array, on="user_label_int", how="left")

test_metric = test_metric.drop("user_label_int")
test_rdd = test_metric.rdd

from pyspark.mllib.evaluation import RankingMetrics

metrics_2 = RankingMetrics(test_rdd)

예제 #23
compare = recommends.join(ground_truths, on='user', how='left')
compare = [(r.__getattr__('recommends'), r.__getattr__('ground_truths'))
           for r in compare.collect()]
compare = sc.parallelize(compare)

#Alternative method
compare = sc.parallelize(compare.collect(
))  # this take longer time than the above '_getattr_()' method.

# print metrics
metrics = RankingMetrics(compare)
# predict test and rmse
predict = model.transform(test)
predict = predict.filter(F.col('prediction') != float('nan'))
reg_eval = RegressionEvaluator(predictionCol='prediction',
# testing NDCG metric on bad documents
set1 = sc.parallelize([([1, 2, 3], [1, 2])])
예제 #24
def main(spark, val_pq, model_file_path):
        validation data
        path to the pipeline(stringIndexers + als) model

    # Read data
    val = spark.read.parquet(val_pq)

    print('load trained model')
    # Load the trained pipeline model
    model = PipelineModel.load(model_file_path)

    # evaluation

    print("Run prediction")
    # Run the model to create prediction against a validation set
    preds = model.transform(val)
    print("Run evaluation")

    # model evaluation using rmse on val data
    print("Start evaluation using rmse")
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(preds)

    # Generate top 500 book recommendations for each user in validation data.
    # Returns a DataFrame of (userCol, recommendations), 
    # where recommendations are stored as an array of (itemCol, rating) Rows.
    #user_id = preds.select("user_id_idx").distinct()
    #res = model.stages[-1].recommendForUserSubset(user_id, 500)

    print("generate top 500 book recommendations for val users")
    res = model.stages[-1].recommendForAllUsers(500)
    preds_per_user = res.selectExpr("user_id_idx", "recommendations.book_id_idx as preds_books")
    # preds_pe_user.show(5)

    true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\
                        .agg(expr("collect_set(book_id_idx) as books"))
    # true_per_user.show(5)

    print("Start join")
    # true_per_user: an RDD of (predicted ranking, ground
    # truth set) pairs
    # true_vs_preds_per_user = preds_per_user.join(true_per_user, ["userId"]).rdd\
    #                 .map(lambda row: (row.items_pred, row.items)).cache()

    true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\

    # print(*true_vs_preds_per_user.take(5),sep="\n")

    # Evaluate using RMSE
    #evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="??")
    #rmse = evaluator.evaluate(preds)
    #print(f'The out-of-sample RMSE of the current model is: {rmse:.2f}')

    # Evaluate using MAP
    print("Start evaluation using MAP")
    metrics = RankingMetrics(true_vs_preds_per_user)
    map_ = metrics.meanAveragePrecision

    #Evaluate using ndcg
    print("Start evaluation using ndcg")
    ndcg = metrics.ndcgAt(500)
    #Evaluate using precision
    mpa = metrics.precisionAt(500)

    print('rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ', ndcg, 'mpa score: ', mpa)
# Join the song recommendations and actual songs played together for each user.
user_songs = (

user_songs.show(20, 100)

# Select only the recommended and actual song columns and convert the dataframe to an rdd. 
user_songs_rdd = user_songs.select(F.col('recommended_songs'), F.col('relevant_songs')).rdd

# Compute the ranking metrics for the collaborative filtering model.
rank_metrics = RankingMetrics(user_songs_rdd)

# Precision @ 5: 0.907 (3dp)
precision_at_5 = rank_metrics.precisionAt(5)

# NDCG @ 10: 0.906 (3dp)
ndcg_at_10 = rank_metrics.ndcgAt(10)

# MAP: 0.699 (3dp)
mean_average_precision = rank_metrics.meanAveragePrecision
예제 #26
             .config("spark.executor.memory", "8g")
             .config("spark.driver.memory", "8g")

    train = spark.read.parquet(f'{sys.argv[1]}/train.parquet')
    test = spark.read.parquet(f'{sys.argv[1]}/test.parquet')
    validation = spark.read.parquet(f'{sys.argv[1]}/validation.parquet')

    als = ALS(rank=15, maxIter=5, regParam=0.001,userCol="user_id", itemCol="book_id", ratingCol="rating", seed=0,
    StartT = time.time()
    model = als.fit(train)
    EndT = time.time()
    T = EndT - StartT
    print(f"The running time is: {T}")
    predictions = model.transform(test)

    predictions = predictions.orderBy(predictions.prediction.desc())
    final_prediction = predictions.filter(predictions.prediction >= 0).groupBy("user_id").agg(F.collect_list("book_id").alias("prediction"))
    predictions = predictions.orderBy(predictions.rating.desc())
    final_rating = predictions.groupBy("user_id").agg(F.collect_list("book_id").alias("rating"))
    final = final_prediction.join(final_rating, final_prediction.user_id == final_rating.user_id, 'inner')\
        .select(final_prediction.user_id, final_prediction.prediction, final_rating.rating)
    metrics = RankingMetrics(final.select('prediction', 'rating').rdd.map(tuple))
    res = metrics.ndcgAt(500)
    Precision = metrics.precisionAt(500)
    print(f"The NDCG evaluation result is: {res}")
    print(f"The PrecisionAtK evaluation result is: {Precision}")

예제 #27
rmse = evaluator.evaluate(predictions_test)

print("Root-mean-square error = " + str(rmse))

test_true = spark.sql(
    'select user, book from test where rating > 2 sort by rating desc')
labels = test_true.groupby('user').agg(collect_list('book'))

test_recommendations = model.recommendForUserSubset(labels.select('user'), 500)
preds = test_recommendations.withColumn(
    'recommendations', explode('recommendations')).select(

preds_and_labels = preds.join(labels, on='user')

metrics = RankingMetrics(
    preds_and_labels.select('collect_list(item)', 'collect_list(book)').rdd)
map_metric = metrics.meanAveragePrecision
pA = metrics.precisionAt(500)
ndcgA = metrics.ndcgAt(500)

results.append((rank, reg, rmse, map_metric, pA, ndcgA))

print('MAP = ', map_metric, ' pA = ', pA, ' ndcgA = ', ndcgA, '\n')

res_rdd = spark.sparkContext.parallelize(results)
res_df = spark.createDataFrame(res_rdd).repartition(1)
예제 #28
def main(spark, train_pq, val_pq):
        validation data
        path to the pipeline(stringIndexers + als) model
    import itertools

    # Read train and val data
    print("load train and validation data")
    train = spark.read.parquet(train_pq)
    val = spark.read.parquet(val_pq)

    # Increase partition size of train data to reduce task load

    # Pipeline
    # StringIndexers
    print("build stringIndexer")
    indexer_user = StringIndexer(inputCol="user_id",
    indexer_book = StringIndexer(inputCol="book_id",

    # Hyper-parameter tuning
    rank_ = [10, 15, 20]
    regParam_ = [0.01, 0.05, 0.1, 0.3, 1]
    param_grid = itertools.product(rank_, regParam_)

    # ALS model hyperparameter tuning
    for i in param_grid:
        print('training for {} start'.format(i))

        als = ALS(maxIter=10, rank=i[0], regParam=i[1],\
                userCol="user_id_idx", itemCol="book_id_idx", ratingCol="rating",\

        # Combine into the pipeline
        pipeline = Pipeline(stages=[indexer_user, indexer_book, als])

        model = pipeline.fit(train)
        print('training for {} complete'.format(i))

        # predition against validation data
        preds = model.transform(val)

        # model evaluation using rmse on val data
        print("Start evaluation using rmse for {}".format(i))
        evaluator = RegressionEvaluator(metricName="rmse",
        rmse = evaluator.evaluate(preds)

        # Make top 500 recommendations for users in validation test
        print('evaluation for {} start'.format(i))

        res = model.stages[-1].recommendForAllUsers(500)

        preds_per_user = res.selectExpr(
            "user_id_idx", "recommendations.book_id_idx as preds_books")
        true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\
                            .agg(expr("collect_set(book_id_idx) as books"))

        print("Start join for {}".format(i))
        true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\

        # Evaluate using MAP
        print("Start evaluation using MAP for {}".format(i))
        metrics = RankingMetrics(true_vs_preds_per_user)
        map_ = metrics.meanAveragePrecision

        #Evaluate using ndcg
        print("Start evaluation using ndcg for {}".format(i))
        ndcg = metrics.ndcgAt(500)

        #Evaluate using precision
        print("Start evaluation using precisionAtK for {}".format(i))
        mpa = metrics.precisionAt(500)

        print(i, 'rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ',
              ndcg, 'mpa score: ', mpa)
predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))

ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
predictionAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])

예제 #30
def main(spark, data_file):
    '''Main routine for supervised evaluation

    spark : SparkSession object

    data_file : string, path to the parquet file to load

    # Load the dataframe from data file
    #input_data = spark.read.parquet(data_file)
    df = spark.read.csv(data_file, header=True)

    #df = df.limit(500000)
    df = df.filter(df.user_id.isNotNull())

    df_unique_user = df.select("user_id").distinct()
    df_unique_user = df_unique_user.selectExpr("user_id as uid")

    percent = df_unique_user.count()
    percent = 0.001 * percent
    percent = math.floor(percent)

    df_unique_user = df_unique_user.limit(percent)
    print('df unique user')
    df_final = df.join(df_unique_user, df.user_id == df_unique_user.uid,
                       "inner").select(df.user_id, df.book_id, df.is_read,
                                       df.rating, df.is_reviewed)
    df = df_final
    #FILTER USERS < 10
    counts = df.groupBy('user_id').count().selectExpr("user_id as uid",
                                                      "count as count")
    df = df.join(counts,
                 df.user_id == counts.uid).filter(F.col("count") > 10).drop(
                     'uid', 'count')

    from pyspark.sql.types import DoubleType

    df = df.filter(df.rating.isNotNull())
    df = df.withColumn("rating", df["rating"].cast(DoubleType()))
    from pyspark.ml.feature import StringIndexer

    stage_1 = StringIndexer(inputCol='user_id', outputCol='user_id_index')
    #df = stage_1.setHandleInvalid("keep").fit(df).transform(df)
    df = stage_1.fit(df).transform(df)

    stage_2 = StringIndexer(inputCol='book_id', outputCol='book_id_index')
    #transformed = stage_2.setHandleInvalid("keep").fit(df).transform(df)
    df = stage_2.fit(df).transform(df)

    user_id = df.select("user_id").distinct()
    uid = df.select(

    #get the count of each user_id
    counts = df.groupBy('user_id').count()  #Show count of each user_id
    counts = counts.selectExpr('user_id as user_id',
                               'count as n')  #Rename count as n

    #Create Train Test and Validation sets 60-20-20
    train_size = int(0.6 * len(uid))
    vali_size = train_size + int(0.2 * len(uid))
    test_size = vali_size + int(0.2 * len(uid))

    train_set = uid[:train_size]
    vali_set = uid[train_size:vali_size]
    test_set = uid[vali_size:]

    train_set = df.filter(df.user_id.isin(train_set))
    vali_set = df.filter(df.user_id.isin(vali_set))
    test_set = df.filter(df.user_id.isin(test_set))

    vali_uid = vali_set.select(
    test_uid = test_set.select(

    #For each validation user, use half of their interactions for training,
    validict = {i: 0.5 for i in vali_uid}
    new_vali = vali_set.sampleBy("user_id", fractions=validict, seed=40)

    testdict = {i: 0.5 for i in test_uid}
    new_test = test_set.sampleBy("user_id", fractions=testdict, seed=40)

    vali_set = vali_set.exceptAll(new_vali)
    train_set = train_set.union(new_vali)

    test_set = test_set.exceptAll(new_test)
    train_set = train_set.union(new_test)

    # # ALS

    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    from pyspark.sql.functions import expr
    import itertools as it

    rank_ = [5, 10, 15]
    regParam_ = [0.01, 0.1, 1.0]
    alpha_ = [1, 2, 5]
    param_grid = it.product(rank_, regParam_, alpha_)
    vals_list = []
    stats = []
    rmse_list = []
    best_map = 999999999
    best_model = None
    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank=i[0],
        model = als.fit(train_set)
        user_subset = vali_set.select("user_id_index").distinct()
        userRecs = model.recommendForUserSubset(user_subset, 500)
        from pyspark.sql.functions import expr
        true_label = test_set.select('user_id_index', 'book_id_index')\
                .agg(expr('collect_list(book_id_index) as true_item'))
        pred_label = userRecs.select('user_id_index',
        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        evaluator = RegressionEvaluator(metricName="rmse",
        predictions = model.transform(vali_set)
        predsDF = predictions.filter(predictions.rating.between(3,
        predsDF = spark.createDataFrame(predsDF)

        rmse = evaluator.evaluate(predsDF)
        if map_ < best_map:
            best_model = model
            best_map = map_
            #print('New best model')
            stats.append([i[0], i[1], i[2], rmse])

        columns = [
            'Alpha', 'Rank', 'RegParam', 'MAP', 'Precision', 'NDGC', 'RMSE'
        vals_list.append((i[2], i[0], i[1], map_, mpa, ndcg, rmse))
        print('MAP: %f' % map_)
        print('Precision: %f' % mpa)
        print('NDCG: %f' % ndcg)
        print('rmse %f:' % rmse)
        plt.scatter(i[0], rmse)



    convertToFloat = lambda lines: [double(x) for x in vals_list]
    #from pyspark.sql.types import *

    mySchema = StructType([ StructField("Alpha", IntegerType(), True)\

                           ,StructField("Rank", IntegerType(), True)\

                           ,StructField("Reg_Param", DoubleType())\

                           ,StructField("MAP ", DoubleType(), True)\

                           ,StructField("Precision", DoubleType(), True)\

                           ,StructField("NDGC", DoubleType(), True)\

                           ,StructField("RMSE", DoubleType(), True)])
    df = spark.createDataFrame(vals_list, schema=mySchema)

    #Evaluation of test set
    #print('Finish Training for {}'.format(i))
    user_subset = test_set.select("user_id_index").distinct()
    userRecs = best_model.recommendForUserSubset(user_subset, 500)

    true_label = test_set.select('user_id_index', 'book_id_index')\
                .agg(expr('collect_list(book_id_index) as true_item'))
    pred_label = userRecs.select('user_id_index',

    pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    metrics = RankingMetrics(pred_true_rdd)
    map_ = metrics.meanAveragePrecision
    ndcg = metrics.ndcgAt(500)
    mpa = metrics.precisionAt(500)
    evaluator = RegressionEvaluator(metricName="rmse",
    predictions = model.transform(test_set)
    predsDF = predictions.filter(predictions.rating.between(3, 5)).collect()
    predsDF = spark.createDataFrame(predsDF)

    rmse = evaluator.evaluate(predsDF)
    print('Test Metrics:')
    print('MAP: %f' % map_)
    print('Precision: %f' % mpa)
    print('NDCG: %f' % ndcg)
    print('rmse %f:' % rmse)

    #Latent Factors
    ufac_df = best_model.userFactors.toPandas()
    ifac_df = best_model.itemFactors.toPandas()
    ufac_matrix = np.vstack(ufac_df.features.values)
    ifac_matrix = np.vstack(ifac_df.features.values)

    import seaborn as sns
    sns.set(rc={'figure.figsize': (11.7, 8.27)})
    palette = sns.color_palette("hls", 10)
    import numpy as np
    import pandas as pd
    import sklearn
    from sklearn.manifold import TSNE

    X = ufac_matrix
    Y = ifac_matrix
    tsne = TSNE()
    X_embedded = tsne.fit_transform(X)
    Y_embedded = tsne.fit_transform(Y)
    plot_users = sns.scatterplot(X_embedded[:, 0],
                                 X_embedded[:, 1],
    plot_items = sns.scatterplot(Y_embedded[:, 0],
                                 Y_embedded[:, 1],

    pkl.dump(ufac_matrix, open('ufac_matrix.pkl', 'wb'))
    pkl.dump(ifac_matrix, open('ifac_matrix.pkl', 'wb'))