예제 #1
0
    def _evaluate(self, predicted_df):
        k = self.get_k()

        window_spec = Window.partitionBy('user').orderBy(
            col('prediction').desc())
        per_user_predicted_items_df = predicted_df \
            .select('user', 'item', 'prediction', F.rank().over(window_spec).alias('rank')) \
            .where('rank <= {0}'.format(k)) \
            .groupBy('user') \
            .agg(expr('collect_list(item) as items'))

        window_spec = Window.partitionBy('user').orderBy(
            col('starred_at').desc())
        per_user_actual_items_df = predicted_df \
            .select('user', 'item', 'starred_at', F.rank().over(window_spec).alias('rank')) \
            .where('rank <= {0}'.format(k)) \
            .groupBy('user') \
            .agg(expr('collect_list(item) as items'))

        per_user_items_rdd = per_user_predicted_items_df.join(F.broadcast(per_user_actual_items_df), 'user', 'inner') \
            .rdd \
            .map(lambda row: (row[1], row[2]))

        if per_user_items_rdd.isEmpty():
            return 0.0

        ranking_metrics = RankingMetrics(per_user_items_rdd)
        metric = ranking_metrics.ndcgAt(k)
        return metric
예제 #2
0
def Tester(spark, model, df_test, rank, regParam, alpha, K=500):
    #df_test = spark.read.parquet(formatted_test_address)
    targetUsers = df_test.select("user_id_numeric").distinct()
    userRecs = model.recommendForUserSubset(targetUsers, K)
    userRecs = userRecs.select("user_id_numeric",
                               "recommendations.track_id_numeric",
                               "recommendations.rating")

    # need to get ordered list of track_id based on counts groupby individual users.
    # reference:https://stackoverflow.com/questions/46580253/collect-list-by-preserving-order-based-on-another-variable
    w = Window.partitionBy("user_id_numeric").orderBy(df_val['count'].desc())
    labels = df_val.withColumn('ActualRanking',
                               F.collect_list("track_id_numeric").over(w))
    labels = labels.select(['user_id_numeric', 'ActualRanking'
                            ]).dropDuplicates(['user_id_numeric'])

    # Get the metrics
    # predictionsAndlabels should be an RDD of (predicted ranking, ground truth set) pairs.
    # reference: https://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics
    predictionsAndlabels = userRecs.join(
        labels, [labels.user_id_numeric == userRecs.user_id_numeric],
        'left').select('track_id_numeric', 'ActualRanking')
    metricsRank = RankingMetrics(predictionsAndlabels.rdd)

    print("------------------------------------------")
    print("Params: Rank %f | regParam %f | alpha = %f" %
          (rank, regParam, alpha))
    print("p(15)   %.8f" % metricsRank.precisionAt(15))
    print("p(500)   %.8f" % metricsRank.precisionAt(500))
    print("MAP  %.8f" % metricsRank.meanAveragePrecision)
    print("nDCG %.8f" % metricsRank.ndcgAt(K))
    return
예제 #3
0
def main(spark, txt):
    model = ALSModel.load('hdfs:/user/jm7955/' + args.model)
    distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' %
                                        args.distinct)

    print("distinct_users")
    print('finished writing in %d seconds' % int(timer() - start))
    #distinct_users.show()
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)
    print("labels")
    #labels.show()
    print('finished writing in %d seconds' % int(timer() - start))

    predictions = model.recommendForUserSubset(distinct_users, 500)\
        .select('user', F.col('recommendations.item').alias('item'))
    print("predictions")
    #predictions.show()
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = predictions.join(
        labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("predictionsAndLabels")
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    file = open(txt, 'w')

    file.write('metrics.meanAveragePrecision: %s\n' %
               metrics.meanAveragePrecision)
    file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
    file.close()
def main(spark, model_file, test_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    test = spark.read.parquet(test_file)
    test = test.sort('user', ascending=False)
    test.createOrReplaceTempView('test_table')
    model = ALSModel.load(model_file)

    user_subset = test.select("user").distinct()
    user_subset = model.recommendForUserSubset(user_subset, 500)

    user_subset = user_subset.select("user",
                                     col("recommendations.item").alias("item"))
    user_subset = user_subset.sort('user', ascending=False)
    print("sort user")
    predictionAndLabels = user_subset.join(
        test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("joined predictions and counts")

    metrics = RankingMetrics(predictionAndLabels)
    print("made metrics")
    MAP = metrics.meanAveragePrecision
    precision = metrics.precisionAt(500)
    ndcg = metrics.ndcgAt(500)

    print('MAP: %f' % MAP)
    print('Precision: %f' % precision)
    print('NDCG: %f' % ndcg)
예제 #5
0
def evaluateTopk(model,data,top_k=500):
    '''
    Input:
    validation: RDD
        - user, product (book_id), rating
    '''
    truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product"))
    print("Getting Predictions...")
    tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]])
    predictions=spark.createDataFrame(tmp1,["user","predictions"])


    print("Predictions and Labels...")
    k=predictions.join(truth,truth.user==predictions.user)
    final=k.rdd.map(lambda r: [r[1],r[3]])
    metrics=RankingMetrics(final)

    print("\nCalculate NDCG at {}...".format(top_k))
    res1=metrics.ndcgAt(top_k)
    print("NDCG at {}: {}".format(top_k,res1))

    print("\nCalculate MAP...")
    res2=metrics.meanAveragePrecision
    print("MAP: {}".format(res2))

    print("\nCalculate Precision at {}...".format(top_k))
    res3=metrics.precisionAt(top_k)
    print("Precision at {}: {}".format(top_k,res1))

    return res1,res2,res3
예제 #6
0
def evaluation(df, model, ks):
	'''
	Evaluate the model.
	ks: a list of parameter k used in precision at k and NDCG at k.
	'''

	print(' Make predictions...')
	predictions = model.recommendForUserSubset(df, 500)

	print(' Prepare ground truth set and predicted set...')
	labels = df.groupBy('user').agg(F.collect_set('item')).collect()
	user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect()
	labels = sorted(labels, key = lambda x: x.user)
	user_pred = sorted(user_pred, key = lambda x: x.user)
	print(' Combine ground truth set and predicted set...')
	predictionAndLabels = []
	for i in range(len(user_pred)):
		predictionAndLabels.append((user_pred[i].item, labels[i][1]))
	print(' Parallelize...')
	predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000)
	print(' Calculate metrics...')
	metrics = RankingMetrics(predictionAndLabels)
	eval_results = []
	eval_results.append(metrics.meanAveragePrecision)
	for k in ks:
		eval_results.append(metrics.precisionAt(k))
		eval_results.append(metrics.ndcgAt(k))

	return eval_results
예제 #7
0
def get_rankMetrics(spark, df, trained_model, approx=False, k=500):
    """
    This function evaluates the performance of a given model on a given dataset using Ranking Metrics,
    and returns the final performance metrics.

    Parameters
    ----------
    df: DataFrame to evaluate on
    trained_model: trained model to evaluate
    approx: boolean; use ANN(approximate nearest neighbors) when True
    k: number of recommendation 
    ----------
    """
    import datetime
    import nmslib_recommend2
    import pyspark.sql.functions as F
    from pyspark.mllib.evaluation import RankingMetrics

    # change column names
    df = df.select(['user_id', 'book_id',
                    'rating']).toDF('user', 'item', 'rating')

    # relevant item if its centered rating > 0
    fn = F.udf(lambda x: 1.0 if x >= 3 else 0.0)
    df = df.withColumn('rating', fn(df.rating))
    relevant = df[df.rating == 1.0].groupBy('user').agg(F.collect_list('item'))

    # recommend k items for each user
    print("recommendation time comparison start: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    if approx:
        recommend = nmslib_recommend2.nmslib_recommend(spark, df,
                                                       trained_model, k)
        recommend = spark.createDataFrame(recommend, ["user", "recommend"])
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            rec_and_rel.append((rec, rel))
    else:
        userSubset = relevant.select('user')
        recommend = trained_model.recommendForUserSubset(userSubset, 500)
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            predict_items = [i.item for i in rec]
            rec_and_rel.append((predict_items, rel))
    print("recommendation time comparison end: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # Compute metrics
    rec_and_rel_rdd = spark.sparkContext.parallelize(rec_and_rel)
    metric_class = RankingMetrics(rec_and_rel_rdd)

    ndcg = metric_class.ndcgAt(k)
    map_ = metric_class.meanAveragePrecision
    pk = metric_class.precisionAt(k)

    return print("NDCG:", ndcg, "\nMAP:", map_, "\nPrecision:", pk)
예제 #8
0
def evaluate(model, test_data):
    test = test_data.map(lambda p: (p[0], p[1]))
    ret = model.predictAll(test) \
        .map(lambda r: (r.user, (r.product, r.rating))) \
        .groupByKey() \
        .mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)) \
        .mapValues(lambda l: [x[0] for x in l])
    gt_items = test_data.filter(lambda p: p[2] == 1.0).map(lambda r: (r[0], [r[1]]))
    predictionAndLabels = ret.join(gt_items).map(lambda r: (r[1][0], list(r[1][1])))
    metrics = RankingMetrics(predictionAndLabels)
    return metrics.ndcgAt(TopK)
예제 #9
0
def top_k_rankingmetrics(dataset=None,
                         k=10,
                         ranking_metrics="precisionAt",
                         user="******",
                         item="book_id",
                         rating="rating",
                         prediction="prediction"):
    '''
	This function is to compute the ranking metrics from predictions.
	Input:
	1. k: only evaluate the performance of the top k items
	2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt
	3. user, item, prediction: column names; string type

	refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html
	'''
    if dataset == None:
        print("Error! Please specify a dataset.")
        return
    # prediction table
    windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc())
    perUserPredictedItemsDF = dataset \
     .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # actual target table
    windowSpec = Window.partitionBy(user).orderBy(col(rating).desc())
    perUserActualItemsDF = dataset \
     .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # join
    perUserItemsRDD = perUserPredictedItemsDF \
     .join(F.broadcast(perUserActualItemsDF), user, 'inner') \
     .rdd \
     .map(lambda row: (row[1], row[2]))
    ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD)
    # get the result of the metric
    if ranking_metrics == "precisionAt":
        precision_at_k = ranking_metrics_evaluator.precisionAt(k)
        #print("precisionAt: {}".format(round(precision_at_k, 4)))
        return precision_at_k
    elif ranking_metrics == "meanAveragePrecision":
        mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k)
        #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4)))
        return mean_avg_precision
    elif ranking_metrics == "ndcgAt":
        ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k)
        #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4)))
        return ndcg_at_k
def dummy_run(spark):

    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    import pyspark.sql.functions as F
    from pyspark.sql.functions import expr

    train=spark.createDataFrame(
    [
        (82, 124, 5.0),
        (64, 123, 4.0),
        (27, 122, 3.0),
        (25, 122, 1.0),
        (12, 124, 2.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    val=spark.createDataFrame(
    [
        (82, 123, 5.0),
        (64, 122, 4.0),
        (27, 124, 3.0),
        (64, 123, 2.0),
        (12, 122, 4.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    user_id = val.select('user_id').distinct()
    true_label = val.select('user_id', 'book_id')\
                .groupBy('user_id')\
                .agg(expr('collect_list(book_id) as true_item'))

    als = ALS(rank = 3 , regParam=0.1, 
                userCol="user_id", itemCol="book_id", ratingCol='rating', 
                implicitPrefs=False, coldStartStrategy="drop")
    model = als.fit(train)

    recs = model.recommendForUserSubset(user_id, 2)
    pred_labels = recs.select('user_id','recommendations.book_id')
    pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    
    metrics = RankingMetrics(pred_true_rdd)
    mean_ap = metrics.meanAveragePrecision
    ndcg_at_k = metrics.ndcgAt(2)
    p_at_k= metrics.precisionAt(2)
    print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k)
    return 
예제 #11
0
    def __evaluate_ranking(self, rnk_inf: SparkDF):
        test_ground_truth = self.__test.groupBy("user_id").agg(collect_list("business_id").alias("business_gt"))

        pred_with_labels = rnk_inf.join(test_ground_truth, on="user_id").drop("user_id")

        metrics = RankingMetrics(pred_with_labels.rdd)

        results = {}

        for m in self.ranking_metrics:
            metric_name = "{}@{}".format(m, self.top_k)
            if "ndcg" in m:
                results[metric_name] = metrics.ndcgAt(self.top_k)
            elif m == "precision":
                results[metric_name] = metrics.precisionAt(self.top_k)

        return results
예제 #12
0
def recsys(spark):
    # Load data from parquet
    val = spark.read.parquet("val_set.parquet")
    test = spark.read.parquet("test_set.parquet")
    cols_to_drop = ['is_read', 'is_reviewed']
    test = test.drop(*cols_to_drop)
    val = val.drop(*cols_to_drop)

    # Load model from path
    model_path = "hdfs:/user/ago265/best_model"
    best_model = ALSModel.load(model_path)

    # Compile a list of all the books each user read
    val_users = val.select("user_id").distinct()

    val_books = val.select("user_id", "book_id")\
                                .groupBy("user_id")\
                                .agg(expr('collect_list(book_id) as books'))

    test_users = test.select("user_id").distinct()
    test_books = test.select("user_id", "book_id").groupBy("user_id").agg(expr('collect_list(book_id) as books'))


    # # Recommender System for all users at k=500
    # k = 500
    # print('Making top 500 recommendations for all users')
    # rec = best_model.recommendForAllUsers(k)

    # Recommender System for subset of users at k=10
    k = 10
    print('Making top {} recommendations for a subset of users'.format(k))
    rec = best_model.recommendForUserSubset(test_users, k)
    pred_label = rec.select('user_id','recommendations.book_id')

    # Create an RDD to evaluate with Ranking Metrics
    final_df = pred_label.join(test_books,['user_id'],'inner').select('book_id','books')
    final_rdd = final_df.rdd.map(lambda x: (x.book_id, x.books))
    
    metrics = RankingMetrics(final_rdd)
    result1 = metrics.meanAveragePrecision
    result2 = metrics.precisionAt(k)
    result3 = metrics.ndcgAt(k)
    print("MAP = ", result1)
    print("Precision at k = ", result2)
    print("NDCG at k = ", result3)
예제 #13
0
def get_val_metrics(model, val):
    preds = model.transform(val)
    recs = model.recommendForUserSubset(val, 500)
    
    top_items = recs.selectExpr('user as user', 'recommendations.item as top_items')
    true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list'))
    predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\
        .select('true_item_list', 'top_items')
    
    predictions_and_labels_rankings.write.json('val_recs.json')
    
    ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd)
    prec_at = ranking_metrics.precisionAt(500)
    mean_avg_prec = ranking_metrics.meanAveragePrecision
    ndcg = ranking_metrics.ndcgAt(500)
    
    rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError
    evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse')
    rmse = evaluator.evaluate(preds)
    return rmse, prec_at, mean_avg_prec, ndcg
예제 #14
0
def main(spark):
    val_df = spark.read.parquet(
        'hdfs:/user/jm7955/test_full_indexed.parquet').drop('count')
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)

    predictions = val_df.groupBy("item").count().orderBy(
        "count", ascending=False).limit(500).collect()
    predictions = [row.item for row in predictions]
    print("predictions")
    #predictions.show()
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = labels.rdd.map(lambda tup: (predictions, tup[1]))
    print("predictionsAndLabels")
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    print('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision)
    print('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    print('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
예제 #15
0
def main(spark, model_file, data_file, K):
    '''Main routine for Collaborative Filtering Model testing

        Parameters
        ----------
        spark: SparkSession object

        model_file: string, path to store the model

        data_file: string, path to the parquet file to load

        K: int, evaluations are based on predictions of the top K items for each user
        '''
    testIdx = spark.read.parquet(data_file)
    model = ALSModel.load(model_file)

    users_val = testIdx.select("user_idx").distinct()

    perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K)
    perUserPredictedItemsDF = perUserPredictedItemsDF.select(
        "user_idx", "recommendations.track_idx").withColumnRenamed(
            'user_idx', 'user').withColumnRenamed('recommendations.track_idx',
                                                  'items')

    w2 = Window.partitionBy('user_idx').orderBy(col('count').desc())
    perUserActualItemsDF = testIdx.select(
        'user_idx', 'track_idx', 'count',
        F.rank().over(w2).alias('rank')).where(
            'rank <= {0}'.format(K)).groupBy('user_idx').agg(
                expr('collect_list(track_idx) as items')).withColumnRenamed(
                    'user_idx', 'user')

    perUserItemsRDD = perUserPredictedItemsDF.join(
        perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)

    print("============================================")
    print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision)
    print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K))
    print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def main(spark, model_file, test_file):
    test_data = spark.read.parquet(test_file)
    als_model_tuned = ALSModel.load(model_file)

    print("Imported trained model and test data sets")

    #generating true values of book_id for each user_id
    groundTruth_test = test_data.groupby("user_id").agg(
        F.collect_list("book_id").alias("test_truth"))
    print("Created ground truth df for test set")

    # user_test_list=spark.sql('select distinct user_id from groundTruth_val where user_id=14')
    # rec = als_model_normal.recommendForUserSubset(user_test_list,500)

    #generating recs
    rec = als_model_tuned.recommendForAllUsers(500)
    print("500 recommendations for all users generated")

    #creating dataframe to have both true values and predicted values
    predictions_test = rec.join(groundTruth_test,
                                rec.user_id == groundTruth_test.user_id,
                                'inner')

    #coverting to rdd for RankingMetrics()
    predAndLabels_test = predictions_test.select('recommendations.book_id',
                                                 'test_truth').rdd.map(tuple)

    print("starting ranking metrics for test data")
    metrics_test = RankingMetrics(predAndLabels_test)

    #calculating metrics
    precision_test = metrics_test.precisionAt(500)
    map_test = metrics_test.meanAveragePrecision
    ndcg_test = metrics_test.ndcgAt(500)

    print('Test set , Precision at 500: {}'.format(precision_test))
    print('Test set , Mean Average Precision : {}'.format(map_test))
    print('Test set, ndcgAt500 : {}'.format(ndcg_test))
def main(spark, rank, regParam, path, fraction):
    TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction)
    ALS_PATH = TEMP_PATH + "/als"
    MODEL_PATH = TEMP_PATH + "/als_model"
    print("Loading model...")
    als = ALS.load(path + ALS_PATH)
    model = ALSModel.load(path + MODEL_PATH)
    print("Loading data...")
    testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format(
        path, fraction))
    testing.createOrReplaceTempView("testing")

    # RMSE
    predictions = model.transform(testing)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("RSME:", rmse)
    predictions = model.recommendForAllUsers(500)
    predictions.createOrReplaceTempView("predictions")
    groundtruth = testing.groupby("user_id").agg(
        F.collect_set("book_id").alias('groundtruth'))
    groundtruth.createOrReplaceTempView("groundtruth")
    total = spark.sql(
        "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id"
    )
    total.createOrReplaceTempView("total")

    data = total.selectExpr("predictions.book_id", "groundtruth")
    print("df to rdd...")
    rdd = data.rdd.map(tuple)
    print("creating metrics...")
    metrics = RankingMetrics(rdd)
    print("meanAveragePrecision:", metrics.meanAveragePrecision)
    print("precision at 500:", metrics.precisionAt(500))
    print("ndcgAt 500:", metrics.ndcgAt(500))
예제 #18
0
def Ranking_evaluator (spark,model, val, metric_type):
    
    val.createOrReplaceTempView('val')                        
    val_user = spark.sql('SELECT DISTINCT user_id FROM val')  
    #val_user = val.select('user_id').distinct()
    val_rec = model.recommendForUserSubset(val_user,500)
    #val_rec.printSchema()
    
    val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations')
    val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating'))
    
    w= Window.partitionBy('user_id')
    val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct()
   
    val = val.sort(f.desc('rating'))
    val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct()
    
    scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner')
    
    rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect()
    ranks = spark.sparkContext.parallelize(rankLists)
    
    metrics = RankingMetrics(ranks)
    
    MAP = metrics.meanAveragePrecision
    Precision = metrics.precisionAt(500)
    NDCG = metrics.ndcgAt(500)
    
    if metric_type == 'Precision':
        return Precision, {'MAP': MAP,'NDCG': NDCG}
    elif metric_type == 'MAP':
        return MAP, {'Precision': Precision,'NDCG': NDCG}
    elif metric_type == 'NDCG':
        return NDCG, {'MAP': MAP, 'Precision': Precision}
    else:
        return None
예제 #19
0
def main(spark, log_comp=False, drop_low=False, drop_thr=0):
    '''

    Parameters
    ----------
    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    '''
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +
                                           list(user_val)))

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .groupBy('user_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
    else:
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',
                                'recommendations.track_id_indexed')

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

    pass
# |273            |[41816, 27149, 34678, 7667, 44085] |
# |300            |[252, 273, 249, 70526, 19087]      |
# |412            |[28731, 8672, 377, 3113, 12806]    |
# |434            |[8641, 4373, 59438, 9138, 3075]    |
# |475            |[341, 3367, 52732, 5522, 376]      |
# |585            |[10539, 1093, 92301, 1118, 4265]   |
# |600            |[249, 399, 239, 1329, 398]         |
# |611            |[147361, 16719, 4348, 13235, 5355] |
# |619            |[5434, 9311, 20623, 32116, 9872]   |
# +---------------+-----------------------------------+

combined = (
    recommended_songs.join(relevant_songs, on='user_id_encoded', how='inner')
    .rdd
    .map(lambda row: (row[1], row[2]))
)
combined.cache()
combined.count()

# 929537

combined.take(1)

# ([107048, 127769, 129688, 113295, 145331], [43243, 32053, 32958, 25699, 33861])

rankingMetrics = RankingMetrics(combined)
ndcgAtK = rankingMetrics.ndcgAt(k)
print(ndcgAtK)

# 1.8102832147923323e-05
#+---------------+--------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
#only showing top 5 rows

from pyspark.mllib.evaluation import RankingMetrics

# (c)
# calculate ranking metricss
metrics = RankingMetrics(
    recommendation_and_relevant \
    .select(['song_recommended', 'song_relevant']) \
    .rdd \
    .map(lambda row: (row[0], row[1]))
    )

print("PRECISION @ 10: ", metrics.precisionAt(N))
print("MAP @ 10: ", metrics.meanAveragePrecision)
print("NDCG @10: ", metrics.ndcgAt(N))

#PRECISION @ 10:  0.7214197272224288
#MAP @ 10:  0.727511046690355
#NDCG @10:  0.8838562387037185

K = 5
print("PRECISION @ 5: ", metrics.precisionAt(K))
print("MAP @ 5: ", metrics.meanAveragePrecision)
print("NDCG @5: ", metrics.ndcgAt(K))

#PRECISION @ 5:  0.8564546973286272
#MAP @ 5:  0.727511046690355
#NDCG @5:  0.8868546173633727
# |13            |[39976, 17757, 42161, 10388, 358, 12484, 2665, 14813, 31426, 15100, 323, 26466, 1681, 24312, 44365, 106926, 198130, 21657, 93644, 44757, 5799, 25123, 9304, 39760, 9881, 2871, 79236, 2382, 198556, 91782, 55549, 62649, 5610, 14253, 6174, 18954, 35594, 238579, 69904, 9588, 8871, 15823, 31586, 3293, 26423, 2297, 34823, 68899, 3203, 25252, 61048, 32477, 69582, 7610, 209894, 92725, 3802, 10756, 18569, 22165, 367388, 227941, 23995, 34785, 7991, 14973, 20305, 7957, 61830, 22368, 20557, 178142, 218470, 22680, 21605, 9013, 27744, 57802, 10051, 2848, 266897, 18024, 53865, 12249, 24992, 21327, 9678, 8356, 13506, 33451, 8843, 116467, 490, 20049, 513, 1725, 10200, 7357, 6070, 188932, 21851, 6301, 506, 8090, 22949, 4701, 47288, 2216, 2385, 4444, 38333, 1495, 43326, 315646, 33003, 15979, 16027, 35607, 159510, 6527, 31853, 35293, 2328, 35042, 178908, 39174, 158341, 9842, 7585, 1402, 9332, 93039, 113812, 191995, 123202, 42562, 132855, 30570, 42540, 23455, 91249, 41681, 58231, 6767, 69674, 21994, 59822, 40986, 19394, 25327, 60898, 2041, 12795, 17051, 16370, 70431, 31587, 8892, 12582, 88674, 282, 13111, 1501, 22997, 31764, 2225, 20258, 27294, 10295, 2152, 57587, 9729, 23459, 16137, 117237, 72131, 19403, 11533, 43754, 45, 23750, 9571, 45203, 12685, 23519, 212800, 59, 30216, 24060, 78635, 20745, 8744, 156320, 642, 27949, 1535, 46969, 251093, 5199, 725, 2891, 145874, 45107, 60363, 20755, 24416, 19160, 3601, 5205, 37683, 57073, 33228, 23237, 192976, 12129, 4341, 32564, 74404, 10054, 110321, 344, 9723, 11257, 8182, 64614, 38089, 8119, 12191, 168335, 10890, 31603, 11590, 45920, 5544, 7669, 207048, 202010, 41916, 4633, 48367, 6221, 30591, 8147, 33618, 10090, 812, 11462, 93892, 168004, 17873, 22793, 3244, 122582, 5314, 7600, 151850, 151800, 6284, 11996, 33470, 16546, 1548, 5506, 717, 203892, 90024, 26917, 21907, 2786, 102091, 114944, 2224, 49639, 22955, 83688, 11276, 11809, 7934, 22376, 7702, 112081, 15597, 30624, 9639, 18184, 73911, 214566, 59176, 1827, 78718, 5938, 20295, 13316, 110918, 23884, 107546, 63621, 25199, 54068, 22616, 8092, 70417, 20132, 24387, 1085, 6323, 47494, 32450, 51668, 9915, 39085, 1903, 14467, 37499, 13965, 65128, 25002, 7598, 19500, 14881, 1746, 39393, 17841, 4624, 11898, 8217, 7969, 12153, 122408, 14710, 118438, 38815, 18032, 29033, 3704, 58911, 11481, 8588, 20206, 10082, 23141, 13642, 21032, 10553, 35684, 3361, 34612, 86649, 6247, 25974, 16845, 6262, 20356, 2771, 27958, 52280, 1272, 18797, 58599, 4169, 68650, 22741, 5820, 8298, 3410, 26007, 207212, 18396, 11278, 9411, 4225, 15767, 6874, 23970]                      |
# |14            |[77141, 3637, 7195, 19, 7448, 11251, 197717, 14332, 24118, 11105, 5296, 5125, 91421, 4200, 15346, 10857, 156580, 6928, 3084, 18824, 62919, 156, 15517, 33040, 165827, 256, 54251, 4576, 3246, 8713, 116480, 8089, 10559, 159145, 8166, 19739, 20301, 807, 2455, 28, 11702, 3335, 66807, 69710, 20402, 14534, 51120, 5673, 22574, 10443, 30670, 14975, 3572, 27270, 40098, 160827, 3713, 762, 6412, 11386, 46340, 22183, 25538, 91, 52308, 9175, 18197, 132408, 103064, 4070, 23815, 13939, 22875, 1944, 26011, 1646, 22411, 38868, 5857, 20405, 93365, 25824, 83128, 24594, 107, 2134, 2956, 28117, 17090, 49321, 7218, 53240, 19662, 3445, 24516, 53928, 24564, 369, 7427, 16172, 1810, 64718, 24242, 42559, 16798, 70014, 4819, 25875, 31262, 64991, 267, 5431, 18463, 13819, 27310, 30276, 66483, 52946, 734, 43176, 68290, 124, 19582, 2473, 12647, 18895, 10195, 31347, 710, 19028, 34842, 86959, 37376, 8326, 48861, 3234, 7465, 14910, 1821, 1222, 44216, 23698, 16773, 33837, 70276, 12373, 1841, 39105, 1593, 2774, 238, 46385, 2572, 28325, 793, 1230, 3130, 2987, 1368, 77243, 11338, 19446, 34634, 23234, 6538, 18461, 37464, 669, 4583, 1767, 62515, 19067, 44465, 2283, 4301, 19720, 57401, 43294, 12301, 42033, 50, 431, 26850, 86988, 308, 7190, 6990, 11605, 214508, 110813, 167537, 1322, 5017, 834, 104892, 7385, 107016, 78, 13275, 6495, 5483, 3056, 30796, 5975, 31583, 9051, 1629, 23565, 2071, 14901, 498, 8804, 3866, 95118, 917, 865, 9796, 25093, 2577, 28466, 32617, 61395, 42801, 182130, 163515, 12024, 397, 5274, 14600, 3897, 150679, 69461, 14303, 38, 1923, 12997, 109675, 17776, 9505, 112961, 10086, 7314, 7, 31734, 1210, 1251, 32043, 7262, 66434, 6221, 36180, 14156, 64852, 28760, 7549, 6548, 11017, 1963, 25, 14833, 81312, 66354, 1273, 13256, 11389, 44865, 21964, 172, 1114, 25501, 887, 1228, 3294, 36312, 10234, 863, 1691, 107739, 56800, 26356, 35011, 1286, 5561, 9801, 11578, 21969, 16864, 13044, 5423, 25281, 13050, 2882, 44747, 7235, 70682, 16534, 11433, 8, 11274, 25555, 35219, 173, 14520, 17960, 9724, 3960, 8221, 20655, 8025, 3218, 9487, 1670, 2599, 37911, 795, 12070, 61941, 16904, 2381, 5003, 11642, 67007, 229, 8973, 54877, 3500, 11087, 1011, 15582, 288, 19932, 3540, 3740, 3605, 41086, 8753, 3648, 5459, 1633, 758, 2195, 13995, 7814, 436, 15809, 33354, 26958, 111222, 103374, 7558, 5417, 28749, 39374, 46077, 12621, 9244, 55504, 1502, 18336, 22059, 15351]                                                                                                                                                                                                                    |
# |18            |[1684, 1373, 3158, 11006, 96, 1018, 3528, 6026, 323, 23159, 33690, 2026, 13818, 33777, 21162, 15130, 3913, 125, 31316, 5589, 6765, 9824, 2715, 13261, 1688, 6251, 768, 6201, 15774, 1579, 4095, 10431, 33, 66059, 13681, 1458, 11745, 3734, 25581, 3798, 5453, 51199, 13166, 3612, 1197, 2394, 239, 13400, 9072, 709, 16113, 34092, 14361, 3747, 981, 33490, 4634, 6814, 3491, 10275, 7603, 2521, 830, 19808, 8941, 7113, 18916, 6015, 21240, 4942, 7808, 14631, 1139, 20967, 7227, 823, 1053, 111, 6555, 17982, 5078, 552, 13747, 2, 6741, 23561, 7902, 86, 23027, 14767, 3436, 243, 1476, 20085, 506, 24618, 15521, 9537, 11375, 1388, 19604, 38277, 264, 473, 7112, 32, 6685, 335, 1679, 19118, 15434, 920, 11530, 9201, 2545, 63813, 6395, 3588, 1123, 275, 13, 11125, 884, 8242, 2894, 2263, 12239, 1156, 10453, 4553, 5151, 36538, 8360, 34192, 4871, 3290, 387, 16483, 21304, 16491, 2719, 7217, 15284, 30089, 5, 952, 7097, 29675, 9386, 5537, 9010, 2227, 10285, 130, 44730, 5515, 18043, 118, 512, 31109, 2410, 2899, 2194, 11479, 1403, 10295, 3749, 1589, 1154, 9729, 2144, 38816, 15849, 20165, 14722, 5664, 11795, 6298, 18714, 31537, 448, 2684, 1644, 17770, 1460, 1394, 43, 7984, 96194, 487, 1483, 1901, 588, 686, 1360, 1693, 40040, 10596, 21870, 9641, 162, 11548, 3427, 22937, 778, 1050, 29020, 22315, 1866, 324, 27863, 94391, 3079, 19766, 584, 7404, 560, 21, 385, 3642, 24830, 562, 4843, 71, 2099, 654, 1417, 417, 39781, 131529, 2140, 35027, 7139, 5127, 801, 16869, 1098, 8642, 478, 465, 3043, 4953, 5873, 2986, 48228, 68, 24715, 858, 16893, 3149, 17705, 1975, 6097, 1803, 25641, 3853, 7745, 362, 10858, 2113, 9000, 1771, 23953, 6553, 34806, 132, 59297, 5025, 791, 9502, 47844, 593, 3562, 1842, 15314, 3936, 18949, 1354, 19971, 92, 15414, 6236, 3872, 1576, 8848, 835, 23908, 16794, 1216, 632, 2731, 8518, 456, 40656, 77241, 1745, 125647, 931, 7546, 2668, 8, 4043, 1742, 65062, 4615, 12273, 3934, 1896, 13097, 3677, 2155, 29616, 94, 3498, 1554, 4137, 28853, 171, 2751, 5847, 16587, 2846, 44815, 949, 6124, 576, 15352, 4250, 6776, 81, 5084, 5238, 6086, 201, 18697, 26919, 9829, 1090, 41415, 67367, 2573, 11, 4030, 26728, 17469, 10210, 11798, 8706, 35135, 342, 197, 11747, 116, 54053, 53398]                                                                                                                                                                                                                                                                                                                                                                                                        |
# |38            |[19194, 24847, 32405, 1083, 13446, 50754, 19756, 10310, 40706, 14519, 16500, 28105, 33884, 3140, 33315, 15312, 156, 5282, 60007, 8338, 809, 22433, 4122, 1830, 7476, 22166, 7577, 5085, 89650, 20538, 8488, 5765, 27566, 3556, 9838, 2644, 12499, 10243, 11779, 2297, 22854, 28, 1728, 13729, 39674, 11632, 16113, 15826, 11243, 41876, 38841, 36730, 981, 119352, 13056, 50350, 9746, 18153, 44669, 16303, 55377, 8925, 23920, 37811, 5888, 28852, 26076, 14039, 19778, 8896, 37238, 16597, 823, 18847, 14430, 3179, 30366, 32656, 7301, 33764, 47070, 9230, 8502, 6375, 3782, 22085, 1976, 19604, 37394, 31819, 4466, 27459, 9170, 6889, 28121, 13901, 15601, 13272, 7985, 591, 18427, 6654, 25743, 20146, 9088, 4227, 1003, 9466, 19573, 3340, 8360, 18530, 88985, 2085, 4329, 80843, 111489, 18278, 11902, 12714, 22116, 49247, 27001, 30089, 19385, 9570, 7097, 24738, 14897, 8597, 22098, 8601, 13511, 29046, 23483, 21092, 13168, 30588, 35163, 15972, 4918, 2116, 6596, 3170, 1574, 8739, 161, 11930, 4549, 8781, 7161, 2776, 588, 6701, 12086, 13345, 20679, 12600, 27949, 7634, 21637, 10618, 6197, 35015, 7534, 4282, 37105, 4363, 75705, 11888, 2181, 2948, 29329, 584, 560, 1604, 28982, 17308, 14856, 7530, 48100, 4097, 23470, 11576, 17941, 6033, 8182, 10789, 2097, 2957, 17579, 8119, 10064, 1098, 13979, 17944, 3201, 40856, 16055, 9773, 1313, 52358, 14509, 24798, 14433, 17032, 36372, 40092, 1743, 15043, 16893, 19320, 4575, 11539, 25167, 41051, 31462, 18621, 27291, 2100, 9110, 5462, 73859, 45617, 27991, 8496, 172, 7061, 36503, 3562, 3936, 6964, 36521, 16364, 51719, 16996, 25343, 1933, 53672, 16794, 1396, 15887, 5683, 13570, 14746, 38563, 54530, 16534, 39231, 57838, 22824, 16793, 16163, 25770, 12273, 13097, 17097, 29616, 10024, 26370, 39441, 18292, 36039, 42265, 48291, 11323, 27139, 13165, 4597, 11362, 24062, 11481, 112844, 16071, 20206, 31566, 23141, 27364, 41685, 27778, 3536, 4108, 2404, 193, 168060, 41415, 6379, 34486, 6153, 12199, 2393, 8706, 12778, 17378, 33769, 4058, 3820, 18239, 119720, 46001, 8157, 45010, 6955]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
# +--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

test_metric = temp.join(test_array, on="user_label_int", how="left")

test_metric = test_metric.drop("user_label_int")
# +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
# |user_label_int|recommends                                   |collect_list(song_label_int)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
# +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
# |1142          |[23, 139, 123, 16, 154, 15, 189, 326, 13, 64]|[10987, 59, 3669, 304, 28093, 26997, 63746, 11765, 105293, 77947, 1811, 37683, 19008, 11538, 28300, 23633, 1438, 41249, 22878, 35245, 8119, 30832, 35564, 27155, 10310, 38918, 255157, 8318, 28105, 73804, 156, 6330, 6730, 39734, 40038, 31689, 3791, 44169, 40961, 2842, 97090, 8030, 31952, 45293, 39674, 42473, 14361, 36730, 9568, 35437, 10309, 305, 661, 3401, 29747, 12390, 1783, 60083, 57150, 10668, 42899, 8621, 16611, 59655, 25533, 10911, 39675, 453, 334550, 16757, 13110, 27082, 32208, 41429, 7491, 11510, 14221, 14171, 7327, 78452, 27710, 2572, 7328, 3035, 41902, 28479, 55754, 28975, 10825, 306, 1460, 30031, 4780, 42324, 22860, 4872, 23791, 209667, 529, 122485, 9415, 6296, 158, 15676, 50788, 3069, 202095, 7100, 2237, 45427, 1875, 45010, 25193, 51955, 6721, 6685, 23842, 13096, 1997, 58890, 169, 26651, 13862, 13, 1192, 32409, 20335, 8349, 26319, 56018, 39725, 33538, 33175, 5971, 2726, 23229, 7893, 7061, 2854, 14325, 26892, 20126, 1480]|
# +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
# only showing top 1 row

test_rdd = test_metric.rdd

from pyspark.mllib.evaluation import RankingMetrics

metrics_2 = RankingMetrics(test_rdd)

metrics_2.precisionAt(5)
#  0.06123845908946199

metrics_2.ndcgAt(10)
# 0.05557657159203472

metrics_2.meanAveragePrecision
# 0.00965199857704829
예제 #23
0
# +------+------------------------------------------------------------------------------------------------------------------------------------+

compare = recommends.join(ground_truths, on='user', how='left')
compare = [(r.__getattr__('recommends'), r.__getattr__('ground_truths'))
           for r in compare.collect()]
compare = sc.parallelize(compare)

#Alternative method
compare = sc.parallelize(compare.collect(
))  # this take longer time than the above '_getattr_()' method.

# print metrics
metrics = RankingMetrics(compare)
print(metrics.precisionAt(5))
# 0.06666666666666667
print(metrics.ndcgAt(10))
# 0.06506334166535027
print(metrics.meanAveragePrecision)
# 0.027777777777777776

# predict test and rmse
predict = model.transform(test)
predict = predict.filter(F.col('prediction') != float('nan'))
reg_eval = RegressionEvaluator(predictionCol='prediction',
                               labelCol='rating',
                               metricName='rmse')
reg_eval.evaluate(predict)
# 4.856047802562721

# testing NDCG metric on bad documents
set1 = sc.parallelize([([1, 2, 3], [1, 2])])
예제 #24
0
def main(spark, val_pq, model_file_path):
    '''
    Args
    -------
    val_pq:
        validation data
    
    model_file_path:
        path to the pipeline(stringIndexers + als) model
    '''

    # Read data
    val = spark.read.parquet(val_pq)

    print('load trained model')
    # Load the trained pipeline model
    model = PipelineModel.load(model_file_path)

    # evaluation

    print("Run prediction")
    # Run the model to create prediction against a validation set
    preds = model.transform(val)
    
    print("Run evaluation")

    # model evaluation using rmse on val data
    print("Start evaluation using rmse")
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(preds)

    # Generate top 500 book recommendations for each user in validation data.
    # Returns a DataFrame of (userCol, recommendations), 
    # where recommendations are stored as an array of (itemCol, rating) Rows.
    #user_id = preds.select("user_id_idx").distinct()
    #res = model.stages[-1].recommendForUserSubset(user_id, 500)

    print("generate top 500 book recommendations for val users")
    res = model.stages[-1].recommendForAllUsers(500)
    preds_per_user = res.selectExpr("user_id_idx", "recommendations.book_id_idx as preds_books")
    # preds_pe_user.show(5)

    true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\
                        .groupBy("user_id_idx")\
                        .agg(expr("collect_set(book_id_idx) as books"))
    # true_per_user.show(5)

    
    print("Start join")
    # true_per_user: an RDD of (predicted ranking, ground
    # truth set) pairs
    # true_vs_preds_per_user = preds_per_user.join(true_per_user, ["userId"]).rdd\
    #                 .map(lambda row: (row.items_pred, row.items)).cache()

    true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\
                    .select("preds_books","books").rdd

    # print(*true_vs_preds_per_user.take(5),sep="\n")

    # Evaluate using RMSE
    #evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="??")
    #rmse = evaluator.evaluate(preds)
    #print(f'The out-of-sample RMSE of the current model is: {rmse:.2f}')

    # Evaluate using MAP
    print("Start evaluation using MAP")
    metrics = RankingMetrics(true_vs_preds_per_user)
    map_ = metrics.meanAveragePrecision

    #Evaluate using ndcg
    print("Start evaluation using ndcg")
    ndcg = metrics.ndcgAt(500)
    
    #Evaluate using precision
    mpa = metrics.precisionAt(500)

    print('rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ', ndcg, 'mpa score: ', mpa)
# Join the song recommendations and actual songs played together for each user.
user_songs = (
    predicted_songs_per_user
    .join(
        actual_songs_per_user,
        on='User_ID_encoded',
        how='inner'
        )
    )

user_songs.show(20, 100)

# Select only the recommended and actual song columns and convert the dataframe to an rdd. 
user_songs_rdd = user_songs.select(F.col('recommended_songs'), F.col('relevant_songs')).rdd
user_songs_rdd.cache()

# Compute the ranking metrics for the collaborative filtering model.
rank_metrics = RankingMetrics(user_songs_rdd)

# Precision @ 5: 0.907 (3dp)
precision_at_5 = rank_metrics.precisionAt(5)
print(precision_at_5)

# NDCG @ 10: 0.906 (3dp)
ndcg_at_10 = rank_metrics.ndcgAt(10)
print(ndcg_at_10)

# MAP: 0.699 (3dp)
mean_average_precision = rank_metrics.meanAveragePrecision
print(mean_average_precision)
예제 #26
0
             .config("spark.executor.memory", "8g")
             .config("spark.driver.memory", "8g")
             .getOrCreate())
    spark.sparkContext.setLogLevel("ERROR")

    train = spark.read.parquet(f'{sys.argv[1]}/train.parquet')
    test = spark.read.parquet(f'{sys.argv[1]}/test.parquet')
    validation = spark.read.parquet(f'{sys.argv[1]}/validation.parquet')

    als = ALS(rank=15, maxIter=5, regParam=0.001,userCol="user_id", itemCol="book_id", ratingCol="rating", seed=0,
              coldStartStrategy="drop")
    StartT = time.time()
    model = als.fit(train)
    EndT = time.time()
    T = EndT - StartT
    print(f"The running time is: {T}")
    predictions = model.transform(test)

    predictions = predictions.orderBy(predictions.prediction.desc())
    final_prediction = predictions.filter(predictions.prediction >= 0).groupBy("user_id").agg(F.collect_list("book_id").alias("prediction"))
    predictions = predictions.orderBy(predictions.rating.desc())
    final_rating = predictions.groupBy("user_id").agg(F.collect_list("book_id").alias("rating"))
    final = final_prediction.join(final_rating, final_prediction.user_id == final_rating.user_id, 'inner')\
        .select(final_prediction.user_id, final_prediction.prediction, final_rating.rating)
    metrics = RankingMetrics(final.select('prediction', 'rating').rdd.map(tuple))
    res = metrics.ndcgAt(500)
    Precision = metrics.precisionAt(500)
    print(f"The NDCG evaluation result is: {res}")
    print(f"The PrecisionAtK evaluation result is: {Precision}")

예제 #27
0
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions_test)

print("Root-mean-square error = " + str(rmse))

test.createOrReplaceTempView('test')
test_true = spark.sql(
    'select user, book from test where rating > 2 sort by rating desc')
labels = test_true.groupby('user').agg(collect_list('book'))

test_recommendations = model.recommendForUserSubset(labels.select('user'), 500)
preds = test_recommendations.withColumn(
    'recommendations', explode('recommendations')).select(
        'user',
        'recommendations.item').groupBy('user').agg(collect_list('item'))

preds_and_labels = preds.join(labels, on='user')

metrics = RankingMetrics(
    preds_and_labels.select('collect_list(item)', 'collect_list(book)').rdd)
map_metric = metrics.meanAveragePrecision
pA = metrics.precisionAt(500)
ndcgA = metrics.ndcgAt(500)

results.append((rank, reg, rmse, map_metric, pA, ndcgA))

print('MAP = ', map_metric, ' pA = ', pA, ' ndcgA = ', ndcgA, '\n')

res_rdd = spark.sparkContext.parallelize(results)
res_df = spark.createDataFrame(res_rdd).repartition(1)
res_df.write.csv('test_results.csv')
예제 #28
0
def main(spark, train_pq, val_pq):
    '''
    Args
    -------
    val_pq:
        validation data
    
    model_file_path:
        path to the pipeline(stringIndexers + als) model
    '''
    import itertools

    # Read train and val data
    print("load train and validation data")
    train = spark.read.parquet(train_pq)
    val = spark.read.parquet(val_pq)

    # Increase partition size of train data to reduce task load
    #train.repartition(200)

    # Pipeline
    # StringIndexers
    print("build stringIndexer")
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_idx",
                                 handleInvalid='skip')
    indexer_book = StringIndexer(inputCol="book_id",
                                 outputCol="book_id_idx",
                                 handleInvalid='skip')

    # Hyper-parameter tuning
    rank_ = [10, 15, 20]
    regParam_ = [0.01, 0.05, 0.1, 0.3, 1]
    param_grid = itertools.product(rank_, regParam_)

    # ALS model hyperparameter tuning
    for i in param_grid:
        print('training for {} start'.format(i))

        als = ALS(maxIter=10, rank=i[0], regParam=i[1],\
                userCol="user_id_idx", itemCol="book_id_idx", ratingCol="rating",\
                coldStartStrategy="drop").setSeed(42)

        # Combine into the pipeline
        pipeline = Pipeline(stages=[indexer_user, indexer_book, als])

        model = pipeline.fit(train)
        print('training for {} complete'.format(i))

        # predition against validation data
        preds = model.transform(val)

        # model evaluation using rmse on val data
        print("Start evaluation using rmse for {}".format(i))
        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(preds)

        # Make top 500 recommendations for users in validation test
        print('evaluation for {} start'.format(i))

        res = model.stages[-1].recommendForAllUsers(500)

        preds_per_user = res.selectExpr(
            "user_id_idx", "recommendations.book_id_idx as preds_books")
        true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\
                            .groupBy("user_id_idx")\
                            .agg(expr("collect_set(book_id_idx) as books"))

        print("Start join for {}".format(i))
        true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\
                        .select("preds_books","books").rdd

        # Evaluate using MAP
        print("Start evaluation using MAP for {}".format(i))
        metrics = RankingMetrics(true_vs_preds_per_user)
        map_ = metrics.meanAveragePrecision

        #Evaluate using ndcg
        print("Start evaluation using ndcg for {}".format(i))
        ndcg = metrics.ndcgAt(500)

        #Evaluate using precision
        print("Start evaluation using precisionAtK for {}".format(i))
        mpa = metrics.precisionAt(500)

        print(i, 'rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ',
              ndcg, 'mpa score: ', mpa)
model.recommendForAllUsers()
predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))

ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
predictionAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])




# calculate 

 metrics = RankingMetrics(predictionAndLabels)

 metrics.precisionAt(5)

 metrics.ndcgAt(10)
 
 metrics.meanAveragePrecision

 










예제 #30
0
def main(spark, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    data_file : string, path to the parquet file to load
    '''

    # Load the dataframe from data file
    #input_data = spark.read.parquet(data_file)
    df = spark.read.csv(data_file, header=True)

    #df = df.limit(500000)
    df = df.filter(df.user_id.isNotNull())
    print('1')
    df.show()

    df_unique_user = df.select("user_id").distinct()
    df_unique_user = df_unique_user.selectExpr("user_id as uid")

    percent = df_unique_user.count()
    percent = 0.001 * percent
    percent = math.floor(percent)
    print(percent)

    df_unique_user = df_unique_user.limit(percent)
    print('df unique user')
    df_unique_user.show()
    df_final = df.join(df_unique_user, df.user_id == df_unique_user.uid,
                       "inner").select(df.user_id, df.book_id, df.is_read,
                                       df.rating, df.is_reviewed)
    df = df_final
    print('2')
    df.show()
    #FILTER USERS < 10
    counts = df.groupBy('user_id').count().selectExpr("user_id as uid",
                                                      "count as count")
    df = df.join(counts,
                 df.user_id == counts.uid).filter(F.col("count") > 10).drop(
                     'uid', 'count')
    print('3')
    df.show()

    from pyspark.sql.types import DoubleType

    df = df.filter(df.rating.isNotNull())
    df = df.withColumn("rating", df["rating"].cast(DoubleType()))
    from pyspark.ml.feature import StringIndexer

    stage_1 = StringIndexer(inputCol='user_id', outputCol='user_id_index')
    #df = stage_1.setHandleInvalid("keep").fit(df).transform(df)
    df = stage_1.fit(df).transform(df)

    stage_2 = StringIndexer(inputCol='book_id', outputCol='book_id_index')
    #transformed = stage_2.setHandleInvalid("keep").fit(df).transform(df)
    df = stage_2.fit(df).transform(df)

    user_id = df.select("user_id").distinct()
    uid = df.select(
        F.collect_set('user_id').alias('user_id')).first()['user_id']

    #get the count of each user_id
    counts = df.groupBy('user_id').count()  #Show count of each user_id
    counts = counts.selectExpr('user_id as user_id',
                               'count as n')  #Rename count as n

    #Create Train Test and Validation sets 60-20-20
    train_size = int(0.6 * len(uid))
    vali_size = train_size + int(0.2 * len(uid))
    test_size = vali_size + int(0.2 * len(uid))

    train_set = uid[:train_size]
    vali_set = uid[train_size:vali_size]
    test_set = uid[vali_size:]

    train_set = df.filter(df.user_id.isin(train_set))
    vali_set = df.filter(df.user_id.isin(vali_set))
    test_set = df.filter(df.user_id.isin(test_set))
    #-----------------------------------------------

    # In[8]:

    vali_uid = vali_set.select(
        F.collect_set('user_id').alias('user_id')).first()['user_id']
    test_uid = test_set.select(
        F.collect_set('user_id').alias('user_id')).first()['user_id']

    #For each validation user, use half of their interactions for training,
    validict = {i: 0.5 for i in vali_uid}
    new_vali = vali_set.sampleBy("user_id", fractions=validict, seed=40)

    testdict = {i: 0.5 for i in test_uid}
    new_test = test_set.sampleBy("user_id", fractions=testdict, seed=40)

    vali_set = vali_set.exceptAll(new_vali)
    train_set = train_set.union(new_vali)

    test_set = test_set.exceptAll(new_test)
    train_set = train_set.union(new_test)
    train_set.show()

    # # ALS

    # In[40]:

    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    from pyspark.sql.functions import expr
    import itertools as it

    rank_ = [5, 10, 15]
    regParam_ = [0.01, 0.1, 1.0]
    alpha_ = [1, 2, 5]
    param_grid = it.product(rank_, regParam_, alpha_)
    vals_list = []
    stats = []
    rmse_list = []
    best_map = 999999999
    best_model = None
    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank=i[0],
                  maxIter=10,
                  regParam=i[1],
                  alpha=i[2],
                  userCol="user_id_index",
                  itemCol="book_id_index",
                  ratingCol='rating',
                  nonnegative=True,
                  coldStartStrategy="drop")
        model = als.fit(train_set)
        user_subset = vali_set.select("user_id_index").distinct()
        userRecs = model.recommendForUserSubset(user_subset, 500)
        from pyspark.sql.functions import expr
        print('Recommended')
        true_label = test_set.select('user_id_index', 'book_id_index')\
                .groupBy('user_id_index')\
                .agg(expr('collect_list(book_id_index) as true_item'))
        pred_label = userRecs.select('user_id_index',
                                     'recommendations.book_id_index')
        print('pred_label')
        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
        print('pred_true_rdd')
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        predictions = model.transform(vali_set)
        predsDF = predictions.filter(predictions.rating.between(3,
                                                                5)).collect()
        predsDF = spark.createDataFrame(predsDF)

        rmse = evaluator.evaluate(predsDF)
        rmse_list.append(rmse)
        if map_ < best_map:
            best_model = model
            best_map = map_
            #print('New best model')
            stats.append([i[0], i[1], i[2], rmse])

        columns = [
            'Alpha', 'Rank', 'RegParam', 'MAP', 'Precision', 'NDGC', 'RMSE'
        ]
        vals_list.append((i[2], i[0], i[1], map_, mpa, ndcg, rmse))
        print('MAP: %f' % map_)
        print('Precision: %f' % mpa)
        print('NDCG: %f' % ndcg)
        print('rmse %f:' % rmse)
        plt.scatter(i[0], rmse)
        #plt.pause(0.05)
    plt.show()

    #als=ALS(maxIter=5,regParam=0.09,rank=200,userCol="user_id_index",itemCol="book_id_index",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
    #model=als.fit(train_set)

    #evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
    #predictions=model.transform(vali_set)

    predictions.show()
    convertToFloat = lambda lines: [double(x) for x in vals_list]
    #from pyspark.sql.types import *

    mySchema = StructType([ StructField("Alpha", IntegerType(), True)\

                           ,StructField("Rank", IntegerType(), True)\

                           ,StructField("Reg_Param", DoubleType())\

                           ,StructField("MAP ", DoubleType(), True)\

                           ,StructField("Precision", DoubleType(), True)\

                           ,StructField("NDGC", DoubleType(), True)\

                           ,StructField("RMSE", DoubleType(), True)])
    df = spark.createDataFrame(vals_list, schema=mySchema)
    df.show()

    #Evaluation of test set
    #print('Finish Training for {}'.format(i))
    user_subset = test_set.select("user_id_index").distinct()
    userRecs = best_model.recommendForUserSubset(user_subset, 500)

    true_label = test_set.select('user_id_index', 'book_id_index')\
                .groupBy('user_id_index')\
                .agg(expr('collect_list(book_id_index) as true_item'))
    pred_label = userRecs.select('user_id_index',
                                 'recommendations.book_id_index')

    pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    metrics = RankingMetrics(pred_true_rdd)
    map_ = metrics.meanAveragePrecision
    ndcg = metrics.ndcgAt(500)
    mpa = metrics.precisionAt(500)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    predictions = model.transform(test_set)
    predsDF = predictions.filter(predictions.rating.between(3, 5)).collect()
    predsDF = spark.createDataFrame(predsDF)

    rmse = evaluator.evaluate(predsDF)
    print('Test Metrics:')
    print('MAP: %f' % map_)
    print('Precision: %f' % mpa)
    print('NDCG: %f' % ndcg)
    print('rmse %f:' % rmse)

    #Latent Factors
    ufac_df = best_model.userFactors.toPandas()
    ifac_df = best_model.itemFactors.toPandas()
    ufac_matrix = np.vstack(ufac_df.features.values)
    ifac_matrix = np.vstack(ifac_df.features.values)

    import seaborn as sns
    sns.set(rc={'figure.figsize': (11.7, 8.27)})
    palette = sns.color_palette("hls", 10)
    import numpy as np
    import pandas as pd
    import sklearn
    from sklearn.manifold import TSNE

    X = ufac_matrix
    Y = ifac_matrix
    tsne = TSNE()
    X_embedded = tsne.fit_transform(X)
    Y_embedded = tsne.fit_transform(Y)
    plot_users = sns.scatterplot(X_embedded[:, 0],
                                 X_embedded[:, 1],
                                 legend='full',
                                 palette=palette)
    plot_items = sns.scatterplot(Y_embedded[:, 0],
                                 Y_embedded[:, 1],
                                 legend='full')

    pkl.dump(ufac_matrix, open('ufac_matrix.pkl', 'wb'))
    pkl.dump(ifac_matrix, open('ifac_matrix.pkl', 'wb'))