def _evaluate(self, predicted_df): k = self.get_k() window_spec = Window.partitionBy('user').orderBy( col('prediction').desc()) per_user_predicted_items_df = predicted_df \ .select('user', 'item', 'prediction', F.rank().over(window_spec).alias('rank')) \ .where('rank <= {0}'.format(k)) \ .groupBy('user') \ .agg(expr('collect_list(item) as items')) window_spec = Window.partitionBy('user').orderBy( col('starred_at').desc()) per_user_actual_items_df = predicted_df \ .select('user', 'item', 'starred_at', F.rank().over(window_spec).alias('rank')) \ .where('rank <= {0}'.format(k)) \ .groupBy('user') \ .agg(expr('collect_list(item) as items')) per_user_items_rdd = per_user_predicted_items_df.join(F.broadcast(per_user_actual_items_df), 'user', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) if per_user_items_rdd.isEmpty(): return 0.0 ranking_metrics = RankingMetrics(per_user_items_rdd) metric = ranking_metrics.ndcgAt(k) return metric
def Tester(spark, model, df_test, rank, regParam, alpha, K=500): #df_test = spark.read.parquet(formatted_test_address) targetUsers = df_test.select("user_id_numeric").distinct() userRecs = model.recommendForUserSubset(targetUsers, K) userRecs = userRecs.select("user_id_numeric", "recommendations.track_id_numeric", "recommendations.rating") # need to get ordered list of track_id based on counts groupby individual users. # reference:https://stackoverflow.com/questions/46580253/collect-list-by-preserving-order-based-on-another-variable w = Window.partitionBy("user_id_numeric").orderBy(df_val['count'].desc()) labels = df_val.withColumn('ActualRanking', F.collect_list("track_id_numeric").over(w)) labels = labels.select(['user_id_numeric', 'ActualRanking' ]).dropDuplicates(['user_id_numeric']) # Get the metrics # predictionsAndlabels should be an RDD of (predicted ranking, ground truth set) pairs. # reference: https://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics predictionsAndlabels = userRecs.join( labels, [labels.user_id_numeric == userRecs.user_id_numeric], 'left').select('track_id_numeric', 'ActualRanking') metricsRank = RankingMetrics(predictionsAndlabels.rdd) print("------------------------------------------") print("Params: Rank %f | regParam %f | alpha = %f" % (rank, regParam, alpha)) print("p(15) %.8f" % metricsRank.precisionAt(15)) print("p(500) %.8f" % metricsRank.precisionAt(500)) print("MAP %.8f" % metricsRank.meanAveragePrecision) print("nDCG %.8f" % metricsRank.ndcgAt(K)) return
def main(spark, txt): model = ALSModel.load('hdfs:/user/jm7955/' + args.model) distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.distinct) print("distinct_users") print('finished writing in %d seconds' % int(timer() - start)) #distinct_users.show() labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels) print("labels") #labels.show() print('finished writing in %d seconds' % int(timer() - start)) predictions = model.recommendForUserSubset(distinct_users, 500)\ .select('user', F.col('recommendations.item').alias('item')) print("predictions") #predictions.show() print('finished writing in %d seconds' % int(timer() - start)) predictionsAndLabels = predictions.join( labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("predictionsAndLabels") print('finished writing in %d seconds' % int(timer() - start)) metrics = RankingMetrics(predictionsAndLabels) print('finished writing in %d seconds' % int(timer() - start)) file = open(txt, 'w') file.write('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision) file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500)) file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500)) file.close()
def main(spark, model_file, test_file): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file test = spark.read.parquet(test_file) test = test.sort('user', ascending=False) test.createOrReplaceTempView('test_table') model = ALSModel.load(model_file) user_subset = test.select("user").distinct() user_subset = model.recommendForUserSubset(user_subset, 500) user_subset = user_subset.select("user", col("recommendations.item").alias("item")) user_subset = user_subset.sort('user', ascending=False) print("sort user") predictionAndLabels = user_subset.join( test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("joined predictions and counts") metrics = RankingMetrics(predictionAndLabels) print("made metrics") MAP = metrics.meanAveragePrecision precision = metrics.precisionAt(500) ndcg = metrics.ndcgAt(500) print('MAP: %f' % MAP) print('Precision: %f' % precision) print('NDCG: %f' % ndcg)
def evaluateTopk(model,data,top_k=500): ''' Input: validation: RDD - user, product (book_id), rating ''' truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product")) print("Getting Predictions...") tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]]) predictions=spark.createDataFrame(tmp1,["user","predictions"]) print("Predictions and Labels...") k=predictions.join(truth,truth.user==predictions.user) final=k.rdd.map(lambda r: [r[1],r[3]]) metrics=RankingMetrics(final) print("\nCalculate NDCG at {}...".format(top_k)) res1=metrics.ndcgAt(top_k) print("NDCG at {}: {}".format(top_k,res1)) print("\nCalculate MAP...") res2=metrics.meanAveragePrecision print("MAP: {}".format(res2)) print("\nCalculate Precision at {}...".format(top_k)) res3=metrics.precisionAt(top_k) print("Precision at {}: {}".format(top_k,res1)) return res1,res2,res3
def evaluation(df, model, ks): ''' Evaluate the model. ks: a list of parameter k used in precision at k and NDCG at k. ''' print(' Make predictions...') predictions = model.recommendForUserSubset(df, 500) print(' Prepare ground truth set and predicted set...') labels = df.groupBy('user').agg(F.collect_set('item')).collect() user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect() labels = sorted(labels, key = lambda x: x.user) user_pred = sorted(user_pred, key = lambda x: x.user) print(' Combine ground truth set and predicted set...') predictionAndLabels = [] for i in range(len(user_pred)): predictionAndLabels.append((user_pred[i].item, labels[i][1])) print(' Parallelize...') predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000) print(' Calculate metrics...') metrics = RankingMetrics(predictionAndLabels) eval_results = [] eval_results.append(metrics.meanAveragePrecision) for k in ks: eval_results.append(metrics.precisionAt(k)) eval_results.append(metrics.ndcgAt(k)) return eval_results
def get_rankMetrics(spark, df, trained_model, approx=False, k=500): """ This function evaluates the performance of a given model on a given dataset using Ranking Metrics, and returns the final performance metrics. Parameters ---------- df: DataFrame to evaluate on trained_model: trained model to evaluate approx: boolean; use ANN(approximate nearest neighbors) when True k: number of recommendation ---------- """ import datetime import nmslib_recommend2 import pyspark.sql.functions as F from pyspark.mllib.evaluation import RankingMetrics # change column names df = df.select(['user_id', 'book_id', 'rating']).toDF('user', 'item', 'rating') # relevant item if its centered rating > 0 fn = F.udf(lambda x: 1.0 if x >= 3 else 0.0) df = df.withColumn('rating', fn(df.rating)) relevant = df[df.rating == 1.0].groupBy('user').agg(F.collect_list('item')) # recommend k items for each user print("recommendation time comparison start: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if approx: recommend = nmslib_recommend2.nmslib_recommend(spark, df, trained_model, k) recommend = spark.createDataFrame(recommend, ["user", "recommend"]) joined = recommend.join(relevant, on='user') rec_and_rel = [] for user, rec, rel in joined.collect(): rec_and_rel.append((rec, rel)) else: userSubset = relevant.select('user') recommend = trained_model.recommendForUserSubset(userSubset, 500) joined = recommend.join(relevant, on='user') rec_and_rel = [] for user, rec, rel in joined.collect(): predict_items = [i.item for i in rec] rec_and_rel.append((predict_items, rel)) print("recommendation time comparison end: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # Compute metrics rec_and_rel_rdd = spark.sparkContext.parallelize(rec_and_rel) metric_class = RankingMetrics(rec_and_rel_rdd) ndcg = metric_class.ndcgAt(k) map_ = metric_class.meanAveragePrecision pk = metric_class.precisionAt(k) return print("NDCG:", ndcg, "\nMAP:", map_, "\nPrecision:", pk)
def evaluate(model, test_data): test = test_data.map(lambda p: (p[0], p[1])) ret = model.predictAll(test) \ .map(lambda r: (r.user, (r.product, r.rating))) \ .groupByKey() \ .mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)) \ .mapValues(lambda l: [x[0] for x in l]) gt_items = test_data.filter(lambda p: p[2] == 1.0).map(lambda r: (r[0], [r[1]])) predictionAndLabels = ret.join(gt_items).map(lambda r: (r[1][0], list(r[1][1]))) metrics = RankingMetrics(predictionAndLabels) return metrics.ndcgAt(TopK)
def top_k_rankingmetrics(dataset=None, k=10, ranking_metrics="precisionAt", user="******", item="book_id", rating="rating", prediction="prediction"): ''' This function is to compute the ranking metrics from predictions. Input: 1. k: only evaluate the performance of the top k items 2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt 3. user, item, prediction: column names; string type refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html ''' if dataset == None: print("Error! Please specify a dataset.") return # prediction table windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc()) perUserPredictedItemsDF = dataset \ .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # actual target table windowSpec = Window.partitionBy(user).orderBy(col(rating).desc()) perUserActualItemsDF = dataset \ .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # join perUserItemsRDD = perUserPredictedItemsDF \ .join(F.broadcast(perUserActualItemsDF), user, 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD) # get the result of the metric if ranking_metrics == "precisionAt": precision_at_k = ranking_metrics_evaluator.precisionAt(k) #print("precisionAt: {}".format(round(precision_at_k, 4))) return precision_at_k elif ranking_metrics == "meanAveragePrecision": mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k) #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4))) return mean_avg_precision elif ranking_metrics == "ndcgAt": ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k) #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4))) return ndcg_at_k
def dummy_run(spark): from pyspark.ml.recommendation import ALS from pyspark.mllib.evaluation import RankingMetrics import pyspark.sql.functions as F from pyspark.sql.functions import expr train=spark.createDataFrame( [ (82, 124, 5.0), (64, 123, 4.0), (27, 122, 3.0), (25, 122, 1.0), (12, 124, 2.0) ], ['user_id', 'book_id', 'rating'] ) val=spark.createDataFrame( [ (82, 123, 5.0), (64, 122, 4.0), (27, 124, 3.0), (64, 123, 2.0), (12, 122, 4.0) ], ['user_id', 'book_id', 'rating'] ) user_id = val.select('user_id').distinct() true_label = val.select('user_id', 'book_id')\ .groupBy('user_id')\ .agg(expr('collect_list(book_id) as true_item')) als = ALS(rank = 3 , regParam=0.1, userCol="user_id", itemCol="book_id", ratingCol='rating', implicitPrefs=False, coldStartStrategy="drop") model = als.fit(train) recs = model.recommendForUserSubset(user_id, 2) pred_labels = recs.select('user_id','recommendations.book_id') pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) metrics = RankingMetrics(pred_true_rdd) mean_ap = metrics.meanAveragePrecision ndcg_at_k = metrics.ndcgAt(2) p_at_k= metrics.precisionAt(2) print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k) return
def __evaluate_ranking(self, rnk_inf: SparkDF): test_ground_truth = self.__test.groupBy("user_id").agg(collect_list("business_id").alias("business_gt")) pred_with_labels = rnk_inf.join(test_ground_truth, on="user_id").drop("user_id") metrics = RankingMetrics(pred_with_labels.rdd) results = {} for m in self.ranking_metrics: metric_name = "{}@{}".format(m, self.top_k) if "ndcg" in m: results[metric_name] = metrics.ndcgAt(self.top_k) elif m == "precision": results[metric_name] = metrics.precisionAt(self.top_k) return results
def recsys(spark): # Load data from parquet val = spark.read.parquet("val_set.parquet") test = spark.read.parquet("test_set.parquet") cols_to_drop = ['is_read', 'is_reviewed'] test = test.drop(*cols_to_drop) val = val.drop(*cols_to_drop) # Load model from path model_path = "hdfs:/user/ago265/best_model" best_model = ALSModel.load(model_path) # Compile a list of all the books each user read val_users = val.select("user_id").distinct() val_books = val.select("user_id", "book_id")\ .groupBy("user_id")\ .agg(expr('collect_list(book_id) as books')) test_users = test.select("user_id").distinct() test_books = test.select("user_id", "book_id").groupBy("user_id").agg(expr('collect_list(book_id) as books')) # # Recommender System for all users at k=500 # k = 500 # print('Making top 500 recommendations for all users') # rec = best_model.recommendForAllUsers(k) # Recommender System for subset of users at k=10 k = 10 print('Making top {} recommendations for a subset of users'.format(k)) rec = best_model.recommendForUserSubset(test_users, k) pred_label = rec.select('user_id','recommendations.book_id') # Create an RDD to evaluate with Ranking Metrics final_df = pred_label.join(test_books,['user_id'],'inner').select('book_id','books') final_rdd = final_df.rdd.map(lambda x: (x.book_id, x.books)) metrics = RankingMetrics(final_rdd) result1 = metrics.meanAveragePrecision result2 = metrics.precisionAt(k) result3 = metrics.ndcgAt(k) print("MAP = ", result1) print("Precision at k = ", result2) print("NDCG at k = ", result3)
def get_val_metrics(model, val): preds = model.transform(val) recs = model.recommendForUserSubset(val, 500) top_items = recs.selectExpr('user as user', 'recommendations.item as top_items') true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list')) predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\ .select('true_item_list', 'top_items') predictions_and_labels_rankings.write.json('val_recs.json') ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd) prec_at = ranking_metrics.precisionAt(500) mean_avg_prec = ranking_metrics.meanAveragePrecision ndcg = ranking_metrics.ndcgAt(500) rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse') rmse = evaluator.evaluate(preds) return rmse, prec_at, mean_avg_prec, ndcg
def main(spark): val_df = spark.read.parquet( 'hdfs:/user/jm7955/test_full_indexed.parquet').drop('count') labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels) predictions = val_df.groupBy("item").count().orderBy( "count", ascending=False).limit(500).collect() predictions = [row.item for row in predictions] print("predictions") #predictions.show() print('finished writing in %d seconds' % int(timer() - start)) predictionsAndLabels = labels.rdd.map(lambda tup: (predictions, tup[1])) print("predictionsAndLabels") print('finished writing in %d seconds' % int(timer() - start)) metrics = RankingMetrics(predictionsAndLabels) print('finished writing in %d seconds' % int(timer() - start)) print('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision) print('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500)) print('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
def main(spark, model_file, data_file, K): '''Main routine for Collaborative Filtering Model testing Parameters ---------- spark: SparkSession object model_file: string, path to store the model data_file: string, path to the parquet file to load K: int, evaluations are based on predictions of the top K items for each user ''' testIdx = spark.read.parquet(data_file) model = ALSModel.load(model_file) users_val = testIdx.select("user_idx").distinct() perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K) perUserPredictedItemsDF = perUserPredictedItemsDF.select( "user_idx", "recommendations.track_idx").withColumnRenamed( 'user_idx', 'user').withColumnRenamed('recommendations.track_idx', 'items') w2 = Window.partitionBy('user_idx').orderBy(col('count').desc()) perUserActualItemsDF = testIdx.select( 'user_idx', 'track_idx', 'count', F.rank().over(w2).alias('rank')).where( 'rank <= {0}'.format(K)).groupBy('user_idx').agg( expr('collect_list(track_idx) as items')).withColumnRenamed( 'user_idx', 'user') perUserItemsRDD = perUserPredictedItemsDF.join( perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) print("============================================") print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision) print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K)) print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def main(spark, model_file, test_file): test_data = spark.read.parquet(test_file) als_model_tuned = ALSModel.load(model_file) print("Imported trained model and test data sets") #generating true values of book_id for each user_id groundTruth_test = test_data.groupby("user_id").agg( F.collect_list("book_id").alias("test_truth")) print("Created ground truth df for test set") # user_test_list=spark.sql('select distinct user_id from groundTruth_val where user_id=14') # rec = als_model_normal.recommendForUserSubset(user_test_list,500) #generating recs rec = als_model_tuned.recommendForAllUsers(500) print("500 recommendations for all users generated") #creating dataframe to have both true values and predicted values predictions_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') #coverting to rdd for RankingMetrics() predAndLabels_test = predictions_test.select('recommendations.book_id', 'test_truth').rdd.map(tuple) print("starting ranking metrics for test data") metrics_test = RankingMetrics(predAndLabels_test) #calculating metrics precision_test = metrics_test.precisionAt(500) map_test = metrics_test.meanAveragePrecision ndcg_test = metrics_test.ndcgAt(500) print('Test set , Precision at 500: {}'.format(precision_test)) print('Test set , Mean Average Precision : {}'.format(map_test)) print('Test set, ndcgAt500 : {}'.format(ndcg_test))
def main(spark, rank, regParam, path, fraction): TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction) ALS_PATH = TEMP_PATH + "/als" MODEL_PATH = TEMP_PATH + "/als_model" print("Loading model...") als = ALS.load(path + ALS_PATH) model = ALSModel.load(path + MODEL_PATH) print("Loading data...") testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format( path, fraction)) testing.createOrReplaceTempView("testing") # RMSE predictions = model.transform(testing) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("RSME:", rmse) predictions = model.recommendForAllUsers(500) predictions.createOrReplaceTempView("predictions") groundtruth = testing.groupby("user_id").agg( F.collect_set("book_id").alias('groundtruth')) groundtruth.createOrReplaceTempView("groundtruth") total = spark.sql( "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id" ) total.createOrReplaceTempView("total") data = total.selectExpr("predictions.book_id", "groundtruth") print("df to rdd...") rdd = data.rdd.map(tuple) print("creating metrics...") metrics = RankingMetrics(rdd) print("meanAveragePrecision:", metrics.meanAveragePrecision) print("precision at 500:", metrics.precisionAt(500)) print("ndcgAt 500:", metrics.ndcgAt(500))
def Ranking_evaluator (spark,model, val, metric_type): val.createOrReplaceTempView('val') val_user = spark.sql('SELECT DISTINCT user_id FROM val') #val_user = val.select('user_id').distinct() val_rec = model.recommendForUserSubset(val_user,500) #val_rec.printSchema() val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations') val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating')) w= Window.partitionBy('user_id') val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct() val = val.sort(f.desc('rating')) val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct() scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner') rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect() ranks = spark.sparkContext.parallelize(rankLists) metrics = RankingMetrics(ranks) MAP = metrics.meanAveragePrecision Precision = metrics.precisionAt(500) NDCG = metrics.ndcgAt(500) if metric_type == 'Precision': return Precision, {'MAP': MAP,'NDCG': NDCG} elif metric_type == 'MAP': return MAP, {'Precision': Precision,'NDCG': NDCG} elif metric_type == 'NDCG': return NDCG, {'MAP': MAP, 'Precision': Precision} else: return None
def main(spark, log_comp=False, drop_low=False, drop_thr=0): ''' Parameters ---------- spark : SparkSession object train_path : string, path to the training parquet file to load val_path : string, path to the validation parquet file to load test_path : string, path to the validation parquet file to load ''' ## Load in datasets train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet' val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train = spark.read.parquet(train_path) val = spark.read.parquet(val_path) test = spark.read.parquet(test_path) ## Downsample the data # Pick out user list in training set user_train = set(row['user_id'] for row in train.select('user_id').distinct().collect()) # Pick out user list in validation set user_val = set(row['user_id'] for row in val.select('user_id').distinct().collect()) # Get the previous 1M users user_prev = list(user_train - user_val) # Random sampling to get 20% k = int(0.2 * len(user_prev)) user_prev_filtered = random.sample(user_prev, k) train = train.where(train.user_id.isin(user_prev_filtered + list(user_val))) ## Create StringIndexer indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') indexer_user_model = indexer_user.fit(train) indexer_track = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') indexer_track_model = indexer_track.fit(train) train = indexer_user_model.transform(train) train = indexer_track_model.transform(train) val = indexer_user_model.transform(val) val = indexer_track_model.transform(val) test = indexer_user_model.transform(test) test = indexer_track_model.transform(test) ## ALS model rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) ## Pick out users from validation set user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Log-Compression ## count -> log(1+count) if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" ## Drop interactions that have counts lower than specified threhold if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i)) # Make top 500 recommendations for users in validation test res = model.recommendForUserSubset(user_id, 500) pred_label = res.select('user_id_indexed', 'recommendations.track_id_indexed') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('Start Evaluating for {}'.format(i)) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa) pass
# |273 |[41816, 27149, 34678, 7667, 44085] | # |300 |[252, 273, 249, 70526, 19087] | # |412 |[28731, 8672, 377, 3113, 12806] | # |434 |[8641, 4373, 59438, 9138, 3075] | # |475 |[341, 3367, 52732, 5522, 376] | # |585 |[10539, 1093, 92301, 1118, 4265] | # |600 |[249, 399, 239, 1329, 398] | # |611 |[147361, 16719, 4348, 13235, 5355] | # |619 |[5434, 9311, 20623, 32116, 9872] | # +---------------+-----------------------------------+ combined = ( recommended_songs.join(relevant_songs, on='user_id_encoded', how='inner') .rdd .map(lambda row: (row[1], row[2])) ) combined.cache() combined.count() # 929537 combined.take(1) # ([107048, 127769, 129688, 113295, 145331], [43243, 32053, 32958, 25699, 33861]) rankingMetrics = RankingMetrics(combined) ndcgAtK = rankingMetrics.ndcgAt(k) print(ndcgAtK) # 1.8102832147923323e-05
#+---------------+--------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ #only showing top 5 rows from pyspark.mllib.evaluation import RankingMetrics # (c) # calculate ranking metricss metrics = RankingMetrics( recommendation_and_relevant \ .select(['song_recommended', 'song_relevant']) \ .rdd \ .map(lambda row: (row[0], row[1])) ) print("PRECISION @ 10: ", metrics.precisionAt(N)) print("MAP @ 10: ", metrics.meanAveragePrecision) print("NDCG @10: ", metrics.ndcgAt(N)) #PRECISION @ 10: 0.7214197272224288 #MAP @ 10: 0.727511046690355 #NDCG @10: 0.8838562387037185 K = 5 print("PRECISION @ 5: ", metrics.precisionAt(K)) print("MAP @ 5: ", metrics.meanAveragePrecision) print("NDCG @5: ", metrics.ndcgAt(K)) #PRECISION @ 5: 0.8564546973286272 #MAP @ 5: 0.727511046690355 #NDCG @5: 0.8868546173633727
# |13 |[39976, 17757, 42161, 10388, 358, 12484, 2665, 14813, 31426, 15100, 323, 26466, 1681, 24312, 44365, 106926, 198130, 21657, 93644, 44757, 5799, 25123, 9304, 39760, 9881, 2871, 79236, 2382, 198556, 91782, 55549, 62649, 5610, 14253, 6174, 18954, 35594, 238579, 69904, 9588, 8871, 15823, 31586, 3293, 26423, 2297, 34823, 68899, 3203, 25252, 61048, 32477, 69582, 7610, 209894, 92725, 3802, 10756, 18569, 22165, 367388, 227941, 23995, 34785, 7991, 14973, 20305, 7957, 61830, 22368, 20557, 178142, 218470, 22680, 21605, 9013, 27744, 57802, 10051, 2848, 266897, 18024, 53865, 12249, 24992, 21327, 9678, 8356, 13506, 33451, 8843, 116467, 490, 20049, 513, 1725, 10200, 7357, 6070, 188932, 21851, 6301, 506, 8090, 22949, 4701, 47288, 2216, 2385, 4444, 38333, 1495, 43326, 315646, 33003, 15979, 16027, 35607, 159510, 6527, 31853, 35293, 2328, 35042, 178908, 39174, 158341, 9842, 7585, 1402, 9332, 93039, 113812, 191995, 123202, 42562, 132855, 30570, 42540, 23455, 91249, 41681, 58231, 6767, 69674, 21994, 59822, 40986, 19394, 25327, 60898, 2041, 12795, 17051, 16370, 70431, 31587, 8892, 12582, 88674, 282, 13111, 1501, 22997, 31764, 2225, 20258, 27294, 10295, 2152, 57587, 9729, 23459, 16137, 117237, 72131, 19403, 11533, 43754, 45, 23750, 9571, 45203, 12685, 23519, 212800, 59, 30216, 24060, 78635, 20745, 8744, 156320, 642, 27949, 1535, 46969, 251093, 5199, 725, 2891, 145874, 45107, 60363, 20755, 24416, 19160, 3601, 5205, 37683, 57073, 33228, 23237, 192976, 12129, 4341, 32564, 74404, 10054, 110321, 344, 9723, 11257, 8182, 64614, 38089, 8119, 12191, 168335, 10890, 31603, 11590, 45920, 5544, 7669, 207048, 202010, 41916, 4633, 48367, 6221, 30591, 8147, 33618, 10090, 812, 11462, 93892, 168004, 17873, 22793, 3244, 122582, 5314, 7600, 151850, 151800, 6284, 11996, 33470, 16546, 1548, 5506, 717, 203892, 90024, 26917, 21907, 2786, 102091, 114944, 2224, 49639, 22955, 83688, 11276, 11809, 7934, 22376, 7702, 112081, 15597, 30624, 9639, 18184, 73911, 214566, 59176, 1827, 78718, 5938, 20295, 13316, 110918, 23884, 107546, 63621, 25199, 54068, 22616, 8092, 70417, 20132, 24387, 1085, 6323, 47494, 32450, 51668, 9915, 39085, 1903, 14467, 37499, 13965, 65128, 25002, 7598, 19500, 14881, 1746, 39393, 17841, 4624, 11898, 8217, 7969, 12153, 122408, 14710, 118438, 38815, 18032, 29033, 3704, 58911, 11481, 8588, 20206, 10082, 23141, 13642, 21032, 10553, 35684, 3361, 34612, 86649, 6247, 25974, 16845, 6262, 20356, 2771, 27958, 52280, 1272, 18797, 58599, 4169, 68650, 22741, 5820, 8298, 3410, 26007, 207212, 18396, 11278, 9411, 4225, 15767, 6874, 23970] | # |14 |[77141, 3637, 7195, 19, 7448, 11251, 197717, 14332, 24118, 11105, 5296, 5125, 91421, 4200, 15346, 10857, 156580, 6928, 3084, 18824, 62919, 156, 15517, 33040, 165827, 256, 54251, 4576, 3246, 8713, 116480, 8089, 10559, 159145, 8166, 19739, 20301, 807, 2455, 28, 11702, 3335, 66807, 69710, 20402, 14534, 51120, 5673, 22574, 10443, 30670, 14975, 3572, 27270, 40098, 160827, 3713, 762, 6412, 11386, 46340, 22183, 25538, 91, 52308, 9175, 18197, 132408, 103064, 4070, 23815, 13939, 22875, 1944, 26011, 1646, 22411, 38868, 5857, 20405, 93365, 25824, 83128, 24594, 107, 2134, 2956, 28117, 17090, 49321, 7218, 53240, 19662, 3445, 24516, 53928, 24564, 369, 7427, 16172, 1810, 64718, 24242, 42559, 16798, 70014, 4819, 25875, 31262, 64991, 267, 5431, 18463, 13819, 27310, 30276, 66483, 52946, 734, 43176, 68290, 124, 19582, 2473, 12647, 18895, 10195, 31347, 710, 19028, 34842, 86959, 37376, 8326, 48861, 3234, 7465, 14910, 1821, 1222, 44216, 23698, 16773, 33837, 70276, 12373, 1841, 39105, 1593, 2774, 238, 46385, 2572, 28325, 793, 1230, 3130, 2987, 1368, 77243, 11338, 19446, 34634, 23234, 6538, 18461, 37464, 669, 4583, 1767, 62515, 19067, 44465, 2283, 4301, 19720, 57401, 43294, 12301, 42033, 50, 431, 26850, 86988, 308, 7190, 6990, 11605, 214508, 110813, 167537, 1322, 5017, 834, 104892, 7385, 107016, 78, 13275, 6495, 5483, 3056, 30796, 5975, 31583, 9051, 1629, 23565, 2071, 14901, 498, 8804, 3866, 95118, 917, 865, 9796, 25093, 2577, 28466, 32617, 61395, 42801, 182130, 163515, 12024, 397, 5274, 14600, 3897, 150679, 69461, 14303, 38, 1923, 12997, 109675, 17776, 9505, 112961, 10086, 7314, 7, 31734, 1210, 1251, 32043, 7262, 66434, 6221, 36180, 14156, 64852, 28760, 7549, 6548, 11017, 1963, 25, 14833, 81312, 66354, 1273, 13256, 11389, 44865, 21964, 172, 1114, 25501, 887, 1228, 3294, 36312, 10234, 863, 1691, 107739, 56800, 26356, 35011, 1286, 5561, 9801, 11578, 21969, 16864, 13044, 5423, 25281, 13050, 2882, 44747, 7235, 70682, 16534, 11433, 8, 11274, 25555, 35219, 173, 14520, 17960, 9724, 3960, 8221, 20655, 8025, 3218, 9487, 1670, 2599, 37911, 795, 12070, 61941, 16904, 2381, 5003, 11642, 67007, 229, 8973, 54877, 3500, 11087, 1011, 15582, 288, 19932, 3540, 3740, 3605, 41086, 8753, 3648, 5459, 1633, 758, 2195, 13995, 7814, 436, 15809, 33354, 26958, 111222, 103374, 7558, 5417, 28749, 39374, 46077, 12621, 9244, 55504, 1502, 18336, 22059, 15351] | # |18 |[1684, 1373, 3158, 11006, 96, 1018, 3528, 6026, 323, 23159, 33690, 2026, 13818, 33777, 21162, 15130, 3913, 125, 31316, 5589, 6765, 9824, 2715, 13261, 1688, 6251, 768, 6201, 15774, 1579, 4095, 10431, 33, 66059, 13681, 1458, 11745, 3734, 25581, 3798, 5453, 51199, 13166, 3612, 1197, 2394, 239, 13400, 9072, 709, 16113, 34092, 14361, 3747, 981, 33490, 4634, 6814, 3491, 10275, 7603, 2521, 830, 19808, 8941, 7113, 18916, 6015, 21240, 4942, 7808, 14631, 1139, 20967, 7227, 823, 1053, 111, 6555, 17982, 5078, 552, 13747, 2, 6741, 23561, 7902, 86, 23027, 14767, 3436, 243, 1476, 20085, 506, 24618, 15521, 9537, 11375, 1388, 19604, 38277, 264, 473, 7112, 32, 6685, 335, 1679, 19118, 15434, 920, 11530, 9201, 2545, 63813, 6395, 3588, 1123, 275, 13, 11125, 884, 8242, 2894, 2263, 12239, 1156, 10453, 4553, 5151, 36538, 8360, 34192, 4871, 3290, 387, 16483, 21304, 16491, 2719, 7217, 15284, 30089, 5, 952, 7097, 29675, 9386, 5537, 9010, 2227, 10285, 130, 44730, 5515, 18043, 118, 512, 31109, 2410, 2899, 2194, 11479, 1403, 10295, 3749, 1589, 1154, 9729, 2144, 38816, 15849, 20165, 14722, 5664, 11795, 6298, 18714, 31537, 448, 2684, 1644, 17770, 1460, 1394, 43, 7984, 96194, 487, 1483, 1901, 588, 686, 1360, 1693, 40040, 10596, 21870, 9641, 162, 11548, 3427, 22937, 778, 1050, 29020, 22315, 1866, 324, 27863, 94391, 3079, 19766, 584, 7404, 560, 21, 385, 3642, 24830, 562, 4843, 71, 2099, 654, 1417, 417, 39781, 131529, 2140, 35027, 7139, 5127, 801, 16869, 1098, 8642, 478, 465, 3043, 4953, 5873, 2986, 48228, 68, 24715, 858, 16893, 3149, 17705, 1975, 6097, 1803, 25641, 3853, 7745, 362, 10858, 2113, 9000, 1771, 23953, 6553, 34806, 132, 59297, 5025, 791, 9502, 47844, 593, 3562, 1842, 15314, 3936, 18949, 1354, 19971, 92, 15414, 6236, 3872, 1576, 8848, 835, 23908, 16794, 1216, 632, 2731, 8518, 456, 40656, 77241, 1745, 125647, 931, 7546, 2668, 8, 4043, 1742, 65062, 4615, 12273, 3934, 1896, 13097, 3677, 2155, 29616, 94, 3498, 1554, 4137, 28853, 171, 2751, 5847, 16587, 2846, 44815, 949, 6124, 576, 15352, 4250, 6776, 81, 5084, 5238, 6086, 201, 18697, 26919, 9829, 1090, 41415, 67367, 2573, 11, 4030, 26728, 17469, 10210, 11798, 8706, 35135, 342, 197, 11747, 116, 54053, 53398] | # |38 |[19194, 24847, 32405, 1083, 13446, 50754, 19756, 10310, 40706, 14519, 16500, 28105, 33884, 3140, 33315, 15312, 156, 5282, 60007, 8338, 809, 22433, 4122, 1830, 7476, 22166, 7577, 5085, 89650, 20538, 8488, 5765, 27566, 3556, 9838, 2644, 12499, 10243, 11779, 2297, 22854, 28, 1728, 13729, 39674, 11632, 16113, 15826, 11243, 41876, 38841, 36730, 981, 119352, 13056, 50350, 9746, 18153, 44669, 16303, 55377, 8925, 23920, 37811, 5888, 28852, 26076, 14039, 19778, 8896, 37238, 16597, 823, 18847, 14430, 3179, 30366, 32656, 7301, 33764, 47070, 9230, 8502, 6375, 3782, 22085, 1976, 19604, 37394, 31819, 4466, 27459, 9170, 6889, 28121, 13901, 15601, 13272, 7985, 591, 18427, 6654, 25743, 20146, 9088, 4227, 1003, 9466, 19573, 3340, 8360, 18530, 88985, 2085, 4329, 80843, 111489, 18278, 11902, 12714, 22116, 49247, 27001, 30089, 19385, 9570, 7097, 24738, 14897, 8597, 22098, 8601, 13511, 29046, 23483, 21092, 13168, 30588, 35163, 15972, 4918, 2116, 6596, 3170, 1574, 8739, 161, 11930, 4549, 8781, 7161, 2776, 588, 6701, 12086, 13345, 20679, 12600, 27949, 7634, 21637, 10618, 6197, 35015, 7534, 4282, 37105, 4363, 75705, 11888, 2181, 2948, 29329, 584, 560, 1604, 28982, 17308, 14856, 7530, 48100, 4097, 23470, 11576, 17941, 6033, 8182, 10789, 2097, 2957, 17579, 8119, 10064, 1098, 13979, 17944, 3201, 40856, 16055, 9773, 1313, 52358, 14509, 24798, 14433, 17032, 36372, 40092, 1743, 15043, 16893, 19320, 4575, 11539, 25167, 41051, 31462, 18621, 27291, 2100, 9110, 5462, 73859, 45617, 27991, 8496, 172, 7061, 36503, 3562, 3936, 6964, 36521, 16364, 51719, 16996, 25343, 1933, 53672, 16794, 1396, 15887, 5683, 13570, 14746, 38563, 54530, 16534, 39231, 57838, 22824, 16793, 16163, 25770, 12273, 13097, 17097, 29616, 10024, 26370, 39441, 18292, 36039, 42265, 48291, 11323, 27139, 13165, 4597, 11362, 24062, 11481, 112844, 16071, 20206, 31566, 23141, 27364, 41685, 27778, 3536, 4108, 2404, 193, 168060, 41415, 6379, 34486, 6153, 12199, 2393, 8706, 12778, 17378, 33769, 4058, 3820, 18239, 119720, 46001, 8157, 45010, 6955] | # +--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ test_metric = temp.join(test_array, on="user_label_int", how="left") test_metric = test_metric.drop("user_label_int") # +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |user_label_int|recommends |collect_list(song_label_int) | # +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |1142 |[23, 139, 123, 16, 154, 15, 189, 326, 13, 64]|[10987, 59, 3669, 304, 28093, 26997, 63746, 11765, 105293, 77947, 1811, 37683, 19008, 11538, 28300, 23633, 1438, 41249, 22878, 35245, 8119, 30832, 35564, 27155, 10310, 38918, 255157, 8318, 28105, 73804, 156, 6330, 6730, 39734, 40038, 31689, 3791, 44169, 40961, 2842, 97090, 8030, 31952, 45293, 39674, 42473, 14361, 36730, 9568, 35437, 10309, 305, 661, 3401, 29747, 12390, 1783, 60083, 57150, 10668, 42899, 8621, 16611, 59655, 25533, 10911, 39675, 453, 334550, 16757, 13110, 27082, 32208, 41429, 7491, 11510, 14221, 14171, 7327, 78452, 27710, 2572, 7328, 3035, 41902, 28479, 55754, 28975, 10825, 306, 1460, 30031, 4780, 42324, 22860, 4872, 23791, 209667, 529, 122485, 9415, 6296, 158, 15676, 50788, 3069, 202095, 7100, 2237, 45427, 1875, 45010, 25193, 51955, 6721, 6685, 23842, 13096, 1997, 58890, 169, 26651, 13862, 13, 1192, 32409, 20335, 8349, 26319, 56018, 39725, 33538, 33175, 5971, 2726, 23229, 7893, 7061, 2854, 14325, 26892, 20126, 1480]| # +--------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # only showing top 1 row test_rdd = test_metric.rdd from pyspark.mllib.evaluation import RankingMetrics metrics_2 = RankingMetrics(test_rdd) metrics_2.precisionAt(5) # 0.06123845908946199 metrics_2.ndcgAt(10) # 0.05557657159203472 metrics_2.meanAveragePrecision # 0.00965199857704829
# +------+------------------------------------------------------------------------------------------------------------------------------------+ compare = recommends.join(ground_truths, on='user', how='left') compare = [(r.__getattr__('recommends'), r.__getattr__('ground_truths')) for r in compare.collect()] compare = sc.parallelize(compare) #Alternative method compare = sc.parallelize(compare.collect( )) # this take longer time than the above '_getattr_()' method. # print metrics metrics = RankingMetrics(compare) print(metrics.precisionAt(5)) # 0.06666666666666667 print(metrics.ndcgAt(10)) # 0.06506334166535027 print(metrics.meanAveragePrecision) # 0.027777777777777776 # predict test and rmse predict = model.transform(test) predict = predict.filter(F.col('prediction') != float('nan')) reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='rating', metricName='rmse') reg_eval.evaluate(predict) # 4.856047802562721 # testing NDCG metric on bad documents set1 = sc.parallelize([([1, 2, 3], [1, 2])])
def main(spark, val_pq, model_file_path): ''' Args ------- val_pq: validation data model_file_path: path to the pipeline(stringIndexers + als) model ''' # Read data val = spark.read.parquet(val_pq) print('load trained model') # Load the trained pipeline model model = PipelineModel.load(model_file_path) # evaluation print("Run prediction") # Run the model to create prediction against a validation set preds = model.transform(val) print("Run evaluation") # model evaluation using rmse on val data print("Start evaluation using rmse") evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(preds) # Generate top 500 book recommendations for each user in validation data. # Returns a DataFrame of (userCol, recommendations), # where recommendations are stored as an array of (itemCol, rating) Rows. #user_id = preds.select("user_id_idx").distinct() #res = model.stages[-1].recommendForUserSubset(user_id, 500) print("generate top 500 book recommendations for val users") res = model.stages[-1].recommendForAllUsers(500) preds_per_user = res.selectExpr("user_id_idx", "recommendations.book_id_idx as preds_books") # preds_pe_user.show(5) true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\ .groupBy("user_id_idx")\ .agg(expr("collect_set(book_id_idx) as books")) # true_per_user.show(5) print("Start join") # true_per_user: an RDD of (predicted ranking, ground # truth set) pairs # true_vs_preds_per_user = preds_per_user.join(true_per_user, ["userId"]).rdd\ # .map(lambda row: (row.items_pred, row.items)).cache() true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\ .select("preds_books","books").rdd # print(*true_vs_preds_per_user.take(5),sep="\n") # Evaluate using RMSE #evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="??") #rmse = evaluator.evaluate(preds) #print(f'The out-of-sample RMSE of the current model is: {rmse:.2f}') # Evaluate using MAP print("Start evaluation using MAP") metrics = RankingMetrics(true_vs_preds_per_user) map_ = metrics.meanAveragePrecision #Evaluate using ndcg print("Start evaluation using ndcg") ndcg = metrics.ndcgAt(500) #Evaluate using precision mpa = metrics.precisionAt(500) print('rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ', ndcg, 'mpa score: ', mpa)
# Join the song recommendations and actual songs played together for each user. user_songs = ( predicted_songs_per_user .join( actual_songs_per_user, on='User_ID_encoded', how='inner' ) ) user_songs.show(20, 100) # Select only the recommended and actual song columns and convert the dataframe to an rdd. user_songs_rdd = user_songs.select(F.col('recommended_songs'), F.col('relevant_songs')).rdd user_songs_rdd.cache() # Compute the ranking metrics for the collaborative filtering model. rank_metrics = RankingMetrics(user_songs_rdd) # Precision @ 5: 0.907 (3dp) precision_at_5 = rank_metrics.precisionAt(5) print(precision_at_5) # NDCG @ 10: 0.906 (3dp) ndcg_at_10 = rank_metrics.ndcgAt(10) print(ndcg_at_10) # MAP: 0.699 (3dp) mean_average_precision = rank_metrics.meanAveragePrecision print(mean_average_precision)
.config("spark.executor.memory", "8g") .config("spark.driver.memory", "8g") .getOrCreate()) spark.sparkContext.setLogLevel("ERROR") train = spark.read.parquet(f'{sys.argv[1]}/train.parquet') test = spark.read.parquet(f'{sys.argv[1]}/test.parquet') validation = spark.read.parquet(f'{sys.argv[1]}/validation.parquet') als = ALS(rank=15, maxIter=5, regParam=0.001,userCol="user_id", itemCol="book_id", ratingCol="rating", seed=0, coldStartStrategy="drop") StartT = time.time() model = als.fit(train) EndT = time.time() T = EndT - StartT print(f"The running time is: {T}") predictions = model.transform(test) predictions = predictions.orderBy(predictions.prediction.desc()) final_prediction = predictions.filter(predictions.prediction >= 0).groupBy("user_id").agg(F.collect_list("book_id").alias("prediction")) predictions = predictions.orderBy(predictions.rating.desc()) final_rating = predictions.groupBy("user_id").agg(F.collect_list("book_id").alias("rating")) final = final_prediction.join(final_rating, final_prediction.user_id == final_rating.user_id, 'inner')\ .select(final_prediction.user_id, final_prediction.prediction, final_rating.rating) metrics = RankingMetrics(final.select('prediction', 'rating').rdd.map(tuple)) res = metrics.ndcgAt(500) Precision = metrics.precisionAt(500) print(f"The NDCG evaluation result is: {res}") print(f"The PrecisionAtK evaluation result is: {Precision}")
predictionCol="prediction") rmse = evaluator.evaluate(predictions_test) print("Root-mean-square error = " + str(rmse)) test.createOrReplaceTempView('test') test_true = spark.sql( 'select user, book from test where rating > 2 sort by rating desc') labels = test_true.groupby('user').agg(collect_list('book')) test_recommendations = model.recommendForUserSubset(labels.select('user'), 500) preds = test_recommendations.withColumn( 'recommendations', explode('recommendations')).select( 'user', 'recommendations.item').groupBy('user').agg(collect_list('item')) preds_and_labels = preds.join(labels, on='user') metrics = RankingMetrics( preds_and_labels.select('collect_list(item)', 'collect_list(book)').rdd) map_metric = metrics.meanAveragePrecision pA = metrics.precisionAt(500) ndcgA = metrics.ndcgAt(500) results.append((rank, reg, rmse, map_metric, pA, ndcgA)) print('MAP = ', map_metric, ' pA = ', pA, ' ndcgA = ', ndcgA, '\n') res_rdd = spark.sparkContext.parallelize(results) res_df = spark.createDataFrame(res_rdd).repartition(1) res_df.write.csv('test_results.csv')
def main(spark, train_pq, val_pq): ''' Args ------- val_pq: validation data model_file_path: path to the pipeline(stringIndexers + als) model ''' import itertools # Read train and val data print("load train and validation data") train = spark.read.parquet(train_pq) val = spark.read.parquet(val_pq) # Increase partition size of train data to reduce task load #train.repartition(200) # Pipeline # StringIndexers print("build stringIndexer") indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_idx", handleInvalid='skip') indexer_book = StringIndexer(inputCol="book_id", outputCol="book_id_idx", handleInvalid='skip') # Hyper-parameter tuning rank_ = [10, 15, 20] regParam_ = [0.01, 0.05, 0.1, 0.3, 1] param_grid = itertools.product(rank_, regParam_) # ALS model hyperparameter tuning for i in param_grid: print('training for {} start'.format(i)) als = ALS(maxIter=10, rank=i[0], regParam=i[1],\ userCol="user_id_idx", itemCol="book_id_idx", ratingCol="rating",\ coldStartStrategy="drop").setSeed(42) # Combine into the pipeline pipeline = Pipeline(stages=[indexer_user, indexer_book, als]) model = pipeline.fit(train) print('training for {} complete'.format(i)) # predition against validation data preds = model.transform(val) # model evaluation using rmse on val data print("Start evaluation using rmse for {}".format(i)) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(preds) # Make top 500 recommendations for users in validation test print('evaluation for {} start'.format(i)) res = model.stages[-1].recommendForAllUsers(500) preds_per_user = res.selectExpr( "user_id_idx", "recommendations.book_id_idx as preds_books") true_per_user = preds.select("user_id_idx","book_id_idx").filter("rating>=3")\ .groupBy("user_id_idx")\ .agg(expr("collect_set(book_id_idx) as books")) print("Start join for {}".format(i)) true_vs_preds_per_user = preds_per_user.join(true_per_user, ["user_id_idx"])\ .select("preds_books","books").rdd # Evaluate using MAP print("Start evaluation using MAP for {}".format(i)) metrics = RankingMetrics(true_vs_preds_per_user) map_ = metrics.meanAveragePrecision #Evaluate using ndcg print("Start evaluation using ndcg for {}".format(i)) ndcg = metrics.ndcgAt(500) #Evaluate using precision print("Start evaluation using precisionAtK for {}".format(i)) mpa = metrics.precisionAt(500) print(i, 'rmse score: ', rmse, 'map score: ', map_, 'ndcg score: ', ndcg, 'mpa score: ', mpa)
model.recommendForAllUsers() predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating)) ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) predictionAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) # calculate metrics = RankingMetrics(predictionAndLabels) metrics.precisionAt(5) metrics.ndcgAt(10) metrics.meanAveragePrecision
def main(spark, data_file): '''Main routine for supervised evaluation Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load ''' # Load the dataframe from data file #input_data = spark.read.parquet(data_file) df = spark.read.csv(data_file, header=True) #df = df.limit(500000) df = df.filter(df.user_id.isNotNull()) print('1') df.show() df_unique_user = df.select("user_id").distinct() df_unique_user = df_unique_user.selectExpr("user_id as uid") percent = df_unique_user.count() percent = 0.001 * percent percent = math.floor(percent) print(percent) df_unique_user = df_unique_user.limit(percent) print('df unique user') df_unique_user.show() df_final = df.join(df_unique_user, df.user_id == df_unique_user.uid, "inner").select(df.user_id, df.book_id, df.is_read, df.rating, df.is_reviewed) df = df_final print('2') df.show() #FILTER USERS < 10 counts = df.groupBy('user_id').count().selectExpr("user_id as uid", "count as count") df = df.join(counts, df.user_id == counts.uid).filter(F.col("count") > 10).drop( 'uid', 'count') print('3') df.show() from pyspark.sql.types import DoubleType df = df.filter(df.rating.isNotNull()) df = df.withColumn("rating", df["rating"].cast(DoubleType())) from pyspark.ml.feature import StringIndexer stage_1 = StringIndexer(inputCol='user_id', outputCol='user_id_index') #df = stage_1.setHandleInvalid("keep").fit(df).transform(df) df = stage_1.fit(df).transform(df) stage_2 = StringIndexer(inputCol='book_id', outputCol='book_id_index') #transformed = stage_2.setHandleInvalid("keep").fit(df).transform(df) df = stage_2.fit(df).transform(df) user_id = df.select("user_id").distinct() uid = df.select( F.collect_set('user_id').alias('user_id')).first()['user_id'] #get the count of each user_id counts = df.groupBy('user_id').count() #Show count of each user_id counts = counts.selectExpr('user_id as user_id', 'count as n') #Rename count as n #Create Train Test and Validation sets 60-20-20 train_size = int(0.6 * len(uid)) vali_size = train_size + int(0.2 * len(uid)) test_size = vali_size + int(0.2 * len(uid)) train_set = uid[:train_size] vali_set = uid[train_size:vali_size] test_set = uid[vali_size:] train_set = df.filter(df.user_id.isin(train_set)) vali_set = df.filter(df.user_id.isin(vali_set)) test_set = df.filter(df.user_id.isin(test_set)) #----------------------------------------------- # In[8]: vali_uid = vali_set.select( F.collect_set('user_id').alias('user_id')).first()['user_id'] test_uid = test_set.select( F.collect_set('user_id').alias('user_id')).first()['user_id'] #For each validation user, use half of their interactions for training, validict = {i: 0.5 for i in vali_uid} new_vali = vali_set.sampleBy("user_id", fractions=validict, seed=40) testdict = {i: 0.5 for i in test_uid} new_test = test_set.sampleBy("user_id", fractions=testdict, seed=40) vali_set = vali_set.exceptAll(new_vali) train_set = train_set.union(new_vali) test_set = test_set.exceptAll(new_test) train_set = train_set.union(new_test) train_set.show() # # ALS # In[40]: from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.mllib.evaluation import RankingMetrics from pyspark.sql.functions import expr import itertools as it rank_ = [5, 10, 15] regParam_ = [0.01, 0.1, 1.0] alpha_ = [1, 2, 5] param_grid = it.product(rank_, regParam_, alpha_) vals_list = [] stats = [] rmse_list = [] best_map = 999999999 best_model = None for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank=i[0], maxIter=10, regParam=i[1], alpha=i[2], userCol="user_id_index", itemCol="book_id_index", ratingCol='rating', nonnegative=True, coldStartStrategy="drop") model = als.fit(train_set) user_subset = vali_set.select("user_id_index").distinct() userRecs = model.recommendForUserSubset(user_subset, 500) from pyspark.sql.functions import expr print('Recommended') true_label = test_set.select('user_id_index', 'book_id_index')\ .groupBy('user_id_index')\ .agg(expr('collect_list(book_id_index) as true_item')) pred_label = userRecs.select('user_id_index', 'recommendations.book_id_index') print('pred_label') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('pred_true_rdd') metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") predictions = model.transform(vali_set) predsDF = predictions.filter(predictions.rating.between(3, 5)).collect() predsDF = spark.createDataFrame(predsDF) rmse = evaluator.evaluate(predsDF) rmse_list.append(rmse) if map_ < best_map: best_model = model best_map = map_ #print('New best model') stats.append([i[0], i[1], i[2], rmse]) columns = [ 'Alpha', 'Rank', 'RegParam', 'MAP', 'Precision', 'NDGC', 'RMSE' ] vals_list.append((i[2], i[0], i[1], map_, mpa, ndcg, rmse)) print('MAP: %f' % map_) print('Precision: %f' % mpa) print('NDCG: %f' % ndcg) print('rmse %f:' % rmse) plt.scatter(i[0], rmse) #plt.pause(0.05) plt.show() #als=ALS(maxIter=5,regParam=0.09,rank=200,userCol="user_id_index",itemCol="book_id_index",ratingCol="rating",coldStartStrategy="drop",nonnegative=True) #model=als.fit(train_set) #evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction") #predictions=model.transform(vali_set) predictions.show() convertToFloat = lambda lines: [double(x) for x in vals_list] #from pyspark.sql.types import * mySchema = StructType([ StructField("Alpha", IntegerType(), True)\ ,StructField("Rank", IntegerType(), True)\ ,StructField("Reg_Param", DoubleType())\ ,StructField("MAP ", DoubleType(), True)\ ,StructField("Precision", DoubleType(), True)\ ,StructField("NDGC", DoubleType(), True)\ ,StructField("RMSE", DoubleType(), True)]) df = spark.createDataFrame(vals_list, schema=mySchema) df.show() #Evaluation of test set #print('Finish Training for {}'.format(i)) user_subset = test_set.select("user_id_index").distinct() userRecs = best_model.recommendForUserSubset(user_subset, 500) true_label = test_set.select('user_id_index', 'book_id_index')\ .groupBy('user_id_index')\ .agg(expr('collect_list(book_id_index) as true_item')) pred_label = userRecs.select('user_id_index', 'recommendations.book_id_index') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_index', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") predictions = model.transform(test_set) predsDF = predictions.filter(predictions.rating.between(3, 5)).collect() predsDF = spark.createDataFrame(predsDF) rmse = evaluator.evaluate(predsDF) print('Test Metrics:') print('MAP: %f' % map_) print('Precision: %f' % mpa) print('NDCG: %f' % ndcg) print('rmse %f:' % rmse) #Latent Factors ufac_df = best_model.userFactors.toPandas() ifac_df = best_model.itemFactors.toPandas() ufac_matrix = np.vstack(ufac_df.features.values) ifac_matrix = np.vstack(ifac_df.features.values) import seaborn as sns sns.set(rc={'figure.figsize': (11.7, 8.27)}) palette = sns.color_palette("hls", 10) import numpy as np import pandas as pd import sklearn from sklearn.manifold import TSNE X = ufac_matrix Y = ifac_matrix tsne = TSNE() X_embedded = tsne.fit_transform(X) Y_embedded = tsne.fit_transform(Y) plot_users = sns.scatterplot(X_embedded[:, 0], X_embedded[:, 1], legend='full', palette=palette) plot_items = sns.scatterplot(Y_embedded[:, 0], Y_embedded[:, 1], legend='full') pkl.dump(ufac_matrix, open('ufac_matrix.pkl', 'wb')) pkl.dump(ifac_matrix, open('ifac_matrix.pkl', 'wb'))