def main(): spark = SparkSession \ .builder \ .appName("ALSExample") \ .getOrCreate() # ratings_list = [i.strip().split(",") for i in open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv', 'r').readlines()] # stream = open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/movies.csv') # $example on$ lines = spark.read.text( "/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv" ).rdd parts = lines.map(lambda row: row.value.split(",")) ratingsRDD = parts.map(extract).filter(lambda x: x is not None) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratings.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show() movieRecs.show() userSubsetRecs.show() movieSubSetRecs.show() p = movieRecs.toPandas() spark.stop()
def run_spark_als(file_path): read_data(sql_context) als_data_frame = sql_context.sql(""" select visitorid,itemid, case when event = 'view' then 1 when event = 'addtocart' then 5 when event = 'transaction' then 10 else 0 end as rate from event_table """) print(als_data_frame.count()) als_data_frame.show() (training, test) = als_data_frame.randomSplit([0.7, 0.3]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics base_reg = 0.01 for iterNum in range(1): for regParm in range(1): als = ALS(maxIter=iterNum + 1, regParam=0.3, implicitPrefs=False, userCol="visitorid", itemCol="itemid", ratingCol="rate", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rate", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("iterNum: %s, regParam: %s, Root-mean-square error = %s" % (iterNum, base_reg, str(rmse))) base_reg += 0.1 model.itemFactors.show() model.userFactors.show() # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = als_data_frame.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = als_data_frame.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show(20, False) movieRecs.show(20, False) userSubsetRecs.show(20, False) movieSubSetRecs.show(20, False)
def find_personal_recommendation_als(train_data, my_fav, my_least_fav=None, reg_param=0.3, rank=16, maxiter=17): """ function to give personal recommendations to the defined user who has record of favourite restaurant and least favourite restaurant ----------- parameters: train_data: spark dataframe, Initial data for user_id,business_id,rating combinations my_fav: list of strings, favourite restaurant list; Assume these restaurants are marked as 5 my_least_fav: list of strings, least favourite restaurant list; Assume these restaurants are makred as 1 reg_param: lambda in ALS, defined the overfitting penalty of the ALS rank: rank of ALS, defined the complexity of ALS """ best_regularization = reg_param best_rank = rank new_user_id, new_set = add_new_user_data(train_data, my_fav, my_least_fav) best_als=ALS(maxIter=maxiter, \ regParam=best_regularization, \ userCol="user_id_int", \ itemCol="business_id_int", \ ratingCol="stars_review_int", \ rank=best_rank, \ coldStartStrategy="drop", \ checkpointInterval=2 ) model = best_als.fit(new_set) users = new_set.select( best_als.getUserCol()).where(col('user_id_int') == new_user_id) userSubsetRecs = model.recommendForUserSubset(users, 10) print( "Here are the top 10 recommandations for you given your favourite and least favourite food" ) for restaurant in userSubsetRecs.select("recommendations").collect()[0][0]: restaurant_id = restaurant.__getattr__("business_id_int") restaurant_name = train_data.where( col('business_id_int') == restaurant_id).select( 'name').distinct().collect() print(restaurant_name[0].__getattr__("name")) return userSubsetRecs
class Recommendation: def __init__(self, spark, filename): # TO DO # Read data # # self.ratings = ... (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get # NaN evaluation metrics self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(training) # evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logger.info("Root-mean-square error = " + str(rmse)) # top movie recommendations for each user def recommend_for_users(self, num_movies): return self.model.recommendForAllUsers(num_movies) # top user recommendations for each movie def recommend_for_movies(self, num_recommendations): return self.model.recommendForAllItems(num_recommendations) # top movie recommendations for a specified set of users def recommend_for_setusers(self, num_users): users = self.ratings.select(self.als.getUserCol()).distinct().limit(3) return self.model.recommendForUserSubset(users, num_users) # top user recommendations for a specified set of movies def recommend_for_setmovies(self, num_movies): movies = self.ratings.select(self.als.getItemCol()).distinct().limit(3) return self.model.recommendForItemSubset(movies, num_movies)
if error < min_error: min_error = error best_rank = err err += 1 als.setRank(ranks[best_rank]) print 'The best model was trained with rank %s' % ranks[best_rank] my_model = models[best_rank] # COMMAND ---------- # TEST Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2))) Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank])) Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE. # MAGIC # MAGIC The steps you should perform are: # MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame. # MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you. # MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame. # COMMAND ----------
def to_float(col): return float(col) udf_to_float = udf(to_float, DoubleType()) predictions = rawPredictions\ .withColumn("count", udf_to_float(rawPredictions["count"]))\ .withColumn("prediction", udf_to_float(rawPredictions["prediction"])) evaluator = RegressionEvaluator(metricName="rmse", labelCol="play_count", predictionCol="prediction") # rmse = evaluator.evaluate(predictions) # print("Root-mean-square error = " + str(rmse)) # Question 2 B. # Predicting songs for a certain user movies = als_dataset.select(als.getUserCol()).filter( als_dataset["user_label_int"].rlike("14[0-5]")).distinct() movieSubSetRecs = model.recommendForUserSubset(movies, 10) movieSubSetRecs.show(10, False) # +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |user_label_int|recommendations | # +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ # |143 |[[10, 0.7991416], [7, 0.77103287], [0, 0.7623659], [11, 0.75945157], [17, 0.7496676], [9, 0.6951245], [44, 0.67278486], [49, 0.66093284], [26, 0.655926], [109, 0.64585423]]| # +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
class RecommendationEngine: """A movie recommendation engine""" def __train_model(self): """Train the ALS model with the current dataset""" logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratingsdf) logger.info("ALS model built!") def get_top_ratings(self, user_id, movies_count): users = self.ratingsdf.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, movies_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, movie_id, user_count): movies = self.ratingsdf.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_movie_rating(self, user_id, movie_id): request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model.transform(request).collect() return ratings def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path""" logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratingsdf = spark_session.read.csv(ratings_file_path, header=True, inferSchema=True).na.drop() self.ratingsdf = self.ratingsdf.drop("timestamp") # Load movies data for later use logger.info("Loading Movies data...") movies_file_path = os.path.join(dataset_path, 'movies.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=True, inferSchema=True).na.drop() self.moviesdf = self.moviesdf.drop("genres") # Train the model self.__train_model()
def main(): spark = SparkSession \ .builder \ .appName("RedditRecommender") \ .getOrCreate() output_file = open('als-output.txt', 'w') data = spark.read.json('./sample_data.json') cols_to_keep = [data.author, data.id, data.subreddit] data = data.select(*cols_to_keep) data = data.filter(data.author != "[deleted]") @udf("boolean") def isNotDefault(x): defaultSubs = [ "Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews" ] return x not in defaultSubs data = data.filter(isNotDefault(data.subreddit)) data = data.groupBy([data.author, data.subreddit]).count().orderBy(data.author) data = data.withColumn('author_id', hash(data.author)) data = data.withColumn('subreddit_id', hash(data.subreddit)) (training, test) = data.randomSplit([0.8, 0.2]) als = ALS(maxIter=5, rank=70, regParam=0.01, userCol="author_id", itemCol="subreddit_id", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) model = als.fit(training) predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(predictions) output_file.write('Root mean squared error: ' + str(rmse) + '\n\n') users = data.select(als.getUserCol()).distinct().limit(30) user_subset_recs = model.recommendForUserSubset(users, 10) subreddit_recs = {} for row in user_subset_recs.collect(): author = get_author_from_id(data, row['author_id']) subreddit_recs[author] = [] for rec in row['recommendations']: subreddit_recs[author].append(get_subreddit_from_id(data, rec[0])) for author in subreddit_recs.keys(): output_file.write('Top 10 recommendations for user ' + author + ':\n') for rec in subreddit_recs[author]: output_file.write(rec) output_file.write('\n') output_file.write('\n')
class RecommendationEngine: """A movie recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratingsdf) logger.info("ALS model built!") def get_top_ratings(self, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ users = self.ratingsdf.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, movies_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, movie_id, user_count): """Recommends up to movies_count top unrated movies to user_id """ movies = self.ratingsdf.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_ratings_for_movie_ids(self, user_id, movie_id): """Given a user_id and a list of movie_ids, predict ratings for them """ request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model.transform(request).collect() return ratings def add_ratings(self, user_id, movie_id, ratings_given): """Add additional movie ratings in the format (user_id, movie_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, movie_id, ratings_given)], ["userId", "movieId", "rating"]) # Add new ratings to the existing ones self.ratingsdf = self.ratingsdf.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, user_id): """Get rating history for a user """ self.ratingsdf.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, '../../datasets/ratings.csv') self.ratingsdf = spark_session.read.csv(ratings_file_path, header=True, inferSchema=True).na.drop() self.ratingsdf = self.ratingsdf.drop("timestamp") # Load movies data for later use logger.info("Loading Movies data...") movies_file_path = os.path.join(dataset_path, '../../datasets/movies.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=True, inferSchema=True).na.drop() self.moviesdf = self.moviesdf.drop("genres") # Train the model self.__train_model()
# COMMAND ---------- # TEST Test.assertEquals( round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}" .format(round(min_error, 2))) Test.assertEquals( ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format( ranks[best_rank])) Test.assertEqualsHashed( als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed( als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed( als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format( als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE. # MAGIC # MAGIC The steps you should perform are: # MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame.
class RecommendationEngine: """ A book recommendation engine """ def __init__(self, spark_session, dataset_path): """ Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings dataset...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratings_df = spark_session.read.csv(ratings_file_path, header="true", inferSchema="true").na.drop() # Load movies data for later use logger.info("Loading Books dataset...") books_file_path = os.path.join(dataset_path, 'books.csv') self.books_df = spark_session.read.csv(books_file_path, header="true", inferSchema="true").na.drop() self.books_df.createOrReplaceTempView("books") self.books_df_selected = self.spark_session.sql("SELECT `book_id`, `title` \ FROM books") # Train the model self.__train_model() def __train_model(self): """ Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings_df) logger.info("ALS model built!") def get_top_ratings(self, user_id, book_count): """ Recommends up to book_count top unrated books to user_id """ users = self.ratings_df.select(self.als.getUserCol()) users = users.filter(users.user_id == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, book_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('user_id'), \ func.col('recommendations')['book_id'].alias('book_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner') userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_book_recommend(self, book_id, user_count): """ Recommends up to book_count top unrated books to user_id """ books = self.ratings_df.select(self.als.getItemCol()) books = books.filter(books.book_id == book_id) bookSubsetRecs = self.model.recommendForItemSubset(books, user_count) bookSubsetRecs = bookSubsetRecs.withColumn("recommendations", explode("recommendations")) bookSubsetRecs = bookSubsetRecs.select(func.col('book_id'), \ func.col('recommendations')['user_id'].alias('user_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') bookSubsetRecs = bookSubsetRecs.drop('Rating') bookSubsetRecs = bookSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner') bookSubsetRecs = bookSubsetRecs.toPandas() bookSubsetRecs = bookSubsetRecs.to_json() return bookSubsetRecs def get_ratings_for_book_ids(self, user_id, book_id): """ Given a user_id and a list of book_ids, predict ratings for them """ request = self.spark_session.createDataFrame([(user_id, book_id)], ["user_id", "book_id"]) ratings = self.model.transform(request).collect() return ratings def add_ratings(self, user_id, book_id, ratings_given): """ Add additional movie ratings in the format (user_id, movie_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame([(user_id, book_id, ratings_given)], ["user_id", "book_id", "rating"]) # Add new ratings to the existing ones self.ratings_df = self.ratings_df.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, user_id): """ Get rating history for a user """ self.ratings_df.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql('SELECT `user_id`, `book_id`, `rating` from ratingsdata \ WHERE `user_id` = "%s"' %user_id) user_history = user_history.join(self.books_df_selected, ("book_id"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history
class ALSModelSpark(ALSModel): def __init__(self, params): super().__init__(params) self.sqlContext = SQLContext(sc) self._als = ALS(userCol='UID', itemCol='IID', ratingCol='stars', \ coldStartStrategy="drop") self._als.setParams(**self.params) def parse_data(self, path_ratings, nrows): df_ratings = self.sqlContext.read.csv(path_ratings, header=True, quote='"').limit(nrows) # self.data.count() raw_to_uid = StringIndexer(inputCol="user_id", outputCol="UID").fit(df_ratings) self.data = raw_to_uid.transform(df_ratings) raw_to_iid = StringIndexer(inputCol="business_id", outputCol="IID").fit(df_ratings) self.data = raw_to_iid.transform(self.data) # uid and iid must be integers for spark ALS self.data = self.data.rdd.map(\ lambda r: (int(r['UID']), \ int(r['IID']), \ float(r['stars'])))\ .toDF(("UID", "IID", "stars")) def update_parameters(self): self._als.setParams(**self.params) def fit(self): self._model = self._als.fit(self.data) def predict(self, uid, iid): return self._model.transform(test) def top_n_recommendations(self, uid, n=5): ''' Obtain the top n recommendation for any user. Method for the Spark library ''' users = self.data.select(self._als.getUserCol()) user = users.filter(users['UID'] == uid) topN = self._model.recommendForUserSubset(user, 5).collect() top_n_iid, predictions = [], [] for row in topN[0].recommendations: top_n_iid.append(row.IID) predictions.append(row.rating) return top_n_iid, predictions def cross_validate(self, train=0.8): ''' Return the RMSE of the cross validation set. Parameters ---------- train: The fraction to use for training. Default: 0.8 ''' (trainset, testset) = self.data.randomSplit([train, 1. - train]) _model = self._als.fit(trainset) _pred = _model.transform(testset) # pred_.count() may be very small _eval = RegressionEvaluator(\ metricName='rmse', labelCol='stars', \ predictionCol='prediction') rmse = _eval.evaluate(_pred) return rmse
x = sorted(x, key=lambda x: -x[1]) return [x[0] for x in x][0:k] extract_songs_top_k_udf = udf(lambda x: extract_songs_top_k(x, k), ArrayType(IntegerType())) def extract_songs(x): x = sorted(x, key=lambda x: -x[1]) return [x[0] for x in x] extract_songs_udf = udf(lambda x: extract_songs(x), ArrayType(IntegerType())) users = test.select(als.getUserCol()).distinct().limit(10) users.cache() userSubsetRecs = als_model.recommendForUserSubset(users, k) recommended_songs = (userSubsetRecs.withColumn( "recommended_songs", extract_songs_top_k_udf(col("recommendations"))).select( "user_id_encoded", "recommended_songs")) recommended_songs.cache() recommended_songs.count() ''' Output 10 ''' recommended_songs.show(10, 100) ''' Output
class RecommendationEngine: """A product recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_ratings(self, model, user_id, products_count): if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model1.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model2.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model3.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_product_recommend(self, model, product_id, user_count): if model == 0: products = self.df0.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model1.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 1: products = self.df1.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model2.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 2: products = self.df2.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model3.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs def get_ratings_for_product_ids(self, model, user_id, product_id): if model == 0: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model1.transform(request).collect() return ratings elif model == 1: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model2.transform(request).collect() return ratings elif model == 2: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model3.transform(request).collect() return ratings def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load Amazon data for later use logger.info("Loading Amazon data...") file_name1 = 'model-1.txt' dataset_file_path1 = os.path.join(dataset_path, file_name1) exist = os.path.isfile(dataset_file_path1) if exist: self.df0 = spark_session.read.csv(dataset_file_path1, header=None, inferSchema=True) self.df0 = self.df0.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name2 = 'model-2.txt' dataset_file_path2 = os.path.join(dataset_path, file_name2) exist = os.path.isfile(dataset_file_path2) if exist: self.df1 = spark_session.read.csv(dataset_file_path2, header=None, inferSchema=True) self.df1 = self.df1.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name3 = 'model-3.txt' dataset_file_path3 = os.path.join(dataset_path, file_name3) exist = os.path.isfile(dataset_file_path3) if exist: self.df2 = spark_session.read.csv(dataset_file_path3, header=None, inferSchema=True) self.df2 = self.df2.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") # Train the model self.__train_all_model()
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratings.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show() movieRecs.show() userSubsetRecs.show() movieSubSetRecs.show() spark.stop()
fig, ax = plt.subplots() rects = ax.bar([str(r) for r in ranks], results, label = "rmse") ax.set_ylabel('RMSE result') ax.set_title("RMSE by ranks") ax.yaxis.set_data_interval(min(results), max(results),True) for rect in rects: height = rect.get_height() ax.annotate(f'{height:.4f}', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') plt.savefig("./Output/Lab3_plot.png") # select a user users = ratings.select(als.getUserCol()).distinct().sample(withReplacement = False, fraction = 0.1, seed = myseed).limit(1) users.show() # get recomendations from model userSubsetRecs = model.recommendForUserSubset(users, 5) userSubsetRecs.show(1, False) # get movie_id movies = userSubsetRecs.collect()[0].recommendations movies = [row.movieId for row in movies] print(movies) # loading movies.csv movie_data = spark.read.load('/home/lip20ps/com6012/ScalableML/Data/ml-latest-small/movies.csv', format = 'csv', inferSchema = "true", header = "true").cache() movie_data.show(20, False) # find movie according to movie_id for movie_id in movies:
class RecommendationEngine: """A anime recommendation engine """ def __train_model(self, ratings): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(rank=12, maxIter=21, regParam=0.16, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True) self.model = self.als.fit(self.ratings_list[0]) logger.info("ALS model built!") model = self.model return model def get_ratings_for_movie_ids(self, userId, movieId, model): """Given a user_id and a list of anime_ids, predict ratings for them """ dataframe = self.spark.createDataFrame([(userId, movieId)], ["userId", "movieId"]) predictions = self.model_list[model].transform(dataframe) ratings = predictions.toPandas() ratings = ratings.to_json() return ratings def get_top_ratings(self, userId, movies_count, model): """Recommends up to movies_count top unrated movies to user_id """ users = self.ratings_list[model].select( self.als.getUserCol()).distinct() users = users.filter(users.userId == userId) top_ratings = self.model.recommendForUserSubset(users, movies_count) self.json_top = top_ratings.toPandas() self.json_top = self.json_top.to_json() return self.json_top def __init__(self, spark, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ self.ratings_list = [] self.model_list = [] logger.info("Starting up the Recommendation Engine: ") self.spark = spark # Load ratings data for later use logger.info("Loading Ratings data...") listCsv = os.listdir(dataset_path) listCsv = list(listCsv) listCsv.sort() for i in listCsv: print(i) ratings_file_path = os.path.join(dataset_path, i) ratings = spark.read.csv(ratings_file_path, header=True, inferSchema=True).limit(1000000) self.ratings_list.append(ratings) model = self.__train_model(ratings) self.model_list.append(model)
def main(): args_iter = iter(sys.argv[1:]) args = dict(zip(args_iter, args_iter)) # Retrieve the args and replace 's3://' with 's3a://' (used by Spark) s3_input_data = args['s3_input_data'].replace('s3://', 's3a://') print(s3_input_data) s3_output_data = args['s3_output_data'].replace('s3://', 's3a://') print(s3_output_data) spark = SparkSession.builder \ .appName("Spark_ALS") \ .getOrCreate() lines = spark.read.text(s3_input_data).rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=int(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("\nrmse: " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) userRecs.show() # Write top 10 movie recommendations for each user # Note: This is commented out until we fix this: # org.apache.spark.sql.AnalysisException: CSV data source does not support array<struct<movieId:int,rating:float>> data type.; # userRecs.repartition(1).write.mode("overwrite").option("header", True).option("delimiter", "\t").csv(f"{s3_output_data}/recommendations") # Generate top 10 movie recommendations for a specified set of 3 users # TODO: Just select user_id "42" users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) userSubsetRecs.show(truncate=False) # # Generate top 10 user recommendations for each movie # movieRecs = model.recommendForAllItems(10) # movieRecs.show() # # Write top 10 user recommendations for each movie # movieRecs. # .repartition(1) # .write # .mode("overwrite") # .option("header", True) # .option("delimiter", "\t") # .csv(f"{s3_output_data}/movies") # # Generate top 10 user recommendations for a specified set of movies # # TODO: Just select user_id "42" # movies = ratings.select(als.getItemCol()).distinct().limit(3) # movieSubSetRecs = model.recommendForItemSubset(movies, 10) # movieSubSetRecs.show(truncate=False) spark.stop()
class RecommendationEngine: """A anime recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings) logger.info("ALS model built!") def get_ratings_for_movie_ids(self, userId, movieId): """Given a user_id and a list of anime_ids, predict ratings for them """ dataframe = self.spark.createDataFrame([(userId, movieId)], ["userId", "movieId"]) predictions = self.model.transform(dataframe) ratings = predictions.toPandas() ratings = ratings.to_json() return ratings def get_top_ratings(self, userId, animes_count): """Recommends up to animes_count top unrated animes to user_id """ users = self.ratings.select(self.als.getUserCol()).distinct() users = users.filter(users.userId == userId) top_ratings = self.model.recommendForUserSubset(users, animes_count) self.json_top = top_ratings.toPandas() self.json_top = self.json_top.to_json() return self.json_top def __init__(self, spark, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark = spark # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratings = spark.read.csv(ratings_file_path, header=True, inferSchema=True) # Load data Anime # logger.info("Loading Anime data...") # ratings_file_path = os.path.join(dataset_path, 'anime.csv') # self.animes = spark.read.csv(ratings_file_path, header=True, inferSchema=True) self.__train_model()
class RecommendationEngine: """A movie recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ logger.info("Splitting dataset") self.df0 = self.df.limit(int(self.dataset_count / 3)) self.df1 = self.df.limit(int(self.dataset_count * 2 / 3)) self.df2 = self.df print('df 0 count = ' + str(self.df0.count())) print('df 1 count = ' + str(self.df1.count())) print('df 2 count = ' + str(self.df2.count())) logger.info("Dataset Splitted !") #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_ratings(self, model, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model1.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model2.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model3.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, model, movie_id, user_count): """Recommends up to movies_count top unrated movies to user_id """ if model == 0: movies = self.df0.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model1.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs elif model == 1: movies = self.df1.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model2.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs elif model == 2: movies = self.df2.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model3.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_ratings_for_movie_ids(self, model, user_id, movie_id): """Given a user_id and a list of movie_ids, predict ratings for them """ if model == 0: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model1.transform(request).collect() return ratings elif model == 1: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model2.transform(request).collect() return ratings elif model == 2: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model3.transform(request).collect() return ratings def add_ratings(self, model, user_id, movie_id, ratings_given): """Add additional movie ratings in the format (user_id, rating, movie_id) """ if model == 0: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df0 = self.df0.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings elif model == 1: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df1 = self.df1.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings elif model == 2: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df2 = self.df2.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, model, user_id): """Get rating history for a user """ if model == 0: self.df0.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history elif model == 1: self.df1.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history elif model == 2: self.df2.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") file_counter = 0 while True: file_name = 'data_part_' + str(file_counter) + '.txt' dataset_file_path = os.path.join(dataset_path, file_name) exist = os.path.isfile(dataset_file_path) if exist: if file_counter == 0: self.df = spark_session.read.csv(dataset_file_path, header=None, inferSchema=True) else: df_new = spark_session.read.csv(dataset_file_path, header=None, inferSchema=True) self.df = self.df.union(df_new) self.dataset_count = self.df.count() print('Data loaded = ' + str(self.dataset_count)) print(file_name + 'Loaded !') file_counter += 1 else: break self.df = self.df.selectExpr("_c0 as userId", "_c1 as rating", "_c2 as movieId") self.df.show() # print(self.df.count()) # Load movie data for later use logger.info("Loading Movie data...") movies_file_path = os.path.join(dataset_path, 'movie_titles.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=None, inferSchema=True) self.moviesdf = self.moviesdf.selectExpr("_c0 as movieId", "_c1 as Year", "_c2 as movie_title") # Train the model self.__train_all_model()
test = test.na.fill(0.0) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating_score", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root Mean Square Error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 movie recommendations for a specified set of users users = df_rec.select(als.getUserCol()).distinct().limit(3) users_subset = model.recommendForUserSubset(users, 10) #users_subset = users_subset.withColumn("rec_exp", explode("recommendations")).select('user_id', col("rec_exp.movie_id"), col("rec_exp.rating")) #users_subset.limit(10).show() #userRecs = userRecs.withColumn("rec_exp", explode("recommendations")).select('user_id', col("rec_exp.movie_id")) users_subset = users_subset.withColumn("rec_exp", explode("recommendations")).select( 'user_id', col("rec_exp.movie_id")) details(users_subset) #details(userRecs) print("^^^^^ Recommendations ^^^^^") spark.stop()
rmse = evaluator.evaluate(predictions) #model als = ALS(rank=8,maxIter=4,regParam=0.04, userCol="reviewerID", itemCol="asin",ratingCol="overall", coldStartStrategy="nan") mymodel= als.fit(indexedDf) userRecs = mymodel.recommendForAllUsers(10) ProductRecs = mymodel.recommendForAllItems(10) userRecs.show() ProductRecs.show() # Generate top 5 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = mymodel.recommendForUserSubset(users, 5) # Generate top 5 user recommendations for a specified set of products products = ratings.select(als.getItemCol()).distinct().limit(3) productSubSetRecs = mymodel.recommendForItemSubset(products, 5) userSubsetRecs.show() ProductSubSetRecs.show() #pred_rdd = predictions.rdd #pred_rdd.repartition(1).saveAsTextFile("preds") rdd1 = userRecs.rdd rdd2 = ProductRecs.rdd
class RecommendationEngine: """ Recommendation engine """ def __init__(self, spark_session, dataset_path): self.spark_session = spark_session logger.info("Starting up the Spark Session: {}".format( self.spark_session)) # Load listening count data logger.info("Loading listening count dataset...") self.listening_count_df = [] for i in range(0, 3): lc_file_path = os.path.join(dataset_path, 'batch/batch' + str(i) + '.txt') new_df = spark_session.read.csv(lc_file_path, header=None, inferSchema=True).na.drop() new_df = new_df.selectExpr("_c0 as user_id", "_c1 as artist_id", "_c2 as weight") try: self.listening_count_df.append( self.listening_count_df[i - 1].union(new_df)) except IndexError: self.listening_count_df.append(new_df) logger.info("{} loaded".format('batch' + str(i) + '.txt')) logger.info("Loading listening count dataset done!") # Load artist data logger.info("Loading artist dataset...") artist_file_path = os.path.join(dataset_path, 'csv/artists.csv') self.artist_df = spark_session.read.csv(artist_file_path, header="true", inferSchema="true").na.drop() self.artist_df.createOrReplaceTempView("artists") self.artist_df_selected = self.spark_session.sql( "SELECT `id` as artist_id, `name`, `url` \ FROM artists") logger.info("Loading artist dataset done...") # Train the model self.__train_model() def __train_model(self): # Train the ALS model with the current dataset logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="artist_id", ratingCol="weight", coldStartStrategy="drop") self.model = [] for i in range(0, 3): self.model.append(self.als.fit(self.listening_count_df[i])) logger.info("Model {} done : {}".format( i, self.listening_count_df[i].count())) logger.info("ALS model built!") def get_top_ratings(self, model_id, user_id, num_of_books): # Recommends up to top unrated books to user_id user = self.listening_count_df[model_id].select(self.als.getUserCol()) user = user.filter(user.user_id == user_id) userSubsetRecs = self.model[model_id].recommendForUserSubset( user, num_of_books) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('recommendations')['artist_id'].alias('artist_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.artist_df_selected, ('artist_id'), 'inner') df_json = userSubsetRecs.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data def get_top_music_recommend(self, model_id, artist_id, num_of_users): # Recommends up to top unrated books to user_id artist = self.listening_count_df[model_id].select( self.als.getItemCol()) artist = artist.filter(artist.artist_id == artist_id) artistSubsetRecs = self.model[model_id].recommendForItemSubset( artist, num_of_users) artistSubsetRecs = artistSubsetRecs.withColumn( "recommendations", explode("recommendations")) artistSubsetRecs = artistSubsetRecs.select(func.col('recommendations')['user_id'].alias('user_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') artistSubsetRecs = artistSubsetRecs.drop('Rating') df_json = artistSubsetRecs.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data def get_listening_count_for_artist_ids(self, model_id, user_id, artist_id): # Given a user_id and a list of artist_ids, predict listening count for them request = self.spark_session.createDataFrame([(user_id, artist_id)], ['user_id', 'artist_id']) weight = self.model[model_id].transform(request).collect() data = {} data['result'] = weight[0][2] return data def get_listening_count(self, model_id, user_id): # Get listening count history for a user self.listening_count_df[model_id].createOrReplaceTempView( "listeningcount") user_history = self.spark_session.sql( 'SELECT `artist_id`, `weight` from listeningcount \ WHERE `user_id` = "%s"' % user_id) user_history = user_history.join(self.artist_df_selected, ('artist_id'), 'inner') df_json = user_history.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data
class RecommendationEngine: """A Yelp recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_stars(self, model, userId, business_count): """Recommends up to business_count top unrated business to userId """ if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model1.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') # userSubsetRecs = userSubsetRecs.join(self.df0, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model2.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') #userSubsetRecs = userSubsetRecs.join(self.df1, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model3.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') #userSubsetRecs = userSubsetRecs.join(self.df2, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_business_recommend(self, model, businessId, user_count): """Recommends up to businesss_count top unrated businesss to user_id """ if model == 0: business = self.df0.select(self.als.getItemCol()) business = business.filter(business.businessId == businessId) businessSubsetRecs = self.model1.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('stars')).\ drop('recommendations') # businessSubsetRecs = businessSubsetRecs.drop('rating') #businessSubsetRecs = businessSubsetRecs.join(self.businessdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs elif model == 1: business = self.df1.select(self.als.getItemCol()) business = business.filter(business.businessId == businessId) businessSubsetRecs = self.model2.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #businessSubsetRecs = businessSubsetRecs.drop('rating') businessSubsetRecs = businessSubsetRecs.join( self.businesssdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs elif model == 2: businesss = self.df2.select(self.als.getItemCol()) businesss = businesss.filter(businesss.businessId == businessId) businessSubsetRecs = self.model3.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #businessSubsetRecs = businessSubsetRecs.drop('stars') #businessSubsetRecs = businessSubsetRecs.join(self.businesssdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs def get_stars_for_business_ids(self, model, userId, businessId): """Given a user_id and a list of business_ids, predict Stars for them """ if model == 0: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model1.transform(request).collect() return Stars elif model == 1: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model2.transform(request).collect() return Stars elif model == 2: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model3.transform(request).collect() return Stars def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load Stars data for later use logger.info("Loading Stars data...") file_name1 = 'data_part_1.txt' dataset_file_path1 = os.path.join(dataset_path, file_name1) exist = os.path.isfile(dataset_file_path1) if exist: self.df0 = spark_session.read.csv(dataset_file_path1, header=None, inferSchema=True) self.df0 = self.df0.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") file_name2 = 'data_part_2.txt' dataset_file_path2 = os.path.join(dataset_path, file_name2) exist = os.path.isfile(dataset_file_path2) if exist: self.df1 = spark_session.read.csv(dataset_file_path2, header=None, inferSchema=True) self.df1 = self.df1.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") file_name3 = 'data_part_3.txt' dataset_file_path3 = os.path.join(dataset_path, file_name3) exist = os.path.isfile(dataset_file_path3) if exist: self.df2 = spark_session.read.csv(dataset_file_path3, header=None, inferSchema=True) self.df2 = self.df2.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") # Train the model self.__train_all_model()
class RecommendationEngine: """A movie recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings) logger.info("ALS model built!") def add_ratings(self, ratings): """Add additional movie ratings in the format (user_id, movie_id, rating) """ # Convert ratings to a DF new_ratings = self.spark.createDataFrame( [(user_id, movie_id, rating)], ["userId", "movieId", "rating"]) # Add new ratings to the existing ones self.ratings = self.ratings.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_ratings_for_movie_ids(self, user_id, movie_ids): """Given a user_id and a list of movie_ids, predict ratings for them """ requested_movies = self.spark.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) # Get predicted ratings ratings = self.model.transform(requested_movies).collect() return ratings def get_top_ratings(self, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ users = self.ratings.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) top_ratings = self.model.recommendForUserSubset(users, movies_count) top_ratings = top_ratings.select( F.col('userId'), F.col('recommendations')['movieId'].alias('movieId')) top_ratings = top_ratings.toPandas() top_ratings = top_ratings.to_json() return top_ratings def __init__(self, spark, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark = spark # Load ratings data for later use logger.info("Loading Ratings data...") self.ratings = self.spark.read.csv(dataset_path, header=True, inferSchema=True) self.ratings.createOrReplaceTempView("movies") self.new_id = self.spark.sql("SELECT DISTINCT userId FROM movies") self.new_id.createOrReplaceTempView("newId") self.new_user_id = self.spark.sql( "SELECT userId, ROW_NUMBER() OVER (ORDER BY userId) AS user_id FROM newId" ) self.ratings = self.ratings.join( self.new_user_id, self.ratings.userId == self.new_user_id.userId) self.ratings = self.ratings.select('user_id', 'movieId', 'rating') self.ratings = self.ratings.withColumnRenamed('user_id', 'userId') # Train the model self.__train_model()
def CollaborativeFiltering(spark, sampleDataPath): ratingSamples = spark.read.format('csv').option('header', 'true').load(sampleDataPath) \ .withColumn("userIdInt", F.col("userId").cast(IntegerType())) \ .withColumn("movieIdInt", F.col("movieId").cast(IntegerType())) \ .withColumn("ratingFloat", F.col("rating").cast(FloatType())) # 将训练样本使用pyspark.rdd中的randomSplit按0.8:0.2的比例随机分为训练集和测试集 training_data, test_data = ratingSamples.randomSplit((0.8, 0.2)) # 在训练集上建立矩阵分解模型 '''参数详解 regParam:L2正则的系数lambda maxIter:交替计算User与Item的latent factors的迭代次数 userCol:DataFrame中用户列的名字 itemCol:DataFrame中物品列的名字 ratingCol:DataFrame中评分列的名字 coldStateStrategy,冷启动策略:设定为'drop'以确保模型在预测时遇到未知user或者item时(即没有在训练集中出现过)不会返回NaN,而是直接忽略 ''' als = ALS(regParam=0.01, maxIter=5, userCol='userIdInt', itemCol='movieIdInt', ratingCol='ratingFloat', coldStartStrategy='drop') # 训练模型 model = als.fit(training_data) # 通过在测试集上计算RMSE(Root Mean Squared Error, 均方根误差)以评估模型 predictions = model.transform(test_data) # 展示ALS模型的物品隐向量和用户隐向量,可以将这两者当做Item Embedding与User Embedding进行处理 model.itemFactors.show(10, truncate=False) model.userFactors.show(10, truncate=False) # 使用Spark的回归评估器进行评估,metricName选择rmse(Root Mean Square Error, 均方根误差) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ratingFloat", metricName='rmse') rmse = evaluator.evaluate(predictions) # 打印结果 print("RMSE = {}".format(rmse)) # 为每个用户生成Top 10 item推荐列表(即电影推荐) recListForUser = model.recommendForAllUsers(10) # 为每部电影生成Top 10 用户推荐列表 recListForMovie = model.recommendForAllItems(10) # 在给定的用户集上为集合中的每个用户生成Top 10电影推荐列表 userSubset = ratingSamples.select(als.getUserCol()).distinct().limit(3) recListForUserSubset = model.recommendForUserSubset(userSubset, 10) # 在给定的电影集上为集合中的每个电影生成Top 10用户推荐列表 movieSubset = ratingSamples.select(als.getItemCol()).distinct().limit(3) recListForMovieSubset = model.recommendForItemSubset(movieSubset, 10) # 显示推荐结果 recListForUser.show(5, truncate=False) recListForMovie.show(5, truncate=False) recListForUserSubset.show(5, truncate=False) recListForMovieSubset.show(5, truncate=False) paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build() # 使用离线评估策略的交叉验证 # 将全部样本划分为k个大小相等的样本子集,依次遍历这k个子集,将每次遍历到的子集作为验证集,其余子集作为训练集 # 依次进行k次模型的训练和评估,k通常取10 # 最后将这k次评估指标的平均值作为最终评估指标 cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(test_data) avgMetrics = cvModel.avgMetrics
prediction = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(prediction) print("Root-mean-square error = " + str(rmse)) userRecs = model.recommendForAllUsers(10) artistRecs = model.recommendForAllItems(10) #top 10 artist recomendations users = user_artist_data_df.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) #top 10 user recommendations for a specified artist artist = user_artist_data_df.select(als.getItemCol()).distinct().limit(3) artistSubSetRecs = model.recommendForItemSubset(artist, 10) print("Top 10 Recomendation : ") userRecs.show(10) print("============================") artistRecs.show(10) print("============================") userSubsetRecs.show(10) print("============================")