示例#1
0
def main():
    spark = SparkSession \
        .builder \
        .appName("ALSExample") \
        .getOrCreate()

    # ratings_list = [i.strip().split(",") for i in open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv', 'r').readlines()]
    # stream = open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/movies.csv')

    # $example on$
    lines = spark.read.text(
        "/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv"
    ).rdd
    parts = lines.map(lambda row: row.value.split(","))
    ratingsRDD = parts.map(extract).filter(lambda x: x is not None)

    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings.randomSplit([0.8, 0.2])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = ratings.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratings.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)
    # $example off$
    userRecs.show()
    movieRecs.show()
    userSubsetRecs.show()
    movieSubSetRecs.show()

    p = movieRecs.toPandas()

    spark.stop()
示例#2
0
文件: my_als.py 项目: happy-lu/spark
def run_spark_als(file_path):
    read_data(sql_context)

    als_data_frame = sql_context.sql("""
        select  visitorid,itemid, case when event = 'view' then 1
                                when event = 'addtocart' then 5
                                when event = 'transaction' then 10
                                else 0 end as rate from event_table
       """)
    print(als_data_frame.count())
    als_data_frame.show()
    (training, test) = als_data_frame.randomSplit([0.7, 0.3])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    base_reg = 0.01
    for iterNum in range(1):
        for regParm in range(1):
            als = ALS(maxIter=iterNum + 1,
                      regParam=0.3,
                      implicitPrefs=False,
                      userCol="visitorid",
                      itemCol="itemid",
                      ratingCol="rate",
                      coldStartStrategy="drop")
            model = als.fit(training)

            # Evaluate the model by computing the RMSE on the test data
            predictions = model.transform(test)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rate",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print("iterNum: %s, regParam: %s, Root-mean-square error = %s" %
                  (iterNum, base_reg, str(rmse)))
            base_reg += 0.1

    model.itemFactors.show()
    model.userFactors.show()

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = als_data_frame.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = als_data_frame.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)
    # $example off$
    userRecs.show(20, False)
    movieRecs.show(20, False)
    userSubsetRecs.show(20, False)
    movieSubSetRecs.show(20, False)
示例#3
0
class Recommendation:

    def __init__(self, spark, filename):

        # TO DO
        # Read data
        #
        # self.ratings = ...
        (training, test) = ratings.randomSplit([0.8, 0.2])

        # Build the recommendation model using ALS on the training data
        # Note we set cold start strategy to 'drop' to ensure we don't get 
        # NaN evaluation metrics
        self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
        self.model = self.als.fit(training)

        # evaluate the model by computing the RMSE on the test data
        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        logger.info("Root-mean-square error = " + str(rmse))
        
    # top movie recommendations for each user
    def recommend_for_users(self, num_movies):
        return self.model.recommendForAllUsers(num_movies)

    # top user recommendations for each movie
    def recommend_for_movies(self, num_recommendations):
        return self.model.recommendForAllItems(num_recommendations)
    
    # top movie recommendations for a specified set of users
    def recommend_for_setusers(self, num_users):
        users = self.ratings.select(self.als.getUserCol()).distinct().limit(3)
        return self.model.recommendForUserSubset(users, num_users)
    
    # top user recommendations for a specified set of movies
    def recommend_for_setmovies(self, num_movies):
        movies = self.ratings.select(self.als.getItemCol()).distinct().limit(3)
        return self.model.recommendForItemSubset(movies, num_movies)
示例#4
0
          checkpointInterval=1000,
          intermediateStorageLevel='MEMORY_AND_DISK',
          finalStorageLevel='MEMORY_AND_DISK',
          coldStartStrategy='drop')

model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
# print("Root-mean-square error = " + str(rmse))

item_rec = parts.select(als.getItemCol()).distinct()
item_rec = item_rec.where(item_rec.itemId == baoming)
item_rec = model.recommendForItemSubset(item_rec, num_user)

data = item_rec.select("recommendations")
data_list = data.rdd.map(lambda x: x[0]).take(1)
data = [data_list[0][i] for i in range(len(data_list[0]))]
data1 = sqlContext.createDataFrame(data)

p_user = p_user.withColumnRenamed("userId", "userId_1")

data2 = data1.join(p_user, data1["userId"] == p_user["userId_1"])
data3 = data2.select("user1", "rating")
data3 = data3.sort("rating", ascending=False)

data3 = data3.withColumn("rn", F.row_number().over(Window.orderBy("user1")))
示例#5
0
class RecommendationEngine:
    """A movie recommendation engine
    """
    def __train_model(self):
        """Train the ALS model with the current dataset
        """
        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model = self.als.fit(self.ratingsdf)
        logger.info("ALS model built!")

    def get_top_ratings(self, user_id, movies_count):
        """Recommends up to movies_count top unrated movies to user_id
        """
        users = self.ratingsdf.select(self.als.getUserCol())
        users = users.filter(users.userId == user_id)
        userSubsetRecs = self.model.recommendForUserSubset(users, movies_count)
        userSubsetRecs = userSubsetRecs.withColumn("recommendations",
                                                   explode("recommendations"))
        userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                               func.col('recommendations')['movieId'].alias('movieId'),
                                               func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                    drop('recommendations')
        userSubsetRecs = userSubsetRecs.drop('Rating')
        userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                             'inner')
        # userSubsetRecs.show()
        # userSubsetRecs.printSchema()
        userSubsetRecs = userSubsetRecs.toPandas()
        userSubsetRecs = userSubsetRecs.to_json()
        return userSubsetRecs

    def get_top_movie_recommend(self, movie_id, user_count):
        """Recommends up to movies_count top unrated movies to user_id
        """
        movies = self.ratingsdf.select(self.als.getItemCol())
        movies = movies.filter(movies.movieId == movie_id)
        movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count)
        movieSubsetRecs = movieSubsetRecs.withColumn(
            "recommendations", explode("recommendations"))
        movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                 func.col('recommendations')['userId'].alias('userId'),
                                                 func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
        movieSubsetRecs = movieSubsetRecs.drop('Rating')
        movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                               'inner')
        # userSubsetRecs.show()
        # userSubsetRecs.printSchema()
        movieSubsetRecs = movieSubsetRecs.toPandas()
        movieSubsetRecs = movieSubsetRecs.to_json()
        return movieSubsetRecs

    def get_ratings_for_movie_ids(self, user_id, movie_id):
        """Given a user_id and a list of movie_ids, predict ratings for them
        """
        request = self.spark_session.createDataFrame([(user_id, movie_id)],
                                                     ["userId", "movieId"])
        ratings = self.model.transform(request).collect()
        return ratings

    def add_ratings(self, user_id, movie_id, ratings_given):
        """Add additional movie ratings in the format (user_id, movie_id, rating)
        """
        # Convert ratings to an RDD
        new_ratings = self.spark_session.createDataFrame(
            [(user_id, movie_id, ratings_given)],
            ["userId", "movieId", "rating"])
        # Add new ratings to the existing ones
        self.ratingsdf = self.ratingsdf.union(new_ratings)
        # Re-train the ALS model with the new ratings
        self.__train_model()
        new_ratings = new_ratings.toPandas()
        new_ratings = new_ratings.to_json()
        return new_ratings

    def get_history(self, user_id):
        """Get rating history for a user
        """
        self.ratingsdf.createOrReplaceTempView("ratingsdata")
        user_history = self.spark_session.sql(
            'SELECT userId, movieId, rating from ratingsdata where userId = "%s"'
            % user_id)
        user_history = user_history.join(self.moviesdf, ("movieId"), 'inner')
        user_history = user_history.toPandas()
        user_history = user_history.to_json()
        return user_history

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load ratings data for later use
        logger.info("Loading Ratings data...")
        ratings_file_path = os.path.join(dataset_path,
                                         '../../datasets/ratings.csv')
        self.ratingsdf = spark_session.read.csv(ratings_file_path,
                                                header=True,
                                                inferSchema=True).na.drop()
        self.ratingsdf = self.ratingsdf.drop("timestamp")
        # Load movies data for later use
        logger.info("Loading Movies data...")
        movies_file_path = os.path.join(dataset_path,
                                        '../../datasets/movies.csv')
        self.moviesdf = spark_session.read.csv(movies_file_path,
                                               header=True,
                                               inferSchema=True).na.drop()
        self.moviesdf = self.moviesdf.drop("genres")
        # Train the model
        self.__train_model()
示例#6
0
文件: engine.py 项目: darke-f/BigData
class RecommendationEngine:
    """A movie recommendation engine
    """
    def __train_all_model(self):
        """Train the ALS model with the current dataset
        """

        logger.info("Splitting dataset")

        self.df0 = self.df.limit(int(self.dataset_count / 3))
        self.df1 = self.df.limit(int(self.dataset_count * 2 / 3))
        self.df2 = self.df

        print('df 0 count = ' + str(self.df0.count()))
        print('df 1 count = ' + str(self.df1.count()))
        print('df 2 count = ' + str(self.df2.count()))
        logger.info("Dataset Splitted !")

        #Model 1
        logger.info("Training the ALS model 1")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model1 = self.als.fit(self.df0)
        logger.info("ALS model 1 built!")

        #Model 2
        logger.info("Training the ALS model 2")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model2 = self.als.fit(self.df1)
        logger.info("ALS model 2 built!")

        #Model 3
        logger.info("Training the ALS model 3")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model3 = self.als.fit(self.df2)
        logger.info("ALS model 3 built!")

    def __train_model(self, model):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        if model == 0:
            self.model1 = self.als.fit(self.df0)
        elif model == 1:
            self.model2 = self.als.fit(self.df1)
        elif model == 2:
            self.model3 = self.als.fit(self.df2)
        logger.info("ALS model built!")

    def get_top_ratings(self, model, user_id, movies_count):
        """Recommends up to movies_count top unrated movies to user_id
        """

        if model == 0:
            users = self.df0.select(self.als.getUserCol())
            users = users.filter(users.userId == user_id)
            userSubsetRecs = self.model1.recommendForUserSubset(
                users, movies_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['movieId'].alias('movieId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                                 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 1:
            users = self.df1.select(self.als.getUserCol())
            users = users.filter(users.userId == user_id)
            userSubsetRecs = self.model2.recommendForUserSubset(
                users, movies_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['movieId'].alias('movieId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                                 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 2:
            users = self.df2.select(self.als.getUserCol())
            users = users.filter(users.userId == user_id)
            userSubsetRecs = self.model3.recommendForUserSubset(
                users, movies_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['movieId'].alias('movieId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                                 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs

    def get_top_movie_recommend(self, model, movie_id, user_count):
        """Recommends up to movies_count top unrated movies to user_id
        """

        if model == 0:
            movies = self.df0.select(self.als.getItemCol())
            movies = movies.filter(movies.movieId == movie_id)
            movieSubsetRecs = self.model1.recommendForItemSubset(
                movies, user_count)
            movieSubsetRecs = movieSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            movieSubsetRecs = movieSubsetRecs.drop('Rating')
            movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                                   'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            movieSubsetRecs = movieSubsetRecs.toPandas()
            movieSubsetRecs = movieSubsetRecs.to_json()
            return movieSubsetRecs
        elif model == 1:
            movies = self.df1.select(self.als.getItemCol())
            movies = movies.filter(movies.movieId == movie_id)
            movieSubsetRecs = self.model2.recommendForItemSubset(
                movies, user_count)
            movieSubsetRecs = movieSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            movieSubsetRecs = movieSubsetRecs.drop('Rating')
            movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                                   'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            movieSubsetRecs = movieSubsetRecs.toPandas()
            movieSubsetRecs = movieSubsetRecs.to_json()
            return movieSubsetRecs
        elif model == 2:
            movies = self.df2.select(self.als.getItemCol())
            movies = movies.filter(movies.movieId == movie_id)
            movieSubsetRecs = self.model3.recommendForItemSubset(
                movies, user_count)
            movieSubsetRecs = movieSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            movieSubsetRecs = movieSubsetRecs.drop('Rating')
            movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                                   'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            movieSubsetRecs = movieSubsetRecs.toPandas()
            movieSubsetRecs = movieSubsetRecs.to_json()
            return movieSubsetRecs

    def get_ratings_for_movie_ids(self, model, user_id, movie_id):
        """Given a user_id and a list of movie_ids, predict ratings for them
        """

        if model == 0:
            request = self.spark_session.createDataFrame([(user_id, movie_id)],
                                                         ["userId", "movieId"])
            ratings = self.model1.transform(request).collect()
            return ratings
        elif model == 1:
            request = self.spark_session.createDataFrame([(user_id, movie_id)],
                                                         ["userId", "movieId"])
            ratings = self.model2.transform(request).collect()
            return ratings
        elif model == 2:
            request = self.spark_session.createDataFrame([(user_id, movie_id)],
                                                         ["userId", "movieId"])
            ratings = self.model3.transform(request).collect()
            return ratings

    def add_ratings(self, model, user_id, movie_id, ratings_given):
        """Add additional movie ratings in the format (user_id, rating, movie_id)
        """

        if model == 0:
            # Convert ratings to an RDD
            new_ratings = self.spark_session.createDataFrame(
                [(user_id, ratings_given, movie_id)],
                ["userId", "rating", "movieId"])
            # Add new ratings to the existing ones
            self.df0 = self.df0.union(new_ratings)
            # Re-train the ALS model with the new ratings
            self.__train_model(model)
            new_ratings = new_ratings.toPandas()
            new_ratings = new_ratings.to_json()
            return new_ratings
        elif model == 1:
            # Convert ratings to an RDD
            new_ratings = self.spark_session.createDataFrame(
                [(user_id, ratings_given, movie_id)],
                ["userId", "rating", "movieId"])
            # Add new ratings to the existing ones
            self.df1 = self.df1.union(new_ratings)
            # Re-train the ALS model with the new ratings
            self.__train_model(model)
            new_ratings = new_ratings.toPandas()
            new_ratings = new_ratings.to_json()
            return new_ratings
        elif model == 2:
            # Convert ratings to an RDD
            new_ratings = self.spark_session.createDataFrame(
                [(user_id, ratings_given, movie_id)],
                ["userId", "rating", "movieId"])
            # Add new ratings to the existing ones
            self.df2 = self.df2.union(new_ratings)
            # Re-train the ALS model with the new ratings
            self.__train_model(model)
            new_ratings = new_ratings.toPandas()
            new_ratings = new_ratings.to_json()
            return new_ratings

    def get_history(self, model, user_id):
        """Get rating history for a user
        """

        if model == 0:
            self.df0.createOrReplaceTempView("ratingsdata")
            user_history = self.spark_session.sql(
                'SELECT userId, movieId, rating from ratingsdata where userId = "%s"'
                % user_id)
            user_history = user_history.join(self.moviesdf, ("movieId"),
                                             'inner')
            user_history = user_history.toPandas()
            user_history = user_history.to_json()
            return user_history
        elif model == 1:
            self.df1.createOrReplaceTempView("ratingsdata")
            user_history = self.spark_session.sql(
                'SELECT userId, movieId, rating from ratingsdata where userId = "%s"'
                % user_id)
            user_history = user_history.join(self.moviesdf, ("movieId"),
                                             'inner')
            user_history = user_history.toPandas()
            user_history = user_history.to_json()
            return user_history
        elif model == 2:
            self.df2.createOrReplaceTempView("ratingsdata")
            user_history = self.spark_session.sql(
                'SELECT userId, movieId, rating from ratingsdata where userId = "%s"'
                % user_id)
            user_history = user_history.join(self.moviesdf, ("movieId"),
                                             'inner')
            user_history = user_history.toPandas()
            user_history = user_history.to_json()
            return user_history

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load ratings data for later use
        logger.info("Loading Ratings data...")

        file_counter = 0
        while True:
            file_name = 'data_part_' + str(file_counter) + '.txt'
            dataset_file_path = os.path.join(dataset_path, file_name)
            exist = os.path.isfile(dataset_file_path)
            if exist:
                if file_counter == 0:
                    self.df = spark_session.read.csv(dataset_file_path,
                                                     header=None,
                                                     inferSchema=True)
                else:
                    df_new = spark_session.read.csv(dataset_file_path,
                                                    header=None,
                                                    inferSchema=True)
                    self.df = self.df.union(df_new)
                self.dataset_count = self.df.count()
                print('Data loaded = ' + str(self.dataset_count))
                print(file_name + 'Loaded !')
                file_counter += 1
            else:
                break
        self.df = self.df.selectExpr("_c0 as userId", "_c1 as rating",
                                     "_c2 as movieId")
        self.df.show()
        # print(self.df.count())

        # Load movie data for later use
        logger.info("Loading Movie data...")
        movies_file_path = os.path.join(dataset_path, 'movie_titles.csv')
        self.moviesdf = spark_session.read.csv(movies_file_path,
                                               header=None,
                                               inferSchema=True)
        self.moviesdf = self.moviesdf.selectExpr("_c0 as movieId",
                                                 "_c1 as Year",
                                                 "_c2 as movie_title")
        # Train the model
        self.__train_all_model()
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

# COMMAND ----------

# TEST
Test.assertEquals(
    round(min_error, 2), 0.81,
    "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}"
    .format(round(min_error, 2)))
Test.assertEquals(
    ranks[best_rank], 12,
    "Unexpected value for best rank. Expected 12. Got {0}".format(
        ranks[best_rank]))
Test.assertEqualsHashed(
    als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6",
    "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(
    als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a",
    "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(
    als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c",
    "Incorrect choice of {0} for ALS rating column.".format(
        als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
    (training, test) = ratings.randomSplit([0.8, 0.2])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    # Create ALS model
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="invoice_num",
              itemCol="product_code",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)
    model.write().overwrite().save("models/recommendModel")
    invoiceRecs = model.recommendForAllUsers(10)
    invoiceRecs.show(truncate=False)
    print(als.getItemCol())
    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))
"""
    # Generate top 10 movie recommendations for each user
    invoiceRecs = model.recommendForAllUsers(10)

    # Generate top 10 user recommendations for each movie
    productRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for 3 users
class RecommendationEngine:
    """A movie recommendation engine"""
    def __train_model(self):
        """Train the ALS model with the current dataset"""
        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model = self.als.fit(self.ratingsdf)
        logger.info("ALS model built!")

    def get_top_ratings(self, user_id, movies_count):
        users = self.ratingsdf.select(self.als.getUserCol())
        users = users.filter(users.userId == user_id)
        userSubsetRecs = self.model.recommendForUserSubset(users, movies_count)
        userSubsetRecs = userSubsetRecs.withColumn("recommendations",
                                                   explode("recommendations"))
        userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                               func.col('recommendations')['movieId'].alias('movieId'),
                                               func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                    drop('recommendations')
        userSubsetRecs = userSubsetRecs.drop('Rating')
        userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                             'inner')
        userSubsetRecs = userSubsetRecs.toPandas()
        userSubsetRecs = userSubsetRecs.to_json()
        return userSubsetRecs

    def get_top_movie_recommend(self, movie_id, user_count):
        movies = self.ratingsdf.select(self.als.getItemCol())
        movies = movies.filter(movies.movieId == movie_id)
        movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count)
        movieSubsetRecs = movieSubsetRecs.withColumn(
            "recommendations", explode("recommendations"))
        movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                 func.col('recommendations')['userId'].alias('userId'),
                                                 func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
        movieSubsetRecs = movieSubsetRecs.drop('Rating')
        movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                               'inner')
        movieSubsetRecs = movieSubsetRecs.toPandas()
        movieSubsetRecs = movieSubsetRecs.to_json()
        return movieSubsetRecs

    def get_movie_rating(self, user_id, movie_id):
        request = self.spark_session.createDataFrame([(user_id, movie_id)],
                                                     ["userId", "movieId"])
        ratings = self.model.transform(request).collect()
        return ratings

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path"""
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load ratings data for later use
        logger.info("Loading Ratings data...")
        ratings_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.ratingsdf = spark_session.read.csv(ratings_file_path,
                                                header=True,
                                                inferSchema=True).na.drop()
        self.ratingsdf = self.ratingsdf.drop("timestamp")
        # Load movies data for later use
        logger.info("Loading Movies data...")
        movies_file_path = os.path.join(dataset_path, 'movies.csv')
        self.moviesdf = spark_session.read.csv(movies_file_path,
                                               header=True,
                                               inferSchema=True).na.drop()
        self.moviesdf = self.moviesdf.drop("genres")
        # Train the model
        self.__train_model()
  print 'For rank %s the RMSE is %s' % (rank, error)
  if error < min_error:
    min_error = error
    best_rank = err
  err += 1

als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

# COMMAND ----------

# TEST
Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2)))
Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank]))
Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC 
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
# MAGIC 
# MAGIC The steps you should perform are:
# MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame.
# MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you.
# MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.
示例#11
0
    predictions = model.transform(test)
    model.itemFactors.show(10, truncate=False)
    model.userFactors.show(10, truncate=False)
    evaluator = RegressionEvaluator(predictionCol="prediction",
                                    labelCol='ratingFloat',
                                    metricName='rmse')
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = {}".format(rmse))
    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)
    # Generate top 10 movie recommendations for a specified set of users
    users = ratingSamples.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratingSamples.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)
    userRecs.show(5, False)
    movieRecs.show(5, False)
    userSubsetRecs.show(5, False)
    movieSubSetRecs.show(5, False)
    paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build()
    cv = CrossValidator(estimator=als,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)
    cvModel = cv.fit(test)
    avgMetrics = cvModel.avgMetrics
    spark.stop()
示例#12
0
class RecommendationEngine:
    """
  Recommendation engine
  """
    def __init__(self, spark_session, dataset_path):
        self.spark_session = spark_session
        logger.info("Starting up the Spark Session: {}".format(
            self.spark_session))

        # Load listening count data
        logger.info("Loading listening count dataset...")
        self.listening_count_df = []
        for i in range(0, 3):
            lc_file_path = os.path.join(dataset_path,
                                        'batch/batch' + str(i) + '.txt')
            new_df = spark_session.read.csv(lc_file_path,
                                            header=None,
                                            inferSchema=True).na.drop()
            new_df = new_df.selectExpr("_c0 as user_id", "_c1 as artist_id",
                                       "_c2 as weight")
            try:
                self.listening_count_df.append(
                    self.listening_count_df[i - 1].union(new_df))
            except IndexError:
                self.listening_count_df.append(new_df)
            logger.info("{} loaded".format('batch' + str(i) + '.txt'))
        logger.info("Loading listening count dataset done!")

        # Load artist data
        logger.info("Loading artist dataset...")
        artist_file_path = os.path.join(dataset_path, 'csv/artists.csv')
        self.artist_df = spark_session.read.csv(artist_file_path,
                                                header="true",
                                                inferSchema="true").na.drop()
        self.artist_df.createOrReplaceTempView("artists")
        self.artist_df_selected = self.spark_session.sql(
            "SELECT `id` as artist_id, `name`, `url` \
                            FROM artists")
        logger.info("Loading artist dataset done...")

        # Train the model
        self.__train_model()

    def __train_model(self):
        # Train the ALS model with the current dataset
        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="user_id",
                       itemCol="artist_id",
                       ratingCol="weight",
                       coldStartStrategy="drop")
        self.model = []
        for i in range(0, 3):
            self.model.append(self.als.fit(self.listening_count_df[i]))
            logger.info("Model {} done : {}".format(
                i, self.listening_count_df[i].count()))
        logger.info("ALS model built!")

    def get_top_ratings(self, model_id, user_id, num_of_books):
        # Recommends up to top unrated books to user_id
        user = self.listening_count_df[model_id].select(self.als.getUserCol())
        user = user.filter(user.user_id == user_id)
        userSubsetRecs = self.model[model_id].recommendForUserSubset(
            user, num_of_books)
        userSubsetRecs = userSubsetRecs.withColumn("recommendations",
                                                   explode("recommendations"))
        userSubsetRecs = userSubsetRecs.select(func.col('recommendations')['artist_id'].alias('artist_id'), \
                             func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations')
        userSubsetRecs = userSubsetRecs.drop('Rating')
        userSubsetRecs = userSubsetRecs.join(self.artist_df_selected,
                                             ('artist_id'), 'inner')
        df_json = userSubsetRecs.toJSON()
        data = {}
        data['result'] = []
        for row in df_json.collect():
            data['result'].append(json.loads(row))
        return data

    def get_top_music_recommend(self, model_id, artist_id, num_of_users):
        # Recommends up to top unrated books to user_id
        artist = self.listening_count_df[model_id].select(
            self.als.getItemCol())
        artist = artist.filter(artist.artist_id == artist_id)
        artistSubsetRecs = self.model[model_id].recommendForItemSubset(
            artist, num_of_users)
        artistSubsetRecs = artistSubsetRecs.withColumn(
            "recommendations", explode("recommendations"))
        artistSubsetRecs = artistSubsetRecs.select(func.col('recommendations')['user_id'].alias('user_id'), \
                            func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations')
        artistSubsetRecs = artistSubsetRecs.drop('Rating')
        df_json = artistSubsetRecs.toJSON()
        data = {}
        data['result'] = []
        for row in df_json.collect():
            data['result'].append(json.loads(row))
        return data

    def get_listening_count_for_artist_ids(self, model_id, user_id, artist_id):
        # Given a user_id and a list of artist_ids, predict listening count for them
        request = self.spark_session.createDataFrame([(user_id, artist_id)],
                                                     ['user_id', 'artist_id'])
        weight = self.model[model_id].transform(request).collect()
        data = {}
        data['result'] = weight[0][2]
        return data

    def get_listening_count(self, model_id, user_id):
        # Get listening count history for a user
        self.listening_count_df[model_id].createOrReplaceTempView(
            "listeningcount")
        user_history = self.spark_session.sql(
            'SELECT `artist_id`, `weight` from listeningcount \
                        WHERE `user_id` = "%s"' % user_id)
        user_history = user_history.join(self.artist_df_selected,
                                         ('artist_id'), 'inner')
        df_json = user_history.toJSON()
        data = {}
        data['result'] = []
        for row in df_json.collect():
            data['result'].append(json.loads(row))
        return data
示例#13
0
class RecommendationEngine:
    """
    A book recommendation engine
    """
    
    def __init__(self, spark_session, dataset_path):
        """
        Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load ratings data for later use
        logger.info("Loading Ratings dataset...")
        ratings_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.ratings_df = spark_session.read.csv(ratings_file_path, header="true", inferSchema="true").na.drop()
        # Load movies data for later use
        logger.info("Loading Books dataset...")
        books_file_path = os.path.join(dataset_path, 'books.csv')
        self.books_df = spark_session.read.csv(books_file_path, header="true", inferSchema="true").na.drop()
        self.books_df.createOrReplaceTempView("books")
        self.books_df_selected = self.spark_session.sql("SELECT `book_id`, `title` \
                                                        FROM books")
        # Train the model
        self.__train_model()

    def __train_model(self):
        """
        Train the ALS model with the current dataset
        """
        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop")
        self.model = self.als.fit(self.ratings_df)
        logger.info("ALS model built!")

    def get_top_ratings(self, user_id, book_count):
        """
        Recommends up to book_count top unrated books to user_id
        """
        users = self.ratings_df.select(self.als.getUserCol())
        users = users.filter(users.user_id == user_id)
        userSubsetRecs = self.model.recommendForUserSubset(users, book_count)
        userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations"))
        userSubsetRecs = userSubsetRecs.select(func.col('user_id'), \
                                               func.col('recommendations')['book_id'].alias('book_id'), \
                                               func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations')
        userSubsetRecs = userSubsetRecs.drop('Rating')
        userSubsetRecs = userSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner')
        userSubsetRecs = userSubsetRecs.toPandas()
        userSubsetRecs = userSubsetRecs.to_json()
        return userSubsetRecs

    def get_top_book_recommend(self, book_id, user_count):
        """
        Recommends up to book_count top unrated books to user_id
        """
        books = self.ratings_df.select(self.als.getItemCol())
        books = books.filter(books.book_id == book_id)
        bookSubsetRecs = self.model.recommendForItemSubset(books, user_count)
        bookSubsetRecs = bookSubsetRecs.withColumn("recommendations", explode("recommendations"))
        bookSubsetRecs = bookSubsetRecs.select(func.col('book_id'), \
                                                func.col('recommendations')['user_id'].alias('user_id'), \
                                                func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations')
        bookSubsetRecs = bookSubsetRecs.drop('Rating')
        bookSubsetRecs = bookSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner')
        bookSubsetRecs = bookSubsetRecs.toPandas()
        bookSubsetRecs = bookSubsetRecs.to_json()
        return bookSubsetRecs

    def get_ratings_for_book_ids(self, user_id, book_id):
        """
        Given a user_id and a list of book_ids, predict ratings for them
        """
        request = self.spark_session.createDataFrame([(user_id, book_id)], ["user_id", "book_id"])
        ratings = self.model.transform(request).collect()
        return ratings

    def add_ratings(self, user_id, book_id, ratings_given):
        """
        Add additional movie ratings in the format (user_id, movie_id, rating)
        """
        # Convert ratings to an RDD
        new_ratings = self.spark_session.createDataFrame([(user_id, book_id, ratings_given)],
                                                         ["user_id", "book_id", "rating"])
        # Add new ratings to the existing ones
        self.ratings_df = self.ratings_df.union(new_ratings)
        # Re-train the ALS model with the new ratings
        self.__train_model()
        new_ratings = new_ratings.toPandas()
        new_ratings = new_ratings.to_json()
        return new_ratings

    def get_history(self, user_id):
        """
        Get rating history for a user
        """
        self.ratings_df.createOrReplaceTempView("ratingsdata")
        user_history = self.spark_session.sql('SELECT `user_id`, `book_id`, `rating` from ratingsdata \
                                                WHERE `user_id` = "%s"' %user_id)
        user_history = user_history.join(self.books_df_selected, ("book_id"), 'inner')
        user_history = user_history.toPandas()
        user_history = user_history.to_json()
        return user_history
class RecommendationEngine:
    """A product recommendation engine
    """
    def __train_all_model(self):
        """Train the ALS model with the current dataset
        """

        #Model 1
        logger.info("Training the ALS model 1")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model1 = self.als.fit(self.df0)
        logger.info("ALS model 1 built!")

        #Model 2
        logger.info("Training the ALS model 2")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model2 = self.als.fit(self.df1)
        logger.info("ALS model 2 built!")

        #Model 3
        logger.info("Training the ALS model 3")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model3 = self.als.fit(self.df2)
        logger.info("ALS model 3 built!")

    def __train_model(self, model):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        if model == 0:
            self.model1 = self.als.fit(self.df0)
        elif model == 1:
            self.model2 = self.als.fit(self.df1)
        elif model == 2:
            self.model3 = self.als.fit(self.df2)
        logger.info("ALS model built!")

    def get_top_ratings(self, model, user_id, products_count):

        if model == 0:
            users = self.df0.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model1.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 1:
            users = self.df1.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model2.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 2:
            users = self.df2.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model3.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs

    def get_top_product_recommend(self, model, product_id, user_count):

        if model == 0:
            products = self.df0.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model1.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs
        elif model == 1:
            products = self.df1.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model2.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs
        elif model == 2:
            products = self.df2.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model3.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs

    def get_ratings_for_product_ids(self, model, user_id, product_id):

        if model == 0:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model1.transform(request).collect()
            return ratings
        elif model == 1:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model2.transform(request).collect()
            return ratings
        elif model == 2:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model3.transform(request).collect()
            return ratings

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session

        # Load Amazon data for later use
        logger.info("Loading Amazon data...")
        file_name1 = 'model-1.txt'
        dataset_file_path1 = os.path.join(dataset_path, file_name1)
        exist = os.path.isfile(dataset_file_path1)
        if exist:
            self.df0 = spark_session.read.csv(dataset_file_path1,
                                              header=None,
                                              inferSchema=True)
            self.df0 = self.df0.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        file_name2 = 'model-2.txt'
        dataset_file_path2 = os.path.join(dataset_path, file_name2)
        exist = os.path.isfile(dataset_file_path2)
        if exist:
            self.df1 = spark_session.read.csv(dataset_file_path2,
                                              header=None,
                                              inferSchema=True)
            self.df1 = self.df1.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        file_name3 = 'model-3.txt'
        dataset_file_path3 = os.path.join(dataset_path, file_name3)
        exist = os.path.isfile(dataset_file_path3)
        if exist:
            self.df2 = spark_session.read.csv(dataset_file_path3,
                                              header=None,
                                              inferSchema=True)
            self.df2 = self.df2.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        # Train the model
        self.__train_all_model()
示例#15
0
class RecommendationEngine:
    """A anime recommendation engine
    """
    def __train_model(self):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="user_id",
                       itemCol="anime_id",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model = self.als.fit(self.ratings)

        logger.info("ALS model built!")

    def add_ratings(self, user_id, anime_id, ratings):
        """Add additional anime ratings in the format (user_id, anime_id, rating)
        """
        # Convert ratings to an RDD
        new_ratings = self.spark.createDataFrame(
            [(user_id, anime_id, ratings)], ["user_id", "anime_id", "rating"])
        # Add new ratings to the existing ones
        self.ratings = self.ratings.union(new_ratings)
        # Re-train the ALS model with the new ratings
        self.__train_model()
        new_ratings = new_ratings.toPandas()
        new_ratings = new_ratings.to_json()
        return new_ratings

    def get_ratings_for_anime_ids(self, user_id, anime_id):
        """Given a user_id and a list of anime_ids, predict ratings for them 
        """

        dataframe = self.spark.createDataFrame([(user_id, anime_id)],
                                               ["user_id", "anime_id"])
        predictions = self.model.transform(dataframe)
        ratings = predictions.toPandas()
        ratings = ratings.to_json()

        return ratings

    def get_top_ratings(self, user_id, animes_count):
        """Recommends up to animes_count top unrated animes to user_id
        """
        users = self.ratings.select(self.als.getUserCol()).distinct()
        users = users.filter(users.user_id == user_id)
        top_ratings = self.model.recommendForUserSubset(users, animes_count)

        self.json_top = top_ratings.toPandas()
        self.json_top = self.json_top.to_json()
        return self.json_top

    def get_anime_top_ratings(self, anime_id, users_count):
        """Recommends up to animes_count top unrated animes to user_id
        """
        animes = self.ratings.select(self.als.getItemCol()).distinct()
        animes = animes.filter(animes.anime_id == anime_id)
        anime_top = self.model.recommendForItemSubset(animes, users_count)

        self.json_top = anime_top.toPandas()
        self.json_top = self.json_top.to_json()
        return self.json_top

    def __init__(self, spark, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """

        logger.info("Starting up the Recommendation Engine: ")

        self.spark = spark

        # Load ratings data for later use
        logger.info("Loading Ratings data...")
        ratings_file_path = os.path.join(dataset_path, 'rating.csv')
        self.ratings = spark.read.csv(ratings_file_path,
                                      header=True,
                                      inferSchema=True)
        # Load data Anime
        # logger.info("Loading Anime data...")
        # ratings_file_path = os.path.join(dataset_path, 'anime.csv')
        # self.animes = spark.read.csv(ratings_file_path, header=True, inferSchema=True)

        self.__train_model()
mymodel= als.fit(indexedDf)


userRecs = mymodel.recommendForAllUsers(10)

ProductRecs = mymodel.recommendForAllItems(10)


userRecs.show()
ProductRecs.show()

# Generate top 5 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = mymodel.recommendForUserSubset(users, 5)
# Generate top 5 user recommendations for a specified set of products
products = ratings.select(als.getItemCol()).distinct().limit(3)
productSubSetRecs = mymodel.recommendForItemSubset(products, 5)

userSubsetRecs.show()
ProductSubSetRecs.show()


#pred_rdd = predictions.rdd
#pred_rdd.repartition(1).saveAsTextFile("preds")


rdd1 = userRecs.rdd
rdd2 = ProductRecs.rdd

rdd1.repartition(1).saveAsTextFile("userRecs")
示例#17
0
class RecommendationEngine:
    """A Yelp recommendation engine
    """
    def __train_all_model(self):
        """Train the ALS model with the current dataset
        """

        #Model 1
        logger.info("Training the ALS model 1")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="businessId",
                       ratingCol="Stars",
                       coldStartStrategy="drop")
        self.model1 = self.als.fit(self.df0)
        logger.info("ALS model 1 built!")

        #Model 2
        logger.info("Training the ALS model 2")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="businessId",
                       ratingCol="Stars",
                       coldStartStrategy="drop")
        self.model2 = self.als.fit(self.df1)
        logger.info("ALS model 2 built!")

        #Model 3
        logger.info("Training the ALS model 3")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="businessId",
                       ratingCol="Stars",
                       coldStartStrategy="drop")
        self.model3 = self.als.fit(self.df2)
        logger.info("ALS model 3 built!")

    def __train_model(self, model):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="businessId",
                       ratingCol="Stars",
                       coldStartStrategy="drop")
        if model == 0:
            self.model1 = self.als.fit(self.df0)
        elif model == 1:
            self.model2 = self.als.fit(self.df1)
        elif model == 2:
            self.model3 = self.als.fit(self.df2)
        logger.info("ALS model built!")

    def get_top_stars(self, model, userId, business_count):
        """Recommends up to business_count top unrated business to userId
        """

        if model == 0:
            users = self.df0.select(self.als.getUserCol())
            users = users.filter(users.userId == userId)
            userSubsetRecs = self.model1.recommendForUserSubset(
                users, business_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['businessId'].alias('businessId'),
                                                   func.col('recommendations')['rating'].alias('rating')).\
                                                                                        drop('recommendations')
            #userSubsetRecs = userSubsetRecs.drop('stars')
            # userSubsetRecs = userSubsetRecs.join(self.df0, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 1:
            users = self.df1.select(self.als.getUserCol())
            users = users.filter(users.userId == userId)
            userSubsetRecs = self.model2.recommendForUserSubset(
                users, business_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['businessId'].alias('businessId'),
                                                   func.col('recommendations')['rating'].alias('rating')).\
                                                                                        drop('recommendations')
            #userSubsetRecs = userSubsetRecs.drop('stars')
            #userSubsetRecs = userSubsetRecs.join(self.df1, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 2:
            users = self.df2.select(self.als.getUserCol())
            users = users.filter(users.userId == userId)
            userSubsetRecs = self.model3.recommendForUserSubset(
                users, business_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                                   func.col('recommendations')['businessId'].alias('businessId'),
                                                   func.col('recommendations')['rating'].alias('rating')).\
                                                                                        drop('recommendations')
            #userSubsetRecs = userSubsetRecs.drop('stars')
            #userSubsetRecs = userSubsetRecs.join(self.df2, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs

    def get_top_business_recommend(self, model, businessId, user_count):
        """Recommends up to businesss_count top unrated businesss to user_id
        """

        if model == 0:
            business = self.df0.select(self.als.getItemCol())
            business = business.filter(business.businessId == businessId)
            businessSubsetRecs = self.model1.recommendForItemSubset(
                business, user_count)
            businessSubsetRecs = businessSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['rating'].alias('stars')).\
                                                                                            drop('recommendations')
            # businessSubsetRecs = businessSubsetRecs.drop('rating')
            #businessSubsetRecs = businessSubsetRecs.join(self.businessdf, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            businessSubsetRecs = businessSubsetRecs.toPandas()
            businessSubsetRecs = businessSubsetRecs.to_json()
            return businessSubsetRecs
        elif model == 1:
            business = self.df1.select(self.als.getItemCol())
            business = business.filter(business.businessId == businessId)
            businessSubsetRecs = self.model2.recommendForItemSubset(
                business, user_count)
            businessSubsetRecs = businessSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['rating'].alias('rating')).\
                                                                                            drop('recommendations')
            #businessSubsetRecs = businessSubsetRecs.drop('rating')
            businessSubsetRecs = businessSubsetRecs.join(
                self.businesssdf, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            businessSubsetRecs = businessSubsetRecs.toPandas()
            businessSubsetRecs = businessSubsetRecs.to_json()
            return businessSubsetRecs
        elif model == 2:
            businesss = self.df2.select(self.als.getItemCol())
            businesss = businesss.filter(businesss.businessId == businessId)
            businessSubsetRecs = self.model3.recommendForItemSubset(
                business, user_count)
            businessSubsetRecs = businessSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'),
                                                     func.col('recommendations')['userId'].alias('userId'),
                                                     func.col('recommendations')['rating'].alias('rating')).\
                                                                                            drop('recommendations')
            #businessSubsetRecs = businessSubsetRecs.drop('stars')
            #businessSubsetRecs = businessSubsetRecs.join(self.businesssdf, ("businessId"), 'inner')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            businessSubsetRecs = businessSubsetRecs.toPandas()
            businessSubsetRecs = businessSubsetRecs.to_json()
            return businessSubsetRecs

    def get_stars_for_business_ids(self, model, userId, businessId):
        """Given a user_id and a list of business_ids, predict Stars for them
        """

        if model == 0:
            request = self.spark_session.createDataFrame(
                [(userId, businessId)], ["userId", "businessId"])
            Stars = self.model1.transform(request).collect()
            return Stars
        elif model == 1:
            request = self.spark_session.createDataFrame(
                [(userId, businessId)], ["userId", "businessId"])
            Stars = self.model2.transform(request).collect()
            return Stars
        elif model == 2:
            request = self.spark_session.createDataFrame(
                [(userId, businessId)], ["userId", "businessId"])
            Stars = self.model3.transform(request).collect()
            return Stars

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load Stars data for later use
        logger.info("Loading Stars data...")

        file_name1 = 'data_part_1.txt'
        dataset_file_path1 = os.path.join(dataset_path, file_name1)
        exist = os.path.isfile(dataset_file_path1)
        if exist:
            self.df0 = spark_session.read.csv(dataset_file_path1,
                                              header=None,
                                              inferSchema=True)
            self.df0 = self.df0.selectExpr("_c0 as userId",
                                           "_c1 as businessId", "_c2 as Stars")

        file_name2 = 'data_part_2.txt'
        dataset_file_path2 = os.path.join(dataset_path, file_name2)
        exist = os.path.isfile(dataset_file_path2)
        if exist:
            self.df1 = spark_session.read.csv(dataset_file_path2,
                                              header=None,
                                              inferSchema=True)
            self.df1 = self.df1.selectExpr("_c0 as userId",
                                           "_c1 as businessId", "_c2 as Stars")

        file_name3 = 'data_part_3.txt'
        dataset_file_path3 = os.path.join(dataset_path, file_name3)
        exist = os.path.isfile(dataset_file_path3)
        if exist:
            self.df2 = spark_session.read.csv(dataset_file_path3,
                                              header=None,
                                              inferSchema=True)
            self.df2 = self.df2.selectExpr("_c0 as userId",
                                           "_c1 as businessId", "_c2 as Stars")
        # Train the model
        self.__train_all_model()
示例#18
0
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = ratings.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratings.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)
    # $example off$
    userRecs.show()
    movieRecs.show()
    userSubsetRecs.show()
    movieSubSetRecs.show()

    spark.stop()
def CollaborativeFiltering(spark, sampleDataPath):
    ratingSamples = spark.read.format('csv').option('header', 'true').load(sampleDataPath) \
        .withColumn("userIdInt", F.col("userId").cast(IntegerType())) \
        .withColumn("movieIdInt", F.col("movieId").cast(IntegerType())) \
        .withColumn("ratingFloat", F.col("rating").cast(FloatType()))

    # 将训练样本使用pyspark.rdd中的randomSplit按0.8:0.2的比例随机分为训练集和测试集
    training_data, test_data = ratingSamples.randomSplit((0.8, 0.2))

    # 在训练集上建立矩阵分解模型
    '''参数详解
    regParam:L2正则的系数lambda
    maxIter:交替计算User与Item的latent factors的迭代次数
    userCol:DataFrame中用户列的名字
    itemCol:DataFrame中物品列的名字
    ratingCol:DataFrame中评分列的名字
    coldStateStrategy,冷启动策略:设定为'drop'以确保模型在预测时遇到未知user或者item时(即没有在训练集中出现过)不会返回NaN,而是直接忽略
    '''
    als = ALS(regParam=0.01,
              maxIter=5,
              userCol='userIdInt',
              itemCol='movieIdInt',
              ratingCol='ratingFloat',
              coldStartStrategy='drop')

    # 训练模型
    model = als.fit(training_data)
    # 通过在测试集上计算RMSE(Root Mean Squared Error, 均方根误差)以评估模型
    predictions = model.transform(test_data)
    # 展示ALS模型的物品隐向量和用户隐向量,可以将这两者当做Item Embedding与User Embedding进行处理
    model.itemFactors.show(10, truncate=False)
    model.userFactors.show(10, truncate=False)
    # 使用Spark的回归评估器进行评估,metricName选择rmse(Root Mean Square Error, 均方根误差)
    evaluator = RegressionEvaluator(predictionCol="prediction",
                                    labelCol="ratingFloat",
                                    metricName='rmse')
    rmse = evaluator.evaluate(predictions)
    # 打印结果
    print("RMSE = {}".format(rmse))
    # 为每个用户生成Top 10 item推荐列表(即电影推荐)
    recListForUser = model.recommendForAllUsers(10)
    # 为每部电影生成Top 10 用户推荐列表
    recListForMovie = model.recommendForAllItems(10)
    # 在给定的用户集上为集合中的每个用户生成Top 10电影推荐列表
    userSubset = ratingSamples.select(als.getUserCol()).distinct().limit(3)
    recListForUserSubset = model.recommendForUserSubset(userSubset, 10)
    # 在给定的电影集上为集合中的每个电影生成Top 10用户推荐列表
    movieSubset = ratingSamples.select(als.getItemCol()).distinct().limit(3)
    recListForMovieSubset = model.recommendForItemSubset(movieSubset, 10)
    # 显示推荐结果
    recListForUser.show(5, truncate=False)
    recListForMovie.show(5, truncate=False)
    recListForUserSubset.show(5, truncate=False)
    recListForMovieSubset.show(5, truncate=False)

    paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build()
    # 使用离线评估策略的交叉验证
    # 将全部样本划分为k个大小相等的样本子集,依次遍历这k个子集,将每次遍历到的子集作为验证集,其余子集作为训练集
    # 依次进行k次模型的训练和评估,k通常取10
    # 最后将这k次评估指标的平均值作为最终评估指标
    cv = CrossValidator(estimator=als,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)
    cvModel = cv.fit(test_data)
    avgMetrics = cvModel.avgMetrics
示例#20
0
文件: main.py 项目: vksmgr/music
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")

rmse = evaluator.evaluate(prediction)
print("Root-mean-square error = " + str(rmse))

userRecs = model.recommendForAllUsers(10)

artistRecs = model.recommendForAllItems(10)



#top 10 artist recomendations
users = user_artist_data_df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)


#top 10 user recommendations for a specified artist

artist = user_artist_data_df.select(als.getItemCol()).distinct().limit(3)
artistSubSetRecs = model.recommendForItemSubset(artist, 10)

print("Top 10 Recomendation : ")
userRecs.show(10)
print("============================")
artistRecs.show(10)
print("============================")
userSubsetRecs.show(10)
print("============================")
artistSubSetRecs.show(10)

spark.stop()