MaxIter = 10 # 과적합 방지 RegParam = 0.01 als = ALS()\ .setMaxIter(MaxIter)\ .setAlpha(Alpha)\ .setRegParam(RegParam)\ .setRank(Rank)\ .setRegParam(RegParam)\ .setImplicitPrefs(implicitPrefs)\ .setUserCol("userId")\ .setItemCol("movieId")\ .setRatingCol("rating") als.explainParams() alsModel = als.fit(training) predictions = alsModel.transform(test) # COMMAND ---------- # user와 Item간의 weight Factor # Rank의 갯수에 따라서 달라진다. alsModel.userFactors.show(10, False) user_recs = alsModel.recommendForAllUsers(10) user_recs.show() user_recs.where(user_recs.userId == 0)\ .select("recommendations.movieId", "recommendations.rating")\ .collect()
ratings = spark.read.text("/data/sample_movielens_ratings.txt")\ .rdd.toDF()\ .selectExpr("split(value , '::') as col")\ .selectExpr( "cast(col[0] as int) as userId", "cast(col[1] as int) as movieId", "cast(col[2] as float) as rating", "cast(col[3] as long) as timestamp") training, test = ratings.randomSplit([0.8, 0.2]) als = ALS()\ .setMaxIter(5)\ .setRegParam(0.01)\ .setUserCol("userId")\ .setItemCol("movieId")\ .setRatingCol("rating") print als.explainParams() alsModel = als.fit(training) predictions = alsModel.transform(test) # COMMAND ---------- alsModel.recommendForAllUsers(10)\ .selectExpr("userId", "explode(recommendations)").show() alsModel.recommendForAllItems(10)\ .selectExpr("movieId", "explode(recommendations)").show() # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator
recommendation_data.show() # Save modeling data: recommendation_data.write.parquet("data/recommendation_data/", mode="overwrite") # ## Create train and test datasets (train, test) = recommendation_data.randomSplit(weights=[0.75, 0.25], seed=12345) # ## Specify and fit an ALS model from pyspark.ml.recommendation import ALS als = ALS(userCol="user", itemCol="artist", ratingCol="playcount", implicitPrefs=True, seed=23456) print(als.explainParams()) als_model = als.fit(train) # ## Examine the ALS model als_model.userFactors.head(5) als_model.itemFactors.head(5) # **Note:** Some artists are not represented in the training data: als_model.userFactors.count() als_model.itemFactors.count() # ## Apply the model