def process_item_sequence(rating_data): sequences = rating_data.filter(F.col("rating") >= 3.5).groupBy("userId")\ .agg(udf_combine_movies_by_timeline(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\ .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " ")) print_info(sequences, message="after build movieIdStr: ") sequences = sequences.select("movieIds") return sequences
def extract_and_save_movie_features_to_redis(features: DataFrame): print_info(features) samples = features.withColumn("movieRowNum", F.row_number().over( sql.Window.partitionBy("movieId").orderBy(F.col("timestamp").desc()))) \ .filter(F.col("movieRowNum") == 1) \ .select("movieId", "releaseYear", "movieGenre1", "movieGenre2", "movieGenre3", "movieRatingCount", "movieAvgRating", "movieRatingStddev").na.fill("") pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True) r = redis.Redis(connection_pool=pool) sample_array = samples.collect() print("total user size: %d" % len(sample_array)) insert_movie_number = 0 movie_count = len(sample_array) for sample in sample_array: movie_key = Config.movie_feature_prefix + sample["movieId"] value_map = dict() value_map["releaseYear"] = sample["releaseYear"] value_map["movieGenre1"] = sample["movieGenre1"] value_map["movieGenre2"] = sample["movieGenre2"] value_map["movieGenre3"] = sample["movieGenre3"] value_map["movieRatingCount"] = sample["movieRatingCount"] value_map["movieAvgRating"] = sample["movieAvgRating"] value_map["movieRatingStddev"] = sample["movieRatingStddev"] r.hmset(movie_key, value_map) insert_movie_number += 1 if insert_movie_number % 100 == 0: print("%d/%d..." % (insert_movie_number, movie_count)) r.close() return samples
def add_sample_label(data): print_info(data) sample_count = data.count() data.groupBy("rating").count().orderBy("rating")\ .withColumn("percentage", F.col("count")/sample_count)\ .show() data = data.withColumn("label", F.when(F.col("rating") >= 3.5, 1).otherwise(0)) print_info(data) return data
def generate_transition_matrix(movie_sequences: DataFrame): print_info(movie_sequences) pair_data = movie_sequences.rdd.flatMap(build_pair_data) print(pair_data.take(20)) pair_data_with_weight = pair_data.groupBy(lambda x: x[0]).map( build_movieId_weight).collectAsMap() item2weight = {} total_length = movie_sequences.count() for movieId, id2weight in pair_data_with_weight.items(): item2weight[movieId] = len(id2weight) * 1.0 / total_length return pair_data_with_weight, item2weight
def add_user_features(data): # find positive rating list of each userId features = data.withColumn("userPositiveHistory", F.collect_list(F.when(F.col("label") == 1, F.col("movieId")).otherwise(F.lit(None))) .over( sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1) ))\ .withColumn("userPositiveHistory", F.reverse(F.col("userPositiveHistory"))) \ .withColumn("userRatedMovie1", F.col("userPositiveHistory").getItem(0)) \ .withColumn("userRatedMovie2", F.col("userPositiveHistory").getItem(1)) \ .withColumn("userRatedMovie3", F.col("userPositiveHistory").getItem(2)) \ .withColumn("userRatedMovie4", F.col("userPositiveHistory").getItem(3)) \ .withColumn("userRatedMovie5", F.col("userPositiveHistory").getItem(4)) \ .withColumn("userRatingCount", F.count(F.lit(1)).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \ .withColumn("userAvgReleaseYear", F.avg(F.col("releaseYear")).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \ .withColumn("userReleaseYearStddev", F.stddev(F.col("releaseYear")).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \ .withColumn("userAvgRating", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy("userId") .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy("userId") .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \ .withColumn("userGenres", udf_extract_genres(F.collect_list(F.when(F.col("label") == 1, F.col("genres")).otherwise(F.lit(None))) .over(sql.Window.partitionBy("userId").orderBy("timestamp").rowsBetween(-100, -1)))) \ .na.fill(0) \ .withColumn("userReleaseYearStddev", F.format_number(F.col("userReleaseYearStddev"), Config.NUMBER_PRECISION)) \ .withColumn("userGenre1", F.col("userGenres").getItem(0)) \ .withColumn("userGenre2", F.col("userGenres").getItem(1)) \ .withColumn("userGenre3", F.col("userGenres").getItem(2)) \ .withColumn("userGenre4", F.col("userGenres").getItem(3)) \ .withColumn("userGenre5", F.col("userGenres").getItem(4)) \ .drop("genres", "userGenres", "userPositiveHistory") \ .filter(F.col("userRatingCount") > 1) print_info(features, topN=20) return features
def add_movie_features(data, rating_with_label): # combine movie data and label data = rating_with_label.join(data, on=['movieId'], how="left") # get release year data = data.withColumn("releaseYear", udf_get_year_from_title(F.col("title")))\ .withColumn("title", udf_get_title_from_title(F.col("title")))\ .drop("title") # split genres to 3 columns data = data.withColumn("genre1", F.split(F.col("genres"), "\\|").getItem(0)) \ .withColumn("genre2", F.split(F.col("genres"), "\\|").getItem(1)) \ .withColumn("genre3", F.split(F.col("genres"), "\\|").getItem(2)) # get rating's avg, std rating_features = data.groupBy("movieId").agg( F.count(F.lit(1)).alias("movieRatingCount"), F.format_number(F.avg(F.col("rating")), Config.NUMBER_PRECISION).alias("AvgMovieRating"), F.stddev(F.col("rating")).alias("StdMovieRating")).na.fill(0)\ .withColumn("StdMovieRating", F.format_number(F.col("StdMovieRating"), Config.NUMBER_PRECISION)) data = data.join(rating_features, on=["movieId"], how="left") print_info(data) return data