예제 #1
0
def process_item_sequence(rating_data):
    sequences = rating_data.filter(F.col("rating") >= 3.5).groupBy("userId")\
        .agg(udf_combine_movies_by_timeline(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\
        .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " "))
    print_info(sequences, message="after build movieIdStr: ")
    sequences = sequences.select("movieIds")
    return sequences
예제 #2
0
def extract_and_save_movie_features_to_redis(features: DataFrame):
    print_info(features)
    samples = features.withColumn("movieRowNum",
                                  F.row_number().over(
                                      sql.Window.partitionBy("movieId").orderBy(F.col("timestamp").desc()))) \
        .filter(F.col("movieRowNum") == 1) \
        .select("movieId", "releaseYear", "movieGenre1", "movieGenre2", "movieGenre3",
                "movieRatingCount", "movieAvgRating", "movieRatingStddev").na.fill("")
    pool = redis.ConnectionPool(host='localhost',
                                port=6379,
                                decode_responses=True)
    r = redis.Redis(connection_pool=pool)
    sample_array = samples.collect()
    print("total user size: %d" % len(sample_array))
    insert_movie_number = 0
    movie_count = len(sample_array)
    for sample in sample_array:
        movie_key = Config.movie_feature_prefix + sample["movieId"]
        value_map = dict()
        value_map["releaseYear"] = sample["releaseYear"]
        value_map["movieGenre1"] = sample["movieGenre1"]
        value_map["movieGenre2"] = sample["movieGenre2"]
        value_map["movieGenre3"] = sample["movieGenre3"]
        value_map["movieRatingCount"] = sample["movieRatingCount"]
        value_map["movieAvgRating"] = sample["movieAvgRating"]
        value_map["movieRatingStddev"] = sample["movieRatingStddev"]
        r.hmset(movie_key, value_map)
        insert_movie_number += 1
        if insert_movie_number % 100 == 0:
            print("%d/%d..." % (insert_movie_number, movie_count))
    r.close()
    return samples
예제 #3
0
def add_sample_label(data):
    print_info(data)
    sample_count = data.count()
    data.groupBy("rating").count().orderBy("rating")\
        .withColumn("percentage", F.col("count")/sample_count)\
        .show()
    data = data.withColumn("label",
                           F.when(F.col("rating") >= 3.5, 1).otherwise(0))
    print_info(data)
    return data
예제 #4
0
def generate_transition_matrix(movie_sequences: DataFrame):
    print_info(movie_sequences)
    pair_data = movie_sequences.rdd.flatMap(build_pair_data)
    print(pair_data.take(20))
    pair_data_with_weight = pair_data.groupBy(lambda x: x[0]).map(
        build_movieId_weight).collectAsMap()
    item2weight = {}
    total_length = movie_sequences.count()
    for movieId, id2weight in pair_data_with_weight.items():
        item2weight[movieId] = len(id2weight) * 1.0 / total_length
    return pair_data_with_weight, item2weight
예제 #5
0
def add_user_features(data):
    # find positive rating list of each userId
    features = data.withColumn("userPositiveHistory",
                               F.collect_list(F.when(F.col("label") == 1, F.col("movieId")).otherwise(F.lit(None)))
                               .over(
                                   sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1)
                               ))\
        .withColumn("userPositiveHistory", F.reverse(F.col("userPositiveHistory"))) \
        .withColumn("userRatedMovie1", F.col("userPositiveHistory").getItem(0)) \
        .withColumn("userRatedMovie2", F.col("userPositiveHistory").getItem(1)) \
        .withColumn("userRatedMovie3", F.col("userPositiveHistory").getItem(2)) \
        .withColumn("userRatedMovie4", F.col("userPositiveHistory").getItem(3)) \
        .withColumn("userRatedMovie5", F.col("userPositiveHistory").getItem(4)) \
        .withColumn("userRatingCount",
                    F.count(F.lit(1)).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \
        .withColumn("userAvgReleaseYear",
                    F.avg(F.col("releaseYear")).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \
        .withColumn("userReleaseYearStddev",
                    F.stddev(F.col("releaseYear")).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \
        .withColumn("userAvgRating",
                    F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy("userId")
                    .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \
        .withColumn("userRatingStddev",
                    F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy("userId")
                    .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \
        .withColumn("userGenres",
                    udf_extract_genres(F.collect_list(F.when(F.col("label") == 1, F.col("genres")).otherwise(F.lit(None)))
                    .over(sql.Window.partitionBy("userId").orderBy("timestamp").rowsBetween(-100, -1)))) \
        .na.fill(0) \
        .withColumn("userReleaseYearStddev", F.format_number(F.col("userReleaseYearStddev"), Config.NUMBER_PRECISION)) \
        .withColumn("userGenre1", F.col("userGenres").getItem(0)) \
        .withColumn("userGenre2", F.col("userGenres").getItem(1)) \
        .withColumn("userGenre3", F.col("userGenres").getItem(2)) \
        .withColumn("userGenre4", F.col("userGenres").getItem(3)) \
        .withColumn("userGenre5", F.col("userGenres").getItem(4)) \
        .drop("genres", "userGenres", "userPositiveHistory") \
        .filter(F.col("userRatingCount") > 1)
    print_info(features, topN=20)
    return features
예제 #6
0
def add_movie_features(data, rating_with_label):
    # combine movie data and label
    data = rating_with_label.join(data, on=['movieId'], how="left")
    # get release year
    data = data.withColumn("releaseYear", udf_get_year_from_title(F.col("title")))\
        .withColumn("title", udf_get_title_from_title(F.col("title")))\
        .drop("title")
    # split genres to 3 columns
    data = data.withColumn("genre1", F.split(F.col("genres"), "\\|").getItem(0)) \
        .withColumn("genre2", F.split(F.col("genres"), "\\|").getItem(1)) \
        .withColumn("genre3", F.split(F.col("genres"), "\\|").getItem(2))
    # get rating's avg, std
    rating_features = data.groupBy("movieId").agg(
        F.count(F.lit(1)).alias("movieRatingCount"),
        F.format_number(F.avg(F.col("rating")), Config.NUMBER_PRECISION).alias("AvgMovieRating"),
        F.stddev(F.col("rating")).alias("StdMovieRating")).na.fill(0)\
        .withColumn("StdMovieRating", F.format_number(F.col("StdMovieRating"), Config.NUMBER_PRECISION))

    data = data.join(rating_features, on=["movieId"], how="left")
    print_info(data)
    return data