Exemplo n.º 1
0
def popularity_based_metrics(ratings, tips):
    total_reviews = ratings.groupBy("business_id").agg(
        F.count(F.lit(1)).alias("total_reviews"))

    all_pairs = ratings.join(
        ren(ratings, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    all_pairs = all_pairs.join(total_reviews, "business_id")

    adamic_ratings = all_pairs.groupBy("user_id", "user_id_2").agg(
        F.sum(1 /
              F.log("total_reviews")).cast("float").alias("aa_pop_ratings"))

    tips = tips.join(ratings.select("user_id").distinct(), "user_id", "right")

    total_tips = tips.groupBy("business_id").agg(
        F.count(F.lit(1)).alias("total_tips"))

    all_pairs = ratings.join(
        ren(ratings, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    all_pairs = all_pairs.join(total_tips, "business_id")

    adamic_tips = all_pairs.groupBy("user_id", "user_id_2").agg(
        F.sum(1 / F.log("total_tips")).cast("float").alias("aa_pop_tips"))

    return adamic_ratings.join(adamic_tips, ["user_id", "user_id_2"], "outer")
Exemplo n.º 2
0
def socialBasedMetrics(ratings, user_friend):
    fold_user_friend = user_friend.join(
        ratings.select("user_id").distinct(), "user_id", "right")
    fu_with_friendsize = fold_user_friend.join(fold_user_friend.select(col("user_id").alias("friend"),
                                                                       col("nf").alias("nf_friend")).distinct(),
                                               "friend") \
        .select("user_id", "nf", "friend", "nf_friend")

    ufJoin = fu_with_friendsize.join(
        ren(fu_with_friendsize,
            ["friend"]), "friend").filter(col("user_id") < col("user_id_2"))

    intersection = ufJoin.groupBy("user_id", "user_id_2", "nf", "nf_2").agg(
        F.count(F.lit(1)).alias("intersection"),
        F.sum(1 / F.log("nf_friend")).cast("float").alias("adamic_adar_graph"))

    graph = intersection.withColumn("jaccard_graph",
                                    (col("intersection") / (col("nf") + col("nf_2") - col("intersection"))).cast(
                                        "float")) \
        .withColumn("cosine_graph", (col("intersection") / (F.sqrt(col("nf") * col("nf_2")))).cast("float")) \
        .withColumn(
        "preferential_attachment", col("nf") * col("nf_2")).select("user_id", "user_id_2", "adamic_adar_graph",
                                                                   "jaccard_graph", "cosine_graph",
                                                                   "preferential_attachment").filter(
        (col("adamic_adar_graph") > 0) | (col("jaccard_graph") > 0) | (col("cosine_graph") > 0))

    return graph
Exemplo n.º 3
0
def categoryAndTemporalBasedMetrics(ratings, business_data):
    b_categories, cat_rating = getFoldCategoriesAndRatings(
        ratings, business_data)

    ucs = cat_rating.select("user_id", "category").distinct()
    # Esto filtra categorias con N apariciones
    categories_appearances = ucs.groupBy("category").agg(
        F.count(F.lit(1)).alias("nc")).orderBy(
            F.desc("nc")).filter(col("nc") <= 2000).drop("nc")

    cat_rating = cat_rating.join(categories_appearances, "category", "right")

    ucs = ucs.join(categories_appearances, "category", "right")

    ucs_all_pairs = ucs.join(ucs.select("category"
                                        , col("user_id").alias("user_id_2"))
                             , "category") \
        .filter(col("user_id") < col("user_id_2"))
    intersection = ucs_all_pairs.groupBy("user_id", "user_id_2").agg(
        F.count(F.lit(1)).alias("intersection"))
    ucs_grouped = ucs.select("user_id").groupBy("user_id").agg(
        F.count(F.lit(1)).alias("nr"))

    intersection = intersection.join(ucs_grouped, "user_id")
    intersection = intersection.join(ren(ucs_grouped), "user_id_2")

    jaccard_cat = intersection.withColumn("jaccard_cat", (
        col("intersection") /
        (col("nr") + col("nr_2") - col("intersection"))).cast("float")).select(
            "user_id", "user_id_2", "jaccard_cat")
    return jaccard_cat
Exemplo n.º 4
0
def jaccard_distance(ratingdf, distancedf, outputcol):
    ratingdf = ratingdf.select("user_id", "business_id")
    users_nr = ratingdf.groupBy("user_id").agg(
        F.count(F.lit(1)).alias("nr")
    ) \
        .alias("users_nr")
    # Usuarios que fueron al primer business
    ratingsJoin = ratingdf.join(distancedf, "business_id")

    ratingsJoin = ratingsJoin.select("user_id", "business_id", "business_id_2").join(ren(ratingdf),
                                                                                     "business_id_2").filter(
        col("user_id") < col("user_id_2")).join(users_nr, "user_id").join(ren(users_nr), "user_id_2")

    intersection = ratingsJoin.groupBy("user_id", "user_id_2", "nr", "nr_2").agg(
        F.count(F.lit(1)).alias("intersection"))
    jaccard_df = intersection.withColumn(outputcol,
                                         (col("intersection") / (col("nr") + col("nr_2") - col("intersection")))
                                         .cast("float"))
    return jaccard_df.select("user_id", "user_id_2", outputcol)
Exemplo n.º 5
0
def jaccard(ratingdf, columns, outputcol):
    reduced_df = ratingdf.select(["user_id"] + checkList(columns)).distinct()

    users_nr = reduced_df.groupBy("user_id").agg(
        F.count(F.lit(1)).alias("nr")
    ) \
        .alias("users_nr")

    ratings_join = reduced_df.join(
        ren(reduced_df, checkList(columns)),
        columns).filter(col("user_id") > col("user_id_2"))

    intersection = ratings_join.groupBy("user_id", "user_id_2").agg(
        F.count(F.lit(1)).alias("intersection"))

    intersection = intersection.join(users_nr, "user_id").join(
        ren(users_nr), "user_id_2").filter(col("user_id") < col("user_id_2"))

    jaccard_df = intersection.withColumn(
        outputcol,
        (col("intersection") /
         (col("nr") + col("nr_2") - col("intersection"))).cast("float"))
    return jaccard_df.select("user_id", "user_id_2", outputcol)
Exemplo n.º 6
0
def jaccard_category(ratingdf, filterCol, outputcol):
    users_nr = ratingdf.groupBy("user_id").agg(
        F.count(F.lit(1)).alias("nr")
    ) \
        .alias("users_nr")

    week_ratings = ratingdf.select("user_id", "category", filterCol)

    ratingsJoin = week_ratings.join(ren(week_ratings, ["category", filterCol]),
                                    ["category", filterCol]) \
        .filter(col("user_id") < col("user_id_2"))

    intersection = ratingsJoin.groupBy("user_id", "user_id_2").agg(
        F.count(F.lit(1)).alias("intersection"))

    intersection = intersection.join(users_nr,
                                     "user_id").join(ren(users_nr),
                                                     "user_id_2")

    jaccard_df = intersection.withColumn(
        outputcol,
        (col("intersection") /
         (col("nr") + col("nr_2") - col("intersection"))).cast("float"))
    return jaccard_df.select("user_id", "user_id_2", outputcol)
Exemplo n.º 7
0
def geo_distance_metrics(ratings, business_data, business_distance, b_categories):
    cat_rating = getFoldCategoriesAndRatings(ratings, business_data, b_categories)

    fold_business_distance = business_distance.join(ratings.select("business_id").distinct(), "business_id",
                                                    "right").join(
        ratings.select(col("business_id").alias("business_id_2")).distinct(),
        "business_id_2", "right")

    jaccard_business_distance = jaccard_distance(ratings, filter_business_distance(fold_business_distance, 0.2),
                                                 "jaccard_business_distance")

    business_with_cat = fold_business_distance.join(b_categories, "business_id") \
        .join(ren(b_categories), "business_id_2")
    business_with_cat = filter_business_distance(business_with_cat, 0.2).filter(col("category") == col("category_2"))
    jaccard_distance_cat = jaccard_distance(ratings, business_with_cat, "jaccard_distance_cat")

    ratings_with_business = cat_rating.join(business_data.select("business_id", "city"), "business_id")
    jaccard_city_cat = jaccard(ratings_with_business, ["category", "city"], "jaccard_city_cat")
    return jaccard_business_distance.join(jaccard_distance_cat, ["user_id", "user_id_2"], "outer").join(
        jaccard_city_cat, ["user_id", "user_id_2"], "outer")
Exemplo n.º 8
0
def jaccard_user_reviews(ratingdf, filterCol, outputcol):
    users_nr = ratingdf.groupBy("user_id").agg(
        F.count(F.lit(1)).alias("nr")
    ) \
        .alias("users_nr")
    week_ratings = ratingdf.join(users_nr,
                                 "user_id").select("user_id", "business_id",
                                                   filterCol, "nr")

    renamed = ren(week_ratings, ["business_id", filterCol])

    ratings_join = week_ratings.join(renamed,
                                    ["business_id", filterCol]) \
        .filter(col("user_id") < col("user_id_2"))

    intersection = ratings_join.groupBy(
        "user_id", "user_id_2", "nr",
        "nr_2").agg(F.count(F.lit(1)).alias("intersection"))
    jaccard_df = intersection.withColumn(
        outputcol,
        (col("intersection") /
         (col("nr") + col("nr_2") - col("intersection"))).cast("float"))
    return jaccard_df.select("user_id", "user_id_2", outputcol)
Exemplo n.º 9
0
    def build(self):
        ###########################################################
        # Esto se puede usar para filtrar por cantidad de train_ratings.
        ###########################################################

        # print(users.count())
        # print(train_ratings.count())

        # all_ratings.orderBy("user_id", "business_id").show()

        # user_friend = user_friend.withColumnRenamed("friend", "friend_name").join(
        #     user_id.select(col("user_id").alias("friend_id"),
        #                    col("user_name").alias("friend_name")),
        #     "friend_name").select("user_id", "friend_id", "nf")

        # *********************************************
        # Agregar Connected Components y PageRank PRECALCULADOS
        # *********************************************
        # cc = self._conn.read_parquet("hdfs://192.168.240.10/yelp-graph/connected_components")
        # pagerank = self._conn.read_parquet("hdfs://192.168.240.10/yelp-graph/user_pagerank")

        ###########################################################
        # Crear Folds Con Ratings Originales
        ###########################################################
        if self._create_folds:
            for fold in range(self._folds):
                print("Building Fold {}".format(fold))
                train_ratings, test_ratings = train_test_split_randomized(
                    self._dataset.ratings, fold, self._folds, self._split)
                self._conn.write_parquet(train_ratings,
                                         "fold_" + str(fold) + "/train")
                self._conn.write_parquet(test_ratings,
                                         "fold_" + str(fold) + "/test")
                self._conn.clear()

        are_friends = self._dataset.social_graph.select("user_id", col("friend").alias("user_id_2")) \
            .withColumn("are_friends",
                        lit(True).cast(
                            "boolean"))
        for fold in range(self._folds):
            print("Creating features for fold {}".format(fold))
            train_ratings = self._conn.read_parquet(
                "fold_{}/train".format(fold))
            rating_based = rating_based_metrics(train_ratings)
            # Filtro los features sociales por los usuarios del fold
            social_based = socialBasedMetrics(train_ratings,
                                              self._dataset.social_graph)
            popularity_based = popularity_based_metrics(
                train_ratings, self._dataset.tips)

            user_home_df = user_home(train_ratings,
                                     self._dataset.users,
                                     self._dataset.business,
                                     self._dataset.business_distance,
                                     filter_best_ratings=False)

            user_home_distance = user_home_df.crossJoin(broadcast(ren(user_home_df))).withColumn("home_distance",
                                                                                                 km(col("latitude"),
                                                                                                    col("longitude"),
                                                                                                    col("latitude_2"),
                                                                                                    col("longitude_2"))) \
                .where(col("home_distance") < 1) \
                .select("user_id", "user_id_2", (lit(1) / (lit(1) + col("home_distance"))).alias("home_sim"))

            self._conn.write_parquet(user_home_distance,
                                     "tmp/fold_{}_home_distance".format(fold))
            user_home_distance = self._conn.read_parquet(
                "tmp/fold_{}_home_distance".format(fold))

            fold_business_data = self._dataset.business.join(
                train_ratings.select("business_id").distinct(), "business_id",
                "right")
            #     catBased = categoryAndTemporalBasedMetrics(train_ratings, fold_business_data)
            geo_based = geo_distance_metrics(train_ratings, fold_business_data,
                                             self._dataset.business_distance,
                                             self._dataset.business_categories)
            dfs = [
                rating_based,
                social_based,
                #            catBased,
                geo_based,
                popularity_based,
                user_home_distance
            ]

            current_df = None
            df_count = 0
            for df in dfs:
                if current_df is None:
                    current_df = df
                else:
                    current_df = current_df.join(df, ["user_id", "user_id_2"],
                                                 "outer")

                self._conn.write_parquet(
                    current_df, "tmp/fold_{}_{}".format(fold, df_count))
                current_df = self._conn.read_parquet("tmp/fold_{}_{}".format(
                    fold, df_count))
                df_count = df_count + 1

            # all_features = current_df.join(cc, "user_id", "leftouter").join(
            #     cc.select(col("user_id").alias("user_id_2"), col("cc").alias("cc_2")), "user_id_2",
            #     "leftouter").withColumn(
            #     "cc_sim", F.when(col("cc") == col("cc_2"), 1.0).otherwise(0.0))
            #
            # all_features = all_features.join(pagerank, "user_id", "leftouter").join(
            #     pagerank.select(col("user_id").alias("user_id_2"),
            #                     col("pagerank").alias("pagerank_2")), "user_id_2", "leftouter")

            usefulness_df = usefulness(train_ratings)
            all_features = current_df.join(
                usefulness_df, "user_id", "leftouter").join(
                    usefulness_df.select(
                        col("user_id").alias("user_id_2"),
                        col("usefulness").alias("usefulness_2")), "user_id_2",
                    "leftouter")

            liked_tips_df = liked_tips(train_ratings, self._dataset.tips)
            all_features = all_features.join(
                liked_tips_df, "user_id", "leftouter").join(
                    liked_tips_df.select(
                        col("user_id").alias("user_id_2"),
                        col("liked_tips").alias("liked_tips_2")), "user_id_2",
                    "leftouter")

            #     with_class = all_features.join(are_friends, ["user_id", "user_id_2"], "leftouter")\
            #                      .na.fill({"are_friends": False}).fillna(0)

            self._conn.write_parquet(all_features,
                                     "/fold_{}/train_features".format(fold))

            all_features.printSchema()

            print("Finished building fold " + str(fold))
            return all_features
Exemplo n.º 10
0
    def create_files(self, conn, create_distances, filter_city):
        business_df = conn.read_json("dataset/business.json")
        users_df = conn.read_json("dataset/user.json")
        checkins_df = conn.read_json("dataset/checkin.json")
        tips_df = conn.read_json("dataset/tip.json")
        ratings_df = conn.read_json("dataset/review.json")

        if filter_city is not None:
            business_df = business_df.filter(col("state") == "AZ")
        checkins_df = filter_dataframe(checkins_df, business_df, "business_id")
        tips_df = filter_dataframe(tips_df, business_df, "business_id")
        ratings_df = filter_dataframe(ratings_df, business_df, "business_id")
        users_df = filter_dataframe(users_df, ratings_df, "user_id")

        user_id, users = convert_column_to_id(users_df, "user_id", "user_name")
        business_id, business = convert_column_to_id(business_df, "business_id", "business_name")

        checkins = replace_ids(checkins_df, "business_id", business_id, "business_name")
        ratings = replace_all(ratings_df, user_id, business_id)
        tips = replace_all(tips_df, user_id, business_id)

        social_graph = users.select("user_id", "friends") \
            .select("user_id",
                    f.explode(
                        "friends").alias(
                        "friend"),
                    f.size(
                        "friends").alias(
                        "nf")).join(
            user_id.select(col("user_name").alias("friend"), col("user_id").alias("friend_id")), "friend").select(
            "user_id", col("friend_id").alias("friend"), "nf")
        business_categories = business.select(col("business_id"), "categories") \
            .select("business_id", f.explode("categories").alias("category"),
                    f.size("categories").alias("nc"))
        category_id = business_categories.select("category").distinct().rdd.map(lambda x: x[0]).zipWithIndex() \
            .toDF(["category_name", "category"]).select("category_name", col("category").cast("integer"))
        business_categories = business_categories.select("business_id",
                                                         col("category").alias("category_name"),
                                                         "nc").join(category_id, "category_name") \
            .drop("category_name")

        ratings = ratings.select("user_id",
                                 "business_id",
                                 "useful",
                                 "stars")
        users = users.select("user_id",
                             "average_stars")
        business = business.select("business_id",
                                   "city",
                                   "latitude",
                                   "longitude")
        social_graph = social_graph.select("user_id",
                                           "friend",
                                           "nf")
        tips = tips.select("user_id",
                           "business_id",
                           "likes")

        conn.write_parquet(business, "dataset/yelp_business_converted_" + filter_city)
        conn.write_parquet(business_categories, "dataset/yelp_business_categories_converted_" + filter_city)
        conn.write_parquet(users, "dataset/yelp_user_converted_" + filter_city)
        conn.write_parquet(checkins, "dataset/yelp_checkin_converted_" + filter_city)
        conn.write_parquet(tips, "dataset/yelp_tip_converted_" + filter_city)
        conn.write_parquet(ratings, "dataset/yelp_review_converted_" + filter_city)
        conn.write_parquet(social_graph, "dataset/social_graph_converted_" + filter_city)
        if create_distances:
            lat_and_long = business.select("business_id", "longitude", "latitude")
            business_distance = lat_and_long.crossJoin(f.broadcast(ren(lat_and_long))) \
                .withColumn("distance",
                            km(col("latitude"),
                               col("longitude"),
                               col("latitude_2"),
                               col("longitude_2"))) \
                .where(col("distance") <= 1).select("business_id", "business_id_2", "distance")
            conn.write_parquet(business_distance, "dataset/business_distance/")
Exemplo n.º 11
0
def rating_based_metrics(ratings):
    ratings_quad = ratings.select(
        "user_id", "business_id", "stars").withColumn(
            "stars_quad",
            col("stars") * col("stars")).alias("user_business_rating")
    sum_stars = ratings_quad.groupBy("user_id").agg(
        f.sum("stars_quad").alias("sum_quad_stars"),
        f.count(f.lit(1)).alias("nr")
    ) \
        .alias("user_business_stars_quad")

    ratings_sum = ratings_quad.join(sum_stars,
                                    "user_id").select("business_id", "user_id",
                                                      "stars", "stars_quad",
                                                      "sum_quad_stars", "nr")

    all_pairs = ratings_sum.join(
        ren(ratings_sum, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    cosine_data = all_pairs.groupBy(
        "user_id", "user_id_2", "sum_quad_stars", "sum_quad_stars_2").agg(
            f.sum("stars").alias("sum_stars"),
            f.sum("stars_2").alias("sum_stars_2"),
            f.sum(col("stars") * col("stars_2")).alias("sum_xy"),
            f.sum((col("stars") - col("stars_2")) *
                  (col("stars") - col("stars_2"))).alias("sumxy_diff_quad"))
    cosine_rating = cosine_data.withColumn(
        "cosine_rating",
        ((col("sum_xy")) /
         (f.sqrt("sum_quad_stars") *
          f.sqrt("sum_quad_stars_2"))).cast("float")).select(
              "user_id", "user_id_2",
              "cosine_rating").filter(col("cosine_rating") > 0)

    item_count = ratings.select("business_id").distinct().count()
    item_count_sqrt = math.sqrt(item_count)

    dfDiff = all_pairs.withColumn("diff", (col("stars") - col("stars_2")) *
                                  (col("stars") - col("stars_2")) -
                                  col("stars_quad") - col("stars_quad_2"))

    euclidean = dfDiff.groupBy(
        "user_id", "user_id_2", "sum_quad_stars",
        "sum_quad_stars_2").agg(f.sum("diff").alias("sum_diff")).withColumn(
            "diff_quad",
            col("sum_diff") + col("sum_quad_stars") + col("sum_quad_stars_2"))

    euclidean_rating = euclidean.withColumn(
        "euclidean_rating",
        (1 /
         (1 + f.sqrt("diff_quad") / item_count_sqrt)).cast("float")).select(
             "user_id", "user_id_2",
             "euclidean_rating").filter(col("euclidean_rating") > 0)

    intersection = all_pairs.groupBy("user_id", "user_id_2", "nr", "nr_2").agg(
        f.count(f.lit(1)).alias("intersection"))
    jaccard_rating = intersection.withColumn("jaccard_rating", (
            col("intersection") / (col("nr") + col("nr_2") - col("intersection"))).cast("float")) \
        .select("user_id",
                "user_id_2",
                "jaccard_rating").filter(
        col("jaccard_rating") > 0)

    mean_ratings = ratings_quad.groupBy("user_id").agg(
        f.mean("stars").alias("mean_stars")).alias("mean_ratings")

    centered_stars = ratings_quad.join(mean_ratings, "user_id").withColumn(
        "centered_stars",
        col("stars") - col("mean_stars")).withColumn(
            "centered_quad_stars",
            col("centered_stars") * col("centered_stars"))

    centered_stars_sums = centered_stars.groupBy("user_id").agg(f.sum("centered_stars").alias("sum_centered_stars"),
                                                                f.sum("centered_quad_stars").alias(
                                                                    "sum_centered_quad_stars")) \
        .alias("centered_stars_sums")

    centered_stars = centered_stars.join(centered_stars_sums, "user_id")
    centered_stars = centered_stars.join(
        ren(centered_stars, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    centered_grouped = centered_stars.groupBy("user_id", "user_id_2", "sum_centered_quad_stars",
                                              "sum_centered_quad_stars_2").agg(
        f.sum(col("centered_stars") * col("centered_stars_2")).alias("sum_xy_centered")) \
        .alias("centered_sum_quad")

    pearson_rating = centered_grouped.withColumn(
        "pearson_rating",
        ((col("sum_xy_centered")) /
         (f.sqrt("sum_centered_quad_stars") *
          f.sqrt("sum_centered_quad_stars_2"))).cast("float")).select(
              "user_id", "user_id_2",
              "pearson_rating").filter(col("pearson_rating") > 0)

    return cosine_rating.join(jaccard_rating,
                              ["user_id", "user_id_2"], "outer").join(
                                  euclidean_rating, ["user_id", "user_id_2"],
                                  "outer").join(pearson_rating,
                                                ["user_id", "user_id_2"],
                                                "outer")