示例#1
0
    def _calculateSimilarity(self, threshold=0):
        """ similar_movies: (movie_id, movie_rating_rdd)"""
        user_cnt = self.utility.numRows()
        movie_cnt = self.utility.numCols()
        rdd = self.utility.entries
        sims = dict()
        sims_set = dict()
        for i in range(1, user_cnt + 1):
            sims_set[i] = list()

        users = rdd.groupBy(lambda x: x.i).collect()

        for i in range(1, user_cnt + 1):
            for j in range(1, user_cnt + 1):
                if i == j:
                    continue

                for user in users:
                    if user[0] == i:
                        user1 = user
                    if user[0] == j:
                        user2 = user

                vt1 = SparseVector(movie_cnt,
                                   [(user.j, 1) for user in user1[1]])
                vt2 = SparseVector(movie_cnt,
                                   [(user.j, 1) for user in user2[1]])

                sim = vt1.dot(vt2) / (vt1.norm(2) * vt2.norm(2))
                sims[(i, j)] = sim

                sims_set[i].append(j)
        return (sims, sims_set)