def _calculateSimilarity(self, threshold=0): """ similar_movies: (movie_id, movie_rating_rdd)""" user_cnt = self.utility.numRows() movie_cnt = self.utility.numCols() rdd = self.utility.entries sims = dict() sims_set = dict() for i in range(1, user_cnt + 1): sims_set[i] = list() users = rdd.groupBy(lambda x: x.i).collect() for i in range(1, user_cnt + 1): for j in range(1, user_cnt + 1): if i == j: continue for user in users: if user[0] == i: user1 = user if user[0] == j: user2 = user vt1 = SparseVector(movie_cnt, [(user.j, 1) for user in user1[1]]) vt2 = SparseVector(movie_cnt, [(user.j, 1) for user in user2[1]]) sim = vt1.dot(vt2) / (vt1.norm(2) * vt2.norm(2)) sims[(i, j)] = sim sims_set[i].append(j) return (sims, sims_set)