def consume(self, f="jaccard"): if len(self.temp) == 0: return starting_tm = time.time() ex_roid_sum = 0 # clusteroid sum to each point sum_dists = [] for p1 in self.temp: if f == "jaccard": _dist = get_jaccard(p1[1], self.clusteroid) else: _dist = cosine_similarity(p1[1], self.clusteroid) # sum of the current clusteroid ex_roid_sum += _dist # sum for each temp point sum = _dist for p2 in self.temp: if f == "jaccard": sum += get_jaccard(p1[1], p2[1]) else: sum += cosine_similarity(p1[1], p2[1])[0][0] sum_dists.append(sum) # also update membership self.membership.append(p1[0]) # find max distance max_dist_idx = sum_dists.index(max(sum_dists)) # declare the new clusteroid if sum_dists[max_dist_idx] > ex_roid_sum: self.clusteroid = self.temp[max_dist_idx][1] self.temp = [] print("Consuming took {:.3f}".format(time.time() - starting_tm), ":: Key ~>", self.key, "membership ", len(self.membership))
def consume(self): if len(self.temp) == 0: return starting_tm = time.time() ex_roid_sum = 0 sum_dists = [] for p1 in self.temp: d1 = get_jaccard(self.clusteroid_genres, p1['genres']) d2 = get_jaccard(self.clusteroid_tags, p1['tags']) d3 = cosine_similarity(self.clusteroid_ratings, p1['ratings'])[0][0] _dist = 0.33 * d1 + 0.25 * d2 + 0.45 * d3 ex_roid_sum += _dist sum = _dist for p2 in self.temp: d1 = get_jaccard(p2['genres'], p1['genres']) d2 = get_jaccard(p2['tags'], p1['tags']) d3 = cosine_similarity(p2['ratings'], p1['ratings'])[0][0] sum += 0.33 * d1 + 0.25 * d2 + 0.45 * d3 sum_dists.append(sum) self.membership.append(p1['movie_id']) max_dist_idx = sum_dists.index(max(sum_dists)) if sum_dists[max_dist_idx] > ex_roid_sum: self.clusteroid_genres = self.temp[max_dist_idx]['genres'] self.clusteroid_tags = self.temp[max_dist_idx]['tags'] self.clusteroid_ratings = self.temp[max_dist_idx]['ratings'] self.temp = [] print("Consuming took {:.3f}".format(time.time() - starting_tm), ":: Key ~>", self.key, "membership ", len(self.membership))
def fit_with_all(self): starting_tm = time.time() random_clusters_ids = [] iteration = 0 user_ratings = MoviesRatings(self.ratings_path) for chunk in pd.read_csv(self.data_path, chunksize=self.chunk_size): loop_tm = time.time() chunk_ids = chunk['movieId'].tolist() chunk_vectors = {} # user_ratings.get_many_vectors(chunk_ids) for movie_id in chunk_ids: chunk_vectors[movie_id] = user_ratings.get_vector(movie_id) print("All vector created in: {:.3f}".format(time.time() - loop_tm)) if iteration == 0: rows_id = random.sample(range(self.chunk_size), self.k) random_clusters_ids = [chunk['movieId'][row_id] for row_id in rows_id] self.discard = [ ComplexCluster( i, movie_id, chunk['genres'][row_id], chunk['tags'][row_id], chunk_vectors[movie_id] ) for i, (row_id, movie_id) in enumerate(zip(rows_id, random_clusters_ids)) ] clustering_tm = time.time() for movie_id, genres, tags in zip(chunk['movieId'], chunk['genres'], chunk['tags']): if movie_id in random_clusters_ids: continue dists = [] for cluster in self.discard: d1 = get_jaccard(cluster.clusteroid_genres, genres) d2 = get_jaccard(cluster.clusteroid_tags, tags) d3 = cosine_similarity(cluster.clusteroid_ratings, chunk_vectors[movie_id])[0][0] distance = 0.33*d1 + 0.25*d2 + 0.45*d3 dists.append(distance) point = { "movie_id": movie_id, "genres": genres, "tags": tags, "ratings": chunk_vectors[movie_id] } if max(dists) >= self.threshold: self.discard[dists.index(max(dists))].add_temp_point(point) else: self.remaining.append(point) print("Clustering part took {:.3f}".format(time.time() - clustering_tm)) for cluster in self.discard: cluster.consume() print("chunk ", iteration, " in: {:.3f}".format(time.time() - loop_tm)) iteration += 1 print("Total Iterations:", iteration, " Chunk Size: ", self.chunk_size) print("Fit duration(s): {:.3f}".format(time.time() - starting_tm))
def complex_absorb(self): print("Absorb starts") starting_tm = time.time() for remain in self.remaining: dist = [] for cluster in self.discard: d1 = get_jaccard(cluster.clusteroid_genres, remain['genres']) d2 = get_jaccard(cluster.clusteroid_tags, remain['tags']) d3 = cosine_similarity(cluster.clusteroid_ratings, remain['ratings'])[0][0] _dist = 0.33 * d1 + 0.25 * d2 + 0.45 * d3 dist.append(_dist) self.discard[dist.index(max(dist))].membership.append(remain['movie_id']) print("Absorb duration(s): {:.3f}".format(time.time()-starting_tm))
def refresh(self): sum_dists = [] for member in self.members: sum = 0 for point in self.members: sum += get_jaccard(member[1], point[1]) sum_dists.append(sum) self.clusteroid = self.members[sum_dists.index(min(sum_dists))][1]
def simple_absorb(self): print("Absorb starts") starting_tm = time.time() for remain in self.remaining: dist = [] for cluster in self.discard: if self.distance_f != "d3": dist.append(get_jaccard(remain.clusteroid, cluster.clusteroid)) else: dist.append(cosine_similarity(remain.clusteroid, cluster.clusteroid)[0][0]) self.discard[dist.index(max(dist))].membership.extend([x[0] for x in remain.members]) print("Absorb duration(s): {:.3f}".format(time.time()-starting_tm))
def fit_with_new(self): starting_tm = time.time() random_clusters_ids = [] iteration = 0 for chunk in pd.read_csv(self.data_path, chunksize=self.chunk_size): loop_tm = time.time() # If it is the first time, initialize the first k clusters if iteration == 0: rows_id = random.sample(range(self.chunk_size), self.k) random_clusters_ids = [chunk['movieId'][row_id] for row_id in rows_id] self.discard = [ SimpleCluster(i, movie_id, chunk[self.target][row_id]) for i, (row_id, movie_id) in enumerate(zip(rows_id, random_clusters_ids)) ] for movie_id, record in zip(chunk['movieId'], chunk[self.target]): if movie_id in random_clusters_ids: continue # calculate the distance with each cluster dists = [] for cluster in self.discard: dists.append(get_jaccard(cluster.clusteroid, record)) # if it is over than threshold add it to DC if max(dists) >= self.threshold: self.discard[dists.index(max(dists))].add_temp_point(movie_id, record) else: # add it to retained set self.remaining.append(RemainEntity((movie_id, record))) # calculate the new clusteroids in the discard set for c in self.discard: c.consume() # handle retain set --TOO SLOW-- # self.remaining = hierarchical_cluster(self.remaining, self.threshold) print("chunk ", iteration, " in: {:.3f}".format(time.time() - loop_tm)) iteration += 1 # end of dataset parse print("Total Iterations:", iteration, " Chunk Size: ", self.chunk_size) print("Fit duration(s): {:.3f}".format(time.time()-starting_tm))