def update(self, data): # Save the old centroid and recompute TF_DT old_centroid = self.centroid self.tweets = data self.tf, self.dt = computeTF_DT(self.tweets) self.centroid = self.calculateCentroid() return float(tweet_distance(old_centroid, self.centroid)) / min(len(tokenise(old_centroid)), len(tokenise(self.centroid)))
def calculateCentroid(self): # If there's no tweets in the cluster, return if len(self.tweets) == 0: return '' # Generate a list of twitter_distance t_dist = [] for t in self.tweets: t_dist.append(tweet_distance(t, self.centroid)) # Computing average distance of all the tweet_distance avgDist = round(float(sum(t_dist))/len(self.tweets)) # Find the minimum difference in distance and returning that tweet closest_dist = [abs(x-avgDist) for x in t_dist] min_index, min_val = min(enumerate(closest_dist), key=lambda x:x[1]) return self.tweets[min_index]
def kmeans(tweets, k, maxRound, cutoff): init = random.sample(tweets, k) # randomly sample k tweets clusters = [cluster.Cluster(t) for t in init] # Use the init set as k separate clusters round = 0 while round < maxRound: #print 'Round #%s<br>' % round lists = [[] for c in clusters] # Create an empty list for each cluster for t in tweets: # Compute distances to each of the cluster dist = [ float(tweet_distance(t, clusters[i].centroid)) / min(len(tokenise(t)), len(tokenise(clusters[i].centroid))) for i in range(len(clusters)) ] # Find the max, which indicate the most similarity maxDist = max(dist) idx = dist.index(maxDist) # If the tweet doesn't fit into any cluster (below a threshold), randomly assign it to a cluster, otherwise, assign it to the cluster with maximum distance if maxDist < cutoff: lists[random.sample(range(k), 1)[0]].append(t) else: lists[idx].append(t) # Update the clusters biggest_shift = 0.0 for i in range(len(clusters)): shift = clusters[i].update(lists[i]) biggest_shift = max(biggest_shift, shift) # If the clusters aren't shifting much (i.e. twitter distance remain high), break and return the results if biggest_shift > cutoff: break round = round + 1 #print "Done clustering...<br>" return clusters
def kmeans(tweets, k, maxRound, cutoff): init = random.sample(tweets, k) # randomly sample k tweets clusters = [cluster.Cluster(t) for t in init] # Use the init set as k separate clusters round = 0 while round < maxRound: #print 'Round #%s<br>' % round lists = [ [] for c in clusters] # Create an empty list for each cluster for t in tweets: # Compute distances to each of the cluster dist = [float(tweet_distance(t, clusters[i].centroid))/min(len(tokenise(t)), len(tokenise(clusters[i].centroid))) for i in range(len(clusters))] # Find the max, which indicate the most similarity maxDist = max(dist) idx = dist.index(maxDist) # If the tweet doesn't fit into any cluster (below a threshold), randomly assign it to a cluster, otherwise, assign it to the cluster with maximum distance if maxDist < cutoff: lists[random.sample(range(k), 1)[0]].append(t) else: lists[idx].append(t) # Update the clusters biggest_shift = 0.0 for i in range(len(clusters)): shift = clusters[i].update(lists[i]) biggest_shift = max(biggest_shift, shift) # If the clusters aren't shifting much (i.e. twitter distance remain high), break and return the results if biggest_shift > cutoff: break round = round + 1 #print "Done clustering...<br>" return clusters