def update(self, data): # Save the old centroid and recompute TF_DT old_centroid = self.centroid self.tweets = data self.tf, self.dt = computeTF_DT(self.tweets) self.centroid = self.calculateCentroid() return float(tweet_distance(old_centroid, self.centroid)) / min(len(tokenise(old_centroid)), len(tokenise(self.centroid)))
def __init__(self, data): if len(data) == 0: raise Exception("ILLEGAL: empty cluster") self.tweets = [data] # store a list of tweets self.centroid = data # identify one of the tweets as centroid; initially it's just one tweet self.tf, self.dt = computeTF_DT(self.tweets) # term frequency, dominant term