def get_topic_similarity(self,user_data): """ Returns cosine similarity between topic scores for two users""" #=====[ Extracts tweet text from user data ]===== tweets = [utils.get_tweets(data) for data in user_data] tweets = [self.format_tweets(tweet_set) for tweet_set in tweets] #=====[ Gets topics for list of tweets ]===== all_tweet_topics = [self.get_topics(tweet_set) for tweet_set in tweets] scores = [] #=====[ Rehydrates a vector in order to take cosine similarity for topics in all_tweet_topics: topic_scores = [0]*100 for score in topics: topic_scores[score[0]] = score[1] scores.append(topic_scores) similarity = cosine_similarity(scores[0:1], scores) similarity = int(float("%.3f" % similarity[0][1])*100) return similarity
def distill_top_topics(self,user_data, topn): #=====[ Extracts tweet text from user data ]===== tweets = utils.get_tweets(user_data) tweets = self.format_tweets(tweets) #=====[ Gets topics for list of tweets ]===== all_tweet_topics = self.get_topics(tweets) topic_scores = {} #=====[ Aggregates scores for each topic from each tweet ]===== for topic in all_tweet_topics: if topic[1] > 0.15: topic_scores[topic[0]] = topic[1] #=====[ Sorts topics and returns #topn of them ]===== sorted_topics = sorted(topic_scores.items(), key=operator.itemgetter(1)) top_topics = [] for topic in sorted_topics: topic_name = self.topic_names[topic[0]] if topic_name not in top_topics: top_topics.append(topic_name) return top_topics