def fromLocalNeighbourhood(local_neighbourhood: LocalNeighbourhood, params=None, remove_unconnected_nodes=True): graph = nx.DiGraph() user_list = local_neighbourhood.get_user_id_list() user_list.remove(str(local_neighbourhood.seed_id)) log.info("Length of list " + str(len(user_list))) if remove_unconnected_nodes: user_list = SocialGraph.remove_unconnected_nodes( local_neighbourhood) log.info("Length of list after removing unconnected nodes " + str(len(user_list))) if str(local_neighbourhood.seed_id) in user_list: user_list.remove(str(local_neighbourhood.seed_id)) # user_list.remove(str(local_neighbourhood.seed_id)) for user in user_list: graph.add_node(user) for user in user_list: friends = local_neighbourhood.get_user_friends(user) for friend in friends: graph.add_edge(user, str(friend)) log.info(graph.order()) params = deepcopy(local_neighbourhood.params) if params is None: params = {} params["graph_type"] = "union" social_graph = SocialGraph(graph, local_neighbourhood.seed_id, params) return social_graph
def fromLocalNeighbourhood(local_neighbourhood: LocalNeighbourhood, params=None, remove_unconnected_nodes=True): graph = nx.DiGraph() user_list = local_neighbourhood.get_user_id_list() if remove_unconnected_nodes: user_list = SocialGraph.remove_unconnected_nodes(local_neighbourhood) for user in user_list: graph.add_node(user) for user in user_list: friends = local_neighbourhood.get_user_friends(user) for friend in friends: if user in local_neighbourhood.get_user_friends(friend): graph.add_edge(user, friend) params = deepcopy(local_neighbourhood.params) if params is None: params = {} params["graph_type"] = "intersection" social_graph = SocialGraph(graph, local_neighbourhood.seed_id, params) return social_graph
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) global_clean = friends_cleaner.clean_friends_global(seed_id, init_user_friends, tweet_threshold=50, follower_threshold=50, friend_threshold=0, bot_threshold=0) clean_list, removed_list = friends_cleaner.clean_friends_local( seed_id, global_clean, local_following=thresh) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood( seed_id, local_neighbourhood, remove_unconnected_nodes=True) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) log.info("Iteration: " + str(i))
def produce_plots(seed_id: str, user_name: str, threshold: int, i, type, path=DEFAULT_PATH): threshold = int(threshold) if type == 0: type_str = "default" t1 = threshold t2 = threshold elif type == 1: type_str = "follower_only" t1 = 0 t2 = threshold elif type == 2: type_str = "tweet_only" t1 = threshold t2 = 0 injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) clean_list = friends_cleaner.clean_friends_from_list_hard_thresh(seed_id, init_user_friends, t1, t2) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=False) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) write_clusters_to_file(user_name, clusters, i, threshold, type_str)
def download_local_neighbourhood_by_id(self, user_id: str, params=None): user_friends_ids = self.cleaned_user_friend_getter.get_user_friends_ids(user_id) if user_friends_ids is None: log.info("Could not find user_friend list") self.user_friends_downloader.download_friends_ids_by_id(user_id) user_friends_ids = self.user_friend_getter.get_user_friends_ids(user_id) user_dict = {} user_dict[str(user_id)] = user_friends_ids num_ids = len(user_friends_ids) for i in range(num_ids): id = user_friends_ids[i] user_friends = self.user_friend_getter.get_user_friends_ids(id) if user_friends is None: self.user_friends_downloader.download_friends_ids_by_id(id) user_friends = self.user_friend_getter.get_user_friends_ids(id) log.info("Downloaded " + str(len(user_friends)) + " user friends for " + str(id)) else: log.info("Already stored " + str(len(user_friends)) + " user friends for " + str(id)) assert user_friends is not None user_dict[str(id)] = [str(id) for id in user_friends if (id in user_friends_ids)] log.log_progress(log, i, num_ids) local_neighbourhood = LocalNeighbourhood(seed_id=user_id, params=params, users=user_dict) self.local_neighbourhood_setter.store_local_neighbourhood(local_neighbourhood) log.info("Done downloading local neighbourhood")
def produce_plots(seed_id: str, user_name: str, path=DEFAULT_PATH): threshold = 60 injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor() tweet_processor = process_module.get_tweet_processor() production_ranker = process_module.get_ranker() consumption_ranker = process_module.get_ranker(type="Consumption") follower_ranker = process_module.get_ranker(type="Follower") # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) clean_list = friends_cleaner.clean_friends_from_list(seed_id, init_user_friends, percent_threshold=threshold) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) count = 1 for cluster in clusters: if len(cluster.users) < 5: continue prod_ranking, prod_scores = production_ranker.rank(seed_id, cluster) cons_ranking, cons_scores = consumption_ranker.rank(seed_id, cluster) foll_ranking, foll_scores = follower_ranker.rank(seed_id, cluster) cluster_wf_vector = cluster_word_frequency_processor.process_cluster_word_frequency_vector(cluster.users) wf_dict = cluster_wf_vector.get_words_dict() sorted_words = list(sorted(wf_dict, key=wf_dict.get, reverse=True)) sorted_words.remove("rt") sorted_words.remove("like") top_words = sorted_words[0:min(len(sorted_words), 10)] file_prefix = user_name + '_' + str(count) scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons") scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons", use_log_log_scale=True) scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", type1='Production Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", use_log_log_scale=True, type1='Production Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", type1='Consumption Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", use_log_log_scale=True, type1='Consumption Utility', type2='Follower Utility') write_scores_to_file({"production": prod_scores, "consumption": cons_scores, "follower": foll_scores}, user_name, count) count += 1
def fromLocalNeighbourhood(local_neighbourhood: LocalNeighbourhood, params=None, remove_unconnected_nodes=True): graph = nx.DiGraph() user_list = local_neighbourhood.get_user_id_list() user_list.remove(str(local_neighbourhood.seed_id)) log.info("Length of list " + str(len(user_list))) if remove_unconnected_nodes: user_list = SocialGraph.remove_unconnected_nodes( local_neighbourhood) log.info("Length of list after removing unconnected nodes " + str(len(user_list))) user_list.remove(str(local_neighbourhood.seed_id)) for user in user_list: graph.add_node(user) for user in user_list: friends = local_neighbourhood.get_user_friends(user) for friend in friends: if friend != str(local_neighbourhood.seed_id): if user in local_neighbourhood.get_user_friends( str(friend)): graph.add_edge(user, friend) # Remove Unconnected Nodes remove = [] for node in graph: neighbors = list(graph.neighbors(node)) #predecessors = list(graph.predecessors(node)) if len(neighbors) == 0: remove.append(node) for node in remove: graph.remove_node(node) params = deepcopy(local_neighbourhood.params) if params is None: params = {} params["graph_type"] = "intersection" social_graph = SocialGraph(graph, local_neighbourhood.seed_id, params) return social_graph
def get_local_neighbourhood(self, seed_id: str, params=None): doc = None if params is None: doc = self.collection.find_one( {"seed_id": bson.int64.Int64(seed_id)}) else: doc = self.collection.find_one({ "seed_id": bson.int64.Int64(seed_id), "params": params }) return LocalNeighbourhood.fromDict(doc)
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) type = 'local_and_global' filename_start = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_' filename = filename_start + str(i) + '.json' with open(filename, 'r') as file: user_lists = json.load(file) count = len(user_lists) max_cluster = user_lists[0] for j in range(1, count): if len(user_lists[j]) > len(max_cluster): max_cluster = user_lists[j] max_cluster.remove(str(seed_id)) log.info("Num users in max cluster is " + str(len(max_cluster))) init_user_dict = get_local_neighbourhood_user_dict(seed_id, max_cluster, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=True) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) log.info("Iteration: " + str(i)) write_clusters_to_file(user_name, clusters, i, thresh, "local_and_global_of_cluster")
def process_tweets_by_local_neighbourhood( self, local_neighbourhood: LocalNeighbourhood): user_ids = local_neighbourhood.get_user_id_list() self.process_tweets_by_user_list(user_ids)
def produce_plots(user_name: str, thresh, iteration, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() user_tweet_getter = dao_module.get_user_tweet_getter() clean_user_friend_getter = dao_module.get_cleaned_user_friend_getter() local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter() prod_ranker = process_module.get_ranker() con_ranker = process_module.get_ranker("Consumption") seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) # user = user_getter.get_user_by_id(str(seed_id)) # follower_thresh = 0.1 * user.followers_count # friend_thresh = 0.1 * user.friends_count # tweet_thresh = 0.1 * len(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(seed_id))) # global_clean = friends_cleaner.clean_friends_global(seed_id, # tweet_threshold=tweet_thresh, follower_threshold=follower_thresh, friend_threshold=friend_thresh) # clean_list, removed_list = friends_cleaner.clean_friends_local(seed_id, global_clean, local_following=thresh) # clean_list = [str(id) for id in clean_list] clean_list = clean_user_friend_getter.get_user_friends_ids(str(seed_id)) # social_graph = social_graph_constructor.construct_social_graph(seed_id, is_union=False) # following_counts = {} # for user_id in clean_list: # friends = user_friend_getter.get_user_friends_ids(str(user_id)) # following_counts[user_id] = len(set(friends).intersection(clean_list)) # sorted_users = list(sorted(following_counts, key=following_counts.get, reverse=True)) # print([following_counts[user] for user in sorted_users]) local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(seed_id) # Refined Friends Method for k in range(1, 7): log.info("Refining Friends List:") user_list = local_neighbourhood.get_user_id_list() friends_map = {} print('1012256833816363008' in user_list) for user in user_list: friends_list = [] friends = local_neighbourhood.get_user_friends(user) # print(len(friends)) for friend in friends: if user in local_neighbourhood.get_user_friends(str(friend)): friends_list.append(str(friend)) if user == str(seed_id): if int(user) in user_friend_getter.get_user_friends_ids(str(friend)): friends_list.append(str(friend)) # print(len(friends_list)) friends_map[str(user)] = friends_list if user == "254201259": print(len(friends_list)) log.info("Refining by Jaccard Similarity:") for user in [str(id) for id in user_list]: friends_list = friends_map[user] similarities = {} for friend in friends_list: sim = jaccard_similarity(friends_list, friends_map[str(friend)]) similarities[friend] = sim sorted_users = sorted(similarities, key=similarities.get, reverse=True) top_sum = 0 for top_user in sorted_users[:10]: top_sum += similarities[top_user] if len(sorted_users) >= 10: thresh = 0.1 * k * (top_sum / 10) elif len(sorted_users) == 0: thresh = 0 else: thresh = 0.1 * k * (top_sum / len(sorted_users)) # Can do more efficiently using binary search index = len(sorted_users) for i in range(len(sorted_users)): user = sorted_users[i] if similarities[user] < thresh: index = i break friends_map[user] = sorted_users[:index] log.info("Thresh: " + str(0.1*k)) log.info("Setting Local Neighborhood:") refined_local_neighborhood = LocalNeighbourhood(str(seed_id), None, friends_map) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, refined_local_neighborhood, is_union=False) log.info("Clustering:") clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, None) # log.info("Iteration: " + str(iteration)) log.info(len(clusters)) cluster_sizes = {} for i in range(len(clusters)): cluster_sizes[i] = len(clusters[i].users) sorted_indices = sorted(cluster_sizes, key=cluster_sizes.get, reverse=True) for index in sorted_indices[:5]: cluster = clusters[index] prod_ranking, prod = prod_ranker.rank(str(seed_id), cluster) con_ranking, con = con_ranker.rank(str(seed_id), cluster) ranked_prod = prod_ranking.get_all_ranked_user_ids() ranked_con = con_ranking.get_all_ranked_user_ids() log.info("Cluster Size: " + str(len(cluster.users))) log.info("Ranked by Production: ") log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_prod]) log.info("Ranked by Consumption: ") log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_con])