Exemplo n.º 1
0
def produce_plots(seed_id: str, user_name: str, threshold: int, i, type, path=DEFAULT_PATH):
    threshold = int(threshold)
    if type == 0:
        type_str = "default"
        t1 = threshold
        t2 = threshold
    elif type == 1:
        type_str = "follower_only"
        t1 = 0
        t2 = threshold
    elif type == 2:
        type_str = "tweet_only"
        t1 = threshold
        t2 = 0

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()

    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    clean_list = friends_cleaner.clean_friends_from_list_hard_thresh(seed_id, init_user_friends, t1, t2)
    clean_list = [str(id) for id in clean_list]
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=False)

    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})
    write_clusters_to_file(user_name, clusters, i, threshold, type_str)
Exemplo n.º 2
0
    def download_local_neighbourhood_by_id(self, user_id: str, params=None):
        user_friends_ids = self.cleaned_user_friend_getter.get_user_friends_ids(user_id)
        if user_friends_ids is None:
            log.info("Could not find user_friend list")
            self.user_friends_downloader.download_friends_ids_by_id(user_id)
            user_friends_ids = self.user_friend_getter.get_user_friends_ids(user_id)

        user_dict = {}
        user_dict[str(user_id)] = user_friends_ids

        num_ids = len(user_friends_ids)
        for i in range(num_ids):
            id = user_friends_ids[i]

            user_friends = self.user_friend_getter.get_user_friends_ids(id)
            if user_friends is None:
                self.user_friends_downloader.download_friends_ids_by_id(id)
                user_friends = self.user_friend_getter.get_user_friends_ids(id)
                log.info("Downloaded " + str(len(user_friends)) + " user friends for " + str(id))
            else:
                log.info("Already stored " + str(len(user_friends)) + " user friends for " + str(id))

            assert user_friends is not None

            user_dict[str(id)] = [str(id) for id in user_friends if (id in user_friends_ids)]

            log.log_progress(log, i, num_ids)

        local_neighbourhood = LocalNeighbourhood(seed_id=user_id, params=params, users=user_dict)
        self.local_neighbourhood_setter.store_local_neighbourhood(local_neighbourhood)

        log.info("Done downloading local neighbourhood")
Exemplo n.º 3
0
def produce_plots(seed_id: str, user_name: str, path=DEFAULT_PATH):
    threshold = 60

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor()

    tweet_processor = process_module.get_tweet_processor()

    production_ranker = process_module.get_ranker()
    consumption_ranker = process_module.get_ranker(type="Consumption")
    follower_ranker = process_module.get_ranker(type="Follower")

    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    clean_list = friends_cleaner.clean_friends_from_list(seed_id, init_user_friends, percent_threshold=threshold)
    clean_list = [str(id) for id in clean_list]
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood)
    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})

    count = 1
    for cluster in clusters:
        if len(cluster.users) < 5:
            continue

        prod_ranking, prod_scores = production_ranker.rank(seed_id, cluster)
        cons_ranking, cons_scores = consumption_ranker.rank(seed_id, cluster)
        foll_ranking, foll_scores = follower_ranker.rank(seed_id, cluster)

        cluster_wf_vector = cluster_word_frequency_processor.process_cluster_word_frequency_vector(cluster.users)

        wf_dict = cluster_wf_vector.get_words_dict()
        sorted_words = list(sorted(wf_dict, key=wf_dict.get, reverse=True))
        sorted_words.remove("rt")
        sorted_words.remove("like")
        top_words = sorted_words[0:min(len(sorted_words), 10)]

        file_prefix = user_name + '_' + str(count)

        scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons")
        scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons", use_log_log_scale=True)

        scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", type1='Production Utility', type2='Follower Utility')
        scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", use_log_log_scale=True, type1='Production Utility', type2='Follower Utility')

        scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", type1='Consumption Utility', type2='Follower Utility')
        scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", use_log_log_scale=True, type1='Consumption Utility', type2='Follower Utility')

        write_scores_to_file({"production": prod_scores, "consumption": cons_scores, "follower": foll_scores}, user_name, count)
        count += 1
Exemplo n.º 4
0
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    user_getter = dao_module.get_user_getter()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)

    type = 'local_and_global'
    filename_start = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_'
    filename = filename_start + str(i) + '.json'
    with open(filename, 'r') as file:
        user_lists = json.load(file)
        count = len(user_lists)
    max_cluster = user_lists[0]
    for j in range(1, count):
        if len(user_lists[j]) > len(max_cluster):
            max_cluster = user_lists[j]
    max_cluster.remove(str(seed_id))
    log.info("Num users in max cluster is " + str(len(max_cluster)))
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, max_cluster, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=True)
    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})

    log.info("Iteration: " + str(i))
    write_clusters_to_file(user_name, clusters, i, thresh, "local_and_global_of_cluster")
Exemplo n.º 5
0
def produce_plots(user_name: str, thresh, iteration, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    user_getter = dao_module.get_user_getter()
    user_tweet_getter = dao_module.get_user_tweet_getter()
    clean_user_friend_getter = dao_module.get_cleaned_user_friend_getter()
    local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter()
    prod_ranker = process_module.get_ranker()
    con_ranker = process_module.get_ranker("Consumption")

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)

    # user = user_getter.get_user_by_id(str(seed_id))
    # follower_thresh = 0.1 * user.followers_count
    # friend_thresh = 0.1 * user.friends_count
    # tweet_thresh = 0.1 * len(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(seed_id)))
    # global_clean = friends_cleaner.clean_friends_global(seed_id,
    #             tweet_threshold=tweet_thresh, follower_threshold=follower_thresh, friend_threshold=friend_thresh)
    # clean_list, removed_list = friends_cleaner.clean_friends_local(seed_id, global_clean, local_following=thresh)
    # clean_list = [str(id) for id in clean_list]

    clean_list = clean_user_friend_getter.get_user_friends_ids(str(seed_id))
    # social_graph = social_graph_constructor.construct_social_graph(seed_id, is_union=False)
    # following_counts = {}
    # for user_id in clean_list:
    #     friends = user_friend_getter.get_user_friends_ids(str(user_id))
    #     following_counts[user_id] = len(set(friends).intersection(clean_list))
    # sorted_users = list(sorted(following_counts, key=following_counts.get, reverse=True))
    # print([following_counts[user] for user in sorted_users])

    local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(seed_id)

    # Refined Friends Method
    for k in range(1, 7):
        log.info("Refining Friends List:")
        user_list = local_neighbourhood.get_user_id_list()
        friends_map = {}
        print('1012256833816363008' in user_list)
        for user in user_list:
            friends_list = []
            friends = local_neighbourhood.get_user_friends(user)
            # print(len(friends))
            for friend in friends:
                if user in local_neighbourhood.get_user_friends(str(friend)):
                    friends_list.append(str(friend))
                if user == str(seed_id):
                    if int(user) in user_friend_getter.get_user_friends_ids(str(friend)):
                        friends_list.append(str(friend))
            # print(len(friends_list))
            friends_map[str(user)] = friends_list
            if user == "254201259":
                print(len(friends_list))

        log.info("Refining by Jaccard Similarity:")
        for user in [str(id) for id in user_list]:
            friends_list = friends_map[user]
            similarities = {}
            for friend in friends_list:
                sim = jaccard_similarity(friends_list, friends_map[str(friend)])
                similarities[friend] = sim
            sorted_users = sorted(similarities, key=similarities.get, reverse=True)
            top_sum = 0
            for top_user in sorted_users[:10]:
                top_sum += similarities[top_user]
            if len(sorted_users) >= 10:
                thresh = 0.1 * k * (top_sum / 10)
            elif len(sorted_users) == 0:
                thresh = 0
            else:
                thresh = 0.1 * k * (top_sum / len(sorted_users))
            # Can do more efficiently using binary search
            index = len(sorted_users)
            for i in range(len(sorted_users)):
                user = sorted_users[i]
                if similarities[user] < thresh:
                    index = i
                    break
            friends_map[user] = sorted_users[:index]

        log.info("Thresh: " + str(0.1*k))
        log.info("Setting Local Neighborhood:")
        refined_local_neighborhood = LocalNeighbourhood(str(seed_id), None, friends_map)
        social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, refined_local_neighborhood, is_union=False)
        log.info("Clustering:")
        clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, None)
        # log.info("Iteration: " + str(iteration))
        log.info(len(clusters))
        cluster_sizes = {}
        for i in range(len(clusters)):
            cluster_sizes[i] = len(clusters[i].users)
        sorted_indices = sorted(cluster_sizes, key=cluster_sizes.get, reverse=True)
        for index in sorted_indices[:5]:
            cluster = clusters[index]
            prod_ranking, prod = prod_ranker.rank(str(seed_id), cluster)
            con_ranking, con = con_ranker.rank(str(seed_id), cluster)
            ranked_prod = prod_ranking.get_all_ranked_user_ids()
            ranked_con = con_ranking.get_all_ranked_user_ids()

            log.info("Cluster Size: " + str(len(cluster.users)))
            log.info("Ranked by Production: ")
            log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_prod])
            log.info("Ranked by Consumption: ")
            log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_con])