def worker_function(file_name_list,
                    source_folder,
                    target_folder):
    source_path_list = (source_folder + "/" + file_name for file_name in file_name_list)
    target_path_list = (target_folder + "/" + file_name[:-4] + ".json" for file_name in file_name_list)

    sent_tokenize, _treebank_word_tokenize = get_tokenizer()
    tagger = get_braupt_tagger()
    lemmatizer, lemmatize = get_lemmatizer("wordnet")
    stopset = get_stopset()
    first_cap_re, all_cap_re = get_camel_case_regexes()
    digits_punctuation_whitespace_re = get_digits_punctuation_whitespace_regex()
    pos_set = get_pos_set()

    # Get the lists of a user
    for source_path in source_path_list:
        twitter_lists_corpus = load_pickle(source_path)
        if "lists" in twitter_lists_corpus.keys():
            twitter_lists_corpus = twitter_lists_corpus["lists"]
        else:
            continue

        bag_of_lemmas, lemma_to_keywordbag = user_twitter_list_bag_of_words(twitter_lists_corpus,
                                                                            sent_tokenize, _treebank_word_tokenize,
                                                                            tagger, lemmatizer, lemmatize, stopset,
                                                                            first_cap_re, all_cap_re, digits_punctuation_whitespace_re,
                                                                            pos_set)

        user_annotation = dict()
        user_annotation["bag_of_lemmas"] = bag_of_lemmas
        user_annotation["lemma_to_keywordbag"] = lemma_to_keywordbag

        target_path = next(target_path_list)
        with open(target_path, "w", encoding="utf-8") as fp:
            json.dump(user_annotation, fp)
def worker_function(file_name_list, lemmatizing, source_folder, target_folder):
    source_path_list = (source_folder + "/" + file_name for file_name in file_name_list)
    target_path_list = (target_folder + "/" + file_name[:-4] + ".json" for file_name in file_name_list)

    # Get the lists of a user
    for source_path in source_path_list:
        twitter_lists_corpus = load_pickle(source_path)
        if "lists" in twitter_lists_corpus.keys():
            twitter_lists_corpus = twitter_lists_corpus["lists"]
        else:
            continue

        bag_of_lemmas, lemma_to_keywordbag = user_twitter_list_bag_of_words(twitter_lists_corpus, lemmatizing)

        user_annotation = dict()
        user_annotation["bag_of_lemmas"] = bag_of_lemmas
        user_annotation["lemma_to_keywordbag"] = lemma_to_keywordbag

        target_path = next(target_path_list)
        with open(target_path, "w", encoding="utf-8") as fp:
            json.dump(user_annotation, fp)
def make_annotation(
    twitter_lists_folder,
    twitter_lists_keywords_folder,
    weakly_connected_graph_folder,
    weakly_connected_label_folder,
    full_graph_folder,
):
    # TODO: Move keywords from Mongo to the folder.
    # Read set of users.
    weakly_connected_user_id_set = load_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl")
    weakly_connected_node_to_id = load_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl")
    id_to_name = load_pickle(full_graph_folder + "/id_to_name" + ".pkl")

    # Read set of twitter lists.
    twitter_list_file_list = os.listdir(twitter_lists_folder)
    twitter_list_file_list = [int(file_name[:-4]) for file_name in twitter_list_file_list]

    # Read which users are annotated.
    user_keywords_file_list = os.listdir(twitter_lists_keywords_folder)
    user_keywords_file_list = [int(file_name[:-5]) for file_name in user_keywords_file_list]

    # Find which twitter lists need to be preprocessed.
    user_twitter_id_list = [
        file_name for file_name in twitter_list_file_list if file_name in weakly_connected_user_id_set
    ]
    user_twitter_id_list = [file_name for file_name in user_twitter_id_list if file_name not in user_keywords_file_list]

    twitter_list_file_list = [str(file_name) + ".pkl" for file_name in user_twitter_id_list]

    pool = Pool(processes=get_threads_number() * 2)
    user_chunks = chunks(twitter_list_file_list, get_threads_number() * 2)
    pool.map(
        partial(
            worker_function,
            lemmatizing="wordnet",
            source_folder=twitter_lists_folder,
            target_folder=twitter_lists_keywords_folder,
        ),
        user_chunks,
    )

    # # Make user-label matrix.
    user_keywords_file_list = [str(file_name) for file_name in user_keywords_file_list]
    user_twitter_list_keywords_gen = read_local_user_annotations(twitter_lists_keywords_folder, user_keywords_file_list)
    weakly_connected_id_to_node = dict(zip(weakly_connected_node_to_id.values(), weakly_connected_node_to_id.keys()))

    # # twitter_id_to_weakly_connected_node = {int(twitter_id): weakly_connected_id_to_node[int(twitter_id)] for twitter_id in user_keywords_file_list if int(twitter_id) in weakly_connected_id_to_node.keys()}
    # node_twitter_list_keywords_gen = ((weakly_connected_id_to_node[int(user_twitter_id)], twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys())
    # for node, j in user_twitter_list_keywords_gen:
    #     print(node, j)

    implicated_user_twitter_list_keywords_gen = (
        (int(user_twitter_id), twitter_list_keywords)
        for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen
        if int(user_twitter_id) in weakly_connected_id_to_node.keys()
    )
    # for node, j in user_twitter_list_keywords_gen:
    #     print(node, j)

    ####################################################################################################################
    # Semi-automatic user annotation.
    ####################################################################################################################
    reveal_set = get_reveal_set()
    topic_keyword_dict = get_topic_keyword_dictionary()

    available_topics = set(list(topic_keyword_dict.keys()))

    keyword_list = list()
    for topic in reveal_set:
        if topic in available_topics:
            keyword_list.extend(topic_keyword_dict[topic])

    lemma_set = list()
    for keyword in keyword_list:
        lemma = clean_single_word(keyword, lemmatizing="wordnet")
        lemma_set.append(lemma)
    lemma_set = set(lemma_set)

    keyword_topic_dict = dict()
    for topic, keyword_set in topic_keyword_dict.items():
        for keyword in keyword_set:
            keyword_topic_dict[keyword] = topic

    user_label_matrix, annotated_nodes, label_to_lemma, node_to_lemma_tokeywordbag = form_user_term_matrix(
        implicated_user_twitter_list_keywords_gen,
        weakly_connected_id_to_node,
        lemma_set=lemma_set,
        keyword_to_topic_manual=keyword_topic_dict,
    )

    scipy_sparse_to_csv(
        weakly_connected_label_folder + "/unfiltered_user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True
    )
    store_pickle(weakly_connected_label_folder + "/unfiltered_annotated_nodes" + ".pkl", annotated_nodes)
    store_pickle(weakly_connected_label_folder + "/unfiltered_label_to_lemma" + ".pkl", label_to_lemma)
    store_pickle(
        weakly_connected_label_folder + "/unfiltered_node_to_lemma_tokeywordbag" + ".pkl", node_to_lemma_tokeywordbag
    )

    user_label_matrix, annotated_user_ids, label_to_lemma = filter_user_term_matrix(
        user_label_matrix, annotated_nodes, label_to_lemma, max_number_of_labels=None
    )

    lemma_to_keyword = form_lemma_tokeyword_map(annotated_nodes, node_to_lemma_tokeywordbag)

    # user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = semi_automatic_user_annotation(implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node)

    # Store user-label binary matrix.
    scipy_sparse_to_csv(
        weakly_connected_label_folder + "/user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True
    )

    # Store user-label keyword matrix.
    write_screen_name_to_topics(
        weakly_connected_label_folder + "/user_name_to_topics" + ".tsv",
        user_label_matrix,
        weakly_connected_node_to_id,
        id_to_name,
        label_to_lemma,
        lemma_to_keyword,
        separator="\t",
    )
    return twitter_lists_folder
def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder):
    # Read relevant data.
    mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl")
    mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph))
    retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl")
    retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph))
    user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl")
    user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix))
    user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl")
    node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl")

    # Extract weakly connected graph for the mention graph.
    weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components(
        spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id
    )

    # Calculate the user twitter id set for the weakly connected component.
    weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values()))

    node_array = np.array(old_node_list, dtype=np.int64)

    # Extract corresponding retweet graph and user lemma matrix.
    weakly_connected_mention_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(mention_graph), node_array, directed=True
    )

    weakly_connected_retweet_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(retweet_graph), node_array, directed=True
    )

    user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix)
    weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :]

    # Change sparse matrices to coordinate format in order to save as an edge list.
    weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph)
    weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph)
    weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix)

    # Store weakly connected data.
    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/mention_graph.tsv",
        weakly_connected_mention_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/retweet_graph.tsv",
        weakly_connected_retweet_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/user_lemma_matrix.tsv",
        weakly_connected_user_lemma_matrix,
        separator="\t",
        directed=True,
    )

    store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set)
    store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)