def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder): # Read relevant data. mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl") mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph)) retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl") retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph)) user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl") user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix)) user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl") node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl") # Extract weakly connected graph for the mention graph. weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components( spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id ) # Calculate the user twitter id set for the weakly connected component. weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values())) node_array = np.array(old_node_list, dtype=np.int64) # Extract corresponding retweet graph and user lemma matrix. weakly_connected_mention_graph = submatrix_pull_via_networkx( spsp.coo_matrix(mention_graph), node_array, directed=True ) weakly_connected_retweet_graph = submatrix_pull_via_networkx( spsp.coo_matrix(retweet_graph), node_array, directed=True ) user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix) weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :] # Change sparse matrices to coordinate format in order to save as an edge list. weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph) weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph) weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix) # Store weakly connected data. scipy_sparse_to_csv( weakly_connected_graph_folder + "/mention_graph.tsv", weakly_connected_mention_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/retweet_graph.tsv", weakly_connected_retweet_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/user_lemma_matrix.tsv", weakly_connected_user_lemma_matrix, separator="\t", directed=True, ) store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set) store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)
def make_annotation( twitter_lists_folder, twitter_lists_keywords_folder, weakly_connected_graph_folder, weakly_connected_label_folder, full_graph_folder, ): # TODO: Move keywords from Mongo to the folder. # Read set of users. weakly_connected_user_id_set = load_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl") weakly_connected_node_to_id = load_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl") id_to_name = load_pickle(full_graph_folder + "/id_to_name" + ".pkl") # Read set of twitter lists. twitter_list_file_list = os.listdir(twitter_lists_folder) twitter_list_file_list = [int(file_name[:-4]) for file_name in twitter_list_file_list] # Read which users are annotated. user_keywords_file_list = os.listdir(twitter_lists_keywords_folder) user_keywords_file_list = [int(file_name[:-5]) for file_name in user_keywords_file_list] # Find which twitter lists need to be preprocessed. user_twitter_id_list = [ file_name for file_name in twitter_list_file_list if file_name in weakly_connected_user_id_set ] user_twitter_id_list = [file_name for file_name in user_twitter_id_list if file_name not in user_keywords_file_list] twitter_list_file_list = [str(file_name) + ".pkl" for file_name in user_twitter_id_list] pool = Pool(processes=get_threads_number() * 2) user_chunks = chunks(twitter_list_file_list, get_threads_number() * 2) pool.map( partial( worker_function, lemmatizing="wordnet", source_folder=twitter_lists_folder, target_folder=twitter_lists_keywords_folder, ), user_chunks, ) # # Make user-label matrix. user_keywords_file_list = [str(file_name) for file_name in user_keywords_file_list] user_twitter_list_keywords_gen = read_local_user_annotations(twitter_lists_keywords_folder, user_keywords_file_list) weakly_connected_id_to_node = dict(zip(weakly_connected_node_to_id.values(), weakly_connected_node_to_id.keys())) # # twitter_id_to_weakly_connected_node = {int(twitter_id): weakly_connected_id_to_node[int(twitter_id)] for twitter_id in user_keywords_file_list if int(twitter_id) in weakly_connected_id_to_node.keys()} # node_twitter_list_keywords_gen = ((weakly_connected_id_to_node[int(user_twitter_id)], twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys()) # for node, j in user_twitter_list_keywords_gen: # print(node, j) implicated_user_twitter_list_keywords_gen = ( (int(user_twitter_id), twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys() ) # for node, j in user_twitter_list_keywords_gen: # print(node, j) #################################################################################################################### # Semi-automatic user annotation. #################################################################################################################### reveal_set = get_reveal_set() topic_keyword_dict = get_topic_keyword_dictionary() available_topics = set(list(topic_keyword_dict.keys())) keyword_list = list() for topic in reveal_set: if topic in available_topics: keyword_list.extend(topic_keyword_dict[topic]) lemma_set = list() for keyword in keyword_list: lemma = clean_single_word(keyword, lemmatizing="wordnet") lemma_set.append(lemma) lemma_set = set(lemma_set) keyword_topic_dict = dict() for topic, keyword_set in topic_keyword_dict.items(): for keyword in keyword_set: keyword_topic_dict[keyword] = topic user_label_matrix, annotated_nodes, label_to_lemma, node_to_lemma_tokeywordbag = form_user_term_matrix( implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node, lemma_set=lemma_set, keyword_to_topic_manual=keyword_topic_dict, ) scipy_sparse_to_csv( weakly_connected_label_folder + "/unfiltered_user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) store_pickle(weakly_connected_label_folder + "/unfiltered_annotated_nodes" + ".pkl", annotated_nodes) store_pickle(weakly_connected_label_folder + "/unfiltered_label_to_lemma" + ".pkl", label_to_lemma) store_pickle( weakly_connected_label_folder + "/unfiltered_node_to_lemma_tokeywordbag" + ".pkl", node_to_lemma_tokeywordbag ) user_label_matrix, annotated_user_ids, label_to_lemma = filter_user_term_matrix( user_label_matrix, annotated_nodes, label_to_lemma, max_number_of_labels=None ) lemma_to_keyword = form_lemma_tokeyword_map(annotated_nodes, node_to_lemma_tokeywordbag) # user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = semi_automatic_user_annotation(implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node) # Store user-label binary matrix. scipy_sparse_to_csv( weakly_connected_label_folder + "/user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) # Store user-label keyword matrix. write_screen_name_to_topics( weakly_connected_label_folder + "/user_name_to_topics" + ".tsv", user_label_matrix, weakly_connected_node_to_id, id_to_name, label_to_lemma, lemma_to_keyword, separator="\t", ) return twitter_lists_folder
def process_tweet_collection(tweet_generator, full_graph_folder): mention_graph, retweet_graph, user_lemma_matrix, tweet_id_set, user_id_set, node_to_id, lemma_to_attribute, id_to_name = extract_graphs_and_lemmas_from_tweets( tweet_generator ) # Store full graph data in corresponding folder. store_pickle(full_graph_folder + "/mention_graph" + ".pkl", mention_graph) scipy_sparse_to_csv(full_graph_folder + "/mention_graph" + ".tsv", mention_graph, "\t", directed=True) store_pickle(full_graph_folder + "/retweet_graph" + ".pkl", retweet_graph) scipy_sparse_to_csv(full_graph_folder + "/retweet_graph" + ".tsv", retweet_graph, "\t", directed=True) store_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl", user_lemma_matrix) scipy_sparse_to_csv(full_graph_folder + "/user_lemma_matrix" + ".tsv", user_lemma_matrix, "\t", directed=True) store_pickle(full_graph_folder + "/tweet_id_set" + ".pkl", tweet_id_set) store_pickle(full_graph_folder + "/user_id_set" + ".pkl", user_id_set) store_pickle(full_graph_folder + "/node_to_id" + ".pkl", node_to_id) store_pickle(full_graph_folder + "/lemma_to_attribute" + ".pkl", lemma_to_attribute) store_pickle(full_graph_folder + "/id_to_name" + ".pkl", id_to_name)