def main(): # Parse arguments. parser = argparse.ArgumentParser() parser.add_argument("-s", "--source", dest="source_folder", help="This is the folder with the pickled Twitter lists.", type=str, required=True) parser.add_argument("-t", "--target", dest="target_folder", help="This is the folder where the extracted keyword jsons will be stored.", type=str, required=True) args = parser.parse_args() source_folder = args.source_folder target_folder = args.target_folder # Get the file names where the twitter lists for certain users are stored. file_name_list = os.listdir(source_folder) # Build a pool of processes. pool = Pool(processes=get_threads_number()*2,) # Partition dataset in chunks. user_chunks = chunks(file_name_list, get_threads_number()*2) # Extract bags of words in parallel and serialize and store in JSON format. pool.map(partial(worker_function, source_folder=source_folder, target_folder=target_folder), user_chunks)
def extract_bag_of_words_from_corpus_parallel(corpus, lemmatizing="wordnet"): """ This extracts one bag-of-words from a list of strings. The documents are mapped to parallel processes. Inputs: - corpus: A list of strings. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - bag_of_words: This is a bag-of-words in python dictionary format. - lemma_to_keywordbag_total: Aggregated python dictionary that maps stems/lemmas to original topic keywords. """ #################################################################################################################### # Map and reduce document cleaning. #################################################################################################################### # Build a pool of processes. pool = Pool(processes=get_threads_number()*2,) # Partition the tweets to chunks. partitioned_corpus = chunks(corpus, len(corpus) / get_threads_number()) # Map the cleaning of the tweet corpus to a pool of processes. list_of_bags_of_words, list_of_lemma_to_keywordset_maps = pool.map(partial(clean_corpus_serial, lemmatizing=lemmatizing), partitioned_corpus) # Reduce dictionaries to a single dictionary serially. bag_of_words = reduce_list_of_bags_of_words(list_of_bags_of_words) # Reduce lemma to keyword maps to a single dictionary. lemma_to_keywordbag_total = defaultdict(lambda: defaultdict(int)) for lemma_to_keywordbag in list_of_lemma_to_keywordset_maps: for lemma, keywordbag in lemma_to_keywordbag.items(): for keyword, multiplicity in keywordbag.items(): lemma_to_keywordbag_total[lemma][keyword] += multiplicity return bag_of_words, lemma_to_keywordbag_total
def extract_bag_of_words_from_corpus_parallel(corpus, lemmatizing="wordnet"): """ This extracts one bag-of-words from a list of strings. The documents are mapped to parallel processes. Inputs: - corpus: A list of strings. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - bag_of_words: This is a bag-of-words in python dictionary format. - lemma_to_keywordbag_total: Aggregated python dictionary that maps stems/lemmas to original topic keywords. """ #################################################################################################################### # Map and reduce document cleaning. #################################################################################################################### # Build a pool of processes. pool = Pool(processes=get_threads_number() * 2, ) # Partition the tweets to chunks. partitioned_corpus = chunks(corpus, len(corpus) / get_threads_number()) # Map the cleaning of the tweet corpus to a pool of processes. list_of_bags_of_words, list_of_lemma_to_keywordset_maps = pool.map( partial(clean_corpus_serial, lemmatizing=lemmatizing), partitioned_corpus) # Reduce dictionaries to a single dictionary serially. bag_of_words = reduce_list_of_bags_of_words(list_of_bags_of_words) # Reduce lemma to keyword maps to a single dictionary. lemma_to_keywordbag_total = defaultdict(lambda: defaultdict(int)) for lemma_to_keywordbag in list_of_lemma_to_keywordset_maps: for lemma, keywordbag in lemma_to_keywordbag.items(): for keyword, multiplicity in keywordbag.items(): lemma_to_keywordbag_total[lemma][keyword] += multiplicity return bag_of_words, lemma_to_keywordbag_total
def main(): # Parse arguments. parser = argparse.ArgumentParser() parser.add_argument("-s", "--source", dest="source_folder", help="This is the folder with the pickled Twitter lists.", type=str, required=True) parser.add_argument("-t", "--target", dest="target_folder", help="This is the folder where the extracted keyword jsons will be stored.", type=str, required=True) args = parser.parse_args() source_folder = args.source_folder target_folder = args.target_folder # Get the file names where the twitter lists for certain users are stored. file_name_list = os.listdir(source_folder) # Build a pool of processes. pool = Pool(processes=get_threads_number()*2,) # Partition dataset in chunks. user_chunks = chunks(file_name_list, get_threads_number()*2) # Extract bags of words in parallel and serialize and store in JSON format. pool.map(partial(worker_function, lemmatizing="wordnet", source_folder=source_folder, target_folder=target_folder), user_chunks)
def make_annotation( twitter_lists_folder, twitter_lists_keywords_folder, weakly_connected_graph_folder, weakly_connected_label_folder, full_graph_folder, ): # TODO: Move keywords from Mongo to the folder. # Read set of users. weakly_connected_user_id_set = load_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl") weakly_connected_node_to_id = load_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl") id_to_name = load_pickle(full_graph_folder + "/id_to_name" + ".pkl") # Read set of twitter lists. twitter_list_file_list = os.listdir(twitter_lists_folder) twitter_list_file_list = [int(file_name[:-4]) for file_name in twitter_list_file_list] # Read which users are annotated. user_keywords_file_list = os.listdir(twitter_lists_keywords_folder) user_keywords_file_list = [int(file_name[:-5]) for file_name in user_keywords_file_list] # Find which twitter lists need to be preprocessed. user_twitter_id_list = [ file_name for file_name in twitter_list_file_list if file_name in weakly_connected_user_id_set ] user_twitter_id_list = [file_name for file_name in user_twitter_id_list if file_name not in user_keywords_file_list] twitter_list_file_list = [str(file_name) + ".pkl" for file_name in user_twitter_id_list] pool = Pool(processes=get_threads_number() * 2) user_chunks = chunks(twitter_list_file_list, get_threads_number() * 2) pool.map( partial( worker_function, lemmatizing="wordnet", source_folder=twitter_lists_folder, target_folder=twitter_lists_keywords_folder, ), user_chunks, ) # # Make user-label matrix. user_keywords_file_list = [str(file_name) for file_name in user_keywords_file_list] user_twitter_list_keywords_gen = read_local_user_annotations(twitter_lists_keywords_folder, user_keywords_file_list) weakly_connected_id_to_node = dict(zip(weakly_connected_node_to_id.values(), weakly_connected_node_to_id.keys())) # # twitter_id_to_weakly_connected_node = {int(twitter_id): weakly_connected_id_to_node[int(twitter_id)] for twitter_id in user_keywords_file_list if int(twitter_id) in weakly_connected_id_to_node.keys()} # node_twitter_list_keywords_gen = ((weakly_connected_id_to_node[int(user_twitter_id)], twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys()) # for node, j in user_twitter_list_keywords_gen: # print(node, j) implicated_user_twitter_list_keywords_gen = ( (int(user_twitter_id), twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys() ) # for node, j in user_twitter_list_keywords_gen: # print(node, j) #################################################################################################################### # Semi-automatic user annotation. #################################################################################################################### reveal_set = get_reveal_set() topic_keyword_dict = get_topic_keyword_dictionary() available_topics = set(list(topic_keyword_dict.keys())) keyword_list = list() for topic in reveal_set: if topic in available_topics: keyword_list.extend(topic_keyword_dict[topic]) lemma_set = list() for keyword in keyword_list: lemma = clean_single_word(keyword, lemmatizing="wordnet") lemma_set.append(lemma) lemma_set = set(lemma_set) keyword_topic_dict = dict() for topic, keyword_set in topic_keyword_dict.items(): for keyword in keyword_set: keyword_topic_dict[keyword] = topic user_label_matrix, annotated_nodes, label_to_lemma, node_to_lemma_tokeywordbag = form_user_term_matrix( implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node, lemma_set=lemma_set, keyword_to_topic_manual=keyword_topic_dict, ) scipy_sparse_to_csv( weakly_connected_label_folder + "/unfiltered_user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) store_pickle(weakly_connected_label_folder + "/unfiltered_annotated_nodes" + ".pkl", annotated_nodes) store_pickle(weakly_connected_label_folder + "/unfiltered_label_to_lemma" + ".pkl", label_to_lemma) store_pickle( weakly_connected_label_folder + "/unfiltered_node_to_lemma_tokeywordbag" + ".pkl", node_to_lemma_tokeywordbag ) user_label_matrix, annotated_user_ids, label_to_lemma = filter_user_term_matrix( user_label_matrix, annotated_nodes, label_to_lemma, max_number_of_labels=None ) lemma_to_keyword = form_lemma_tokeyword_map(annotated_nodes, node_to_lemma_tokeywordbag) # user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = semi_automatic_user_annotation(implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node) # Store user-label binary matrix. scipy_sparse_to_csv( weakly_connected_label_folder + "/user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) # Store user-label keyword matrix. write_screen_name_to_topics( weakly_connected_label_folder + "/user_name_to_topics" + ".tsv", user_label_matrix, weakly_connected_node_to_id, id_to_name, label_to_lemma, lemma_to_keyword, separator="\t", ) return twitter_lists_folder
def user_network_profile_classifier(mongo_uri, assessment_id, twitter_app_key, twitter_app_secret, rabbitmq_uri, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, pserver_host_name, pserver_client_name, pserver_client_pass, latest_n, lower_timestamp, upper_timestamp, restart_probability, number_of_threads, number_of_users_to_annotate, max_number_of_labels, user_network_profile_classifier_db, local_resources_folder): """ Performs Online Social Network user classification. Specifically: - Establishes a connection with a Mongo database and gets the newest tweets. - Forms graphs and text-based vector representation for the users involved. - Fetches Twitter lists for influential users. - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics. - Extracts graph-based features using the ARCTE algorithm. - Performs user classification for the rest of the users. - Stores the results at PServer. Input: - mongo_uri: A mongo client URI. - assessment_id: - twitter_app_key: - twitter_app_secret: - rabbitmq_uri: - rabbitmq_queue: - rabbitmq_exchange: - rabbitmq_routing_key: - pserver_host_name: - pserver_client_name: - pserver_client_pass: - latest_n: Get only the N most recent documents. - lower_timestamp: Get only documents created after this UNIX timestamp. - upper_timestamp: Get only documents created before this UNIX timestamp. - restart_probability: - number_of_threads: - number_of_users_to_annotate: - max_number_of_labels: - user_network_profile_classifier_db: - local_resources_folder: The preprocessed Twitter lists for a number of users are stored here. """ #################################################################################################################### # Manage argument input. #################################################################################################################### spec = make_time_window_filter(lower_timestamp, upper_timestamp) if number_of_threads is None: number_of_threads = get_threads_number() #################################################################################################################### # Establish MongoDB connection. #################################################################################################################### client,\ tweet_input_database_name,\ tweet_input_collection_name = safe_establish_mongo_connection(mongo_uri, assessment_id) #################################################################################################################### # Preprocess tweets. #################################################################################################################### mention_graph,\ retweet_graph,\ user_id_set,\ node_to_id = get_graphs_and_lemma_matrix(client, tweet_input_database_name, tweet_input_collection_name, spec, latest_n) adjacency_matrix, node_to_id, features, node_importances = integrate_graphs(mention_graph, retweet_graph, node_to_id, restart_probability, number_of_threads) #################################################################################################################### # Annotate users. #################################################################################################################### twitter_lists_gen,\ user_ids_to_annotate,\ user_twitter_ids_mongo,\ user_twitter_ids_local = fetch_twitter_lists(client, twitter_app_key, twitter_app_secret, user_network_profile_classifier_db, local_resources_folder, node_importances, number_of_users_to_annotate, node_to_id) user_label_matrix,\ annotated_user_ids,\ label_to_lemma,\ lemma_to_keyword = annotate_users(client, twitter_lists_gen, user_ids_to_annotate, user_twitter_ids_mongo, user_twitter_ids_local, local_resources_folder, user_network_profile_classifier_db, node_to_id, max_number_of_labels) #################################################################################################################### # Perform user classification. #################################################################################################################### prediction = user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads) #################################################################################################################### # Write to Mongo, PServer and/or RabbitMQ. #################################################################################################################### # Write data to mongo. write_results_to_mongo(client, user_network_profile_classifier_db, get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword)) # Write data to pserver. if pserver_host_name is not None: topic_list = list(lemma_to_keyword.values()) try: write_topics_to_pserver(pserver_host_name, pserver_client_name, pserver_client_pass, get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword), topic_list) except Exception: print("Unable to write results to PServer.") # Publish results and success message on RabbitMQ. rabbitmq_server_service("restart") rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri) publish_results_via_rabbitmq(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword)) simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "SUCCESS") rabbitmq_connection.close()
def user_network_profile_classifier(mongo_uri, assessment_id, twitter_app_key, twitter_app_secret, rabbitmq_uri, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, wp5_rabbitmq_uri, wp5_rabbitmq_queue, wp5_rabbitmq_exchange, wp5_rabbitmq_routing_key, pserver_host_name, pserver_client_name, pserver_client_pass, latest_n, lower_timestamp, upper_timestamp, timestamp_sort_key, restart_probability, number_of_threads, number_of_users_to_annotate, max_number_of_labels, user_network_profile_classifier_db, local_resources_folder, twitter_credentials): """ Performs Online Social Network user classification. Specifically: - Establishes a connection with a Mongo database and gets the newest tweets. - Forms graphs and text-based vector representation for the users involved. - Fetches Twitter lists for influential users. - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics. - Extracts graph-based features using the ARCTE algorithm. - Performs user classification for the rest of the users. - Stores the results at PServer. Input: - mongo_uri: A mongo client URI. - assessment_id: - twitter_app_key: - twitter_app_secret: - rabbitmq_uri: - rabbitmq_queue: - rabbitmq_exchange: - rabbitmq_routing_key: - pserver_host_name: - pserver_client_name: - pserver_client_pass: - latest_n: Get only the N most recent documents. - lower_timestamp: Get only documents created after this UNIX timestamp. - upper_timestamp: Get only documents created before this UNIX timestamp. - timestamp_sort_key: - restart_probability: - number_of_threads: - number_of_users_to_annotate: - max_number_of_labels: - user_network_profile_classifier_db: - local_resources_folder: The preprocessed Twitter lists for a number of users are stored here. """ good_labels_path = local_resources_folder + "good_labels.txt" bad_labels_path = local_resources_folder + "bad_labels.txt" #################################################################################################################### # Manage argument input. #################################################################################################################### spec = make_time_window_filter(lower_timestamp, upper_timestamp) if number_of_threads is None: number_of_threads = get_threads_number() #################################################################################################################### # Establish MongoDB connection. #################################################################################################################### client,\ tweet_input_database_name,\ tweet_input_collection_name = safe_establish_mongo_connection(mongo_uri, assessment_id) print("MongoDB connection established.") #################################################################################################################### # Preprocess tweets. #################################################################################################################### mention_graph,\ retweet_graph,\ user_lemma_matrix,\ user_id_set,\ node_to_id,\ lemma_to_attribute,\ id_to_name,\ id_to_username,\ id_to_listedcount = get_graphs_and_lemma_matrix(client, tweet_input_database_name, tweet_input_collection_name, spec, latest_n, timestamp_sort_key) print("Users and user interactions extracted") adjacency_matrix,\ node_to_id,\ features,\ node_importances,\ old_node_list = integrate_graphs(mention_graph, retweet_graph, user_lemma_matrix, node_to_id, lemma_to_attribute, restart_probability, number_of_threads) number_of_users = adjacency_matrix.shape[0] print("Number of users in fused graph: ", number_of_users) if number_of_users < 2: rabbitmq_server_service("restart") rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri) simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "NOT_ENOUGH_CONNECTIONS") rabbitmq_connection.close() print("Failure message published via RabbitMQ.") return #################################################################################################################### # Annotate users. #################################################################################################################### twitter_lists_gen,\ user_ids_to_annotate,\ user_twitter_ids_mongo,\ user_twitter_ids_local = fetch_twitter_lists(client, twitter_app_key, twitter_app_secret, tweet_input_database_name, local_resources_folder, id_to_listedcount, node_importances, number_of_users_to_annotate, node_to_id, twitter_credentials) print("Annotating users with Twitter ids: ", user_ids_to_annotate) user_label_matrix,\ annotated_user_ids,\ label_to_lemma,\ lemma_to_keyword = annotate_users(client, twitter_lists_gen, user_ids_to_annotate, user_twitter_ids_mongo, user_twitter_ids_local, local_resources_folder, tweet_input_database_name, node_to_id, max_number_of_labels, good_labels_path, bad_labels_path, user_lemma_matrix, old_node_list, lemma_to_attribute) number_of_labels = user_label_matrix.shape[1] print("Number of labels for classification: ", number_of_labels) if number_of_labels < 2: rabbitmq_server_service("restart") rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri) simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "NOT_ENOUGH_KEYWORDS") rabbitmq_connection.close() print("Failure message published via RabbitMQ.") return #################################################################################################################### # Perform user classification. #################################################################################################################### prediction = user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads) print("User classification complete.") #################################################################################################################### # Write to Mongo, PServer and/or RabbitMQ. #################################################################################################################### # Write data to mongo. # write_results_to_mongo(client, # user_network_profile_classifier_db, # get_user_topic_generator(prediction, # node_to_id, # label_to_lemma, # lemma_to_keyword)) write_results_to_mongo(client, tweet_input_database_name, get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword), id_to_name, id_to_username) print("Results written in MongoDB.") # write_results_to_txt("/home/georgerizos/Documents/test.txt", # get_user_topic_generator(prediction, # node_to_id, # label_to_lemma, # lemma_to_keyword)) # Write data to pserver. if pserver_host_name is not None: topic_list = list(lemma_to_keyword.values()) try: write_topics_to_pserver(pserver_host_name, pserver_client_name, pserver_client_pass, get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword), topic_list) print("Results written in PServer.") except Exception: print("Unable to write results to PServer.") # Publish results and success message on RabbitMQ. wp5_rabbitmq_uri,\ wp5_rabbitmq_queue,\ wp5_rabbitmq_exchange,\ wp5_rabbitmq_routing_key = check_wp5_rabbitmq_connection(wp5_rabbitmq_uri, wp5_rabbitmq_queue, wp5_rabbitmq_exchange, wp5_rabbitmq_routing_key, rabbitmq_uri, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, tweet_input_database_name) rabbitmq_server_service("restart") wp5_rabbitmq_connection = establish_rabbitmq_connection(wp5_rabbitmq_uri) publish_results_via_rabbitmq(rabbitmq_connection=wp5_rabbitmq_connection, rabbitmq_queue=wp5_rabbitmq_queue, rabbitmq_exchange=wp5_rabbitmq_exchange, rabbitmq_routing_key=wp5_rabbitmq_routing_key, user_topic_gen=get_user_topic_generator(prediction, node_to_id, label_to_lemma, lemma_to_keyword), id_to_name=id_to_name) print("Results published via RabbitMQ.") rabbitmq_server_service("restart") rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri) simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "SUCCESS") print("Success message published via RabbitMQ.")