def make_dataset_json(output_file_path, raw_data_file_path, features_folder, comparison_lifetimes_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) # Read features. h5_stores_and_keys = get_h5_stores_and_keys(features_folder, "reddit") # Read comparison lifetimes. lifetime_list = get_comparison_lifetimes(comparison_lifetimes_path) post_ids_to_keep = decide_posts_to_keep(raw_data_file_path, anonymous_coward_name) with open(output_file_path, "w") as fp: fp.write("[\n") for document in document_gen: if document["post_id"] in post_ids_to_keep: timestamp_df,\ handcrafted_df = get_features_df(document, h5_stores_and_keys) # if handcrafted_df is None: # continue discussion_json = make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name) if discussion_json is None: continue json.dump(discussion_json, fp) fp.write(",\n\n") fp.write("]\n")
def make_dataset_json(output_file_path, raw_data_file_path, features_folder, comparison_lifetimes_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) # Read features. h5_stores_and_keys = get_h5_stores_and_keys(features_folder, "reddit") # Read comparison lifetimes. lifetime_list = get_comparison_lifetimes(comparison_lifetimes_path) post_ids_to_keep = decide_posts_to_keep(raw_data_file_path, anonymous_coward_name) with open(output_file_path, "w") as fp: fp.write("[\n") for document in document_gen: if document["post_id"] in post_ids_to_keep: timestamp_df,\ handcrafted_df = get_features_df(document, h5_stores_and_keys) # if handcrafted_df is None: # continue discussion_json = make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name) if discussion_json is None: continue json.dump(discussion_json, fp) fp.write(",\n\n") fp.write("]\n")
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(-comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(-users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(-score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(-controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [ chunk[-1] for chunk in split_list(list(post_id_list_new), 500) ] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(- comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(- users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(- score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(- controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [chunk[-1] for chunk in split_list(list(post_id_list_new), 500)] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)