def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(-comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(-users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(-score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(-controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [ chunk[-1] for chunk in split_list(list(post_id_list_new), 500) ] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets( document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator( document=document, comment_generator=comment_generator, within_discussion_comment_anonymize= within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate( comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix( (len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict[ "comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[ comment_counter, 1] - timestamp_array[comment_counter - 1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json
def anonymize_static_dataset(dataset_name, input_data_folder): document_generator = slashdot.document_generator comment_generator = slashdot.comment_generator extract_document_post_name = slashdot.extract_document_post_name extract_user_name = slashdot.extract_user_name extract_comment_name = slashdot.extract_comment_name calculate_targets = slashdot.calculate_targets extract_timestamp = slashdot.extract_timestamp extract_parent_comment_name = slashdot.extract_parent_comment_name if dataset_name == "slashdot": anonymous_coward_name = "Anonymous Coward" elif dataset_name == "barrapunto": anonymous_coward_name = "pobrecito hablador" # "Pendejo Sin Nombre" else: print("Invalid dataset name.") raise RuntimeError #################################################################################################################### # Dataset-wide user anonymization. #################################################################################################################### file_name_list = os.listdir(input_data_folder) source_file_path_list = [ input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~" ] document_gen = document_generator(source_file_path_list) user_name_set,\ within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen, comment_generator, extract_user_name) file_name_list = os.listdir(input_data_folder) source_file_path_list = sorted([ input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~" ]) #################################################################################################################### # Iterate over files and incrementally calculate features. #################################################################################################################### for document in document_generator(source_file_path_list): comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: target_dict = calculate_targets( document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue ################################################################################################################ # Initiate a smart/safe iteration over all comments. ################################################################################################################ try: safe_comment_gen = safe_comment_generator( document=document, comment_generator=comment_generator, within_discussion_comment_anonymize= within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: invalid_tree = True continue ################################################################################################################ # Make initial post json. ################################################################################################################ initial_post = next(safe_comment_gen) uniform_json = dict() uniform_json["initial_post"] = dict() uniform_json["comments"] = list() uniform_json["initial_post"][ "user_id"] = within_dataset_user_anonymize[extract_user_name( initial_post)] uniform_json["initial_post"][ "comment_id"] = within_discussion_comment_anonymize[ extract_comment_name(initial_post)] try: uniform_json["initial_post"]["timestamp"] = extract_timestamp( initial_post) except TypeError: continue uniform_json["initial_post"]["targets"] = target_dict ################################################################################################################ # Make comment json list. ################################################################################################################ invalid_tree = False while True: try: comment = next(safe_comment_gen) except TypeError: invalid_tree = True break except StopIteration: break if comment is None: invalid_tree = True break comment_json = dict() comment_json["user_id"] = within_dataset_user_anonymize[ extract_user_name(comment)] comment_json["comment_id"] = within_discussion_comment_anonymize[ extract_comment_name(comment)] try: comment_json["timestamp"] = extract_timestamp(comment) except TypeError: invalid_tree = True break try: parent_comment_id = within_discussion_comment_anonymize[ extract_parent_comment_name(comment)] except KeyError: parent_comment_id = uniform_json["initial_post"]["comment_id"] comment_json["parent_comment_id"] = parent_comment_id uniform_json["comments"].append(comment_json) if invalid_tree: continue json_to_store = dict() json_to_store["uniform_json"] = uniform_json yield json_to_store
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(- comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(- users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(- score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(- controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [chunk[-1] for chunk in split_list(list(post_id_list_new), 500)] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate(comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix((len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json
def anonymize_static_dataset(dataset_name, input_data_folder): document_generator = slashdot.document_generator comment_generator = slashdot.comment_generator extract_document_post_name = slashdot.extract_document_post_name extract_user_name = slashdot.extract_user_name extract_comment_name = slashdot.extract_comment_name calculate_targets = slashdot.calculate_targets extract_timestamp = slashdot.extract_timestamp extract_parent_comment_name = slashdot.extract_parent_comment_name if dataset_name == "slashdot": anonymous_coward_name = "Anonymous Coward" elif dataset_name == "barrapunto": anonymous_coward_name = "pobrecito hablador" # "Pendejo Sin Nombre" else: print("Invalid dataset name.") raise RuntimeError #################################################################################################################### # Dataset-wide user anonymization. #################################################################################################################### file_name_list = os.listdir(input_data_folder) source_file_path_list = [input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"] document_gen = document_generator(source_file_path_list) user_name_set,\ within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen, comment_generator, extract_user_name) file_name_list = os.listdir(input_data_folder) source_file_path_list = sorted([input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"]) #################################################################################################################### # Iterate over files and incrementally calculate features. #################################################################################################################### for document in document_generator(source_file_path_list): comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: target_dict = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue ################################################################################################################ # Initiate a smart/safe iteration over all comments. ################################################################################################################ try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: invalid_tree = True continue ################################################################################################################ # Make initial post json. ################################################################################################################ initial_post = next(safe_comment_gen) uniform_json = dict() uniform_json["initial_post"] = dict() uniform_json["comments"] = list() uniform_json["initial_post"]["user_id"] = within_dataset_user_anonymize[extract_user_name(initial_post)] uniform_json["initial_post"]["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(initial_post)] try: uniform_json["initial_post"]["timestamp"] = extract_timestamp(initial_post) except TypeError: continue uniform_json["initial_post"]["targets"] = target_dict ################################################################################################################ # Make comment json list. ################################################################################################################ invalid_tree = False while True: try: comment = next(safe_comment_gen) except TypeError: invalid_tree = True break except StopIteration: break if comment is None: invalid_tree = True break comment_json = dict() comment_json["user_id"] = within_dataset_user_anonymize[extract_user_name(comment)] comment_json["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(comment)] try: comment_json["timestamp"] = extract_timestamp(comment) except TypeError: invalid_tree = True break try: parent_comment_id = within_discussion_comment_anonymize[extract_parent_comment_name(comment)] except KeyError: parent_comment_id = uniform_json["initial_post"]["comment_id"] comment_json["parent_comment_id"] = parent_comment_id uniform_json["comments"].append(comment_json) if invalid_tree: continue json_to_store = dict() json_to_store["uniform_json"] = uniform_json yield json_to_store