def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets( document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator( document=document, comment_generator=comment_generator, within_discussion_comment_anonymize= within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate( comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix( (len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict[ "comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[ comment_counter, 1] - timestamp_array[comment_counter - 1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate(comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix((len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json
def anonymize_static_dataset(dataset_name, input_data_folder): document_generator = slashdot.document_generator comment_generator = slashdot.comment_generator extract_document_post_name = slashdot.extract_document_post_name extract_user_name = slashdot.extract_user_name extract_comment_name = slashdot.extract_comment_name calculate_targets = slashdot.calculate_targets extract_timestamp = slashdot.extract_timestamp extract_parent_comment_name = slashdot.extract_parent_comment_name if dataset_name == "slashdot": anonymous_coward_name = "Anonymous Coward" elif dataset_name == "barrapunto": anonymous_coward_name = "pobrecito hablador" # "Pendejo Sin Nombre" else: print("Invalid dataset name.") raise RuntimeError #################################################################################################################### # Dataset-wide user anonymization. #################################################################################################################### file_name_list = os.listdir(input_data_folder) source_file_path_list = [ input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~" ] document_gen = document_generator(source_file_path_list) user_name_set,\ within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen, comment_generator, extract_user_name) file_name_list = os.listdir(input_data_folder) source_file_path_list = sorted([ input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~" ]) #################################################################################################################### # Iterate over files and incrementally calculate features. #################################################################################################################### for document in document_generator(source_file_path_list): comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: target_dict = calculate_targets( document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue ################################################################################################################ # Initiate a smart/safe iteration over all comments. ################################################################################################################ try: safe_comment_gen = safe_comment_generator( document=document, comment_generator=comment_generator, within_discussion_comment_anonymize= within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: invalid_tree = True continue ################################################################################################################ # Make initial post json. ################################################################################################################ initial_post = next(safe_comment_gen) uniform_json = dict() uniform_json["initial_post"] = dict() uniform_json["comments"] = list() uniform_json["initial_post"][ "user_id"] = within_dataset_user_anonymize[extract_user_name( initial_post)] uniform_json["initial_post"][ "comment_id"] = within_discussion_comment_anonymize[ extract_comment_name(initial_post)] try: uniform_json["initial_post"]["timestamp"] = extract_timestamp( initial_post) except TypeError: continue uniform_json["initial_post"]["targets"] = target_dict ################################################################################################################ # Make comment json list. ################################################################################################################ invalid_tree = False while True: try: comment = next(safe_comment_gen) except TypeError: invalid_tree = True break except StopIteration: break if comment is None: invalid_tree = True break comment_json = dict() comment_json["user_id"] = within_dataset_user_anonymize[ extract_user_name(comment)] comment_json["comment_id"] = within_discussion_comment_anonymize[ extract_comment_name(comment)] try: comment_json["timestamp"] = extract_timestamp(comment) except TypeError: invalid_tree = True break try: parent_comment_id = within_discussion_comment_anonymize[ extract_parent_comment_name(comment)] except KeyError: parent_comment_id = uniform_json["initial_post"]["comment_id"] comment_json["parent_comment_id"] = parent_comment_id uniform_json["comments"].append(comment_json) if invalid_tree: continue json_to_store = dict() json_to_store["uniform_json"] = uniform_json yield json_to_store
def anonymize_static_dataset(dataset_name, input_data_folder): document_generator = slashdot.document_generator comment_generator = slashdot.comment_generator extract_document_post_name = slashdot.extract_document_post_name extract_user_name = slashdot.extract_user_name extract_comment_name = slashdot.extract_comment_name calculate_targets = slashdot.calculate_targets extract_timestamp = slashdot.extract_timestamp extract_parent_comment_name = slashdot.extract_parent_comment_name if dataset_name == "slashdot": anonymous_coward_name = "Anonymous Coward" elif dataset_name == "barrapunto": anonymous_coward_name = "pobrecito hablador" # "Pendejo Sin Nombre" else: print("Invalid dataset name.") raise RuntimeError #################################################################################################################### # Dataset-wide user anonymization. #################################################################################################################### file_name_list = os.listdir(input_data_folder) source_file_path_list = [input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"] document_gen = document_generator(source_file_path_list) user_name_set,\ within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen, comment_generator, extract_user_name) file_name_list = os.listdir(input_data_folder) source_file_path_list = sorted([input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"]) #################################################################################################################### # Iterate over files and incrementally calculate features. #################################################################################################################### for document in document_generator(source_file_path_list): comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: target_dict = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue ################################################################################################################ # Initiate a smart/safe iteration over all comments. ################################################################################################################ try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: invalid_tree = True continue ################################################################################################################ # Make initial post json. ################################################################################################################ initial_post = next(safe_comment_gen) uniform_json = dict() uniform_json["initial_post"] = dict() uniform_json["comments"] = list() uniform_json["initial_post"]["user_id"] = within_dataset_user_anonymize[extract_user_name(initial_post)] uniform_json["initial_post"]["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(initial_post)] try: uniform_json["initial_post"]["timestamp"] = extract_timestamp(initial_post) except TypeError: continue uniform_json["initial_post"]["targets"] = target_dict ################################################################################################################ # Make comment json list. ################################################################################################################ invalid_tree = False while True: try: comment = next(safe_comment_gen) except TypeError: invalid_tree = True break except StopIteration: break if comment is None: invalid_tree = True break comment_json = dict() comment_json["user_id"] = within_dataset_user_anonymize[extract_user_name(comment)] comment_json["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(comment)] try: comment_json["timestamp"] = extract_timestamp(comment) except TypeError: invalid_tree = True break try: parent_comment_id = within_discussion_comment_anonymize[extract_parent_comment_name(comment)] except KeyError: parent_comment_id = uniform_json["initial_post"]["comment_id"] comment_json["parent_comment_id"] = parent_comment_id uniform_json["comments"].append(comment_json) if invalid_tree: continue json_to_store = dict() json_to_store["uniform_json"] = uniform_json yield json_to_store