Exemplo n.º 1
0
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list,
                         anonymous_coward_name):
    discussion_json = dict()

    discussion_json["post_url"] = get_post_url(document)
    discussion_json["post_title"] = get_post_title(document)
    # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list]
    discussion_json["graph_snapshots"] = list()

    comment_gen = comment_generator(document=document)

    comment_name_set,\
    user_name_set,\
    within_discussion_comment_anonymize,\
    within_discussion_user_anonymize,\
    within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                          extract_comment_name=extract_comment_name,
                                                                                          extract_user_name=extract_user_name,
                                                                                          anonymous_coward_name=anonymous_coward_name)

    try:
        discussion_json["prediction_targets"] = calculate_targets(
            document, comment_name_set, user_name_set,
            within_discussion_anonymous_coward)
    except KeyError as e:
        return None

    try:
        safe_comment_gen = safe_comment_generator(
            document=document,
            comment_generator=comment_generator,
            within_discussion_comment_anonymize=
            within_discussion_comment_anonymize,
            extract_comment_name=extract_comment_name,
            extract_parent_comment_name=extract_parent_comment_name,
            extract_timestamp=extract_timestamp,
            safe=True)
    except TypeError:
        return None

    try:
        initial_post = next(safe_comment_gen)
    except TypeError:
        return None
    try:
        timestamp = extract_timestamp(initial_post)
    except TypeError:
        return None
    op_raw_id = extract_user_name(initial_post)
    op_id = within_discussion_user_anonymize[op_raw_id]
    if op_id == within_discussion_anonymous_coward:
        op_is_anonymous = True
    else:
        op_is_anonymous = False

    comment_counter = 0

    timestamp_column_names_list,\
    timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1,
                                                 cascade_source_timestamp=timestamp)

    intermediate_dict = initialize_intermediate(
        comment_name_set,
        user_name_set,
        timestamp,
        within_discussion_anonymous_coward,
        op_is_anonymous=op_is_anonymous)

    comment_tree = spsp.dok_matrix(
        (len(comment_name_set), len(comment_name_set)), dtype=np.int8)

    user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)),
                                 dtype=np.int32)

    current_lifetime = 0.0

    # lifetime_list.append(np.inf)
    for lifetime_counter, lifetime in enumerate(lifetime_list):
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                return None
            except StopIteration:
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(
                    current_lifetime, comment_tree, user_graph,
                    timestamp_array[comment_counter, 1], handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break
            if comment is None:
                return None

            comment_counter += 1

            commenter_name = extract_user_name(comment)
            if commenter_name is None:
                commenter_is_anonymous = True
            else:
                commenter_is_anonymous = False

            try:
                discussion_tree,\
                user_graph,\
                comment_id,\
                parent_comment_id,\
                commenter_id,\
                parent_commenter_id,\
                user_graph_modified,\
                parent_commenter_is_anonymous,\
                comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment,
                                                                          extract_comment_name=extract_comment_name,
                                                                          extract_parent_comment_name=extract_parent_comment_name,
                                                                          extract_user_name=extract_user_name,
                                                                          discussion_tree=comment_tree,
                                                                          user_graph=user_graph,
                                                                          within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                                          within_discussion_user_anonymize=within_discussion_user_anonymize,
                                                                          within_discussion_anonymous_coward=within_discussion_anonymous_coward,
                                                                          comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"])
                intermediate_dict[
                    "comment_id_to_user_id"] = comment_id_to_user_id
            except RuntimeError:
                return None

            try:
                timestamp = extract_timestamp(comment)
            except TypeError:
                return None

            update_timestamp_array(timestamp_column_names_list,
                                   timestamp_array, timestamp, comment_counter)
            timestamp_difference = timestamp_array[
                comment_counter, 1] - timestamp_array[comment_counter - 1, 1]

            try:
                intermediate_dict,\
                comment_depth = update_intermediate(discussion_tree,
                                                    user_graph,
                                                    intermediate_dict,
                                                    commenter_is_anonymous,
                                                    parent_commenter_is_anonymous,
                                                    comment_id,
                                                    parent_comment_id,
                                                    commenter_id,
                                                    parent_commenter_id,
                                                    user_graph_modified,
                                                    timestamp,
                                                    timestamp_difference)
            except RuntimeError:
                return None

            current_lifetime = timestamp_array[comment_counter,
                                               1] - timestamp_array[0, 1]
            if current_lifetime >= lifetime:
                # Read features.
                # handcrafted_df_row = handcrafted_df[feature_list]
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(
                    current_lifetime, comment_tree, user_graph,
                    timestamp_array[comment_counter, 1], handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break

    discussion_json["post_timestamp"] = timestamp_array[0, 1]
    # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1
    # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"]

    return discussion_json
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name):
    discussion_json = dict()

    discussion_json["post_url"] = get_post_url(document)
    discussion_json["post_title"] = get_post_title(document)
    # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list]
    discussion_json["graph_snapshots"] = list()

    comment_gen = comment_generator(document=document)

    comment_name_set,\
    user_name_set,\
    within_discussion_comment_anonymize,\
    within_discussion_user_anonymize,\
    within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                          extract_comment_name=extract_comment_name,
                                                                                          extract_user_name=extract_user_name,
                                                                                          anonymous_coward_name=anonymous_coward_name)

    try:
        discussion_json["prediction_targets"] = calculate_targets(document,
                                                                  comment_name_set,
                                                                  user_name_set,
                                                                  within_discussion_anonymous_coward)
    except KeyError as e:
        return None

    try:
        safe_comment_gen = safe_comment_generator(document=document,
                                                  comment_generator=comment_generator,
                                                  within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                  extract_comment_name=extract_comment_name,
                                                  extract_parent_comment_name=extract_parent_comment_name,
                                                  extract_timestamp=extract_timestamp,
                                                  safe=True)
    except TypeError:
        return None

    try:
        initial_post = next(safe_comment_gen)
    except TypeError:
        return None
    try:
        timestamp = extract_timestamp(initial_post)
    except TypeError:
        return None
    op_raw_id = extract_user_name(initial_post)
    op_id = within_discussion_user_anonymize[op_raw_id]
    if op_id == within_discussion_anonymous_coward:
        op_is_anonymous = True
    else:
        op_is_anonymous = False

    comment_counter = 0

    timestamp_column_names_list,\
    timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1,
                                                 cascade_source_timestamp=timestamp)

    intermediate_dict = initialize_intermediate(comment_name_set,
                                                user_name_set,
                                                timestamp,
                                                within_discussion_anonymous_coward,
                                                op_is_anonymous=op_is_anonymous)

    comment_tree = spsp.dok_matrix((len(comment_name_set),
                                    len(comment_name_set)),
                                   dtype=np.int8)

    user_graph = spsp.dok_matrix((len(user_name_set),
                                  len(user_name_set)),
                                 dtype=np.int32)

    current_lifetime = 0.0

    # lifetime_list.append(np.inf)
    for lifetime_counter, lifetime in enumerate(lifetime_list):
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                return None
            except StopIteration:
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(current_lifetime,
                                                     comment_tree,
                                                     user_graph,
                                                     timestamp_array[comment_counter, 1],
                                                     handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break
            if comment is None:
                return None

            comment_counter += 1

            commenter_name = extract_user_name(comment)
            if commenter_name is None:
                commenter_is_anonymous = True
            else:
                commenter_is_anonymous = False

            try:
                discussion_tree,\
                user_graph,\
                comment_id,\
                parent_comment_id,\
                commenter_id,\
                parent_commenter_id,\
                user_graph_modified,\
                parent_commenter_is_anonymous,\
                comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment,
                                                                          extract_comment_name=extract_comment_name,
                                                                          extract_parent_comment_name=extract_parent_comment_name,
                                                                          extract_user_name=extract_user_name,
                                                                          discussion_tree=comment_tree,
                                                                          user_graph=user_graph,
                                                                          within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                                          within_discussion_user_anonymize=within_discussion_user_anonymize,
                                                                          within_discussion_anonymous_coward=within_discussion_anonymous_coward,
                                                                          comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"])
                intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id
            except RuntimeError:
                return None

            try:
                timestamp = extract_timestamp(comment)
            except TypeError:
                return None

            update_timestamp_array(timestamp_column_names_list,
                                   timestamp_array,
                                   timestamp,
                                   comment_counter)
            timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1]

            try:
                intermediate_dict,\
                comment_depth = update_intermediate(discussion_tree,
                                                    user_graph,
                                                    intermediate_dict,
                                                    commenter_is_anonymous,
                                                    parent_commenter_is_anonymous,
                                                    comment_id,
                                                    parent_comment_id,
                                                    commenter_id,
                                                    parent_commenter_id,
                                                    user_graph_modified,
                                                    timestamp,
                                                    timestamp_difference)
            except RuntimeError:
                return None

            current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1]
            if current_lifetime >= lifetime:
                # Read features.
                # handcrafted_df_row = handcrafted_df[feature_list]
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(current_lifetime,
                                                     comment_tree,
                                                     user_graph,
                                                     timestamp_array[comment_counter, 1],
                                                     handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break

    discussion_json["post_timestamp"] = timestamp_array[0, 1]
    # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1
    # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"]

    return discussion_json
Exemplo n.º 3
0
def anonymize_static_dataset(dataset_name, input_data_folder):
    document_generator = slashdot.document_generator
    comment_generator = slashdot.comment_generator
    extract_document_post_name = slashdot.extract_document_post_name
    extract_user_name = slashdot.extract_user_name
    extract_comment_name = slashdot.extract_comment_name
    calculate_targets = slashdot.calculate_targets
    extract_timestamp = slashdot.extract_timestamp
    extract_parent_comment_name = slashdot.extract_parent_comment_name
    if dataset_name == "slashdot":
        anonymous_coward_name = "Anonymous Coward"
    elif dataset_name == "barrapunto":
        anonymous_coward_name = "pobrecito hablador"  # "Pendejo Sin Nombre"
    else:
        print("Invalid dataset name.")
        raise RuntimeError

    ####################################################################################################################
    # Dataset-wide user anonymization.
    ####################################################################################################################
    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = [
        input_data_folder + "/" + file_name for file_name in file_name_list
        if not file_name[-1] == "~"
    ]
    document_gen = document_generator(source_file_path_list)

    user_name_set,\
    within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen,
                                                                                comment_generator,
                                                                                extract_user_name)

    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = sorted([
        input_data_folder + "/" + file_name for file_name in file_name_list
        if not file_name[-1] == "~"
    ])

    ####################################################################################################################
    # Iterate over files and incrementally calculate features.
    ####################################################################################################################
    for document in document_generator(source_file_path_list):
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            target_dict = calculate_targets(
                document, comment_name_set, user_name_set,
                within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        ################################################################################################################
        # Initiate a smart/safe iteration over all comments.
        ################################################################################################################
        try:
            safe_comment_gen = safe_comment_generator(
                document=document,
                comment_generator=comment_generator,
                within_discussion_comment_anonymize=
                within_discussion_comment_anonymize,
                extract_comment_name=extract_comment_name,
                extract_parent_comment_name=extract_parent_comment_name,
                extract_timestamp=extract_timestamp,
                safe=True)
        except TypeError:
            invalid_tree = True
            continue

        ################################################################################################################
        # Make initial post json.
        ################################################################################################################
        initial_post = next(safe_comment_gen)

        uniform_json = dict()
        uniform_json["initial_post"] = dict()
        uniform_json["comments"] = list()

        uniform_json["initial_post"][
            "user_id"] = within_dataset_user_anonymize[extract_user_name(
                initial_post)]
        uniform_json["initial_post"][
            "comment_id"] = within_discussion_comment_anonymize[
                extract_comment_name(initial_post)]
        try:
            uniform_json["initial_post"]["timestamp"] = extract_timestamp(
                initial_post)
        except TypeError:
            continue
        uniform_json["initial_post"]["targets"] = target_dict

        ################################################################################################################
        # Make comment json list.
        ################################################################################################################
        invalid_tree = False
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                invalid_tree = True
                break
            except StopIteration:
                break
            if comment is None:
                invalid_tree = True
                break
            comment_json = dict()
            comment_json["user_id"] = within_dataset_user_anonymize[
                extract_user_name(comment)]
            comment_json["comment_id"] = within_discussion_comment_anonymize[
                extract_comment_name(comment)]
            try:
                comment_json["timestamp"] = extract_timestamp(comment)
            except TypeError:
                invalid_tree = True
                break
            try:
                parent_comment_id = within_discussion_comment_anonymize[
                    extract_parent_comment_name(comment)]
            except KeyError:
                parent_comment_id = uniform_json["initial_post"]["comment_id"]
            comment_json["parent_comment_id"] = parent_comment_id

            uniform_json["comments"].append(comment_json)

        if invalid_tree:
            continue

        json_to_store = dict()
        json_to_store["uniform_json"] = uniform_json
        yield json_to_store
def anonymize_static_dataset(dataset_name,
                             input_data_folder):
    document_generator = slashdot.document_generator
    comment_generator = slashdot.comment_generator
    extract_document_post_name = slashdot.extract_document_post_name
    extract_user_name = slashdot.extract_user_name
    extract_comment_name = slashdot.extract_comment_name
    calculate_targets = slashdot.calculate_targets
    extract_timestamp = slashdot.extract_timestamp
    extract_parent_comment_name = slashdot.extract_parent_comment_name
    if dataset_name == "slashdot":
        anonymous_coward_name = "Anonymous Coward"
    elif dataset_name == "barrapunto":
        anonymous_coward_name = "pobrecito hablador"  # "Pendejo Sin Nombre"
    else:
        print("Invalid dataset name.")
        raise RuntimeError

    ####################################################################################################################
    # Dataset-wide user anonymization.
    ####################################################################################################################
    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = [input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"]
    document_gen = document_generator(source_file_path_list)

    user_name_set,\
    within_dataset_user_anonymize = calculate_within_dataset_user_anonymization(document_gen,
                                                                                comment_generator,
                                                                                extract_user_name)

    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = sorted([input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"])

    ####################################################################################################################
    # Iterate over files and incrementally calculate features.
    ####################################################################################################################
    for document in document_generator(source_file_path_list):
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            target_dict = calculate_targets(document,
                                            comment_name_set,
                                            user_name_set,
                                            within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        ################################################################################################################
        # Initiate a smart/safe iteration over all comments.
        ################################################################################################################
        try:
            safe_comment_gen = safe_comment_generator(document=document,
                                                      comment_generator=comment_generator,
                                                      within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                      extract_comment_name=extract_comment_name,
                                                      extract_parent_comment_name=extract_parent_comment_name,
                                                      extract_timestamp=extract_timestamp,
                                                      safe=True)
        except TypeError:
            invalid_tree = True
            continue

        ################################################################################################################
        # Make initial post json.
        ################################################################################################################
        initial_post = next(safe_comment_gen)

        uniform_json = dict()
        uniform_json["initial_post"] = dict()
        uniform_json["comments"] = list()

        uniform_json["initial_post"]["user_id"] = within_dataset_user_anonymize[extract_user_name(initial_post)]
        uniform_json["initial_post"]["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(initial_post)]
        try:
            uniform_json["initial_post"]["timestamp"] = extract_timestamp(initial_post)
        except TypeError:
            continue
        uniform_json["initial_post"]["targets"] = target_dict

        ################################################################################################################
        # Make comment json list.
        ################################################################################################################
        invalid_tree = False
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                invalid_tree = True
                break
            except StopIteration:
                break
            if comment is None:
                invalid_tree = True
                break
            comment_json = dict()
            comment_json["user_id"] = within_dataset_user_anonymize[extract_user_name(comment)]
            comment_json["comment_id"] = within_discussion_comment_anonymize[extract_comment_name(comment)]
            try:
                comment_json["timestamp"] = extract_timestamp(comment)
            except TypeError:
                invalid_tree = True
                break
            try:
                parent_comment_id = within_discussion_comment_anonymize[extract_parent_comment_name(comment)]
            except KeyError:
                parent_comment_id = uniform_json["initial_post"]["comment_id"]
            comment_json["parent_comment_id"] = parent_comment_id

            uniform_json["comments"].append(comment_json)

        if invalid_tree:
            continue

        json_to_store = dict()
        json_to_store["uniform_json"] = uniform_json
        yield json_to_store