示例#1
0
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    post_to_targets = dict()

    for document in document_gen:
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            targets = calculate_targets(document, comment_name_set,
                                        user_name_set,
                                        within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        if targets["comments"] > 1:
            post_id = document["post_id"]
            post_to_targets[post_id] = targets

    post_id_list = list()
    comments_list = list()
    users_list = list()
    score_list = list()
    controversiality_list = list()
    for post_id, targets in post_to_targets.items():
        post_id_list.append(post_id)
        comments_list.append(targets["comments"])
        users_list.append(targets["users"])
        score_list.append(targets["score_wilson"])
        controversiality_list.append(targets["controversiality_wilson"])

    n = len(post_id_list)

    post_id_list = np.array(post_id_list)
    comments_list = np.array(comments_list)
    users_list = np.array(users_list)
    score_list = np.array(score_list)
    controversiality_list = np.array(controversiality_list)

    # Rank according to comments.
    comments_rank = rankdata(-comments_list)
    i_comments = np.argsort(comments_list)
    post_id_list_comments = post_id_list[i_comments]
    comments_list = comments_list[i_comments]

    print(np.max(comments_list))

    # Rank according to users.
    users_rank = rankdata(-users_list)
    i_users = np.argsort(users_list)
    post_id_list_users = post_id_list[i_users]
    users_list = users_list[i_users]
    print(np.max(users_list))

    # Rank according to score_wilson.
    score_rank = rankdata(-score_list)
    i_score = np.argsort(score_list)
    post_id_list_score = post_id_list[i_score]
    score_list = score_list[i_score]
    print(np.max(score_list))

    # Rank according to controversiality_wilson.
    controversiality_rank = rankdata(-controversiality_list)
    i_controversiality = np.argsort(controversiality_list)
    post_id_list_controversiality = post_id_list[i_controversiality]
    controversiality_list = controversiality_list[i_controversiality]
    print(np.max(controversiality_list))

    # Rank according to all.
    all_rank = comments_rank + users_rank + score_rank + controversiality_rank
    i = np.argsort(all_rank)
    post_id_list_new = post_id_list[i][::-1]

    # Select 500 posts.
    post_id_chunk_list = [
        chunk[-1] for chunk in split_list(list(post_id_list_new), 500)
    ]

    for post_id in post_id_chunk_list:
        print(post_to_targets[post_id])

    return set(post_id_chunk_list)
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    post_to_targets = dict()

    for document in document_gen:
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            targets = calculate_targets(document,
                                        comment_name_set,
                                        user_name_set,
                                        within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        if targets["comments"] > 1:
            post_id = document["post_id"]
            post_to_targets[post_id] = targets

    post_id_list = list()
    comments_list = list()
    users_list = list()
    score_list = list()
    controversiality_list = list()
    for post_id, targets in post_to_targets.items():
        post_id_list.append(post_id)
        comments_list.append(targets["comments"])
        users_list.append(targets["users"])
        score_list.append(targets["score_wilson"])
        controversiality_list.append(targets["controversiality_wilson"])

    n = len(post_id_list)

    post_id_list = np.array(post_id_list)
    comments_list = np.array(comments_list)
    users_list = np.array(users_list)
    score_list = np.array(score_list)
    controversiality_list = np.array(controversiality_list)

    # Rank according to comments.
    comments_rank = rankdata(- comments_list)
    i_comments = np.argsort(comments_list)
    post_id_list_comments = post_id_list[i_comments]
    comments_list = comments_list[i_comments]

    print(np.max(comments_list))

    # Rank according to users.
    users_rank = rankdata(- users_list)
    i_users = np.argsort(users_list)
    post_id_list_users = post_id_list[i_users]
    users_list = users_list[i_users]
    print(np.max(users_list))

    # Rank according to score_wilson.
    score_rank = rankdata(- score_list)
    i_score = np.argsort(score_list)
    post_id_list_score = post_id_list[i_score]
    score_list = score_list[i_score]
    print(np.max(score_list))

    # Rank according to controversiality_wilson.
    controversiality_rank = rankdata(- controversiality_list)
    i_controversiality = np.argsort(controversiality_list)
    post_id_list_controversiality = post_id_list[i_controversiality]
    controversiality_list = controversiality_list[i_controversiality]
    print(np.max(controversiality_list))

    # Rank according to all.
    all_rank = comments_rank + users_rank + score_rank + controversiality_rank
    i = np.argsort(all_rank)
    post_id_list_new = post_id_list[i][::-1]

    # Select 500 posts.
    post_id_chunk_list = [chunk[-1] for chunk in split_list(list(post_id_list_new), 500)]

    for post_id in post_id_chunk_list:
        print(post_to_targets[post_id])

    return set(post_id_chunk_list)
示例#3
0
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list,
                         anonymous_coward_name):
    discussion_json = dict()

    discussion_json["post_url"] = get_post_url(document)
    discussion_json["post_title"] = get_post_title(document)
    # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list]
    discussion_json["graph_snapshots"] = list()

    comment_gen = comment_generator(document=document)

    comment_name_set,\
    user_name_set,\
    within_discussion_comment_anonymize,\
    within_discussion_user_anonymize,\
    within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                          extract_comment_name=extract_comment_name,
                                                                                          extract_user_name=extract_user_name,
                                                                                          anonymous_coward_name=anonymous_coward_name)

    try:
        discussion_json["prediction_targets"] = calculate_targets(
            document, comment_name_set, user_name_set,
            within_discussion_anonymous_coward)
    except KeyError as e:
        return None

    try:
        safe_comment_gen = safe_comment_generator(
            document=document,
            comment_generator=comment_generator,
            within_discussion_comment_anonymize=
            within_discussion_comment_anonymize,
            extract_comment_name=extract_comment_name,
            extract_parent_comment_name=extract_parent_comment_name,
            extract_timestamp=extract_timestamp,
            safe=True)
    except TypeError:
        return None

    try:
        initial_post = next(safe_comment_gen)
    except TypeError:
        return None
    try:
        timestamp = extract_timestamp(initial_post)
    except TypeError:
        return None
    op_raw_id = extract_user_name(initial_post)
    op_id = within_discussion_user_anonymize[op_raw_id]
    if op_id == within_discussion_anonymous_coward:
        op_is_anonymous = True
    else:
        op_is_anonymous = False

    comment_counter = 0

    timestamp_column_names_list,\
    timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1,
                                                 cascade_source_timestamp=timestamp)

    intermediate_dict = initialize_intermediate(
        comment_name_set,
        user_name_set,
        timestamp,
        within_discussion_anonymous_coward,
        op_is_anonymous=op_is_anonymous)

    comment_tree = spsp.dok_matrix(
        (len(comment_name_set), len(comment_name_set)), dtype=np.int8)

    user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)),
                                 dtype=np.int32)

    current_lifetime = 0.0

    # lifetime_list.append(np.inf)
    for lifetime_counter, lifetime in enumerate(lifetime_list):
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                return None
            except StopIteration:
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(
                    current_lifetime, comment_tree, user_graph,
                    timestamp_array[comment_counter, 1], handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break
            if comment is None:
                return None

            comment_counter += 1

            commenter_name = extract_user_name(comment)
            if commenter_name is None:
                commenter_is_anonymous = True
            else:
                commenter_is_anonymous = False

            try:
                discussion_tree,\
                user_graph,\
                comment_id,\
                parent_comment_id,\
                commenter_id,\
                parent_commenter_id,\
                user_graph_modified,\
                parent_commenter_is_anonymous,\
                comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment,
                                                                          extract_comment_name=extract_comment_name,
                                                                          extract_parent_comment_name=extract_parent_comment_name,
                                                                          extract_user_name=extract_user_name,
                                                                          discussion_tree=comment_tree,
                                                                          user_graph=user_graph,
                                                                          within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                                          within_discussion_user_anonymize=within_discussion_user_anonymize,
                                                                          within_discussion_anonymous_coward=within_discussion_anonymous_coward,
                                                                          comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"])
                intermediate_dict[
                    "comment_id_to_user_id"] = comment_id_to_user_id
            except RuntimeError:
                return None

            try:
                timestamp = extract_timestamp(comment)
            except TypeError:
                return None

            update_timestamp_array(timestamp_column_names_list,
                                   timestamp_array, timestamp, comment_counter)
            timestamp_difference = timestamp_array[
                comment_counter, 1] - timestamp_array[comment_counter - 1, 1]

            try:
                intermediate_dict,\
                comment_depth = update_intermediate(discussion_tree,
                                                    user_graph,
                                                    intermediate_dict,
                                                    commenter_is_anonymous,
                                                    parent_commenter_is_anonymous,
                                                    comment_id,
                                                    parent_comment_id,
                                                    commenter_id,
                                                    parent_commenter_id,
                                                    user_graph_modified,
                                                    timestamp,
                                                    timestamp_difference)
            except RuntimeError:
                return None

            current_lifetime = timestamp_array[comment_counter,
                                               1] - timestamp_array[0, 1]
            if current_lifetime >= lifetime:
                # Read features.
                # handcrafted_df_row = handcrafted_df[feature_list]
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(
                    current_lifetime, comment_tree, user_graph,
                    timestamp_array[comment_counter, 1], handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break

    discussion_json["post_timestamp"] = timestamp_array[0, 1]
    # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1
    # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"]

    return discussion_json
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name):
    discussion_json = dict()

    discussion_json["post_url"] = get_post_url(document)
    discussion_json["post_title"] = get_post_title(document)
    # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list]
    discussion_json["graph_snapshots"] = list()

    comment_gen = comment_generator(document=document)

    comment_name_set,\
    user_name_set,\
    within_discussion_comment_anonymize,\
    within_discussion_user_anonymize,\
    within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                          extract_comment_name=extract_comment_name,
                                                                                          extract_user_name=extract_user_name,
                                                                                          anonymous_coward_name=anonymous_coward_name)

    try:
        discussion_json["prediction_targets"] = calculate_targets(document,
                                                                  comment_name_set,
                                                                  user_name_set,
                                                                  within_discussion_anonymous_coward)
    except KeyError as e:
        return None

    try:
        safe_comment_gen = safe_comment_generator(document=document,
                                                  comment_generator=comment_generator,
                                                  within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                  extract_comment_name=extract_comment_name,
                                                  extract_parent_comment_name=extract_parent_comment_name,
                                                  extract_timestamp=extract_timestamp,
                                                  safe=True)
    except TypeError:
        return None

    try:
        initial_post = next(safe_comment_gen)
    except TypeError:
        return None
    try:
        timestamp = extract_timestamp(initial_post)
    except TypeError:
        return None
    op_raw_id = extract_user_name(initial_post)
    op_id = within_discussion_user_anonymize[op_raw_id]
    if op_id == within_discussion_anonymous_coward:
        op_is_anonymous = True
    else:
        op_is_anonymous = False

    comment_counter = 0

    timestamp_column_names_list,\
    timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1,
                                                 cascade_source_timestamp=timestamp)

    intermediate_dict = initialize_intermediate(comment_name_set,
                                                user_name_set,
                                                timestamp,
                                                within_discussion_anonymous_coward,
                                                op_is_anonymous=op_is_anonymous)

    comment_tree = spsp.dok_matrix((len(comment_name_set),
                                    len(comment_name_set)),
                                   dtype=np.int8)

    user_graph = spsp.dok_matrix((len(user_name_set),
                                  len(user_name_set)),
                                 dtype=np.int32)

    current_lifetime = 0.0

    # lifetime_list.append(np.inf)
    for lifetime_counter, lifetime in enumerate(lifetime_list):
        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                return None
            except StopIteration:
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(current_lifetime,
                                                     comment_tree,
                                                     user_graph,
                                                     timestamp_array[comment_counter, 1],
                                                     handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break
            if comment is None:
                return None

            comment_counter += 1

            commenter_name = extract_user_name(comment)
            if commenter_name is None:
                commenter_is_anonymous = True
            else:
                commenter_is_anonymous = False

            try:
                discussion_tree,\
                user_graph,\
                comment_id,\
                parent_comment_id,\
                commenter_id,\
                parent_commenter_id,\
                user_graph_modified,\
                parent_commenter_is_anonymous,\
                comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment,
                                                                          extract_comment_name=extract_comment_name,
                                                                          extract_parent_comment_name=extract_parent_comment_name,
                                                                          extract_user_name=extract_user_name,
                                                                          discussion_tree=comment_tree,
                                                                          user_graph=user_graph,
                                                                          within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                                          within_discussion_user_anonymize=within_discussion_user_anonymize,
                                                                          within_discussion_anonymous_coward=within_discussion_anonymous_coward,
                                                                          comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"])
                intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id
            except RuntimeError:
                return None

            try:
                timestamp = extract_timestamp(comment)
            except TypeError:
                return None

            update_timestamp_array(timestamp_column_names_list,
                                   timestamp_array,
                                   timestamp,
                                   comment_counter)
            timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1]

            try:
                intermediate_dict,\
                comment_depth = update_intermediate(discussion_tree,
                                                    user_graph,
                                                    intermediate_dict,
                                                    commenter_is_anonymous,
                                                    parent_commenter_is_anonymous,
                                                    comment_id,
                                                    parent_comment_id,
                                                    commenter_id,
                                                    parent_commenter_id,
                                                    user_graph_modified,
                                                    timestamp,
                                                    timestamp_difference)
            except RuntimeError:
                return None

            current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1]
            if current_lifetime >= lifetime:
                # Read features.
                # handcrafted_df_row = handcrafted_df[feature_list]
                handcrafted_df_row = handcrafted_df.iloc[comment_counter]

                time_step_json = make_time_step_json(current_lifetime,
                                                     comment_tree,
                                                     user_graph,
                                                     timestamp_array[comment_counter, 1],
                                                     handcrafted_df_row)
                discussion_json["graph_snapshots"].append(time_step_json)
                break

    discussion_json["post_timestamp"] = timestamp_array[0, 1]
    # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1
    # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"]

    return discussion_json