def load_dataset_full(dataset_full_path, target_osn_name,
                      feature_osn_name_list, target_name_list,
                      branching_feature_names_list_dict,
                      usergraph_feature_names_list_dict,
                      temporal_feature_names_list_dict):
    dataset_full = dict()
    dataset_full[target_osn_name] = dict()

    index = dict()

    h5_store = h5_open(dataset_full_path)

    for osn_name in feature_osn_name_list:
        df = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)
        dataset_full[osn_name]["X_branching"] = df.values
        dataset_full[osn_name]["X_usergraph"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_full[osn_name]["X_temporal"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

    data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw")
    dataset_full[target_osn_name]["y_raw"] = dict()
    for target_name in target_name_list:
        dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[
            target_name].values

    h5_close(h5_store)

    return dataset_full, index
def store_dataset_k(dataset_k_path, dataset_k, X_k_min_dict, X_t_next_dict,
                    index):

    h5_store = h5_open(dataset_k_path)

    for osn_name in dataset_k.keys():
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_branching",
            pd.DataFrame(dataset_k[osn_name]["X_branching"],
                         columns=sorted(
                             list(get_branching_feature_names(osn_name)))))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_usergraph",
            pd.DataFrame(dataset_k[osn_name]["X_usergraph"],
                         columns=sorted(
                             list(get_usergraph_feature_names(osn_name)))))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_temporal",
            pd.DataFrame(dataset_k[osn_name]["X_temporal"],
                         columns=sorted(
                             list(get_temporal_feature_names(osn_name)))))

        utility_arrays = dict()
        utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name]
        utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name]

        h5store_at(h5_store, "/data/" + osn_name + "/utility_arrays",
                   pd.DataFrame(utility_arrays))

    h5_close(h5_store)
def store_dataset_full(dataset_full_path, dataset_full, index,
                       branching_feature_names_list_dict,
                       usergraph_feature_names_list_dict,
                       temporal_feature_names_list_dict):
    h5_store = h5_open(dataset_full_path)

    for osn_name in dataset_full.keys():
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_branching",
            pd.DataFrame(dataset_full[osn_name]["X_branching"],
                         columns=branching_feature_names_list_dict[osn_name]))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_usergraph",
            pd.DataFrame(dataset_full[osn_name]["X_usergraph"],
                         columns=usergraph_feature_names_list_dict[osn_name]))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_temporal",
            pd.DataFrame(dataset_full[osn_name]["X_temporal"],
                         columns=temporal_feature_names_list_dict[osn_name]))

        y_raw_dict = dict()
        for target_name in dataset_full[osn_name]["y_raw"].keys():
            y_raw_dict[target_name] = dataset_full[osn_name]["y_raw"][
                target_name]

        h5store_at(h5_store, "/data/" + osn_name + "/y_raw",
                   pd.DataFrame(y_raw_dict))

    h5_close(h5_store)
def store_k_evaluation_measures(store_path,
                                k_list,
                                k_evaluation_measures,
                                feature_column_names):
    number_of_folds = k_evaluation_measures[0].shape[1]

    h5_store = h5_open(store_path + "results.h5")

    for fold_index in range(number_of_folds):
        data_frame = pd.DataFrame(k_evaluation_measures[0][:, fold_index], columns=["kendall_tau"], index=k_list)
        h5store_at(h5_store,
                   "/data/kendall_tau/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[1][:, fold_index], columns=["p_value"], index=k_list)
        h5store_at(h5_store,
                   "/data/p_value/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[2][:, fold_index], columns=["mse"], index=k_list)
        h5store_at(h5_store,
                   "/data/mse/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[3][:, fold_index], columns=["jaccard"], index=k_list)
        h5store_at(h5_store,
                   "/data/top_k_jaccard/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[4][:, fold_index, :], columns=feature_column_names, index=k_list)
        h5store_at(h5_store,
                   "/data/feature_importances/fold" + str(fold_index),
                   data_frame)

    h5_close(h5_store)
def store_dataset_k(dataset_k_path,
                    dataset_k,
                    X_k_min_dict,
                    X_t_next_dict,
                    index):

    h5_store = h5_open(dataset_k_path)

    for osn_name in dataset_k.keys():
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_branching",
                   pd.DataFrame(dataset_k[osn_name]["X_branching"],
                                columns=sorted(list(get_branching_feature_names(osn_name)))))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_usergraph",
                   pd.DataFrame(dataset_k[osn_name]["X_usergraph"],
                                columns=sorted(list(get_usergraph_feature_names(osn_name)))))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_temporal",
                   pd.DataFrame(dataset_k[osn_name]["X_temporal"],
                                columns=sorted(list(get_temporal_feature_names(osn_name)))))

        utility_arrays = dict()
        utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name]
        utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name]

        h5store_at(h5_store,
                   "/data/" + osn_name + "/utility_arrays",
                   pd.DataFrame(utility_arrays))

    h5_close(h5_store)
def load_dataset_k(dataset_k_path,
                   feature_osn_name_list,
                   branching_feature_names_list_dict,
                   usergraph_feature_names_list_dict,
                   temporal_feature_names_list_dict):
    dataset_k = dict()
    X_k_min_dict = dict()
    X_t_next_dict = dict()

    index = dict()

    h5_store = h5_open(dataset_k_path)

    for osn_name in feature_osn_name_list:
        dataset_k[osn_name] = dict()

        df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)

        dataset_k[osn_name]["X_branching"] = df.values
        dataset_k[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_k[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

        data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays")
        X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values
        X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values

    h5_close(h5_store)

    return dataset_k, X_k_min_dict, X_t_next_dict, index
def store_dataset_full(dataset_full_path,
                       dataset_full,
                       index,
                       branching_feature_names_list_dict,
                       usergraph_feature_names_list_dict,
                       temporal_feature_names_list_dict):
    h5_store = h5_open(dataset_full_path)

    for osn_name in dataset_full.keys():
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_branching",
                   pd.DataFrame(dataset_full[osn_name]["X_branching"],
                                columns=branching_feature_names_list_dict[osn_name]))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_usergraph",
                   pd.DataFrame(dataset_full[osn_name]["X_usergraph"],
                                columns=usergraph_feature_names_list_dict[osn_name]))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_temporal",
                   pd.DataFrame(dataset_full[osn_name]["X_temporal"],
                                columns=temporal_feature_names_list_dict[osn_name]))

        y_raw_dict = dict()
        for target_name in dataset_full[osn_name]["y_raw"].keys():
            y_raw_dict[target_name] = dataset_full[osn_name]["y_raw"][target_name]

        h5store_at(h5_store,
                   "/data/" + osn_name + "/y_raw",
                   pd.DataFrame(y_raw_dict))

    h5_close(h5_store)
def load_dataset_full(dataset_full_path,
                      target_osn_name,
                      feature_osn_name_list,
                      target_name_list,
                      branching_feature_names_list_dict,
                      usergraph_feature_names_list_dict,
                      temporal_feature_names_list_dict):
    dataset_full = dict()
    dataset_full[target_osn_name] = dict()

    index = dict()

    h5_store = h5_open(dataset_full_path)

    for osn_name in feature_osn_name_list:
        df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)
        dataset_full[osn_name]["X_branching"] = df.values
        dataset_full[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_full[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

    data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw")
    dataset_full[target_osn_name]["y_raw"] = dict()
    for target_name in target_name_list:
        dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[target_name].values

    h5_close(h5_store)

    return dataset_full, index
예제 #9
0
def store_k_evaluation_measures(store_path, k_list, k_evaluation_measures,
                                feature_column_names):
    number_of_folds = k_evaluation_measures[0].shape[1]

    h5_store = h5_open(store_path + "results.h5")

    for fold_index in range(number_of_folds):
        data_frame = pd.DataFrame(k_evaluation_measures[0][:, fold_index],
                                  columns=["kendall_tau"],
                                  index=k_list)
        h5store_at(h5_store, "/data/kendall_tau/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[1][:, fold_index],
                                  columns=["p_value"],
                                  index=k_list)
        h5store_at(h5_store, "/data/p_value/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[2][:, fold_index],
                                  columns=["mse"],
                                  index=k_list)
        h5store_at(h5_store, "/data/mse/fold" + str(fold_index), data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[3][:, fold_index],
                                  columns=["jaccard"],
                                  index=k_list)
        h5store_at(h5_store, "/data/top_k_jaccard/fold" + str(fold_index),
                   data_frame)

        data_frame = pd.DataFrame(k_evaluation_measures[4][:, fold_index, :],
                                  columns=feature_column_names,
                                  index=k_list)
        h5store_at(h5_store,
                   "/data/feature_importances/fold" + str(fold_index),
                   data_frame)

    h5_close(h5_store)
def load_dataset_k(dataset_k_path, feature_osn_name_list,
                   branching_feature_names_list_dict,
                   usergraph_feature_names_list_dict,
                   temporal_feature_names_list_dict):
    dataset_k = dict()
    X_k_min_dict = dict()
    X_t_next_dict = dict()

    index = dict()

    h5_store = h5_open(dataset_k_path)

    for osn_name in feature_osn_name_list:
        dataset_k[osn_name] = dict()

        df = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)

        dataset_k[osn_name]["X_branching"] = df.values
        dataset_k[osn_name]["X_usergraph"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_k[osn_name]["X_temporal"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

        data_frame = h5load_from(h5_store,
                                 "/data/" + osn_name + "/utility_arrays")
        X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values
        X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values

    h5_close(h5_store)

    return dataset_k, X_k_min_dict, X_t_next_dict, index
예제 #11
0
def extract_features_static_dataset(dataset_name,
                                    input_data_folder,
                                    output_data_folder):
    if dataset_name in ["reddit_news", "slashdot", "barrapunto"]:
        document_generator = anonymized.document_generator
        comment_generator = anonymized.comment_generator
        extract_document_post_name = anonymized.extract_document_post_name
        extract_user_name = anonymized.extract_user_name
        extract_comment_name = anonymized.extract_comment_name
        calculate_targets = anonymized.calculate_targets
        extract_timestamp = anonymized.extract_timestamp
        extract_parent_comment_name = anonymized.extract_parent_comment_name

        if dataset_name == "reddit_news":
            anonymous_coward_name = None
        elif dataset_name == "slashdot":
            anonymous_coward_name = "Anonymous Coward"
        elif dataset_name == "barrapunto":
            anonymous_coward_name = "pobrecito hablador"  # "Pendejo Sin Nombre"
        else:
            print("Invalid dataset name.")
            raise RuntimeError
    else:
        print("Invalid dataset name.")
        raise RuntimeError

    ####################################################################################################################
    # Dataset-wide user anonymization.
    ####################################################################################################################
    within_dataset_user_anonymizer_filepath = output_data_folder + "/datasetwide/user_anonymizer" + ".pkl"

    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = [input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"]
    document_gen = document_generator(source_file_path_list)

    within_dataset_user_anonymize = get_within_dataset_user_anonymization(within_dataset_user_anonymizer_filepath,
                                                                          document_gen,
                                                                          comment_generator,
                                                                          extract_user_name)

    file_name_list = os.listdir(input_data_folder)
    source_file_path_list = sorted([input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"])

    ####################################################################################################################
    # Initialize the H5 store files.
    ####################################################################################################################
    total_counter = 0
    store_file_counter_gen = store_file_counter_generator(0, 1)
    store_file_counter = next(store_file_counter_gen)
    discussion_counter = 0

    timestamp_h5_store_file = h5_open(output_data_folder + "/timestamp_h5_store_file_" + str(store_file_counter) + ".h5")
    handcrafted_features_h5_store_file = h5_open(output_data_folder + "/handcrafted_features_h5_store_file_" + str(store_file_counter) + ".h5")

    ####################################################################################################################
    # Iterate over files and incrementally calculate features.
    ####################################################################################################################
    document_counter = 0
    actual_document_counter = 0
    for document in document_generator(source_file_path_list):
        document_counter += 1
        actual_document_counter += 1
        if actual_document_counter % 500 == 0:
            print("Document no: ", actual_document_counter)

        invalid_tree = False

        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            target_dict = calculate_targets(document)
        except KeyError as e:
            continue

        ################################################################################################################
        # Initiate a smart/safe iteration over all comments.
        ################################################################################################################
        try:
            safe_comment_gen = safe_comment_generator(document=document,
                                                      comment_generator=comment_generator,
                                                      within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                      extract_comment_name=extract_comment_name,
                                                      extract_parent_comment_name=extract_parent_comment_name,
                                                      extract_timestamp=extract_timestamp,
                                                      safe=True)
        except TypeError:
            invalid_tree = True
            continue

        ################################################################################################################
        # Initialize features and intermediate information and structures for incrementally calculating features.
        ################################################################################################################
        # Just get the set.
        handcrafted_feature_names_set = get_handcrafted_feature_names(dataset_name)

        try:
            initial_post = next(safe_comment_gen)
        except TypeError:
            invalid_tree = True
            continue
        try:
            timestamp = extract_timestamp(initial_post)
        except TypeError:
            invalid_tree = True
            continue
        op_raw_id = extract_user_name(initial_post)
        op_id = within_discussion_user_anonymize[op_raw_id]
        if op_id == within_discussion_anonymous_coward:
            op_is_anonymous = True
        else:
            op_is_anonymous = False

        comment_counter = 0

        timestamp_column_names_list,\
        timestamp_array = initialize_timestamp_array(target_dict["comments"] + 1,
                                                     cascade_source_timestamp=timestamp)

        handcrafted_feature_names_list,\
        replicate_feature_if_anonymous_set,\
        handcrafted_function_list,\
        handcrafted_feature_array = initialize_handcrafted_features(target_dict["comments"] + 1,
                                                                    handcrafted_feature_names_set=handcrafted_feature_names_set,
                                                                    op_is_anonymous=op_is_anonymous)

        intermediate_dict = initialize_intermediate(comment_name_set,
                                                    user_name_set,
                                                    timestamp,
                                                    within_discussion_anonymous_coward,
                                                    op_is_anonymous=op_is_anonymous)

        discussion_tree = spsp.dok_matrix((len(comment_name_set),
                                           len(comment_name_set)),
                                          dtype=np.int8)

        user_graph = spsp.dok_matrix((len(user_name_set),
                                      len(user_name_set)),
                                     dtype=np.int32)

        while True:
            try:
                comment = next(safe_comment_gen)
            except TypeError:
                invalid_tree = True
                break
            except StopIteration:
                break
            if comment is None:
                invalid_tree = True
                break

            comment_counter += 1

            ############################################################################################################
            # Update discussion radial tree and user graph.
            ############################################################################################################
            commenter_name = extract_user_name(comment)
            if commenter_name is None:
                commenter_is_anonymous = True
            else:
                commenter_is_anonymous = False

            try:
                discussion_tree,\
                user_graph,\
                comment_id,\
                parent_comment_id,\
                commenter_id,\
                parent_commenter_id,\
                user_graph_modified,\
                parent_commenter_is_anonymous,\
                comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment,
                                                                          extract_comment_name=extract_comment_name,
                                                                          extract_parent_comment_name=extract_parent_comment_name,
                                                                          extract_user_name=extract_user_name,
                                                                          discussion_tree=discussion_tree,
                                                                          user_graph=user_graph,
                                                                          within_discussion_comment_anonymize=within_discussion_comment_anonymize,
                                                                          within_discussion_user_anonymize=within_discussion_user_anonymize,
                                                                          within_discussion_anonymous_coward=within_discussion_anonymous_coward,
                                                                          comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"])
                intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id
            except RuntimeError:
                invalid_tree = True
                break

            ############################################################################################################
            # Update intermediate information and structures for incrementally calculating features.
            ############################################################################################################
            try:
                timestamp = extract_timestamp(comment)
            except TypeError:
                invalid_tree = True
                break
            update_timestamp_array(timestamp_column_names_list,
                                   timestamp_array,
                                   timestamp,
                                   comment_counter)
            timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1]

            try:
                intermediate_dict,\
                comment_depth = update_intermediate(discussion_tree,
                                                    user_graph,
                                                    intermediate_dict,
                                                    commenter_is_anonymous,
                                                    parent_commenter_is_anonymous,
                                                    comment_id,
                                                    parent_comment_id,
                                                    commenter_id,
                                                    parent_commenter_id,
                                                    user_graph_modified,
                                                    timestamp,
                                                    timestamp_difference)
            except RuntimeError:
                invalid_tree = True
                break

            ############################################################################################################
            # Incrementally calculate discussion features.
            ############################################################################################################
            update_handcrafted_features(handcrafted_feature_names_list,
                                        replicate_feature_if_anonymous_set,
                                        handcrafted_function_list,
                                        handcrafted_feature_array,
                                        comment_counter,
                                        intermediate_dict,
                                        commenter_is_anonymous)

        if invalid_tree:
            continue
        else:
            total_counter += 1
            if discussion_counter == get_h5_max_recommended_number_of_children():
                h5_close(timestamp_h5_store_file)
                h5_close(handcrafted_features_h5_store_file)

                store_file_counter = next(store_file_counter_gen)
                discussion_counter = 0

                timestamp_h5_store_file = h5_open(output_data_folder + "/timestamp_h5_store_file_" + str(store_file_counter) + ".h5")
                handcrafted_features_h5_store_file = h5_open(output_data_folder + "/handcrafted_features_h5_store_file_" + str(store_file_counter) + ".h5")

                # timestamp_h5_store_file_keys = set(timestamp_h5_store_file.keys())
                # handcrafted_features_h5_store_file_keys = set(handcrafted_features_h5_store_file.keys())

            store_features(timestamp_h5_store_file,
                           handcrafted_features_h5_store_file,
                           document,
                           extract_document_post_name(document),
                           target_dict,
                           comment_counter,
                           timestamp_array,
                           timestamp_column_names_list,
                           handcrafted_feature_array,
                           handcrafted_feature_names_list)
            discussion_counter += 1
    print(total_counter)
    h5_close(timestamp_h5_store_file)
    h5_close(handcrafted_features_h5_store_file)

    return 0