def store_dataset_k(dataset_k_path, dataset_k, X_k_min_dict, X_t_next_dict,
                    index):

    h5_store = h5_open(dataset_k_path)

    for osn_name in dataset_k.keys():
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_branching",
            pd.DataFrame(dataset_k[osn_name]["X_branching"],
                         columns=sorted(
                             list(get_branching_feature_names(osn_name)))))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_usergraph",
            pd.DataFrame(dataset_k[osn_name]["X_usergraph"],
                         columns=sorted(
                             list(get_usergraph_feature_names(osn_name)))))
        h5store_at(
            h5_store, "/data/" + osn_name + "/X_temporal",
            pd.DataFrame(dataset_k[osn_name]["X_temporal"],
                         columns=sorted(
                             list(get_temporal_feature_names(osn_name)))))

        utility_arrays = dict()
        utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name]
        utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name]

        h5store_at(h5_store, "/data/" + osn_name + "/utility_arrays",
                   pd.DataFrame(utility_arrays))

    h5_close(h5_store)
def store_dataset_k(dataset_k_path,
                    dataset_k,
                    X_k_min_dict,
                    X_t_next_dict,
                    index):

    h5_store = h5_open(dataset_k_path)

    for osn_name in dataset_k.keys():
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_branching",
                   pd.DataFrame(dataset_k[osn_name]["X_branching"],
                                columns=sorted(list(get_branching_feature_names(osn_name)))))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_usergraph",
                   pd.DataFrame(dataset_k[osn_name]["X_usergraph"],
                                columns=sorted(list(get_usergraph_feature_names(osn_name)))))
        h5store_at(h5_store,
                   "/data/" + osn_name + "/X_temporal",
                   pd.DataFrame(dataset_k[osn_name]["X_temporal"],
                                columns=sorted(list(get_temporal_feature_names(osn_name)))))

        utility_arrays = dict()
        utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name]
        utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name]

        h5store_at(h5_store,
                   "/data/" + osn_name + "/utility_arrays",
                   pd.DataFrame(utility_arrays))

    h5_close(h5_store)
def fill_X_handcrafted_k(dataset_k, h5_store_files, h5_keys, offset, k,
                         X_k_min_dict, X_t_next_dict, osn_name):

    concatenate_features.fill_X_handcrafted_k_actual(
        dataset_k, h5_store_files, h5_keys, offset, k, X_k_min_dict,
        X_t_next_dict, sorted(list(get_branching_feature_names(osn_name))),
        sorted(list(get_usergraph_feature_names(osn_name))),
        sorted(list(get_temporal_feature_names(osn_name))), osn_name)
def fill_X_handcrafted_k(dataset_k,
                         h5_store_files,
                         h5_keys,
                         offset,
                         k,
                         X_k_min_dict,
                         X_t_next_dict,
                         osn_name):

        concatenate_features.fill_X_handcrafted_k_actual(dataset_k,
                                                         h5_store_files,
                                                         h5_keys,
                                                         offset,
                                                         k,
                                                         X_k_min_dict,
                                                         X_t_next_dict,
                                                         sorted(list(get_branching_feature_names(osn_name))),
                                                         sorted(list(get_usergraph_feature_names(osn_name))),
                                                         sorted(list(get_temporal_feature_names(osn_name))),
                                                         osn_name)
    def select_dataset(self,
                       feature_osn_name_list,
                       target_osn_name):
        data_folder = self.data_folder

        add_branching_features = self.add_branching_features
        add_usergraph_features = self.add_usergraph_features
        add_temporal_features = self.add_temporal_features

        uniform_folder = data_folder + "/features"
        results_folder = data_folder + "/results/" + self.osn_name_focus

        branching_feature_dict = dict()
        usergraph_feature_dict = dict()
        temporal_feature_dict = dict()
        for feature_osn_name in feature_osn_name_list:
            if add_branching_features:
                branching_feature_dict[feature_osn_name] = get_branching_feature_names(osn_name=feature_osn_name)
            else:
                branching_feature_dict[feature_osn_name] = set()

            if add_usergraph_features:
                usergraph_feature_dict[feature_osn_name] = get_usergraph_feature_names(osn_name=feature_osn_name)
            else:
                usergraph_feature_dict[feature_osn_name] = set()

            if add_temporal_features:
                temporal_feature_dict[feature_osn_name] = get_temporal_feature_names(osn_name=feature_osn_name)
            else:
                temporal_feature_dict[feature_osn_name] = set()

        if "baseline" in self.experiment_construction_dict.keys():
            if "comments" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add("basic_comment_count")
            elif "users" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add("user_graph_user_count")
            elif "comments_users" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add("basic_comment_count")
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add("user_graph_user_count")
            elif "simple graph" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add("basic_comment_count")
                    branching_feature_dict[feature_osn_name].add("basic_max_depth")
                    branching_feature_dict[feature_osn_name].add("basic_max_width")
                    branching_feature_dict[feature_osn_name].add("basic_ave_width")
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add("user_graph_user_count")

        return uniform_folder,\
               results_folder,\
               branching_feature_dict,\
               usergraph_feature_dict,\
               temporal_feature_dict
def make_feature_matrices(features_folder, osn_focus):
    # Read comparison lifetimes.
    k_list_file_path = features_folder + "/k_list/focus_" + "post" + ".txt"
    k_list = load_valid_k_list(k_list_file_path)

    # Get feature names.
    branching_feature_dict = dict()
    usergraph_feature_dict = dict()
    temporal_feature_dict = dict()
    branching_feature_dict[osn_focus] = get_branching_feature_names(
        osn_name=osn_focus)
    usergraph_feature_dict[osn_focus] = get_usergraph_feature_names(
        osn_name=osn_focus)
    temporal_feature_dict[osn_focus] = get_temporal_feature_names(
        osn_name=osn_focus)

    branching_feature_names_list_dict = dict()
    usergraph_feature_names_list_dict = dict()
    temporal_feature_names_list_dict = dict()

    branching_feature_names_list_dict[osn_focus] = sorted(
        branching_feature_dict[osn_focus])
    usergraph_feature_names_list_dict[osn_focus] = sorted(
        usergraph_feature_dict[osn_focus])
    temporal_feature_names_list_dict[osn_focus] = sorted(
        temporal_feature_dict[osn_focus])

    number_of_branching_features_dict = dict()
    number_of_usergraph_features_dict = dict()
    number_of_temporal_features_dict = dict()

    number_of_branching_features_dict[osn_focus] = len(
        branching_feature_names_list_dict[osn_focus])
    number_of_usergraph_features_dict[osn_focus] = len(
        usergraph_feature_names_list_dict[osn_focus])
    number_of_temporal_features_dict[osn_focus] = len(
        temporal_feature_names_list_dict[osn_focus])

    # Make dataset matrix at time t_{\infty}.
    dataset_full_path = features_folder + "/dataset_full/dataset_full.h5"
    h5_stores_and_keys = get_h5_stores_and_keys(features_folder, "post")

    dataset_size = get_dataset_size(h5_stores_and_keys, "post")

    dataset_full,\
    index = form_dataset_full(dataset_size,
                              h5_stores_and_keys,
                              osn_focus,
                              branching_feature_names_list_dict,
                              usergraph_feature_names_list_dict,
                              temporal_feature_names_list_dict,
                              number_of_branching_features_dict,
                              number_of_usergraph_features_dict,
                              number_of_temporal_features_dict)

    store_dataset_full(dataset_full_path, dataset_full, index,
                       branching_feature_names_list_dict,
                       usergraph_feature_names_list_dict,
                       temporal_feature_names_list_dict)

    X_k_min_dict = dict()
    X_t_next_dict = dict()
    X_k_min_dict[osn_focus] = np.zeros(dataset_size, dtype=int)
    X_t_next_dict[osn_focus] = np.zeros(dataset_size, dtype=float)
    for k_index, k in enumerate(k_list):
        dataset_k,\
        X_k_min_dict,\
        X_t_next_dict,\
        index = form_dataset_k(dataset_size,
                               h5_stores_and_keys,
                               float(k),
                               X_k_min_dict,
                               X_t_next_dict,
                               feature_osn_name_list=[osn_focus])

        try:
            dataset_k_path = features_folder + "/dataset_k/" + osn_focus + "_lifetime_" + k + "_dataset_k.h5"
        except TypeError:
            dataset_k_path = features_folder + "/dataset_k/" + osn_focus + "_lifetime_" + repr(
                k) + "_dataset_k.h5"

        store_dataset_k(dataset_k_path, dataset_k, X_k_min_dict, X_t_next_dict,
                        index)
def make_feature_matrices(features_folder,
                          osn_focus):
    # Read comparison lifetimes.
    k_list_file_path = features_folder + "/k_list/focus_" + "post" + ".txt"
    k_list = load_valid_k_list(k_list_file_path)

    # Get feature names.
    branching_feature_dict = dict()
    usergraph_feature_dict = dict()
    temporal_feature_dict = dict()
    branching_feature_dict[osn_focus] = get_branching_feature_names(osn_name=osn_focus)
    usergraph_feature_dict[osn_focus] = get_usergraph_feature_names(osn_name=osn_focus)
    temporal_feature_dict[osn_focus] = get_temporal_feature_names(osn_name=osn_focus)

    branching_feature_names_list_dict = dict()
    usergraph_feature_names_list_dict = dict()
    temporal_feature_names_list_dict = dict()

    branching_feature_names_list_dict[osn_focus] = sorted(branching_feature_dict[osn_focus])
    usergraph_feature_names_list_dict[osn_focus] = sorted(usergraph_feature_dict[osn_focus])
    temporal_feature_names_list_dict[osn_focus] = sorted(temporal_feature_dict[osn_focus])

    number_of_branching_features_dict = dict()
    number_of_usergraph_features_dict = dict()
    number_of_temporal_features_dict = dict()

    number_of_branching_features_dict[osn_focus] = len(branching_feature_names_list_dict[osn_focus])
    number_of_usergraph_features_dict[osn_focus] = len(usergraph_feature_names_list_dict[osn_focus])
    number_of_temporal_features_dict[osn_focus] = len(temporal_feature_names_list_dict[osn_focus])

    # Make dataset matrix at time t_{\infty}.
    dataset_full_path = features_folder + "/dataset_full/dataset_full.h5"
    h5_stores_and_keys = get_h5_stores_and_keys(features_folder,
                                                "post")

    dataset_size = get_dataset_size(h5_stores_and_keys,
                                    "post")

    dataset_full,\
    index = form_dataset_full(dataset_size,
                              h5_stores_and_keys,
                              osn_focus,
                              branching_feature_names_list_dict,
                              usergraph_feature_names_list_dict,
                              temporal_feature_names_list_dict,
                              number_of_branching_features_dict,
                              number_of_usergraph_features_dict,
                              number_of_temporal_features_dict)

    store_dataset_full(dataset_full_path,
                       dataset_full,
                       index,
                       branching_feature_names_list_dict,
                       usergraph_feature_names_list_dict,
                       temporal_feature_names_list_dict)

    X_k_min_dict = dict()
    X_t_next_dict = dict()
    X_k_min_dict[osn_focus] = np.zeros(dataset_size, dtype=int)
    X_t_next_dict[osn_focus] = np.zeros(dataset_size, dtype=float)
    for k_index, k in enumerate(k_list):
        dataset_k,\
        X_k_min_dict,\
        X_t_next_dict,\
        index = form_dataset_k(dataset_size,
                               h5_stores_and_keys,
                               float(k),
                               X_k_min_dict,
                               X_t_next_dict,
                               feature_osn_name_list=[osn_focus])

        try:
            dataset_k_path = features_folder + "/dataset_k/" + osn_focus + "_lifetime_" + k + "_dataset_k.h5"
        except TypeError:
            dataset_k_path = features_folder + "/dataset_k/" + osn_focus + "_lifetime_" + repr(k) + "_dataset_k.h5"

        store_dataset_k(dataset_k_path,
                        dataset_k,
                        X_k_min_dict,
                        X_t_next_dict,
                        index)
    def select_dataset(self, feature_osn_name_list, target_osn_name):
        data_folder = self.data_folder

        add_branching_features = self.add_branching_features
        add_usergraph_features = self.add_usergraph_features
        add_temporal_features = self.add_temporal_features

        uniform_folder = data_folder + "/features"
        results_folder = data_folder + "/results/" + self.osn_name_focus

        branching_feature_dict = dict()
        usergraph_feature_dict = dict()
        temporal_feature_dict = dict()
        for feature_osn_name in feature_osn_name_list:
            if add_branching_features:
                branching_feature_dict[
                    feature_osn_name] = get_branching_feature_names(
                        osn_name=feature_osn_name)
            else:
                branching_feature_dict[feature_osn_name] = set()

            if add_usergraph_features:
                usergraph_feature_dict[
                    feature_osn_name] = get_usergraph_feature_names(
                        osn_name=feature_osn_name)
            else:
                usergraph_feature_dict[feature_osn_name] = set()

            if add_temporal_features:
                temporal_feature_dict[
                    feature_osn_name] = get_temporal_feature_names(
                        osn_name=feature_osn_name)
            else:
                temporal_feature_dict[feature_osn_name] = set()

        if "baseline" in self.experiment_construction_dict.keys():
            if "comments" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add(
                        "basic_comment_count")
            elif "users" == self.experiment_construction_dict["baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add(
                        "user_graph_user_count")
            elif "comments_users" == self.experiment_construction_dict[
                    "baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add(
                        "basic_comment_count")
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add(
                        "user_graph_user_count")
            elif "simple graph" == self.experiment_construction_dict[
                    "baseline"]:
                for feature_osn_name in feature_osn_name_list:
                    branching_feature_dict[feature_osn_name] = set()
                    branching_feature_dict[feature_osn_name].add(
                        "basic_comment_count")
                    branching_feature_dict[feature_osn_name].add(
                        "basic_max_depth")
                    branching_feature_dict[feature_osn_name].add(
                        "basic_max_width")
                    branching_feature_dict[feature_osn_name].add(
                        "basic_ave_width")
                    usergraph_feature_dict[feature_osn_name] = set()
                    usergraph_feature_dict[feature_osn_name].add(
                        "user_graph_user_count")

        return uniform_folder,\
               results_folder,\
               branching_feature_dict,\
               usergraph_feature_dict,\
               temporal_feature_dict