def load_dataset_k(dataset_k_path, feature_osn_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_k = dict() X_k_min_dict = dict() X_t_next_dict = dict() index = dict() h5_store = h5_open(dataset_k_path) for osn_name in feature_osn_name_list: dataset_k[osn_name] = dict() df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_k[osn_name]["X_branching"] = df.values dataset_k[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_k[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays") X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values h5_close(h5_store) return dataset_k, X_k_min_dict, X_t_next_dict, index
def load_dataset_full(dataset_full_path, target_osn_name, feature_osn_name_list, target_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_full = dict() dataset_full[target_osn_name] = dict() index = dict() h5_store = h5_open(dataset_full_path) for osn_name in feature_osn_name_list: df = h5load_from( h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_full[osn_name]["X_branching"] = df.values dataset_full[osn_name]["X_usergraph"] = h5load_from( h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_full[osn_name]["X_temporal"] = h5load_from( h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw") dataset_full[target_osn_name]["y_raw"] = dict() for target_name in target_name_list: dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[ target_name].values h5_close(h5_store) return dataset_full, index
def load_dataset_full(dataset_full_path, target_osn_name, feature_osn_name_list, target_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_full = dict() dataset_full[target_osn_name] = dict() index = dict() h5_store = h5_open(dataset_full_path) for osn_name in feature_osn_name_list: df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_full[osn_name]["X_branching"] = df.values dataset_full[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_full[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw") dataset_full[target_osn_name]["y_raw"] = dict() for target_name in target_name_list: dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[target_name].values h5_close(h5_store) return dataset_full, index
def fill_X_handcrafted_full_and_y_raw( dataset_full, h5_store_files, h5_keys, offset, osn_name, target_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict, number_of_branching_features_dict, number_of_usergraph_features_dict, number_of_temporal_features_dict): for d, h5_key in enumerate(h5_keys): handcrafted_features_data_frame = h5load_from(h5_store_files[1], h5_key) kth_row = get_kth_row(handcrafted_features_data_frame, -1, branching_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_branching"][ offset + d, :number_of_branching_features_dict[osn_name]] = kth_row kth_row = get_kth_row(handcrafted_features_data_frame, -1, usergraph_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_usergraph"][ offset + d, :number_of_usergraph_features_dict[osn_name]] = kth_row kth_row = get_kth_row(handcrafted_features_data_frame, -1, temporal_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_temporal"][ offset + d, :number_of_temporal_features_dict[osn_name]] = kth_row for target_name in target_list: dataset_full[osn_name]["y_raw"][target_name][ offset + d] = get_target_value(handcrafted_features_data_frame, target_name)
def fill_X_handcrafted_full_and_y_raw(dataset_full, h5_store_files, h5_keys, offset, osn_name, target_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict, number_of_branching_features_dict, number_of_usergraph_features_dict, number_of_temporal_features_dict): for d, h5_key in enumerate(h5_keys): handcrafted_features_data_frame = h5load_from(h5_store_files[1], h5_key) kth_row = get_kth_row(handcrafted_features_data_frame, -1, branching_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_branching"][offset + d, :number_of_branching_features_dict[osn_name]] = kth_row kth_row = get_kth_row(handcrafted_features_data_frame, -1, usergraph_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_usergraph"][offset + d, :number_of_usergraph_features_dict[osn_name]] = kth_row kth_row = get_kth_row(handcrafted_features_data_frame, -1, temporal_feature_names_list_dict[osn_name]) dataset_full[osn_name]["X_temporal"][offset + d, :number_of_temporal_features_dict[osn_name]] = kth_row for target_name in target_list: dataset_full[osn_name]["y_raw"][target_name][offset + d] = get_target_value(handcrafted_features_data_frame, target_name)
def get_all_comment_lifetimes(h5_stores_and_keys, osn_focus): all_comment_timestamps_list = list() extend_comment_timestamp = all_comment_timestamps_list.extend for h5_store_files, h5_keys in h5_stores_and_keys: for h5_key in h5_keys[osn_focus]: timestamps_data_frame = h5load_from(h5_store_files[0], h5_key) timestamps_col = timestamps_data_frame["timestamp"] extend_comment_timestamp(timestamps_col.iloc[1:] - timestamps_col.iloc[0]) return all_comment_timestamps_list
def load_dataset_k(dataset_k_path, feature_osn_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_k = dict() X_k_min_dict = dict() X_t_next_dict = dict() index = dict() h5_store = h5_open(dataset_k_path) for osn_name in feature_osn_name_list: dataset_k[osn_name] = dict() df = h5load_from( h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_k[osn_name]["X_branching"] = df.values dataset_k[osn_name]["X_usergraph"] = h5load_from( h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_k[osn_name]["X_temporal"] = h5load_from( h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays") X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values h5_close(h5_store) return dataset_k, X_k_min_dict, X_t_next_dict, index
def get_all_post_lifetimes(h5_stores_and_keys, osn_focus): all_post_lifetimes_list = list() append_post_lifetime = all_post_lifetimes_list.append for h5_store_files, h5_keys in h5_stores_and_keys: for h5_key in h5_keys[osn_focus]: timestamps_data_frame = h5load_from(h5_store_files[0], h5_key) timestamps_col = timestamps_data_frame["timestamp"] if timestamps_col.size == 1: index = 0 else: index = int(np.ceil(0.99 * (timestamps_col.size - 1))) append_post_lifetime(timestamps_col.iloc[index] - timestamps_col.iloc[0]) return all_post_lifetimes_list
def calculate_k_based_on_lifetime(dataset_k, h5_store_files, h5_keys, offset, k, X_k_min_dict, X_t_next_dict, osn_name): number_of_keys = len(h5_keys["post"]) for d in range(number_of_keys): timestamps_data_frame = h5load_from(h5_store_files[0], h5_keys["post"][d]) if np.isnan(X_t_next_dict[osn_name][offset + d]): continue observed_comments,\ next_lifetime = get_k_based_on_lifetime(timestamps_data_frame, k, min_k=X_k_min_dict[osn_name][offset + d], max_k=-1) X_k_min_dict[osn_name][offset + d] = observed_comments X_t_next_dict[osn_name][offset + d] = next_lifetime
def fill_X_handcrafted_k_actual(dataset_k, h5_store_files, h5_keys, offset, k, X_k_min_dict, X_t_next_dict, branching_feature_names_list, usergraph_feature_names_list, temporal_feature_names_list, osn_name): for d, h5_key in enumerate(h5_keys): if X_k_min_dict[osn_name][offset + d] == -1: dataset_k[osn_name]["X_branching"][offset + d, :] = np.nan dataset_k[osn_name]["X_usergraph"][offset + d, :] = np.nan dataset_k[osn_name]["X_temporal"][offset + d, :] = np.nan continue handcrafted_features_data_frame = h5load_from(h5_store_files[1], h5_key) # min_index = 0 # max_index = len(branching_feature_names_list) kth_row = get_kth_row(handcrafted_features_data_frame, X_k_min_dict[osn_name][offset + d], branching_feature_names_list) dataset_k[osn_name]["X_branching"][offset + d, :] = kth_row # min_index = len(branching_feature_names_list) # max_index = len(branching_feature_names_list) + len(usergraph_feature_names_list) kth_row = get_kth_row(handcrafted_features_data_frame, X_k_min_dict[osn_name][offset + d], usergraph_feature_names_list) dataset_k[osn_name]["X_usergraph"][offset + d, :] = kth_row # min_index = len(branching_feature_names_list) + len(usergraph_feature_names_list) # max_index = len(branching_feature_names_list) + len(usergraph_feature_names_list) + len(temporal_feature_names_list) kth_row = get_kth_row(handcrafted_features_data_frame, X_k_min_dict[osn_name][offset + d], temporal_feature_names_list) dataset_k[osn_name]["X_temporal"][offset + d, :] = kth_row
def load_k_evaluation_measures(store_path, number_of_folds=10): h5_store = h5_open(store_path + "results.h5") kendall_tau_keys = [ "/data/" + "kendall_tau/fold" + str(fold_index) for fold_index in range(number_of_folds) ] p_value_keys = [ "/data/" + "p_value/fold" + str(fold_index) for fold_index in range(number_of_folds) ] mse_keys = [ "/data/" + "mse/fold" + str(fold_index) for fold_index in range(number_of_folds) ] jaccard_keys = [ "/data/" + "top_k_jaccard/fold" + str(fold_index) for fold_index in range(number_of_folds) ] feature_importances_keys = [ "/data/" + "feature_importances/fold" + str(fold_index) for fold_index in range(number_of_folds) ] if (len(kendall_tau_keys) != len(p_value_keys)) or\ (len(kendall_tau_keys) != len(feature_importances_keys)): print("Fold number different for evaluation measures load.") raise RuntimeError number_of_folds = len(feature_importances_keys) data_frame = h5load_from(h5_store, feature_importances_keys[0]) k_list = data_frame.index number_of_samples = k_list.size feature_names_list = data_frame.columns number_of_features = len(feature_names_list) kendall_tau_array = np.empty((number_of_samples, number_of_folds), dtype=np.float64) p_value_array = np.empty((number_of_samples, number_of_folds), dtype=np.float64) mean_square_error = np.empty((number_of_samples, number_of_folds), dtype=np.float64) top_k_jaccard = np.empty((number_of_samples, number_of_folds), dtype=np.float64) feature_importances_array = np.empty( (number_of_samples, number_of_folds, number_of_features), dtype=np.float64) for f in range(number_of_folds): kendall_tau_key = kendall_tau_keys[f] p_value_key = p_value_keys[f] mse_key = mse_keys[f] jaccard_key = jaccard_keys[f] feature_importances_key = feature_importances_keys[f] kendall_tau_data_frame = h5load_from(h5_store, kendall_tau_key) p_value_data_frame = h5load_from(h5_store, p_value_key) mse_data_frame = h5load_from(h5_store, mse_key) jaccard_data_frame = h5load_from(h5_store, jaccard_key) feature_importances_data_frame = h5load_from(h5_store, feature_importances_key) kendall_tau_array[:, f] = np.squeeze(kendall_tau_data_frame.values) p_value_array[:, f] = np.squeeze(p_value_data_frame.values) mean_square_error[:, f] = np.squeeze(mse_data_frame.values) top_k_jaccard[:, f] = np.squeeze(jaccard_data_frame.values) try: feature_importances_array[:, f, :] = np.squeeze( feature_importances_data_frame.values) except ValueError: feature_importances_array[:, f, :] = feature_importances_data_frame.values k_evaluation_measures = (kendall_tau_array, p_value_array, mean_square_error, top_k_jaccard, feature_importances_array) return k_list, k_evaluation_measures, feature_names_list
def load_k_evaluation_measures(store_path, number_of_folds=10): h5_store = h5_open(store_path + "results.h5") kendall_tau_keys = ["/data/" + "kendall_tau/fold" + str(fold_index) for fold_index in range(number_of_folds)] p_value_keys = ["/data/" + "p_value/fold" + str(fold_index) for fold_index in range(number_of_folds)] mse_keys = ["/data/" + "mse/fold" + str(fold_index) for fold_index in range(number_of_folds)] jaccard_keys = ["/data/" + "top_k_jaccard/fold" + str(fold_index) for fold_index in range(number_of_folds)] feature_importances_keys = ["/data/" + "feature_importances/fold" + str(fold_index) for fold_index in range(number_of_folds)] if (len(kendall_tau_keys) != len(p_value_keys)) or\ (len(kendall_tau_keys) != len(feature_importances_keys)): print("Fold number different for evaluation measures load.") raise RuntimeError number_of_folds = len(feature_importances_keys) data_frame = h5load_from(h5_store, feature_importances_keys[0]) k_list = data_frame.index number_of_samples = k_list.size feature_names_list = data_frame.columns number_of_features = len(feature_names_list) kendall_tau_array = np.empty((number_of_samples, number_of_folds), dtype=np.float64) p_value_array = np.empty((number_of_samples, number_of_folds), dtype=np.float64) mean_square_error = np.empty((number_of_samples, number_of_folds), dtype=np.float64) top_k_jaccard = np.empty((number_of_samples, number_of_folds), dtype=np.float64) feature_importances_array = np.empty((number_of_samples, number_of_folds, number_of_features), dtype=np.float64) for f in range(number_of_folds): kendall_tau_key = kendall_tau_keys[f] p_value_key = p_value_keys[f] mse_key = mse_keys[f] jaccard_key = jaccard_keys[f] feature_importances_key = feature_importances_keys[f] kendall_tau_data_frame = h5load_from(h5_store, kendall_tau_key) p_value_data_frame = h5load_from(h5_store, p_value_key) mse_data_frame = h5load_from(h5_store, mse_key) jaccard_data_frame = h5load_from(h5_store, jaccard_key) feature_importances_data_frame = h5load_from(h5_store, feature_importances_key) kendall_tau_array[:, f] = np.squeeze(kendall_tau_data_frame.values) p_value_array[:, f] = np.squeeze(p_value_data_frame.values) mean_square_error[:, f] = np.squeeze(mse_data_frame.values) top_k_jaccard[:, f] = np.squeeze(jaccard_data_frame.values) try: feature_importances_array[:, f, :] = np.squeeze(feature_importances_data_frame.values) except ValueError: feature_importances_array[:, f, :] = feature_importances_data_frame.values k_evaluation_measures = (kendall_tau_array, p_value_array, mean_square_error, top_k_jaccard, feature_importances_array) return k_list, k_evaluation_measures, feature_names_list