def load_dataset_full(dataset_full_path, target_osn_name, feature_osn_name_list, target_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_full = dict() dataset_full[target_osn_name] = dict() index = dict() h5_store = h5_open(dataset_full_path) for osn_name in feature_osn_name_list: df = h5load_from( h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_full[osn_name]["X_branching"] = df.values dataset_full[osn_name]["X_usergraph"] = h5load_from( h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_full[osn_name]["X_temporal"] = h5load_from( h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw") dataset_full[target_osn_name]["y_raw"] = dict() for target_name in target_name_list: dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[ target_name].values h5_close(h5_store) return dataset_full, index
def store_dataset_k(dataset_k_path, dataset_k, X_k_min_dict, X_t_next_dict, index): h5_store = h5_open(dataset_k_path) for osn_name in dataset_k.keys(): h5store_at( h5_store, "/data/" + osn_name + "/X_branching", pd.DataFrame(dataset_k[osn_name]["X_branching"], columns=sorted( list(get_branching_feature_names(osn_name))))) h5store_at( h5_store, "/data/" + osn_name + "/X_usergraph", pd.DataFrame(dataset_k[osn_name]["X_usergraph"], columns=sorted( list(get_usergraph_feature_names(osn_name))))) h5store_at( h5_store, "/data/" + osn_name + "/X_temporal", pd.DataFrame(dataset_k[osn_name]["X_temporal"], columns=sorted( list(get_temporal_feature_names(osn_name))))) utility_arrays = dict() utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name] utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name] h5store_at(h5_store, "/data/" + osn_name + "/utility_arrays", pd.DataFrame(utility_arrays)) h5_close(h5_store)
def store_dataset_full(dataset_full_path, dataset_full, index, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): h5_store = h5_open(dataset_full_path) for osn_name in dataset_full.keys(): h5store_at( h5_store, "/data/" + osn_name + "/X_branching", pd.DataFrame(dataset_full[osn_name]["X_branching"], columns=branching_feature_names_list_dict[osn_name])) h5store_at( h5_store, "/data/" + osn_name + "/X_usergraph", pd.DataFrame(dataset_full[osn_name]["X_usergraph"], columns=usergraph_feature_names_list_dict[osn_name])) h5store_at( h5_store, "/data/" + osn_name + "/X_temporal", pd.DataFrame(dataset_full[osn_name]["X_temporal"], columns=temporal_feature_names_list_dict[osn_name])) y_raw_dict = dict() for target_name in dataset_full[osn_name]["y_raw"].keys(): y_raw_dict[target_name] = dataset_full[osn_name]["y_raw"][ target_name] h5store_at(h5_store, "/data/" + osn_name + "/y_raw", pd.DataFrame(y_raw_dict)) h5_close(h5_store)
def store_k_evaluation_measures(store_path, k_list, k_evaluation_measures, feature_column_names): number_of_folds = k_evaluation_measures[0].shape[1] h5_store = h5_open(store_path + "results.h5") for fold_index in range(number_of_folds): data_frame = pd.DataFrame(k_evaluation_measures[0][:, fold_index], columns=["kendall_tau"], index=k_list) h5store_at(h5_store, "/data/kendall_tau/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[1][:, fold_index], columns=["p_value"], index=k_list) h5store_at(h5_store, "/data/p_value/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[2][:, fold_index], columns=["mse"], index=k_list) h5store_at(h5_store, "/data/mse/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[3][:, fold_index], columns=["jaccard"], index=k_list) h5store_at(h5_store, "/data/top_k_jaccard/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[4][:, fold_index, :], columns=feature_column_names, index=k_list) h5store_at(h5_store, "/data/feature_importances/fold" + str(fold_index), data_frame) h5_close(h5_store)
def store_dataset_k(dataset_k_path, dataset_k, X_k_min_dict, X_t_next_dict, index): h5_store = h5_open(dataset_k_path) for osn_name in dataset_k.keys(): h5store_at(h5_store, "/data/" + osn_name + "/X_branching", pd.DataFrame(dataset_k[osn_name]["X_branching"], columns=sorted(list(get_branching_feature_names(osn_name))))) h5store_at(h5_store, "/data/" + osn_name + "/X_usergraph", pd.DataFrame(dataset_k[osn_name]["X_usergraph"], columns=sorted(list(get_usergraph_feature_names(osn_name))))) h5store_at(h5_store, "/data/" + osn_name + "/X_temporal", pd.DataFrame(dataset_k[osn_name]["X_temporal"], columns=sorted(list(get_temporal_feature_names(osn_name))))) utility_arrays = dict() utility_arrays["X_k_min_array"] = X_k_min_dict[osn_name] utility_arrays["X_t_next_array"] = X_t_next_dict[osn_name] h5store_at(h5_store, "/data/" + osn_name + "/utility_arrays", pd.DataFrame(utility_arrays)) h5_close(h5_store)
def load_dataset_k(dataset_k_path, feature_osn_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_k = dict() X_k_min_dict = dict() X_t_next_dict = dict() index = dict() h5_store = h5_open(dataset_k_path) for osn_name in feature_osn_name_list: dataset_k[osn_name] = dict() df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_k[osn_name]["X_branching"] = df.values dataset_k[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_k[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays") X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values h5_close(h5_store) return dataset_k, X_k_min_dict, X_t_next_dict, index
def store_dataset_full(dataset_full_path, dataset_full, index, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): h5_store = h5_open(dataset_full_path) for osn_name in dataset_full.keys(): h5store_at(h5_store, "/data/" + osn_name + "/X_branching", pd.DataFrame(dataset_full[osn_name]["X_branching"], columns=branching_feature_names_list_dict[osn_name])) h5store_at(h5_store, "/data/" + osn_name + "/X_usergraph", pd.DataFrame(dataset_full[osn_name]["X_usergraph"], columns=usergraph_feature_names_list_dict[osn_name])) h5store_at(h5_store, "/data/" + osn_name + "/X_temporal", pd.DataFrame(dataset_full[osn_name]["X_temporal"], columns=temporal_feature_names_list_dict[osn_name])) y_raw_dict = dict() for target_name in dataset_full[osn_name]["y_raw"].keys(): y_raw_dict[target_name] = dataset_full[osn_name]["y_raw"][target_name] h5store_at(h5_store, "/data/" + osn_name + "/y_raw", pd.DataFrame(y_raw_dict)) h5_close(h5_store)
def load_dataset_full(dataset_full_path, target_osn_name, feature_osn_name_list, target_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_full = dict() dataset_full[target_osn_name] = dict() index = dict() h5_store = h5_open(dataset_full_path) for osn_name in feature_osn_name_list: df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_full[osn_name]["X_branching"] = df.values dataset_full[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_full[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw") dataset_full[target_osn_name]["y_raw"] = dict() for target_name in target_name_list: dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[target_name].values h5_close(h5_store) return dataset_full, index
def store_k_evaluation_measures(store_path, k_list, k_evaluation_measures, feature_column_names): number_of_folds = k_evaluation_measures[0].shape[1] h5_store = h5_open(store_path + "results.h5") for fold_index in range(number_of_folds): data_frame = pd.DataFrame(k_evaluation_measures[0][:, fold_index], columns=["kendall_tau"], index=k_list) h5store_at(h5_store, "/data/kendall_tau/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[1][:, fold_index], columns=["p_value"], index=k_list) h5store_at(h5_store, "/data/p_value/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[2][:, fold_index], columns=["mse"], index=k_list) h5store_at(h5_store, "/data/mse/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[3][:, fold_index], columns=["jaccard"], index=k_list) h5store_at(h5_store, "/data/top_k_jaccard/fold" + str(fold_index), data_frame) data_frame = pd.DataFrame(k_evaluation_measures[4][:, fold_index, :], columns=feature_column_names, index=k_list) h5store_at(h5_store, "/data/feature_importances/fold" + str(fold_index), data_frame) h5_close(h5_store)
def load_dataset_k(dataset_k_path, feature_osn_name_list, branching_feature_names_list_dict, usergraph_feature_names_list_dict, temporal_feature_names_list_dict): dataset_k = dict() X_k_min_dict = dict() X_t_next_dict = dict() index = dict() h5_store = h5_open(dataset_k_path) for osn_name in feature_osn_name_list: dataset_k[osn_name] = dict() df = h5load_from( h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]] index[osn_name] = list(df.index) dataset_k[osn_name]["X_branching"] = df.values dataset_k[osn_name]["X_usergraph"] = h5load_from( h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values dataset_k[osn_name]["X_temporal"] = h5load_from( h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays") X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values h5_close(h5_store) return dataset_k, X_k_min_dict, X_t_next_dict, index
def extract_features_static_dataset(dataset_name, input_data_folder, output_data_folder): if dataset_name in ["reddit_news", "slashdot", "barrapunto"]: document_generator = anonymized.document_generator comment_generator = anonymized.comment_generator extract_document_post_name = anonymized.extract_document_post_name extract_user_name = anonymized.extract_user_name extract_comment_name = anonymized.extract_comment_name calculate_targets = anonymized.calculate_targets extract_timestamp = anonymized.extract_timestamp extract_parent_comment_name = anonymized.extract_parent_comment_name if dataset_name == "reddit_news": anonymous_coward_name = None elif dataset_name == "slashdot": anonymous_coward_name = "Anonymous Coward" elif dataset_name == "barrapunto": anonymous_coward_name = "pobrecito hablador" # "Pendejo Sin Nombre" else: print("Invalid dataset name.") raise RuntimeError else: print("Invalid dataset name.") raise RuntimeError #################################################################################################################### # Dataset-wide user anonymization. #################################################################################################################### within_dataset_user_anonymizer_filepath = output_data_folder + "/datasetwide/user_anonymizer" + ".pkl" file_name_list = os.listdir(input_data_folder) source_file_path_list = [input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"] document_gen = document_generator(source_file_path_list) within_dataset_user_anonymize = get_within_dataset_user_anonymization(within_dataset_user_anonymizer_filepath, document_gen, comment_generator, extract_user_name) file_name_list = os.listdir(input_data_folder) source_file_path_list = sorted([input_data_folder + "/" + file_name for file_name in file_name_list if not file_name[-1] == "~"]) #################################################################################################################### # Initialize the H5 store files. #################################################################################################################### total_counter = 0 store_file_counter_gen = store_file_counter_generator(0, 1) store_file_counter = next(store_file_counter_gen) discussion_counter = 0 timestamp_h5_store_file = h5_open(output_data_folder + "/timestamp_h5_store_file_" + str(store_file_counter) + ".h5") handcrafted_features_h5_store_file = h5_open(output_data_folder + "/handcrafted_features_h5_store_file_" + str(store_file_counter) + ".h5") #################################################################################################################### # Iterate over files and incrementally calculate features. #################################################################################################################### document_counter = 0 actual_document_counter = 0 for document in document_generator(source_file_path_list): document_counter += 1 actual_document_counter += 1 if actual_document_counter % 500 == 0: print("Document no: ", actual_document_counter) invalid_tree = False comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: target_dict = calculate_targets(document) except KeyError as e: continue ################################################################################################################ # Initiate a smart/safe iteration over all comments. ################################################################################################################ try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: invalid_tree = True continue ################################################################################################################ # Initialize features and intermediate information and structures for incrementally calculating features. ################################################################################################################ # Just get the set. handcrafted_feature_names_set = get_handcrafted_feature_names(dataset_name) try: initial_post = next(safe_comment_gen) except TypeError: invalid_tree = True continue try: timestamp = extract_timestamp(initial_post) except TypeError: invalid_tree = True continue op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(target_dict["comments"] + 1, cascade_source_timestamp=timestamp) handcrafted_feature_names_list,\ replicate_feature_if_anonymous_set,\ handcrafted_function_list,\ handcrafted_feature_array = initialize_handcrafted_features(target_dict["comments"] + 1, handcrafted_feature_names_set=handcrafted_feature_names_set, op_is_anonymous=op_is_anonymous) intermediate_dict = initialize_intermediate(comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) discussion_tree = spsp.dok_matrix((len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) while True: try: comment = next(safe_comment_gen) except TypeError: invalid_tree = True break except StopIteration: break if comment is None: invalid_tree = True break comment_counter += 1 ############################################################################################################ # Update discussion radial tree and user graph. ############################################################################################################ commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=discussion_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: invalid_tree = True break ############################################################################################################ # Update intermediate information and structures for incrementally calculating features. ############################################################################################################ try: timestamp = extract_timestamp(comment) except TypeError: invalid_tree = True break update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: invalid_tree = True break ############################################################################################################ # Incrementally calculate discussion features. ############################################################################################################ update_handcrafted_features(handcrafted_feature_names_list, replicate_feature_if_anonymous_set, handcrafted_function_list, handcrafted_feature_array, comment_counter, intermediate_dict, commenter_is_anonymous) if invalid_tree: continue else: total_counter += 1 if discussion_counter == get_h5_max_recommended_number_of_children(): h5_close(timestamp_h5_store_file) h5_close(handcrafted_features_h5_store_file) store_file_counter = next(store_file_counter_gen) discussion_counter = 0 timestamp_h5_store_file = h5_open(output_data_folder + "/timestamp_h5_store_file_" + str(store_file_counter) + ".h5") handcrafted_features_h5_store_file = h5_open(output_data_folder + "/handcrafted_features_h5_store_file_" + str(store_file_counter) + ".h5") # timestamp_h5_store_file_keys = set(timestamp_h5_store_file.keys()) # handcrafted_features_h5_store_file_keys = set(handcrafted_features_h5_store_file.keys()) store_features(timestamp_h5_store_file, handcrafted_features_h5_store_file, document, extract_document_post_name(document), target_dict, comment_counter, timestamp_array, timestamp_column_names_list, handcrafted_feature_array, handcrafted_feature_names_list) discussion_counter += 1 print(total_counter) h5_close(timestamp_h5_store_file) h5_close(handcrafted_features_h5_store_file) return 0