def preprocess_feature_data(dataset, raw_data_folder, preprocessed_data_folder, feature_raw_data_file, number_of_nodes): source_file_path = raw_data_folder + "/" + dataset + "-" + feature_raw_data_file + ".mtx" target_file_path = preprocessed_data_folder + "/" + feature_raw_data_file + ".tsv" matrix = read_feature_raw_data_file(source_file_path, number_of_nodes) scipy_sparse_to_csv(target_file_path, matrix, separator="\t", directed=True, numbering="matlab")
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: user_twitter_id_list.append(int(file_row[0])) id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list)))) user_twitter_id_list = set(user_twitter_id_list) core_file_path = raw_data_folder + "/" + dataset + ".communities" file_row_gen = get_file_row_generator(core_file_path, ",") row = list() col = list() category_counter = 0 for file_row in file_row_gen: id_list = list() first_id = file_row[0].strip().split(" ") first_id = id_to_node[int(first_id[1])] id_list.append(first_id) for id in file_row[1:]: id_list.append(id_to_node[int(id)]) row.extend(id_list) col.extend(category_counter*np.ones(len(id_list), dtype=np.int32)) category_counter += 1 row = np.array(row, dtype=np.int32) col = np.array(col, dtype=np.int32) data = np.ones_like(row, dtype=np.int8) node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), category_counter)) target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv" scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: user_twitter_id_list.append(int(file_row[0])) id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list)))) user_twitter_id_list = set(user_twitter_id_list) core_file_path = raw_data_folder + "/" + dataset + ".core" file_row_gen = get_file_row_generator(core_file_path, " ") core_user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: core_user_twitter_id_list.append(int(file_row[0])) core_user_twitter_id_list = user_twitter_id_list.intersection(core_user_twitter_id_list) non_core_user_twitter_id_set = user_twitter_id_list.difference(core_user_twitter_id_list) row = [id_to_node[id] for id in core_user_twitter_id_list] + [id_to_node[id] for id in non_core_user_twitter_id_set] row = np.array(row, dtype=np.int32) col = [1 for id in core_user_twitter_id_list] + [0 for id in non_core_user_twitter_id_set] col = np.array(col, dtype=np.int32) data = np.ones(len(user_twitter_id_list), dtype=np.int8) node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), 2)) target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv" scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def make_implicit_graphs(preprocessed_data_folder, simple_undirected_implicit_graph_folder): #################################################################################################################### # Read graphs. #################################################################################################################### # Read follow graph. source_path = preprocessed_data_folder + "/" + "followedby" + ".tsv" follow_graph = read_adjacency_matrix(source_path, "\t", "matlab") follow_graph = follow_graph.transpose() # Read mention graph. source_path = preprocessed_data_folder + "/" + "mentionedby" + ".tsv" mention_graph = read_adjacency_matrix(source_path, "\t", "matlab") mention_graph = mention_graph.transpose() # Read retweet graph. source_path = preprocessed_data_folder + "/" + "retweetedby" + ".tsv" retweet_graph = read_adjacency_matrix(source_path, "\t", "matlab") retweet_graph = retweet_graph.transpose() #################################################################################################################### # Simple undirected implicit graphs. #################################################################################################################### target_path = simple_undirected_implicit_graph_folder + "/" + "follow_graph" + ".tsv" simple_undirected_follow_graph = (follow_graph + follow_graph.transpose())/2 scipy_sparse_to_csv(target_path, simple_undirected_follow_graph, separator="\t", directed=False, numbering="matlab") target_path = simple_undirected_implicit_graph_folder + "/" + "mention_graph" + ".tsv" simple_undirected_mention_graph = (mention_graph + mention_graph.transpose())/2 scipy_sparse_to_csv(target_path, simple_undirected_mention_graph, separator="\t", directed=False, numbering="matlab") target_path = simple_undirected_implicit_graph_folder + "/" + "retweet_graph" + ".tsv" simple_undirected_retweet_graph = (retweet_graph + retweet_graph.transpose())/2 scipy_sparse_to_csv(target_path, simple_undirected_retweet_graph, separator="\t", directed=False, numbering="matlab") gc.collect() #################################################################################################################### # Multiview graphs. #################################################################################################################### target_path = simple_undirected_implicit_graph_folder + "/" + "fol_men_graph" + ".tsv" simple_undirected_fol_men_graph = (follow_graph + follow_graph.transpose() + mention_graph + mention_graph.transpose())/4 scipy_sparse_to_csv(target_path, simple_undirected_fol_men_graph, separator="\t", directed=False, numbering="matlab") target_path = simple_undirected_implicit_graph_folder + "/" + "men_ret_graph" + ".tsv" simple_undirected_men_ret_graph = (mention_graph + mention_graph.transpose() + retweet_graph + retweet_graph.transpose())/4 scipy_sparse_to_csv(target_path, simple_undirected_men_ret_graph, separator="\t", directed=False, numbering="matlab") target_path = simple_undirected_implicit_graph_folder + "/" + "fol_ret_graph" + ".tsv" simple_undirected_fol_ret_graph = (follow_graph + follow_graph.transpose() + retweet_graph + retweet_graph.transpose())/4 scipy_sparse_to_csv(target_path, simple_undirected_fol_ret_graph, separator="\t", directed=False, numbering="matlab") target_path = simple_undirected_implicit_graph_folder + "/" + "fol_men_ret_graph" + ".tsv" simple_undirected_fol_men_ret_graph = (follow_graph + follow_graph.transpose() + mention_graph + mention_graph.transpose() + retweet_graph + retweet_graph.transpose())/6 scipy_sparse_to_csv(target_path, simple_undirected_fol_men_ret_graph, separator="\t", directed=False, numbering="matlab")