def read_performance_measures(file_path, number=10): file_row_gen = get_file_row_generator(file_path, "\t") F1_macro_mean = np.zeros(number, dtype=np.float64) F1_macro_std = np.zeros(number, dtype=np.float64) F1_micro_mean = np.zeros(number, dtype=np.float64) F1_micro_std = np.zeros(number, dtype=np.float64) for r in range(18): file_row = next(file_row_gen) file_row = [float(score) for score in file_row] F1_macro_mean[:] = file_row file_row = next(file_row_gen) file_row = [float(score) for score in file_row] F1_macro_std[:] = file_row for r in range(3): file_row = next(file_row_gen) file_row = [float(score) for score in file_row] F1_micro_mean[:] = file_row file_row = next(file_row_gen) file_row = [float(score) for score in file_row] F1_micro_std[:] = file_row return F1_macro_mean, F1_macro_std, F1_micro_mean, F1_micro_std
def read_graph_raw_data_file(filepath, number_of_nodes): file_row_gen = get_file_row_generator(filepath, " ") file_row = next(file_row_gen) while file_row[0][0] == "%": file_row = next(file_row_gen) number_of_edges = int(file_row[2]) row = np.empty(number_of_edges, dtype=np.int32) col = np.empty(number_of_edges, dtype=np.int32) data = np.empty(number_of_edges, dtype=np.float64) edge_counter = 0 for file_row in file_row_gen: if file_row[0] == "": break source_node = int(file_row[0]) target_node = int(file_row[1]) edge_weight = float(file_row[2]) row[edge_counter] = source_node col[edge_counter] = target_node data[edge_counter] = edge_weight edge_counter += 1 row = row - 1 col = col - 1 matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_nodes)) matrix = spsp.coo_matrix(spsp.csr_matrix(matrix)) return matrix
def read_node_label_matrix(file_path, separator, numbering="matlab"): """ Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix. Inputs: - file_path: The path where the node-label matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled. - numbering: Array numbering style: * "matlab" * "c" Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format. - number_of_categories: The number of categories/classes the nodes may belong to. - labelled_node_indices: A NumPy array containing the labelled node indices. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) file_row = next(file_row_generator) number_of_rows = file_row[1] number_of_categories = int(file_row[3]) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Populate the arrays for file_row in file_row_generator: node = np.int64(file_row[0]) label = np.int64(file_row[1]) # Add label append_row(node) append_col(label) labelled_node_indices = np.array(list(set(row))) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) if numbering == "matlab": row -= 1 col -= 1 labelled_node_indices -= 1 elif numbering == "c": pass else: print("Invalid numbering style.") raise RuntimeError # Form sparse adjacency matrix node_label_matrix = spsp.coo_matrix( (data, (row, col)), shape=(number_of_rows, number_of_categories)) node_label_matrix = node_label_matrix.tocsr() return node_label_matrix, number_of_categories, labelled_node_indices
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: user_twitter_id_list.append(int(file_row[0])) id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list)))) user_twitter_id_list = set(user_twitter_id_list) core_file_path = raw_data_folder + "/" + dataset + ".communities" file_row_gen = get_file_row_generator(core_file_path, ",") row = list() col = list() category_counter = 0 for file_row in file_row_gen: id_list = list() first_id = file_row[0].strip().split(" ") first_id = id_to_node[int(first_id[1])] id_list.append(first_id) for id in file_row[1:]: id_list.append(id_to_node[int(id)]) row.extend(id_list) col.extend(category_counter*np.ones(len(id_list), dtype=np.int32)) category_counter += 1 row = np.array(row, dtype=np.int32) col = np.array(col, dtype=np.int32) data = np.ones_like(row, dtype=np.int8) node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), category_counter)) target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv" scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def read_node_label_matrix(file_path, separator, numbering="matlab"): """ Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix. Inputs: - file_path: The path where the node-label matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled. - numbering: Array numbering style: * "matlab" * "c" Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format. - number_of_categories: The number of categories/classes the nodes may belong to. - labelled_node_indices: A NumPy array containing the labelled node indices. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) file_row = next(file_row_generator) number_of_rows = file_row[1] number_of_categories = int(file_row[3]) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Populate the arrays for file_row in file_row_generator: node = np.int64(file_row[0]) label = np.int64(file_row[1]) # Add label append_row(node) append_col(label) labelled_node_indices = np.array(list(set(row))) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) if numbering == "matlab": row -= 1 col -= 1 labelled_node_indices -= 1 elif numbering == "c": pass else: print("Invalid numbering style.") raise RuntimeError # Form sparse adjacency matrix node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_rows, number_of_categories)) node_label_matrix = node_label_matrix.tocsr() return node_label_matrix, number_of_categories, labelled_node_indices
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: user_twitter_id_list.append(int(file_row[0])) id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list)))) user_twitter_id_list = set(user_twitter_id_list) core_file_path = raw_data_folder + "/" + dataset + ".core" file_row_gen = get_file_row_generator(core_file_path, " ") core_user_twitter_id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: core_user_twitter_id_list.append(int(file_row[0])) core_user_twitter_id_list = user_twitter_id_list.intersection(core_user_twitter_id_list) non_core_user_twitter_id_set = user_twitter_id_list.difference(core_user_twitter_id_list) row = [id_to_node[id] for id in core_user_twitter_id_list] + [id_to_node[id] for id in non_core_user_twitter_id_set] row = np.array(row, dtype=np.int32) col = [1 for id in core_user_twitter_id_list] + [0 for id in non_core_user_twitter_id_set] col = np.array(col, dtype=np.int32) data = np.ones(len(user_twitter_id_list), dtype=np.int8) node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), 2)) target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv" scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def read_node_label_matrix(file_path, separator, number_of_nodes): """ Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix. Inputs: - file_path: The path where the node-label matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled. Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format. - number_of_categories: The number of categories/classes the nodes may belong to. - labelled_node_indices: A NumPy array containing the labelled node indices. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Populate the arrays for file_row in file_row_generator: node = np.int64(file_row[0]) label = np.int64(file_row[1]) # Add label append_row(node) append_col(label) number_of_categories = len( set(col) ) # I assume that there are no missing labels. There may be missing nodes. labelled_node_indices = np.array(list(set(row))) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) # Array count should start from 0. row -= 1 col -= 1 labelled_node_indices -= 1 # Form sparse adjacency matrix node_label_matrix = sparse.coo_matrix( (data, (row, col)), shape=(number_of_nodes, number_of_categories)) node_label_matrix = node_label_matrix.tocsr() return node_label_matrix, number_of_categories, labelled_node_indices
def get_number_of_nodes(raw_data_folder, dataset): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") number_of_nodes = 0 for file_row in file_row_gen: if file_row[0] == "": break else: number_of_nodes += 1 return number_of_nodes
def read_adjacency_matrix(file_path, separator): """ Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format. Inputs: - file_path: The path where the adjacency matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Read all file rows for file_row in file_row_generator: source_node = np.int64(file_row[0]) target_node = np.int64(file_row[1]) # Add edge append_row(source_node) append_col(target_node) # Since this is an undirected network also add the reciprocal edge append_row(target_node) append_col(source_node) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) number_of_nodes = np.max( row) # I assume that there are no missing nodes at the end. # Array count should start from 0. row -= 1 col -= 1 # Form sparse adjacency matrix adjacency_matrix = sparse.coo_matrix( (data, (row, col)), shape=(number_of_nodes, number_of_nodes)) return adjacency_matrix
def read_node_label_matrix(file_path, separator, number_of_nodes): """ Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix. Inputs: - file_path: The path where the node-label matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled. Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format. - number_of_categories: The number of categories/classes the nodes may belong to. - labelled_node_indices: A NumPy array containing the labelled node indices. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Populate the arrays for file_row in file_row_generator: node = np.int64(file_row[0]) label = np.int64(file_row[1]) # Add label append_row(node) append_col(label) number_of_categories = len(set(col)) # I assume that there are no missing labels. There may be missing nodes. labelled_node_indices = np.array(list(set(row))) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) # Array count should start from 0. row -= 1 col -= 1 labelled_node_indices -= 1 # Form sparse adjacency matrix node_label_matrix = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_categories)) node_label_matrix = node_label_matrix.tocsr() return node_label_matrix, number_of_categories, labelled_node_indices
def read_adjacency_matrix(file_path, separator): """ Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format. Inputs: - file_path: The path where the adjacency matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) # Initialize lists for row and column sparse matrix arguments row = list() col = list() append_row = row.append append_col = col.append # Read all file rows for file_row in file_row_generator: source_node = np.int64(file_row[0]) target_node = np.int64(file_row[1]) # Add edge append_row(source_node) append_col(target_node) # Since this is an undirected network also add the reciprocal edge append_row(target_node) append_col(source_node) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.ones_like(row, dtype=np.float64) number_of_nodes = np.max(row) # I assume that there are no missing nodes at the end. # Array count should start from 0. row -= 1 col -= 1 # Form sparse adjacency matrix adjacency_matrix = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_nodes)) return adjacency_matrix
def get_number_of_nodes(raw_data_folder, dataset): node_file_path = raw_data_folder + "/" + dataset + ".ids" file_row_gen = get_file_row_generator(node_file_path, " ") number_of_nodes = 0 id_list = list() for file_row in file_row_gen: if file_row[0] == "": break else: number_of_nodes += 1 id_list.append(int(file_row[0])) id_to_node = dict(zip(id_list, range(len(id_list)))) return number_of_nodes, id_to_node
def read_adjacency_matrix(file_path, separator, numbering="matlab"): """ Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format. Inputs: - file_path: The path where the adjacency matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - numbering: Array numbering style: * "matlab" * "c" Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) file_row = next(file_row_generator) number_of_rows = file_row[1] number_of_columns = file_row[3] directed = file_row[7] if directed == "True": directed = True elif directed == "False": directed = False else: print("Invalid metadata.") raise RuntimeError # Initialize lists for row and column sparse matrix arguments row = list() col = list() data = list() append_row = row.append append_col = col.append append_data = data.append # Read all file rows for file_row in file_row_generator: source_node = np.int64(file_row[0]) target_node = np.int64(file_row[1]) edge_weight = np.float64(file_row[2]) # Add edge append_row(source_node) append_col(target_node) append_data(edge_weight) # Since this is an undirected network also add the reciprocal edge if not directed: if source_node != target_node: append_row(target_node) append_col(source_node) append_data(edge_weight) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) if numbering == "matlab": row -= 1 col -= 1 elif numbering == "c": pass else: print("Invalid numbering style.") raise RuntimeError # Form sparse adjacency matrix adjacency_matrix = spsp.coo_matrix( (data, (row, col)), shape=(number_of_rows, number_of_columns)) return adjacency_matrix
def read_adjacency_matrix(file_path, separator, numbering="matlab"): """ Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format. Inputs: - file_path: The path where the adjacency matrix is stored. - separator: The delimiter among values (e.g. ",", "\t", " ") - numbering: Array numbering style: * "matlab" * "c" Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format. """ # Open file file_row_generator = get_file_row_generator(file_path, separator) file_row = next(file_row_generator) number_of_rows = file_row[1] number_of_columns = file_row[3] directed = file_row[7] if directed == "True": directed = True elif directed == "False": directed = False else: print("Invalid metadata.") raise RuntimeError # Initialize lists for row and column sparse matrix arguments row = list() col = list() data = list() append_row = row.append append_col = col.append append_data = data.append # Read all file rows for file_row in file_row_generator: source_node = np.int64(file_row[0]) target_node = np.int64(file_row[1]) edge_weight = np.float64(file_row[2]) # Add edge append_row(source_node) append_col(target_node) append_data(edge_weight) # Since this is an undirected network also add the reciprocal edge if not directed: if source_node != target_node: append_row(target_node) append_col(source_node) append_data(edge_weight) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) if numbering == "matlab": row -= 1 col -= 1 elif numbering == "c": pass else: print("Invalid numbering style.") raise RuntimeError # Form sparse adjacency matrix adjacency_matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_rows, number_of_columns)) return adjacency_matrix
def get_folds_generator(node_label_matrix, labelled_node_indices, number_of_categories, dataset_memory_folder, percentage, number_of_folds=10): """ Read or form and store the seed nodes for training and testing. Inputs: - node_label_matrix: The node-label ground truth in a SciPy sparse matrix format. - labelled_node_indices: A NumPy array containing the labelled node indices. - number_of_categories: The number of categories/classes in the learning. - memory_path: The folder where the results are stored. - percentage: The percentage of labelled samples that will be used for training. Output: - folds: A generator containing train/test set folds. """ number_of_labeled_nodes = labelled_node_indices.size training_set_size = int(np.ceil(percentage*number_of_labeled_nodes/100)) #################################################################################################################### # Read or generate folds #################################################################################################################### fold_file_path = dataset_memory_folder + "/folds/" + str(percentage) + "_folds.txt" train_list = list() test_list = list() if not os.path.exists(fold_file_path): with open(fold_file_path, "w") as fp: for trial in np.arange(number_of_folds): train, test = valid_train_test(node_label_matrix[labelled_node_indices, :], training_set_size, number_of_categories, trial) train = labelled_node_indices[train] test = labelled_node_indices[test] # Write test nodes row = [str(node) for node in test] row = "\t".join(row) + "\n" fp.write(row) # Write train nodes row = [str(node) for node in train] row = "\t".join(row) + "\n" fp.write(row) train_list.append(train) test_list.append(test) else: file_row_gen = get_file_row_generator(fold_file_path, "\t") for trial in np.arange(number_of_folds): # Read test nodes test = next(file_row_gen) test = [int(node) for node in test] test = np.array(test) # Read train nodes train = next(file_row_gen) train = [int(node) for node in train] train = np.array(train) train_list.append(train) test_list.append(test) folds = ((train, test) for train, test in zip(train_list, test_list)) return folds