def load_gntk_matrices(source="TU_DO", dataset="MUTAG", min_scale_mat=0): config = cfg.Config() matrix_dir = f"{config.matrix_path}/GNTK_{source}/{dataset}" data_dir = f"{config.data_path}/{source}/{dataset}" kernel_matrices = {} matrix_list = [ mat_name for mat_name in os.listdir(matrix_dir) if os.path.isdir(f"{matrix_dir}/{mat_name}") ] for mat_name in matrix_list: with open(f"{matrix_dir}/{mat_name}/gram.pkl", "rb") as f: mat = pickle.load(f) # if args.min_scale_mat is True, scale each matrix # by its min. Done by Du et al. # False by default due to potential leak of test data if min_scale_mat: mat = mat / mat.min() kernel_matrices["_".join(mat_name.split("_")[2:])] = mat labels = np.loadtxt(f"{data_dir}/{dataset}_graph_labels.txt") return (kernel_matrices, labels)
def __init__(self): super(num_blocks_results_1, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(bias_var_tradeoff, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(diagonal_dominance, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(data_stats, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(profiling_results_1, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(jk_results_2, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(exp_a_evaluation_2, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(kernal_normalization_results_2, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(time_profiling, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(activation_functions, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(gntk_expressivity_mds_2, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def __init__(self): super(time_complexity, self).__init__() self.config = cfg.Config() self.data = {} self._get_data() self._get_plot_dims()
def load_graphs_duetal(dataset): logger.info(f"loading dataset {dataset} from Du et al. data.") if dataset in ["IMDBBINARY", "COLLAB", "IMDBMULTI"]: degree_as_label = True elif dataset in ["MUTAG", "PROTEINS", "PTC", "NCI1"]: degree_as_label = False config = cfg.Config() data_dir = f"{config.data_path_duetal}/{dataset}" g_list = [] g_labels = [] label_dict = {} feat_dict = {} with open(f"{data_dir}/{dataset}.txt", "r") as f: n_g = int(f.readline().strip()) for i in tqdm(range(n_g)): row = f.readline().strip().split() n, l = [int(w) for w in row] if not l in label_dict: mapped = len(label_dict) label_dict[l] = mapped g = nx.Graph() n_edges = 0 for j in range(n): row = f.readline().strip().split() tmp = int(row[1]) + 2 if tmp == len(row): # no node attributes row = [int(w) for w in row] # attr = None else: row = [int(w) for w in row[:tmp]] # attr = np.array([float(w) for w in row[tmp:]]) if not row[0] in feat_dict: mapped = len(feat_dict) feat_dict[row[0]] = mapped g.add_node(j, lab=feat_dict[row[0]]) n_edges += row[1] for k in range(2, len(row)): g.add_edge(j, row[k]) if degree_as_label: nx.set_node_attributes(g, dict(g.degree()), "lab") assert len(g) == n g_list.append(g) g_labels.append(label_dict[l]) logger.info(f"# classes -- {len(label_dict)}") logger.info(f"# data -- {len(g_list)}") return g_list, g_labels
from src.data import graph_utils from timeit import default_timer as timer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-n", "--n_samples", default=50, type=int, help="Number of samples to calculate the gram matrices for.", ) args = parser.parse_args() config = cfg.Config() exp_config = cfg.TimingExpConfig() out_dir = f"{config.exp_path}/timing" utils.make_dirs_checked(out_dir) datasets = ["IMDBBINARY", "IMDBMULTI", "MUTAG", "NCI1", "PROTEINS", "PTC"] kernels = ["GNTK", "VH", "EH", "SP", "WL"] gram_time = {dataset: {} for dataset in datasets} for dataset in tqdm(datasets, desc="Datasets"): # load data graphs, labels = data_loaders.load_graphs_tudortmund(dataset) n_graphs = len(graphs)
def load_data_tudo(dataset): """ dataset: name of dataset test_proportion: ratio of test train split seed: random seed for random splitting of dataset """ config = cfg.Config() data_dir = config.data_path_tudo if dataset in ["IMDBBINARY", "COLLAB", "IMDBMULTI"]: degree_as_tag = True elif dataset in ["MUTAG", "PROTEINS", "PTC", "NCI1"]: degree_as_tag = False logger.info("Loading data") g_list = [] label_dict = {} feat_dict = {} files = [ file.replace("{}_".format(dataset), "").replace(".txt", "") for file in os.listdir(os.path.join(data_dir, dataset)) if file.split("_")[0] == dataset ] g_indicator = np.loadtxt( os.path.join(data_dir, dataset, "{}_graph_indicator.txt".format(dataset)), delimiter=",", ) g_labels = np.loadtxt( os.path.join(data_dir, dataset, "{}_graph_labels.txt".format(dataset)), delimiter=",", ).tolist() # create helpers n_g = np.max(g_indicator).astype(int) n_nodes = g_indicator.shape[0] n2g_dict = { i: int(g_ind) - 1 for i, g_ind in enumerate(g_indicator.tolist()) } edge_labels_bool = "edge_labels" in files node_labels_bool = "node_labels" in files if node_labels_bool: node_labels = open( os.path.join(data_dir, dataset, "{}_node_labels.txt".format(dataset)), "r") if edge_labels_bool: edge_labels = open( os.path.join(data_dir, dataset, "{}_edge_labels.txt".format(dataset)), "r") A = open(os.path.join(data_dir, dataset, "{}_A.txt".format(dataset)), "r") node_idx = 0 for g_idx in tqdm(range(n_g)): if not g_labels[g_idx] in label_dict: mapped = len(label_dict) label_dict[g_labels[g_idx]] = mapped g = nx.Graph() g_node_idx = 0 node_dict = {} node_tags = [] while n2g_dict[node_idx] == g_idx: node_dict[node_idx] = g_node_idx if node_labels_bool: l = int(node_labels.readline().strip()) if not l in feat_dict: mapped = len(feat_dict) feat_dict[l] = mapped g.add_node(g_node_idx) node_tags.append(feat_dict[l]) node_idx += 1 g_node_idx += 1 if node_idx == n_nodes: break edge = A.readline().strip().replace(" ", "").split(",") while (n2g_dict[int(edge[0]) - 1] == g_idx) & (edge != ""): v1 = int(edge[0]) - 1 v2 = int(edge[1]) - 1 g.add_edge(node_dict[v1], node_dict[v2]) edge = A.readline().strip().replace(" ", "").split(",") if edge[0] == "": break g_list.append(S2VGraph(g, label_dict[g_labels[g_idx]], node_tags)) inverse_label_dict = {v: k for k, v in label_dict.items()} # add labels and edge_mat for g in g_list: g.neighbors = [[] for _ in range(len(g.g))] for i, j in g.g.edges(): g.neighbors[i].append(j) g.neighbors[j].append(i) degree_list = [] for i in range(len(g.g)): g.neighbors[i] = g.neighbors[i] degree_list.append(len(g.neighbors[i])) g.max_neighbor = max(degree_list) # g.label = label_dict[g.label] edges = [list(pair) for pair in g.g.edges()] edges.extend([[i, j] for j, i in edges]) deg_list = list(dict(g.g.degree(range(len(g.g)))).values()) g.edge_mat = torch.LongTensor(edges).transpose(0, 1) if degree_as_tag: for g in g_list: g.node_tags = list(dict(g.g.degree).values()) # Extracting unique tag labels tagset = set([]) for g in g_list: tagset = tagset.union(set(g.node_tags)) tagset = list(tagset) tag2index = {tagset[i]: i for i in range(len(tagset))} for g in g_list: g.node_features = torch.zeros(len(g.node_tags), len(tagset)) g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1 logger.info("# classes: %d" % len(label_dict)) logger.info("# maximum node tag: %d" % len(tagset)) logger.info("# data: %d" % len(g_list)) return g_list, len(label_dict), g_labels, inverse_label_dict
def load_graphs_tudortmund(dataset): config = cfg.Config() data_dir = f"{config.data_path_tudo}/{dataset}" logger.info(f"loading dataset {dataset} from TU Dortmund data.") files = [ file.replace(f"{dataset}_", "").replace(".txt", "") for file in os.listdir(data_dir) if file.split("_")[0] == dataset ] g_indicator = np.loadtxt(f"{data_dir}/{dataset}_graph_indicator.txt", delimiter=",") g_labels = np.loadtxt( f"{data_dir}/{dataset}_graph_labels.txt", delimiter="," ).tolist() # create helpers N = np.max(g_indicator).astype(int) n_nodes = g_indicator.shape[0] n2g_dict = {i: int(g_ind) - 1 for i, g_ind in enumerate(g_indicator.tolist())} edge_labels_bool = "edge_labels" in files node_labels_bool = "node_labels" in files if node_labels_bool: node_labels = open(f"{data_dir}/{dataset}_node_labels.txt", "r") if edge_labels_bool: edge_labels = open(f"{data_dir}/{dataset}_edge_labels.txt", "r") A = open(f"{data_dir}/{dataset}_A.txt", "r") node_idx = 0 g_list = [] for g_idx in tqdm(range(N)): g = nx.Graph() while n2g_dict[node_idx] == g_idx: if node_labels_bool: g.add_node(node_idx, lab=int(node_labels.readline().strip())) else: g.add_node(node_idx) node_idx += 1 if node_idx == n_nodes: break edge = A.readline().strip().replace(" ", "").split(",") while (n2g_dict[int(edge[0]) - 1] == g_idx) & (edge != ""): if edge_labels_bool: g.add_edge( int(edge[0]) - 1, int(edge[1]) - 1, lab=int(edge_labels.readline().strip()), ) else: g.add_edge(int(edge[0]) - 1, int(edge[1]) - 1) edge = A.readline().strip().replace(" ", "").split(",") if edge[0] == "": break if not node_labels_bool: nx.set_node_attributes(g, dict(g.degree()), "lab") g_list.append(g) logger.info(f"# graphs -- {len(g_list)}") return g_list, g_labels
def main(): # Training settings # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper. parser = argparse.ArgumentParser( description= "PyTorch graph convolutional neural net for whole-graph classification" ) parser.add_argument("--dataset", type=str, default="MUTAG", help="name of dataset (default: MUTAG)") parser.add_argument( "--rep_idx", type=int, default=0, help="the index of the cv iteration. Should be less then 10.", ) parser.add_argument( "--fold_idx", type=int, default=0, help="the index of fold in 10-fold validation. Should be less then 10.", ) parser.add_argument( "--learn_eps", action="store_true", help= "Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.", ) args = parser.parse_args() config = cfg.Config() gin_config = cfg.GINConfig(args.dataset) seed = 42 + args.rep_idx architecture = f"L{gin_config.num_layers}_R{gin_config.num_mlp_layers}_scale{gin_config.neighbor_pooling_type}" fold_name = f"rep{args.rep_idx}_fold{args.fold_idx}" out_dir = f"{config.exp_path}/GIN/{args.dataset}/{architecture}" utils.make_dirs_checked(out_dir) # set up seeds and gpu device torch.manual_seed(0) np.random.seed(0) device = (torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")) if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) graphs, num_classes, g_labels, inv_label_dict = load_data_tudo( args.dataset) ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. train_graphs, test_graphs, train_idx, test_idx = separate_data( args.dataset, graphs, seed, args.fold_idx, g_labels) # np.savetxt(f'{out_dir}/{file}_train_indices.txt', train_idx, delimiter=",") np.savetxt(f"{out_dir}/{fold_name}_test_indices.txt", test_idx, delimiter=",") model = GraphCNN( gin_config.num_layers, gin_config.num_mlp_layers, train_graphs[0].node_features.shape[1], gin_config.hidden_dim, num_classes, gin_config.final_dropout, args.learn_eps, gin_config.graph_pooling_type, gin_config.neighbor_pooling_type, device, ).to(device) optimizer = optim.Adam(model.parameters(), lr=gin_config.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) for epoch in range(1, gin_config.epochs + 1): scheduler.step() avg_loss = train(gin_config, model, device, train_graphs, optimizer, epoch) acc_train, acc_test, _ = test(model, device, train_graphs, test_graphs, epoch) with open(f"{out_dir}/{fold_name}.txt", "a") as f: f.write("%f %f %f" % (avg_loss, acc_train, acc_test)) f.write("\n") if epoch == gin_config.epochs: _, _, predictions = test(model, device, train_graphs, test_graphs, epoch) predictions = predictions.data.cpu().numpy().flatten().tolist() predictions = [inv_label_dict[pred] for pred in predictions] np.savetxt( f"{out_dir}/{fold_name}_test_predictions.txt", predictions, delimiter=",", ) print("") print(model.eps)