def __init__(self, log_path): # the logger to use self.__logger = colorlog.getLogger() self.__has_filehandler = False self.__has_streamhandler = False # the format for logging to console console_formatter = colorlog.ColoredFormatter( "%(log_color)s%(message)s", log_colors={ "DEBUG": "bold_cyan", "INFO": "bold_green", "WARNING": "bold_yellow", "ERROR": "bold_red", "CRITICAL": "white,bg_red", }, ) self.__console_handler = logging.StreamHandler() self.__console_handler.setFormatter( logging.Formatter("%(asctime)s:%(levelname)s: %(message)s")) self.__console_handler.setFormatter(console_formatter) self.__logger.addHandler(self.__console_handler) self.__has_streamhandler = True if not log_path is None: utils.make_dirs_checked(log_path) ts = datetime.datetime.now().strftime("%Y%m%d-T%H:%M:%S") path_split = sys.argv[0].split("/") src_idx = path_split.index("src") log_file = f"{'.'.join(path_split[src_idx:])}-{ts}.log" file_handler = logging.FileHandler(f"{log_path}/{log_file}", mode="a", encoding=None, delay=False) file_handler.setFormatter( logging.Formatter( "%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s" )) self.__logger.addHandler(file_handler) self.__has_filehandler = True # matplotlib adds annoying debug logs as soon as it is imported -> disable logging for matplotlib mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) # the logging level to consider self.__logger.setLevel(logging.DEBUG)
def add_filehandler(self, log_path): if not self.__has_filehandler: utils.make_dirs_checked(log_path) ts = datetime.datetime.now().strftime("%Y%m%d-T%H:%M:%S") path_split = sys.argv[0].split("/") src_idx = path_split.index("src") log_file = f"{'.'.join(path_split[src_idx:])}-{ts}.log" file_handler = logging.FileHandler(f"{log_path}/{log_file}", mode="a", encoding=None, delay=False) file_handler.setFormatter( logging.Formatter( "%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s" )) self.__logger.addHandler(file_handler) self.__has_filehandler = True
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-n", "--n_samples", default=50, type=int, help="Number of samples to calculate the gram matrices for.", ) args = parser.parse_args() config = cfg.Config() exp_config = cfg.TimingExpConfig() out_dir = f"{config.exp_path}/timing" utils.make_dirs_checked(out_dir) datasets = ["IMDBBINARY", "IMDBMULTI", "MUTAG", "NCI1", "PROTEINS", "PTC"] kernels = ["GNTK", "VH", "EH", "SP", "WL"] gram_time = {dataset: {} for dataset in datasets} for dataset in tqdm(datasets, desc="Datasets"): # load data graphs, labels = data_loaders.load_graphs_tudortmund(dataset) n_graphs = len(graphs) np.random.seed(42) sample_indices = np.random.choice(range(n_graphs), args.n_samples,
def main(): # Training settings # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper. parser = argparse.ArgumentParser( description= "PyTorch graph convolutional neural net for whole-graph classification" ) parser.add_argument("--dataset", type=str, default="MUTAG", help="name of dataset (default: MUTAG)") parser.add_argument( "--rep_idx", type=int, default=0, help="the index of the cv iteration. Should be less then 10.", ) parser.add_argument( "--fold_idx", type=int, default=0, help="the index of fold in 10-fold validation. Should be less then 10.", ) parser.add_argument( "--learn_eps", action="store_true", help= "Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.", ) args = parser.parse_args() config = cfg.Config() gin_config = cfg.GINConfig(args.dataset) seed = 42 + args.rep_idx architecture = f"L{gin_config.num_layers}_R{gin_config.num_mlp_layers}_scale{gin_config.neighbor_pooling_type}" fold_name = f"rep{args.rep_idx}_fold{args.fold_idx}" out_dir = f"{config.exp_path}/GIN/{args.dataset}/{architecture}" utils.make_dirs_checked(out_dir) # set up seeds and gpu device torch.manual_seed(0) np.random.seed(0) device = (torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")) if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) graphs, num_classes, g_labels, inv_label_dict = load_data_tudo( args.dataset) ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. train_graphs, test_graphs, train_idx, test_idx = separate_data( args.dataset, graphs, seed, args.fold_idx, g_labels) # np.savetxt(f'{out_dir}/{file}_train_indices.txt', train_idx, delimiter=",") np.savetxt(f"{out_dir}/{fold_name}_test_indices.txt", test_idx, delimiter=",") model = GraphCNN( gin_config.num_layers, gin_config.num_mlp_layers, train_graphs[0].node_features.shape[1], gin_config.hidden_dim, num_classes, gin_config.final_dropout, args.learn_eps, gin_config.graph_pooling_type, gin_config.neighbor_pooling_type, device, ).to(device) optimizer = optim.Adam(model.parameters(), lr=gin_config.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) for epoch in range(1, gin_config.epochs + 1): scheduler.step() avg_loss = train(gin_config, model, device, train_graphs, optimizer, epoch) acc_train, acc_test, _ = test(model, device, train_graphs, test_graphs, epoch) with open(f"{out_dir}/{fold_name}.txt", "a") as f: f.write("%f %f %f" % (avg_loss, acc_train, acc_test)) f.write("\n") if epoch == gin_config.epochs: _, _, predictions = test(model, device, train_graphs, test_graphs, epoch) predictions = predictions.data.cpu().numpy().flatten().tolist() predictions = [inv_label_dict[pred] for pred in predictions] np.savetxt( f"{out_dir}/{fold_name}_test_predictions.txt", predictions, delimiter=",", ) print("") print(model.eps)
def evaluate_gntk_cv_run(config, results_dir): dataset = results_dir.split("/")[-1] exp_detail_path = results_dir.replace(f"{config.exp_path}/", "") out_dir = f"{config.eval_path}/{exp_detail_path}" utils.make_dirs_checked(out_dir) iteration_dirs = [] for dir in os.listdir(results_dir): if dir.split("_")[0][:-1] == "iteration": iteration_dirs.append(dir) iteration_dirs.sort() cv_res = { "iteration_accuracy": [], "overall_accuracy_mean": 0.0, "overall_accuracy_std": 0.0, "iterations": {}, } for i, d in enumerate(iteration_dirs): with open(f"{results_dir}/{d}/cv_results.txt", "r") as f: res = json.load(f) cv_res["iterations"][str(i)] = res cv_res["iteration_accuracy"].append(res["iteration_accuracy_mean"]) cv_res["overall_accuracy_mean"] = np.mean(cv_res["iteration_accuracy"]) cv_res["overall_accuracy_std"] = np.std(cv_res["iteration_accuracy"]) # out_dir = os.path.join(results_dir, 'analysis') # if not os.path.isdir(out_dir): # os.makedirs(out_dir, exist_ok=True) with open(f"{out_dir}/cv_results.txt", "w") as f: json.dump(cv_res, f, indent=4) best_params = [] for iteration in cv_res["iterations"].keys(): for fold in cv_res["iterations"][iteration]["folds"].keys(): best_params_tmp = cv_res["iterations"][iteration]["folds"][fold][ "best_parameters"] best_params.append( f"{best_params_tmp['K']}_normalize{best_params_tmp['normalize']}" ) best_params_count = dict(Counter(best_params)) with open(f"{out_dir}/best_param_count.txt", "w") as f: json.dump(best_params_count, f, indent=4) test_indices = {} test_predictions = {} K_validation_accuracy = {} for K in cv_res["iterations"]["0"]["folds"]["0"]["inner_res"][ "K_results"].keys(): for norm in ["True", "False"]: K_validation_accuracy[f"{K}_normalize{norm}"] = [] for iteration, iteration_dict in cv_res["iterations"].items(): test_indices[iteration] = {} test_predictions[iteration] = {} for fold, fold_dict in iteration_dict["folds"].items(): test_indices[iteration][fold] = fold_dict["test_indices"] test_predictions[iteration][fold] = fold_dict["y_pred"] for K, res in fold_dict["inner_res"]["K_results"].items(): for norm in ["True", "False"]: key = f"{K}_normalize{norm}" val = res["normalization"][norm]["best_accuracy_mean"] K_validation_accuracy[key].append(val) with open(f"{out_dir}/test_indices.txt", "w") as f: json.dump(test_indices, f, indent=4) with open(f"{out_dir}/test_predictions.txt", "w") as f: json.dump(test_predictions, f, indent=4) score = f'{np.round(cv_res["overall_accuracy_mean"]*100,2)} ± {np.round(cv_res["overall_accuracy_std"]*100,2)}' with open(f"{out_dir}/{score}", "w") as f: f.write(score) dict_list = [] best_non_norm = {"K_param": None, "accuracy": 0.0} params_dict = { "L": range(1, 15), "R": range(1, 4), "scale": ["uniform", "degree"], "jk": [0, 1], "norm": ["True", "False"], } param_grid = ParameterGrid(params_dict) for params in param_grid: L = params["L"] R = params["R"] scale = params["scale"] jk = params["jk"] norm = params["norm"] key = f"L{L}_R{R}_scale{scale}_jk{jk}_normalize{norm}" res = K_validation_accuracy[key] count = 0.0 try: count += best_params_count[key] except: pass res_acc_mean = np.mean(res) res_acc_std = np.std(res) dict_list.append({ "L": L, "R": R, "scale": scale, "jk": jk, "norm": norm, "acc_mean": res_acc_mean, "acc_std": res_acc_std, "best_count": count, }) if norm == "False": if res_acc_mean > best_non_norm["accuracy"]: best_non_norm["K_param"] = key best_non_norm["accuracy"] = res_acc_mean K_validation_df = pd.DataFrame(dict_list) K_validation_df.to_csv(f"{out_dir}/K_validation_df.csv", index=False, sep=",") with open(f"{out_dir}/best_non_norm_params.txt", "w") as f: json.dump(best_non_norm, f)
def evaluate_gin_run(config, results_dir): dataset = results_dir.split("/")[-1] exp_detail_path = results_dir.replace(f"{config.exp_path}/", "") out_dir = f"{config.eval_path}/{exp_detail_path}" utils.make_dirs_checked(out_dir) iteration_accuracy = [] test_indices = {} test_predictions = {} for iteration in range(10): fold_accuracy = [] test_indices[iteration] = {} test_predictions[iteration] = {} for fold in range(10): with open(f'{results_dir}/rep{iteration}_fold{fold}.txt', 'r') as f: fold_file = f.read().split('\n')[:-1] fold_acc_curve = [ float(item.split(' ')[2]) for item in fold_file ] fold_accuracy.append(fold_acc_curve) fold_indices = np.loadtxt( f'{results_dir}/rep{iteration}_fold{fold}_test_indices.txt', delimiter=",", dtype=float).round().astype(int) test_indices[iteration][fold] = fold_indices.tolist() fold_predictions = np.loadtxt( f'{results_dir}/rep{iteration}_fold{fold}_test_predictions.txt', delimiter=",", dtype=float).round().astype(int) test_predictions[iteration][fold] = fold_predictions.tolist() iteration_mean_accuracy = np.array(fold_accuracy).mean(axis=0) best_epoch = np.argmax(iteration_mean_accuracy) iteration_accuracy.append(iteration_mean_accuracy[best_epoch]) overall_accuracy_mean = np.mean(iteration_accuracy) overall_accuracy_std = np.std(iteration_accuracy) score = f'{np.round(overall_accuracy_mean * 100, 2)} ± {np.round(overall_accuracy_std * 100, 2)}' with open(f'{out_dir}/{score}', 'w') as f: f.write(score) np.savetxt(os.path.join(out_dir, 'iteration_accuracies.txt'), iteration_accuracy, delimiter=",") with open(f'{out_dir}/test_indices.txt', 'w') as f: json.dump(test_indices, f) with open(f'{out_dir}/test_predictions.txt', 'w') as f: json.dump(test_predictions, f)
acc_mean = np.round(acc_mean * 100, 2) acc_std = np.round(acc_std * 100, 2) score = f'{acc_mean:.2f} ± {acc_std:.2f}' score_latex = f'${acc_mean:.2f} \\pm {acc_std:.2f}$' return score, score_latex if __name__ == "__main__": config = cfg.Config() logger.info("-------------------------------------") logger.info("Evaluating experiment (a)") logger.info("-------------------------------------") datasets = ['IMDBBINARY', 'IMDBMULTI', 'MUTAG', 'NCI1', 'PROTEINS', 'PTC'] utils.make_dirs_checked(config.reporting_path) replication_df = pd.DataFrame(np.zeros((3, 6))) orig_results = [ '76.9 ± 3.6', '52.8 ± 4.6', '90.0 ± 8.5', '84.2 ± 1.5', '75.6 ± 4.2', '67.9 ± 6.9' ] replication_df.iloc[0, :] = orig_results for i, dataset in tqdm(enumerate(datasets)): # (a.1) res_dir = f'{config.exp_path}/GNTK/a.1/{dataset}/iteration0' with open(f'{res_dir}/cv_results.txt', 'r') as f: cv_res = json.load(f)