示例#1
0
        def __init__(self, log_path):

            # the logger to use
            self.__logger = colorlog.getLogger()
            self.__has_filehandler = False
            self.__has_streamhandler = False

            # the format for logging to console
            console_formatter = colorlog.ColoredFormatter(
                "%(log_color)s%(message)s",
                log_colors={
                    "DEBUG": "bold_cyan",
                    "INFO": "bold_green",
                    "WARNING": "bold_yellow",
                    "ERROR": "bold_red",
                    "CRITICAL": "white,bg_red",
                },
            )
            self.__console_handler = logging.StreamHandler()
            self.__console_handler.setFormatter(
                logging.Formatter("%(asctime)s:%(levelname)s: %(message)s"))
            self.__console_handler.setFormatter(console_formatter)
            self.__logger.addHandler(self.__console_handler)
            self.__has_streamhandler = True

            if not log_path is None:
                utils.make_dirs_checked(log_path)

                ts = datetime.datetime.now().strftime("%Y%m%d-T%H:%M:%S")
                path_split = sys.argv[0].split("/")
                src_idx = path_split.index("src")
                log_file = f"{'.'.join(path_split[src_idx:])}-{ts}.log"
                file_handler = logging.FileHandler(f"{log_path}/{log_file}",
                                                   mode="a",
                                                   encoding=None,
                                                   delay=False)
                file_handler.setFormatter(
                    logging.Formatter(
                        "%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s"
                    ))
                self.__logger.addHandler(file_handler)
                self.__has_filehandler = True

            # matplotlib adds annoying debug logs as soon as it is imported -> disable logging for matplotlib
            mpl_logger = logging.getLogger("matplotlib")
            mpl_logger.setLevel(logging.WARNING)

            # the logging level to consider
            self.__logger.setLevel(logging.DEBUG)
示例#2
0
        def add_filehandler(self, log_path):
            if not self.__has_filehandler:
                utils.make_dirs_checked(log_path)

                ts = datetime.datetime.now().strftime("%Y%m%d-T%H:%M:%S")
                path_split = sys.argv[0].split("/")
                src_idx = path_split.index("src")
                log_file = f"{'.'.join(path_split[src_idx:])}-{ts}.log"
                file_handler = logging.FileHandler(f"{log_path}/{log_file}",
                                                   mode="a",
                                                   encoding=None,
                                                   delay=False)
                file_handler.setFormatter(
                    logging.Formatter(
                        "%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s"
                    ))
                self.__logger.addHandler(file_handler)
                self.__has_filehandler = True
示例#3
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-n",
        "--n_samples",
        default=50,
        type=int,
        help="Number of samples to calculate the gram matrices for.",
    )

    args = parser.parse_args()
    config = cfg.Config()
    exp_config = cfg.TimingExpConfig()

    out_dir = f"{config.exp_path}/timing"
    utils.make_dirs_checked(out_dir)

    datasets = ["IMDBBINARY", "IMDBMULTI", "MUTAG", "NCI1", "PROTEINS", "PTC"]
    kernels = ["GNTK", "VH", "EH", "SP", "WL"]

    gram_time = {dataset: {} for dataset in datasets}
    for dataset in tqdm(datasets, desc="Datasets"):

        # load data
        graphs, labels = data_loaders.load_graphs_tudortmund(dataset)

        n_graphs = len(graphs)

        np.random.seed(42)
        sample_indices = np.random.choice(range(n_graphs),
                                          args.n_samples,
示例#4
0
文件: main.py 项目: PhN01/thesis_gntk
def main():
    # Training settings
    # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper.
    parser = argparse.ArgumentParser(
        description=
        "PyTorch graph convolutional neural net for whole-graph classification"
    )
    parser.add_argument("--dataset",
                        type=str,
                        default="MUTAG",
                        help="name of dataset (default: MUTAG)")
    parser.add_argument(
        "--rep_idx",
        type=int,
        default=0,
        help="the index of the cv iteration. Should be less then 10.",
    )
    parser.add_argument(
        "--fold_idx",
        type=int,
        default=0,
        help="the index of fold in 10-fold validation. Should be less then 10.",
    )
    parser.add_argument(
        "--learn_eps",
        action="store_true",
        help=
        "Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.",
    )
    args = parser.parse_args()

    config = cfg.Config()
    gin_config = cfg.GINConfig(args.dataset)

    seed = 42 + args.rep_idx

    architecture = f"L{gin_config.num_layers}_R{gin_config.num_mlp_layers}_scale{gin_config.neighbor_pooling_type}"
    fold_name = f"rep{args.rep_idx}_fold{args.fold_idx}"

    out_dir = f"{config.exp_path}/GIN/{args.dataset}/{architecture}"
    utils.make_dirs_checked(out_dir)

    # set up seeds and gpu device
    torch.manual_seed(0)
    np.random.seed(0)
    device = (torch.device("cuda:" + str(args.device))
              if torch.cuda.is_available() else torch.device("cpu"))

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    graphs, num_classes, g_labels, inv_label_dict = load_data_tudo(
        args.dataset)

    ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
    train_graphs, test_graphs, train_idx, test_idx = separate_data(
        args.dataset, graphs, seed, args.fold_idx, g_labels)
    # np.savetxt(f'{out_dir}/{file}_train_indices.txt', train_idx, delimiter=",")
    np.savetxt(f"{out_dir}/{fold_name}_test_indices.txt",
               test_idx,
               delimiter=",")

    model = GraphCNN(
        gin_config.num_layers,
        gin_config.num_mlp_layers,
        train_graphs[0].node_features.shape[1],
        gin_config.hidden_dim,
        num_classes,
        gin_config.final_dropout,
        args.learn_eps,
        gin_config.graph_pooling_type,
        gin_config.neighbor_pooling_type,
        device,
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=gin_config.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    for epoch in range(1, gin_config.epochs + 1):
        scheduler.step()

        avg_loss = train(gin_config, model, device, train_graphs, optimizer,
                         epoch)
        acc_train, acc_test, _ = test(model, device, train_graphs, test_graphs,
                                      epoch)

        with open(f"{out_dir}/{fold_name}.txt", "a") as f:
            f.write("%f %f %f" % (avg_loss, acc_train, acc_test))
            f.write("\n")

        if epoch == gin_config.epochs:
            _, _, predictions = test(model, device, train_graphs, test_graphs,
                                     epoch)
            predictions = predictions.data.cpu().numpy().flatten().tolist()
            predictions = [inv_label_dict[pred] for pred in predictions]
            np.savetxt(
                f"{out_dir}/{fold_name}_test_predictions.txt",
                predictions,
                delimiter=",",
            )

        print("")

        print(model.eps)
示例#5
0
def evaluate_gntk_cv_run(config, results_dir):

    dataset = results_dir.split("/")[-1]
    exp_detail_path = results_dir.replace(f"{config.exp_path}/", "")
    out_dir = f"{config.eval_path}/{exp_detail_path}"

    utils.make_dirs_checked(out_dir)

    iteration_dirs = []
    for dir in os.listdir(results_dir):
        if dir.split("_")[0][:-1] == "iteration":
            iteration_dirs.append(dir)
    iteration_dirs.sort()

    cv_res = {
        "iteration_accuracy": [],
        "overall_accuracy_mean": 0.0,
        "overall_accuracy_std": 0.0,
        "iterations": {},
    }
    for i, d in enumerate(iteration_dirs):
        with open(f"{results_dir}/{d}/cv_results.txt", "r") as f:
            res = json.load(f)
        cv_res["iterations"][str(i)] = res
        cv_res["iteration_accuracy"].append(res["iteration_accuracy_mean"])
    cv_res["overall_accuracy_mean"] = np.mean(cv_res["iteration_accuracy"])
    cv_res["overall_accuracy_std"] = np.std(cv_res["iteration_accuracy"])

    # out_dir = os.path.join(results_dir, 'analysis')
    # if not os.path.isdir(out_dir):
    #     os.makedirs(out_dir, exist_ok=True)

    with open(f"{out_dir}/cv_results.txt", "w") as f:
        json.dump(cv_res, f, indent=4)

    best_params = []
    for iteration in cv_res["iterations"].keys():
        for fold in cv_res["iterations"][iteration]["folds"].keys():
            best_params_tmp = cv_res["iterations"][iteration]["folds"][fold][
                "best_parameters"]
            best_params.append(
                f"{best_params_tmp['K']}_normalize{best_params_tmp['normalize']}"
            )
    best_params_count = dict(Counter(best_params))

    with open(f"{out_dir}/best_param_count.txt", "w") as f:
        json.dump(best_params_count, f, indent=4)

    test_indices = {}
    test_predictions = {}
    K_validation_accuracy = {}
    for K in cv_res["iterations"]["0"]["folds"]["0"]["inner_res"][
            "K_results"].keys():
        for norm in ["True", "False"]:
            K_validation_accuracy[f"{K}_normalize{norm}"] = []

    for iteration, iteration_dict in cv_res["iterations"].items():
        test_indices[iteration] = {}
        test_predictions[iteration] = {}
        for fold, fold_dict in iteration_dict["folds"].items():

            test_indices[iteration][fold] = fold_dict["test_indices"]
            test_predictions[iteration][fold] = fold_dict["y_pred"]

            for K, res in fold_dict["inner_res"]["K_results"].items():
                for norm in ["True", "False"]:
                    key = f"{K}_normalize{norm}"
                    val = res["normalization"][norm]["best_accuracy_mean"]
                    K_validation_accuracy[key].append(val)

    with open(f"{out_dir}/test_indices.txt", "w") as f:
        json.dump(test_indices, f, indent=4)

    with open(f"{out_dir}/test_predictions.txt", "w") as f:
        json.dump(test_predictions, f, indent=4)

    score = f'{np.round(cv_res["overall_accuracy_mean"]*100,2)} ± {np.round(cv_res["overall_accuracy_std"]*100,2)}'

    with open(f"{out_dir}/{score}", "w") as f:
        f.write(score)

    dict_list = []
    best_non_norm = {"K_param": None, "accuracy": 0.0}

    params_dict = {
        "L": range(1, 15),
        "R": range(1, 4),
        "scale": ["uniform", "degree"],
        "jk": [0, 1],
        "norm": ["True", "False"],
    }
    param_grid = ParameterGrid(params_dict)

    for params in param_grid:
        L = params["L"]
        R = params["R"]
        scale = params["scale"]
        jk = params["jk"]
        norm = params["norm"]

        key = f"L{L}_R{R}_scale{scale}_jk{jk}_normalize{norm}"
        res = K_validation_accuracy[key]
        count = 0.0
        try:
            count += best_params_count[key]
        except:
            pass

        res_acc_mean = np.mean(res)
        res_acc_std = np.std(res)
        dict_list.append({
            "L": L,
            "R": R,
            "scale": scale,
            "jk": jk,
            "norm": norm,
            "acc_mean": res_acc_mean,
            "acc_std": res_acc_std,
            "best_count": count,
        })
        if norm == "False":
            if res_acc_mean > best_non_norm["accuracy"]:
                best_non_norm["K_param"] = key
                best_non_norm["accuracy"] = res_acc_mean

    K_validation_df = pd.DataFrame(dict_list)
    K_validation_df.to_csv(f"{out_dir}/K_validation_df.csv",
                           index=False,
                           sep=",")

    with open(f"{out_dir}/best_non_norm_params.txt", "w") as f:
        json.dump(best_non_norm, f)
示例#6
0
def evaluate_gin_run(config, results_dir):

    dataset = results_dir.split("/")[-1]
    exp_detail_path = results_dir.replace(f"{config.exp_path}/", "")
    out_dir = f"{config.eval_path}/{exp_detail_path}"

    utils.make_dirs_checked(out_dir)

    iteration_accuracy = []
    test_indices = {}
    test_predictions = {}

    for iteration in range(10):

        fold_accuracy = []
        test_indices[iteration] = {}
        test_predictions[iteration] = {}

        for fold in range(10):

            with open(f'{results_dir}/rep{iteration}_fold{fold}.txt',
                      'r') as f:
                fold_file = f.read().split('\n')[:-1]

                fold_acc_curve = [
                    float(item.split(' ')[2]) for item in fold_file
                ]
                fold_accuracy.append(fold_acc_curve)

            fold_indices = np.loadtxt(
                f'{results_dir}/rep{iteration}_fold{fold}_test_indices.txt',
                delimiter=",",
                dtype=float).round().astype(int)
            test_indices[iteration][fold] = fold_indices.tolist()

            fold_predictions = np.loadtxt(
                f'{results_dir}/rep{iteration}_fold{fold}_test_predictions.txt',
                delimiter=",",
                dtype=float).round().astype(int)
            test_predictions[iteration][fold] = fold_predictions.tolist()

        iteration_mean_accuracy = np.array(fold_accuracy).mean(axis=0)
        best_epoch = np.argmax(iteration_mean_accuracy)

        iteration_accuracy.append(iteration_mean_accuracy[best_epoch])

    overall_accuracy_mean = np.mean(iteration_accuracy)
    overall_accuracy_std = np.std(iteration_accuracy)
    score = f'{np.round(overall_accuracy_mean * 100, 2)} ± {np.round(overall_accuracy_std * 100, 2)}'

    with open(f'{out_dir}/{score}', 'w') as f:
        f.write(score)

    np.savetxt(os.path.join(out_dir, 'iteration_accuracies.txt'),
               iteration_accuracy,
               delimiter=",")

    with open(f'{out_dir}/test_indices.txt', 'w') as f:
        json.dump(test_indices, f)

    with open(f'{out_dir}/test_predictions.txt', 'w') as f:
        json.dump(test_predictions, f)
示例#7
0
    acc_mean = np.round(acc_mean * 100, 2)
    acc_std = np.round(acc_std * 100, 2)
    score = f'{acc_mean:.2f} ± {acc_std:.2f}'
    score_latex = f'${acc_mean:.2f} \\pm {acc_std:.2f}$'
    return score, score_latex


if __name__ == "__main__":
    config = cfg.Config()
    logger.info("-------------------------------------")
    logger.info("Evaluating experiment (a)")
    logger.info("-------------------------------------")

    datasets = ['IMDBBINARY', 'IMDBMULTI', 'MUTAG', 'NCI1', 'PROTEINS', 'PTC']

    utils.make_dirs_checked(config.reporting_path)

    replication_df = pd.DataFrame(np.zeros((3, 6)))
    orig_results = [
        '76.9 ± 3.6', '52.8 ± 4.6', '90.0 ± 8.5', '84.2 ± 1.5', '75.6 ± 4.2',
        '67.9 ± 6.9'
    ]
    replication_df.iloc[0, :] = orig_results

    for i, dataset in tqdm(enumerate(datasets)):

        # (a.1)
        res_dir = f'{config.exp_path}/GNTK/a.1/{dataset}/iteration0'

        with open(f'{res_dir}/cv_results.txt', 'r') as f:
            cv_res = json.load(f)