Пример #1
0
def tst_classify_synthetic():
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")
    dataset_name = "MNIST"
    set_proxy()
    train_dataset = GNNBenchmarkDataset(root="tst/gnn_benchmark_datasets",
                                        name=dataset_name)
    test_dataset = GNNBenchmarkDataset(root="tst/gnn_benchmark_datasets",
                                       name=dataset_name,
                                       split="test")
    dim_nodes = train_dataset.data.x.shape[1]
    num_classes = train_dataset.num_classes

    print(
        f"{time.time() - start_time:.4f} Finished Loading the dataset: {dataset_name}"
    )
    print(f"Number of classes: {num_classes}. Node feature shape: {dim_nodes}")

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=60, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(
        f'{time.time() - start_time:.2f}s Test Acc (Initial): {test_acc:.4f}')

    for epoch in range(10):
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(
            f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}'
        )
Пример #2
0
def tst_classify_synthetic():
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")
    num_samples = 1000
    num_classes = 2
    min_nodes = 10
    max_nodes = 10
    dim_nodes = 4
    noise_nodes = 1
    connectivity_rate = 0.2
    connectivity_rate_noise = 0.05
    symmetric_flag = True
    random = np.random.RandomState(0)
    noise_remove_node = 0.1
    noise_add_node = 0.1

    graph_dataset = generate_graphs_dataset(
        num_samples=num_samples,
        num_classes=num_classes,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        dim_nodes=dim_nodes,
        noise_nodes=noise_nodes,
        connectivity_rate=connectivity_rate,
        connectivity_rate_noise=connectivity_rate_noise,
        noise_remove_node=noise_remove_node,
        noise_add_node=noise_add_node,
        symmetric_flag=symmetric_flag,
        random=random)
    print(f"{time.time() - start_time:.4f} Finished generating dataset")

    # print("")
    # print(graph_dataset)
    tg_dataset = transform_dataset_to_torch_geometric_dataset(
        graph_dataset.samples, graph_dataset.labels)
    train_loader = DataLoader(tg_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(tg_dataset, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=60, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc (Initial): {test_acc:.4f}')

    for epoch in range(10):
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(
            f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}'
        )
def tst_minhash_lsh_vs_random(random=np.random.RandomState(0)):
    """
    This tst shows how to
    (1) Generate Dataset
    (2) Do the prunning
    (3) Do classification.
    :return:
    """
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")

    # (1) Generate Syhthetic dataset
    print("(1) Generate Syhthetic dataset")
    # Dataset parameters
    num_samples = 1000
    num_classes = 10
    min_nodes = 20
    max_nodes = 30
    dim_nodes = 1
    dim_edges = 2
    connectivity_rate = 0.2
    connectivity_rate_noise = 0.15
    symmetric_flag = True
    noise_remove_node = 0.0
    hidden_channels = 10
    nodes_order_scramble_flag = True
    centers_nodes_std = 1
    centers_edges_std = 1
    node_additive_noise_std = 0.1
    edge_additive_noise_std = 0.1

    epoch_times = 30

    graph_dataset = generate_graphs_dataset(num_samples=num_samples,
                                            num_classes=num_classes,
                                            min_nodes=min_nodes,
                                            max_nodes=max_nodes,
                                            dim_nodes=dim_nodes,
                                            dim_edges=dim_edges,
                                            connectivity_rate=connectivity_rate,
                                            connectivity_rate_noise=connectivity_rate_noise,
                                            noise_remove_node=noise_remove_node,
                                            node_additive_noise_std=node_additive_noise_std,
                                            edge_additive_noise_std=edge_additive_noise_std,
                                            symmetric_flag=symmetric_flag,
                                            centers_nodes_std=centers_nodes_std,
                                            centers_edges_std=centers_edges_std,
                                            nodes_order_scramble_flag=nodes_order_scramble_flag,
                                            random=random)

    print("Show centers")
    for idx, center in enumerate(graph_dataset.centers):
        print(f"center {idx}")
        print(f"{center.__str__()}")


    # Next, we define both Minhash and LSH for generating datasets
    # MinHash parameters
    num_minhash_funcs = 1
    minhash = MinHash(num_minhash_funcs, random, prime=2147483647)
    print(f"minhash:\n{minhash}")

    # LSH parameters
    lsh_num_funcs = 2
    sparsity = 3
    std_of_threshold = 1
    lsh = LSH(dim_nodes,
              num_functions=lsh_num_funcs,
              sparsity=sparsity,
              std_of_threshold=std_of_threshold,
              random=random)
    print(f"lsh:\n{lsh}")

    # (2) Create the dataset
    print("(2) Create the dataset")
    # Transform the dataset
    tg_dataset_original = transform_dataset_to_torch_geometric_dataset(graph_dataset.samples, graph_dataset.labels)


    # (3) Do the prunning
    print("(3) Do the prunning")
    # Make a copy for comparison that the manipulation of the prunning worked.
    tg_dataset_minhash_lsh = copy.deepcopy(tg_dataset_original)
    tg_dataset_random = copy.deepcopy(tg_dataset_original)

    # Do the pruneing according to the two methods:
    prunning_ratio = tg_dataset_prune(tg_dataset_minhash_lsh, "minhash_lsh", minhash=minhash, lsh=lsh)
    print(f"prunning_ratio = {prunning_ratio}")
    tg_dataset_prune(tg_dataset_random, "random", p=prunning_ratio, random=random)

    # Show some samples:
    print("")
    for i in range(min(1, len(graph_dataset.samples))):
        print(f"{i}) Original=\n{tg_dataset_original[i].edge_index.shape}")
        print(f"{i}) Pruned minhash_lsh=\n{tg_dataset_minhash_lsh[i].edge_index.shape}")
        print(f"{i}) Pruned random=\n{tg_dataset_random[i].edge_index.shape}")

    print(f"{time.time() - start_time:.4f} Finished generating dataset")

    # (4) Do the training original.
    print("(4) Do the training original")
    train_loader = DataLoader(tg_dataset_original, batch_size=64, shuffle=True)
    test_loader = DataLoader(tg_dataset_original, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=hidden_channels, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    epoch_times_original = list()
    for epoch in range(epoch_times):
        start_epoch = time.time()
        train(model, train_loader, lr=0.01)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
        epoch_times_original.append(time.time() - start_epoch)
    test_acc_original = test_acc

    # (5) Do the training minhash_lsh.
    print("(5) Do the training minhash_lsh")
    train_loader = DataLoader(tg_dataset_minhash_lsh, batch_size=64, shuffle=True)
    test_loader = DataLoader(tg_dataset_minhash_lsh, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=hidden_channels, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    epoch_times_minhash_lsh = list()
    for epoch in range(epoch_times):
        start_epoch = time.time()
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
        epoch_times_minhash_lsh.append(time.time() - start_epoch)
    test_acc_minhash_lsh = test_acc

    # (6) Do the training random.
    print("(6) Do the training random")
    train_loader = DataLoader(tg_dataset_random, batch_size=64, shuffle=True)
    test_loader = DataLoader(tg_dataset_random, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=hidden_channels, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    epoch_times_random = list()
    for epoch in range(epoch_times):
        start_epoch = time.time()
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
        epoch_times_random.append(time.time() - start_epoch)
    test_acc_random = test_acc

    print(f"Summary")
    print(f"original epochs time mean: {np.mean(epoch_times_original)}")
    print(f"original epochs time minhash_lsh: {np.mean(epoch_times_minhash_lsh)}")
    print(f"original epochs time random: {np.mean(epoch_times_random)}")
    print(f"final test_acc_original: {test_acc_original}")
    print(f"final test_acc_minhash_lsh: {test_acc_minhash_lsh}")
    print(f"final test_acc_random: {test_acc_random}")
Пример #4
0
def tst_classify_networkx_synthetic_tg(
        args,
        num_samples=1000,
        num_classes=2,
        min_nodes=10,
        max_nodes=10,
        dim_nodes=4,
        dim_edges=4,
        centers_nodes_std=0.1,
        centers_edges_std=0.1,
        connectivity_rate=0.2,
        connectivity_rate_noise=0.05,
        symmetric_flag=True,
        nodes_order_scramble_flag=True,
        node_additive_noise_std=0.1,
        edge_additive_noise_std=0.1,
        random=np.random.RandomState(0),
        noise_remove_node=0.1,
        noise_add_node=0.1,
        tb_writer=None,
        graph_dataset=None,
        **kwargs,
):
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")

    if graph_dataset is None:
        graph_dataset = rgd.generate_graphs_dataset(num_samples=num_samples,
                                                    num_classes=num_classes,
                                                    min_nodes=min_nodes,
                                                    max_nodes=max_nodes,
                                                    dim_nodes=dim_nodes,
                                                    dim_edges=dim_edges,
                                                    centers_nodes_std=centers_nodes_std,
                                                    centers_edges_std=centers_edges_std,
                                                    connectivity_rate=connectivity_rate,
                                                    connectivity_rate_noise=connectivity_rate_noise,
                                                    noise_remove_node=noise_remove_node,
                                                    node_additive_noise_std=node_additive_noise_std,
                                                    edge_additive_noise_std=edge_additive_noise_std,
                                                    noise_add_node=noise_add_node,
                                                    nodes_order_scramble_flag=nodes_order_scramble_flag,
                                                    symmetric_flag=symmetric_flag,
                                                    random=random)

    tg_dataset = su.transform_dataset_to_torch_geometric_dataset(graph_dataset.samples, graph_dataset.labels)
    print(f"{time.time() - start_time:.4f} Finished generating dataset")

    pruning_params, prunning_ratio = prune_dataset(tg_dataset, args)

    tg_dataset_train, tg_dataset_test = train_test_split(tg_dataset, test_size=0.25)

    train_loader = DataLoader(tg_dataset_train, batch_size=args.batch_size, shuffle=True)
    test_loader = DataLoader(tg_dataset_test, batch_size=args.batch_size, shuffle=False)

    model = get_model(arch=args.gnn, dim_nodes=dim_nodes, num_classes=num_classes).to(args.device)
    test_acc, _ = func_test(args, model, test_loader)

    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    best_train = 0
    best_test = 0
    train_times = []
    test_times = []
    for epoch in range(args.epochs):
        avg_time_train = train(args, model, train_loader)
        train_times.append(avg_time_train)
        train_acc, _ = func_test(args, model, train_loader)
        test_acc, avg_time_test = func_test(args, model, test_loader)

        test_times.append(avg_time_test)
        best_train = max(best_train, train_acc)
        best_test = max(best_test, test_acc)

        if tb_writer is not None:
            tb_writer.add_scalars('Accuracy',
                                  {'Train': train_acc,
                                   'Test': test_acc, },
                                  epoch)

        print(
            f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

    return graph_dataset, prunning_ratio, best_train, best_test, np.mean(train_times), np.mean(test_times)
Пример #5
0
def single_runner_type02(params):
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")

    print(params)

    random = np.random.RandomState(params["seed"])

    graph_dataset = generate_graphs_dataset(num_samples=params["dataset_params.num_samples"],
                                            num_classes=params["dataset_params.num_classes"],
                                            min_nodes=params["dataset_params.min_nodes"],
                                            max_nodes=params["dataset_params.max_nodes"],
                                            dim_nodes=params["common_params.dim_nodes"],
                                            noise_nodes=params["dataset_params.noise_nodes"],
                                            connectivity_rate=params["dataset_params.connectivity_rate"],
                                            connectivity_rate_noise=params["dataset_params.connectivity_rate_noise"],
                                            noise_remove_node=params["dataset_params.noise_remove_node"],
                                            noise_add_node=params["dataset_params.noise_add_node"],
                                            symmetric_flag=params["dataset_params.symmetric_flag"],
                                            random=random)
    # MinHash parameters
    minhash = MinHash(params["minhash_params.num_minhash_funcs"],
                      random,
                      prime=2147483647)
    print(f"minhash:\n{minhash}")

    # LSH parameters
    lsh = LSH(din=params["common_params.dim_nodes"],
              num_functions=params["lsh_params.lsh_num_funcs"],
              sparsity=params["lsh_params.sparsity"],
              std_of_threshold=params["lsh_params.std_of_threshold"],
              random=random)
    print(f"lsh:\n{lsh}")

    # Prune
    dataset_prune_edges_by_minhash_lsh(graph_dataset, minhash, lsh)

    print(f"{time.time() - start_time:.4f} Finished generating dataset")

    # print("")
    # print(graph_dataset)
    tg_dataset = transform_dataset_to_torch_geometric_dataset(graph_dataset.samples,
                                                              graph_dataset.labels,
                                                              params["dataset_params.num_classes"])
    train_loader = DataLoader(tg_dataset,
                              batch_size=64,
                              shuffle=True)
    test_loader = DataLoader(tg_dataset,
                             batch_size=64,
                             shuffle=False)
    model = GCN(hidden_channels=params["model_params.hidden_channels"],
                in_size=params["common_params.dim_nodes"],
                out_size=params["dataset_params.num_classes"])
    train(model, train_loader)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    for epoch in range(params["model_params.num_episodes"]):
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

    results = dict()
    results["test_acc"] = test_acc
    return results
Пример #6
0
def tst_classify_synthetic(random=np.random.RandomState(0)):
    """
    This tst shows how to
    (1) Generate Dataset
    (2) Do the prunning
    (3) Do classification.
    :return:
    """
    print(f"{time.time() - start_time:.4f} tst_classify_synthetic")

    # (1) Generate Syhthetic dataset
    # Dataset parameters
    num_samples = 1000
    num_classes = 2
    min_nodes = 10
    max_nodes = 10
    dim_nodes = 4
    noise_nodes = 1
    connectivity_rate = 0.2
    connectivity_rate_noise = 0.05
    symmetric_flag = True
    noise_remove_node = 0.1
    noise_add_node = 0.1

    edge_attr_dim = 4

    graph_dataset = generate_graphs_dataset(
        num_samples=num_samples,
        num_classes=num_classes,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        dim_nodes=dim_nodes,
        noise_nodes=noise_nodes,
        connectivity_rate=connectivity_rate,
        connectivity_rate_noise=connectivity_rate_noise,
        noise_remove_node=noise_remove_node,
        noise_add_node=noise_add_node,
        symmetric_flag=symmetric_flag,
        random=random)
    # Next, we define both Minhash and LSH for generating datasets
    # MinHash parameters
    num_minhash_funcs = 2
    minhash = MinHash(num_minhash_funcs, random, prime=2147483647)
    print(f"minhash:\n{minhash}")

    # LSH parameters
    lsh_num_funcs = 2
    sparsity = 3
    std_of_threshold = 1
    lsh = LSH(dim_nodes,
              num_functions=lsh_num_funcs,
              sparsity=sparsity,
              std_of_threshold=std_of_threshold,
              random=random)
    print(f"lsh:\n{lsh}")

    # (2) Do the prunning
    # Transform the dataset
    tg_dataset = transform_dataset_to_torch_geometric_dataset(
        graph_dataset.samples, graph_dataset.labels)

    # Add random edge_attr
    add_random_gaussian_edge_attr(tg_dataset, edge_attr_dim, random)

    # Make a copy for comparison that the manipulation of the prunning worked.
    original_tg_dataset = copy.deepcopy(tg_dataset)

    # Do the prune.
    # tg_dataset_prune(tg_dataset, "minhash_lsh", minhash=minhash, lsh=lsh)
    tg_dataset_prune(tg_dataset, "random", p=0.3, random=random)

    for i in range(min(10, len(graph_dataset.samples))):
        print(
            f"{i}\npruned=\n{tg_dataset[i].edge_index}\noriginal=\n{original_tg_dataset[i].edge_index}"
        )

    print(f"{time.time() - start_time:.4f} Finished generating dataset")

    # (3) Do the training.
    train_loader = DataLoader(tg_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(tg_dataset, batch_size=64, shuffle=False)
    model = GCN(hidden_channels=60, in_size=dim_nodes, out_size=num_classes)

    test_acc = func_test(model, test_loader)
    print(f'{time.time() - start_time:.4f} Test Acc: {test_acc:.4f}')

    for epoch in range(10):
        train(model, train_loader)
        train_acc = func_test(model, train_loader)
        test_acc = func_test(model, test_loader)
        print(
            f'{time.time() - start_time:.4f} Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}'
        )