Exemplo n.º 1
0
def main():
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    G = nx.read_gpickle(args.data_path)
    print(G.number_of_edges())
    print('Each node has node ID (n_id). Example: ', G.nodes[0])
    print(
        'Each edge has edge ID (id) and categorical label (e_label). Example: ',
        G[0][5871])

    # find num edge types
    max_label = 0
    labels = []
    for u, v, edge_key in G.edges:
        l = G[u][v][edge_key]['e_label']
        if not l in labels:
            labels.append(l)
    # labels are consecutive (0-17)
    num_edge_types = len(labels)

    H = WN_transform(G, num_edge_types)
    # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here)
    for node in H.nodes(data=True):
        print(node)
        break
    # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here)
    for edge in H.edges(data=True):
        print(edge)
        break

    hete = HeteroGraph(H)

    dataset = GraphDataset([hete], task='link_pred')
    dataset_train, dataset_val, dataset_test = dataset.split(
        transductive=True, split_ratio=[0.8, 0.1, 0.1])
    train_loader = DataLoader(dataset_train,
                              collate_fn=Batch.collate(),
                              batch_size=1)
    val_loader = DataLoader(dataset_val,
                            collate_fn=Batch.collate(),
                            batch_size=1)
    test_loader = DataLoader(dataset_test,
                             collate_fn=Batch.collate(),
                             batch_size=1)
    dataloaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

    hidden_size = 32
    model = HeteroNet(hete, hidden_size, 0.2).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.001,
                                 weight_decay=5e-4)

    train(model, dataloaders, optimizer, args)
Exemplo n.º 2
0
def run(proc_id, n_gpus, devices):
    dataset_train = torch.split(dataset_train,
                                len(dataset_train) // n_gpus)[proc_id]

    train_loader = DataLoader(dataset_train,
                              collate_fn=Batch.collate(),
                              batch_size=16)  # basic data loader
    val_loader = DataLoader(dataset_val,
                            collate_fn=Batch.collate(),
                            batch_size=16)  # basic data loader
    test_loader = DataLoader(dataset_test,
                             collate_fn=Batch.collate(),
                             batch_size=16)  # basic data loader

    dev_id = devices[proc_id]
    torch.cuda.set_device(dev_id)
    model = Net().to(dev_id)
    model.reset_parameters()
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[dev_id],
                                        output_device=dev_id)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.01,
                                 weight_decay=5e-3)

    val_max = -math.inf
    best_model = model
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12346')
        world_size = n_gpus
        torch.distributed.init_process_group(backend="nccl",
                                             init_method=dist_init_method,
                                             world_size=world_size,
                                             rank=proc_id)
    for epoch in range(1, 201):
        train()
        log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
        train_acc, val_acc, test_acc = test()
        print(log.format(epoch, train_acc, val_acc, test_acc))
        if val_max < val_acc:
            val_max = val_acc
            # best_model = copy.deepcopy(model)

    # model = best_model
    log = 'Best, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    train_acc, val_acc, test_acc = test()
    print(log.format(train_acc, val_acc, test_acc))
Exemplo n.º 3
0
 def test_torch_dataloader_collate(self):
     # graph classification example
     pyg_dataset = TUDataset('./enzymes', 'ENZYMES')
     graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
     dataset = GraphDataset(graphs, task="graph")
     train_batch_num = math.ceil(len(dataset) * 0.8 / 32)
     test_batch_num = math.ceil(len(dataset) * 0.1 / 32)
     val_batch_num = math.ceil(len(dataset) * 0.1 / 32)
     datasets = {}
     datasets['train'], datasets['val'], datasets['test'] = \
         dataset.split(transductive=False, split_ratio=[0.8, 0.1, 0.1])
     dataloaders = {
         split: DataLoader(dataset,
                           collate_fn=Batch.collate(),
                           batch_size=32,
                           shuffle=True)
         for split, dataset in datasets.items()
     }
     self.assertEqual(len(dataloaders['train']), train_batch_num)
     self.assertEqual(len(dataloaders['val']), test_batch_num)
     self.assertEqual(len(dataloaders['test']), val_batch_num)
     for i, data in enumerate(dataloaders['train']):
         if i != len(dataloaders['train']) - 1:
             self.assertEqual(data.num_graphs, 32)
     for i, data in enumerate(dataloaders['val']):
         if i != len(dataloaders['val']) - 1:
             self.assertEqual(data.num_graphs, 32)
     for i, data in enumerate(dataloaders['test']):
         if i != len(dataloaders['test']) - 1:
             self.assertEqual(data.num_graphs, 32)
    def test_hetero_graph_batch(self):
        G = generate_simple_hete_graph()
        hete = HeteroGraph(G)
        hete = HeteroGraph(
            node_feature=hete.node_feature,
            node_label=hete.node_label,
            edge_feature=hete.edge_feature,
            edge_label=hete.edge_label,
            edge_index=hete.edge_index,
            directed=True
        )

        heteGraphDataset = []
        for _ in range(30):
            heteGraphDataset.append(hete.clone())
        dataloader = DataLoader(
            heteGraphDataset,
            collate_fn=Batch.collate(),
            batch_size=3,
            shuffle=True,
        )

        self.assertEqual(len(dataloader), math.ceil(30 / 3))
        for data in dataloader:
            self.assertEqual(data.num_graphs, 3)
Exemplo n.º 5
0
 def gen_data_loaders(self, batch_size, train=True):
     return [
         TorchDataLoader(self.train if train else self.test,
                         collate_fn=Batch.collate([]),
                         batch_size=batch_size // 2,
                         shuffle=True) for i in range(3)
     ]
Exemplo n.º 6
0
def deepsnap_ego(args, pyg_dataset):
    avg_time = 0
    task = "graph"
    for i in range(args.num_runs):
        if args.print_run:
            print("Run {}".format(i + 1))
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset,
                                            verbose=True,
                                            netlib=netlib)
        dataset = GraphDataset(graphs, task=task)
        datasets = {}
        datasets['train'], datasets['val'], datasets['test'] = dataset.split(
            transductive=False, split_ratio=[0.8, 0.1, 0.1], shuffle=False)
        dataloaders = {
            split: DataLoader(dataset,
                              collate_fn=Batch.collate(),
                              batch_size=1,
                              shuffle=False)
            for split, dataset in datasets.items()
        }
        s = time.time()
        for batch in dataloaders['train']:
            batch = batch.apply_transform(ego_nets, update_tensor=True)
        avg_time += (time.time() - s)
    print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
Exemplo n.º 7
0
    def test_pyg_to_graphs_global(self):
        import deepsnap
        deepsnap.use(nx)

        pyg_dataset = Planetoid('./planetoid', "Cora")
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        self.assertTrue(isinstance(graphs[0].G, nx.Graph))
        dataset = GraphDataset(graphs, task='node')
        num_nodes = dataset.num_nodes[0]
        node_0 = int(0.8 * num_nodes)
        node_1 = int(0.1 * num_nodes)
        node_2 = num_nodes - node_0 - node_1
        train, val, test = dataset.split()
        self.assertTrue(isinstance(train[0].G, nx.Graph))
        self.assertTrue(isinstance(val[0].G, nx.Graph))
        self.assertTrue(isinstance(test[0].G, nx.Graph))
        self.assertEqual(train[0].node_label_index.shape[0], node_0)
        self.assertEqual(val[0].node_label_index.shape[0], node_1)
        self.assertEqual(test[0].node_label_index.shape[0], node_2)

        train_loader = DataLoader(train,
                                  collate_fn=Batch.collate(),
                                  batch_size=1)
        for batch in train_loader:
            self.assertTrue(isinstance(batch.G[0], nx.Graph))

        deepsnap.use(sx)
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        self.assertTrue(isinstance(graphs[0].G, sx.Graph))
        dataset = GraphDataset(graphs, task='node')
        num_nodes = dataset.num_nodes[0]
        node_0 = int(0.8 * num_nodes)
        node_1 = int(0.1 * num_nodes)
        node_2 = num_nodes - node_0 - node_1
        train, val, test = dataset.split()
        self.assertTrue(isinstance(train[0].G, sx.Graph))
        self.assertTrue(isinstance(val[0].G, sx.classes.graph.Graph))
        self.assertTrue(isinstance(test[0].G, sx.classes.graph.Graph))
        self.assertEqual(train[0].node_label_index.shape[0], node_0)
        self.assertEqual(val[0].node_label_index.shape[0], node_1)
        self.assertEqual(test[0].node_label_index.shape[0], node_2)

        train_loader = DataLoader(train,
                                  collate_fn=Batch.collate(),
                                  batch_size=1)
        for batch in train_loader:
            self.assertTrue(isinstance(batch.G[0], sx.Graph))
Exemplo n.º 8
0
def main():
    args = arg_parse()

    pyg_dataset = Planetoid('./cora', 'Cora', transform=T.TargetIndegree())
    
    # the input that we assume users have
    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    graphs = GraphDataset.pyg_to_graphs(pyg_dataset, tensor_backend=True)
    if args.multigraph:
        graphs = [copy.deepcopy(graphs[0]) for _ in range(10)]

    dataset = GraphDataset(graphs, 
                           task='link_pred', 
                           edge_message_ratio=args.edge_message_ratio, 
                           edge_train_mode=edge_train_mode)
    print('Initial dataset: {}'.format(dataset))

    # split dataset
    datasets = {}
    datasets['train'], datasets['val'], datasets['test']= dataset.split(
            transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1])

    print('after split')
    print('Train message-passing graph: {} nodes; {} edges.'.format(
            datasets['train'][0].num_nodes,
            datasets['train'][0].num_edges))
    print('Val message-passing graph: {} nodes; {} edges.'.format(
            datasets['val'][0].num_nodes,
            datasets['val'][0].num_edges))
    print('Test message-passing graph: {} nodes; {} edges.'.format(
            datasets['test'][0].num_nodes,
            datasets['test'][0].num_edges))


    # node feature dimension
    input_dim = datasets['train'].num_node_features
    # link prediction needs 2 classes (0, 1)
    num_classes = datasets['train'].num_edge_labels

    model = Net(input_dim, num_classes, args).to(args.device)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs)
    follow_batch = [] # e.g., follow_batch = ['edge_index']

    dataloaders = {split: DataLoader(
            ds, collate_fn=Batch.collate(follow_batch), 
            batch_size=args.batch_size, shuffle=(split=='train'))
            for split, ds in datasets.items()}
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args, scheduler=scheduler)
Exemplo n.º 9
0
def create_loader(datasets):
    loader_train = DataLoader(datasets[0],
                              collate_fn=Batch.collate(),
                              batch_size=cfg.train.batch_size,
                              shuffle=True,
                              num_workers=cfg.num_workers,
                              pin_memory=False)

    loaders = [loader_train]
    for i in range(1, len(datasets)):
        loaders.append(
            DataLoader(datasets[i],
                       collate_fn=Batch.collate(),
                       batch_size=cfg.train.batch_size,
                       shuffle=False,
                       num_workers=cfg.num_workers,
                       pin_memory=False))

    return loaders
Exemplo n.º 10
0
    def test_hetero_graph_batch(self):
        G = generate_simple_hete_graph()
        hete = HeteroGraph(G)

        heteGraphDataset = []
        for i in range(30):
            heteGraphDataset.append(hete.clone())
        dataloader = DataLoader(heteGraphDataset,
                                collate_fn=Batch.collate(),
                                batch_size=3,
                                shuffle=True)

        self.assertEqual(len(dataloader), math.ceil(30 / 3))
        for data in dataloader:
            self.assertEqual(data.num_graphs, 3)
Exemplo n.º 11
0
 def gen_data_loaders(self,
                      size,
                      batch_size,
                      train=True,
                      use_distributed_sampling=False):
     loaders = []
     for i in range(2):
         dataset = combined_syn.get_dataset(
             "graph", size // 2,
             np.arange(self.min_size + 1, self.max_size + 1))
         sampler = torch.utils.data.distributed.DistributedSampler(
             dataset, num_replicas=hvd.size(), rank=hvd.rank()) if \
                 use_distributed_sampling else None
         loaders.append(
             TorchDataLoader(dataset,
                             collate_fn=Batch.collate([]),
                             batch_size=batch_size //
                             2 if i == 0 else batch_size // 2,
                             sampler=sampler,
                             shuffle=False))
     loaders.append([None] * (size // batch_size))
     return loaders
Exemplo n.º 12
0
 def gen_data_loaders(self,
                      size,
                      batch_size,
                      train=True,
                      use_distributed_sampling=False):
     loaders = []
     for i in range(2):
         neighs = []
         for j in range(size // 2):
             graph, neigh = utils.sample_neigh(
                 self.train_set if train else self.test_set,
                 random.randint(self.min_size, self.max_size))
             neighs.append(graph.subgraph(neigh))
         dataset = GraphDataset(GraphDataset.list_to_graphs(neighs))
         loaders.append(
             TorchDataLoader(dataset,
                             collate_fn=Batch.collate([]),
                             batch_size=batch_size //
                             2 if i == 0 else batch_size // 2,
                             sampler=None,
                             shuffle=False))
     loaders.append([None] * (size // batch_size))
     return loaders
Exemplo n.º 13
0
name = 'BioSNAP-Function-Function'
f = datadir + 'minerff.tsv'
f2 = datadir + 'minerf.tsv'
d = readFilePD(f, ['relation'])
d2 = readFilePD(f2, ['namespace'])
# label node feature as 'node feature'
nxg = pdToNx2(d, d2, 'GO_id0', 'GO_id2', 'relation', 'GO_id1', 'namespace')
dg = deepsnap.graph.Graph(nxg)
graphs = dg

dataset = GraphDataset(graphs, task='node')  # node, edge, link_pred, graph
dataset_train, dataset_val, dataset_test = dataset.split(
    transductive=True,
    split_ratio=[0.8, 0.1, 0.1])  # transductive split, inductive split
train_loader = DataLoader(dataset_train,
                          collate_fn=Batch.collate(),
                          batch_size=16)  # basic data loader
val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(),
                        batch_size=16)  # basic data loader
test_loader = DataLoader(dataset_test,
                         collate_fn=Batch.collate(),
                         batch_size=16)  # basic data loader


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # self.conv1 = GCNConv(dataset.num_node_features, 1)
        # self.conv2 = GCNConv(16, dataset.num_node_labels)
        self.conv1 = SplineConv(1, 16, dim=1, kernel_size=2)
        self.conv2 = SplineConv(16, 4, dim=1, kernel_size=2)
Exemplo n.º 14
0
    if accs[1] > best_val:
        best_val = accs[1]
        best_model = copy.deepcopy(model)
    return accs

if __name__ == "__main__":
    cora_pyg = Planetoid('./cora', 'Cora')
    citeseer_pyg = Planetoid('./citeseer', 'CiteSeer')
    G = concatenate_citeseer_cora(cora_pyg[0], citeseer_pyg[0])
    hete = HeteroGraph(G)
    print("Heterogeneous graph {} nodes, {} edges".format(hete.num_nodes, hete.num_edges))

    dataset = GraphDataset([hete], task='node')
    dataset_train, dataset_val, dataset_test = dataset.split(transductive=True,
                                                            split_ratio=[0.8, 0.1, 0.1])
    train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(),
                        batch_size=16)
    val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(),
                        batch_size=16)
    test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(),
                        batch_size=16)
    loaders = [train_loader, val_loader, test_loader]

    hidden_size = 32
    model = HeteroNet(hete, hidden_size, 0.5).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3)
    num_epochs = 100

    train_accs, valid_accs, test_accs = [], [], []

    for epoch in range(num_epochs):
Exemplo n.º 15
0
def main():
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    G = nx.read_gpickle(args.data_path)
    print(G.number_of_edges())
    print('Each node has node ID (n_id). Example: ', G.nodes[0])
    print(
        'Each edge has edge ID (id) and categorical label (e_label). Example: ',
        G[0][5871])

    # find num edge types
    max_label = 0
    labels = []
    for u, v, edge_key in G.edges:
        l = G[u][v][edge_key]['e_label']
        if not l in labels:
            labels.append(l)
    # labels are consecutive (0-17)
    num_edge_types = len(labels)

    H = WN_transform(G, num_edge_types)
    # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here)
    for node in H.nodes(data=True):
        print(node)
        break
    # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here)
    for edge in H.edges(data=True):
        print(edge)
        break

    hetero = HeteroGraph(H)
    hetero = HeteroGraph(edge_index=hetero.edge_index,
                         edge_feature=hetero.edge_feature,
                         node_feature=hetero.node_feature,
                         directed=hetero.is_directed())

    if edge_train_mode == "disjoint":
        dataset = GraphDataset([hetero],
                               task='link_pred',
                               edge_train_mode=edge_train_mode,
                               edge_message_ratio=args.edge_message_ratio)
    else:
        dataset = GraphDataset(
            [hetero],
            task='link_pred',
            edge_train_mode=edge_train_mode,
        )

    dataset_train, dataset_val, dataset_test = dataset.split(
        transductive=True, split_ratio=[0.8, 0.1, 0.1])
    train_loader = DataLoader(dataset_train,
                              collate_fn=Batch.collate(),
                              batch_size=1)
    val_loader = DataLoader(dataset_val,
                            collate_fn=Batch.collate(),
                            batch_size=1)
    test_loader = DataLoader(dataset_test,
                             collate_fn=Batch.collate(),
                             batch_size=1)
    dataloaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

    hidden_size = args.hidden_dim
    conv1, conv2 = generate_2convs_link_pred_layers(hetero, HeteroSAGEConv,
                                                    hidden_size)
    model = HeteroGNN(conv1, conv2, hetero, hidden_size).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    t_accu, v_accu, e_accu = train(model, dataloaders, optimizer, args)
Exemplo n.º 16
0
    # The edges in the graph have the features: edge_type ("cora_edge" or "citeseer_edge")
    print("The edges in the concatenated heterogeneous graph have the following features:")
    for edge in G.edges(data=True):
        print(edge[2])
        break

    hete = HeteroGraph(G)
    print(f"Heterogeneous graph {hete.num_nodes()} nodes, {hete.num_edges()} edges")

    dataset = GraphDataset([hete], task='node')
    dataset_train, dataset_val, dataset_test = dataset.split(
        transductive=True,
        split_ratio=[0.8, 0.1, 0.1]
    )
    train_loader = DataLoader(
        dataset_train, collate_fn=Batch.collate(), batch_size=16
    )
    val_loader = DataLoader(
        dataset_val, collate_fn=Batch.collate(), batch_size=16
    )
    test_loader = DataLoader(
        dataset_test, collate_fn=Batch.collate(), batch_size=16
    )
    loaders = [train_loader, val_loader, test_loader]

    hidden_size = 32
    model = HeteroNet(hete, hidden_size, 0.5).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.01, weight_decay=5e-3
    )
    num_epochs = 100
Exemplo n.º 17
0
                                            netlib=netlib)
        dataset = GraphDataset(graphs,
                               task='node')  # node, edge, link_pred, graph
        dataset_train, dataset_val, dataset_test = dataset.split(
            transductive=True,
            split_ratio=[0.8, 0.1, 0.1])  # transductive split, inductive split
    else:
        graphs_train, graphs_val, graphs_test = \
            GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True,
                    fixed_split=True, netlib=netlib)

        dataset_train, dataset_val, dataset_test = \
            GraphDataset(graphs_train, task='node'), GraphDataset(graphs_val,task='node'), \
            GraphDataset(graphs_test, task='node')

    train_loader = DataLoader(dataset_train,
                              collate_fn=Batch.collate(),
                              batch_size=16)  # basic data loader
    val_loader = DataLoader(dataset_val,
                            collate_fn=Batch.collate(),
                            batch_size=16)  # basic data loader
    test_loader = DataLoader(dataset_test,
                             collate_fn=Batch.collate(),
                             batch_size=16)  # basic data loader

    num_node_features = dataset_train.num_node_features
    num_classes = dataset_train.num_node_labels

    train(train_loader, val_loader, test_loader, args, num_node_features,
          num_classes, args.device)
Exemplo n.º 18
0
def main():
    writer = SummaryWriter()
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    ppi_graph = read_ppi_data(args.ppi_path)

    mode = 'mixed'
    if mode == 'ppi':
        message_passing_graph = ppi_graph
        cmap_graph, knockout_nodes = read_cmap_data(args.data_path)
    elif mode == 'mixed':
        message_passing_graph, knockout_nodes = (
            read_cmap_data(args.data_path, ppi_graph)
        )

    print('Each node has gene ID. Example: ', message_passing_graph.nodes['ADPGK'])
    print('Each edge has de direction. Example', message_passing_graph['ADPGK']['IL1B'])
    print('Total num edges: ', message_passing_graph.number_of_edges())

    # disjoint edge label
    disjoint_split_ratio = 0.1
    val_ratio = 0.1
    disjoint_edge_label_index = []
    val_edges = []

    # newly edited
    train_edges = []
    for u in knockout_nodes:
        rand_num = np.random.rand()
        if rand_num < disjoint_split_ratio:
            # add all edges (cmap only) into edge label index
            # cmap is not a multigraph
            disjoint_edge_label_index.extend(
                [
                    (u, v, edge_key)
                    for v in message_passing_graph.successors(u)
                    for edge_key in message_passing_graph[u][v]
                    if message_passing_graph[u][v][edge_key]['edge_type'] == 1
                ]
            )

            train_edges.extend(
                [
                    (u, v, edge_key)
                    for v in message_passing_graph.successors(u)
                    for edge_key in message_passing_graph[u][v]
                    if message_passing_graph[u][v][edge_key]['edge_type'] == 1
                ]
            )
        elif rand_num < disjoint_split_ratio + val_ratio:
            val_edges.extend(
                [
                    (u, v, edge_key)
                    for v in message_passing_graph.successors(u)
                    for edge_key in message_passing_graph[u][v]
                    if message_passing_graph[u][v][edge_key]['edge_type'] == 1
                ]
            )
        else:
            train_edges.extend(
                [
                    (u, v, edge_key)
                    for v in message_passing_graph.successors(u)
                    for edge_key in message_passing_graph[u][v]
                    if message_passing_graph[u][v][edge_key]['edge_type'] == 1
                ]
            )
    # add default node types for message_passing_graph
    for node in message_passing_graph.nodes:
        message_passing_graph.nodes[node]['node_type'] = 0

    print('Num edges to predict: ', len(disjoint_edge_label_index))
    print('Num edges in val: ', len(val_edges))
    print('Num edges in train: ', len(train_edges))

    graph = HeteroGraph(
        message_passing_graph,
        custom={
            "general_splits": [
                train_edges,
                val_edges
            ],
            "disjoint_split": disjoint_edge_label_index,
            "task": "link_pred"
        }
    )

    graphs = [graph]
    graphDataset = GraphDataset(
        graphs,
        task="link_pred",
        edge_train_mode="disjoint"
    )

    # Transform dataset
    # de direction (currently using homogeneous graph)
    num_edge_types = 2

    graphDataset = graphDataset.apply_transform(
        cmap_transform, num_edge_types=num_edge_types, deep_copy=False
    )
    print('Number of node features: ', graphDataset.num_node_features())

    # split dataset
    dataset = {}
    dataset['train'], dataset['val'] = graphDataset.split(transductive=True)

    # sanity check
    print(f"dataset['train'][0].edge_label_index.keys(): {dataset['train'][0].edge_label_index.keys()}")
    print(f"dataset['train'][0].edge_label_index[(0, 1, 0)].shape[1]: {dataset['train'][0].edge_label_index[(0, 1, 0)].shape[1]}")
    print(f"dataset['val'][0].edge_label_index.keys(): {dataset['val'][0].edge_label_index.keys()}")
    print(f"dataset['val'][0].edge_label_index[(0, 1, 0)].shape[1]: {dataset['val'][0].edge_label_index[(0, 1, 0)].shape[1]}")
    print(f"len(list(dataset['train'][0].G.edges)): {len(list(dataset['train'][0].G.edges))}")
    print(f"len(list(dataset['val'][0].G.edges)): {len(list(dataset['val'][0].G.edges))}")
    print(f"list(dataset['train'][0].G.edges)[:10]: {list(dataset['train'][0].G.edges)[:10]}")
    print(f"list(dataset['val'][0].G.edges)[:10]: {list(dataset['val'][0].G.edges)[:10]}")


    # node feature dimension
    input_dim = dataset['train'].num_node_features()
    edge_feat_dim = dataset['train'].num_edge_features()
    num_classes = dataset['train'].num_edge_labels()
    print(
        'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format(
            input_dim, edge_feat_dim, num_classes
        )
    )
    exit()

    # relation type is both used for edge features and edge labels
    model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.001, weight_decay=5e-3
    )
    follow_batch = []  # e.g., follow_batch = ['edge_index']

    dataloaders = {
        split: DataLoader(
            ds, collate_fn=Batch.collate(follow_batch),
            batch_size=1, shuffle=(split == 'train')
        )
        for split, ds in dataset.items()
    }
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args, writer=writer)
Exemplo n.º 19
0
def main():
    args = arg_parse()

    name = 'BioSNAP-FF'
    f = 'minerff.tsv'
    f2 = 'minerf.tsv'
    d = readFilePD(f, ['relation'])
    d2 = readFilePD(f2, ['namespace'])
    nxg = pdToNx3(d, d2, 'GO_id0', 'GO_id2', 'relation', 'GO_id1', 'namespace')
    dg = Graph(nxg)
    graphs = [dg]

    # the input that we assume users have
    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    #graphs = GraphDataset(graphs)
    if args.multigraph:
        graphs = [copy.deepcopy(graphs[0]) for _ in range(10)]

    dataset = GraphDataset(graphs,
                           task='link_pred',
                           edge_message_ratio=args.edge_message_ratio,
                           edge_train_mode=edge_train_mode)
    print('Initial dataset: {}'.format(dataset))

    # split dataset
    datasets = {}
    datasets['train'], datasets['val'], datasets['test'] = dataset.split(
        transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1])

    print('after split')
    print('Train message-passing graph: {} nodes; {} edges.'.format(
        datasets['train'][0].G.number_of_nodes(),
        datasets['train'][0].G.number_of_edges()))
    print('Val message-passing graph: {} nodes; {} edges.'.format(
        datasets['val'][0].G.number_of_nodes(),
        datasets['val'][0].G.number_of_edges()))
    print('Test message-passing graph: {} nodes; {} edges.'.format(
        datasets['test'][0].G.number_of_nodes(),
        datasets['test'][0].G.number_of_edges()))

    # node feature dimension
    input_dim = 47410  #datasets['train'].num_node_features
    # link prediction needs 2 classes (0, 1)
    num_classes = datasets['train'].num_edge_labels
    #print('num_edge_labels',datasets['train'].num_edge_labels)

    model = Net(input_dim, num_classes, args).to(args.device)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.1,
                                momentum=0.9,
                                weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=args.epochs)
    follow_batch = []  # e.g., follow_batch = ['edge_index']

    dataloaders = {
        split: DataLoader(ds,
                          collate_fn=Batch.collate(follow_batch),
                          batch_size=args.batch_size,
                          shuffle=(split == 'train'))
        for split, ds in datasets.items()
    }
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args, scheduler=scheduler)
Exemplo n.º 20
0
def main():
    args = arg_parse()

    edge_train_mode = args.mode
    print('edge train mode: {}'.format(edge_train_mode))

    WN_graph = nx.read_gpickle(args.data_path)
    print('Each node has node ID (n_id). Example: ', WN_graph.nodes[0])
    print(
        'Each edge has edge ID (id) and categorical label (e_label). Example: ',
        WN_graph[0][5871])

    # Since both feature and label are relation types,
    # Only the disjoint mode would make sense
    dataset = GraphDataset(
        [WN_graph],
        task='link_pred',
        edge_train_mode=edge_train_mode,
        edge_message_ratio=args.edge_message_ratio,
        edge_negative_sampling_ratio=args.neg_sampling_ratio)

    # find num edge types
    max_label = 0
    labels = []
    for u, v, edge_key in WN_graph.edges:
        l = WN_graph[u][v][edge_key]['e_label']
        if not l in labels:
            labels.append(l)
    # labels are consecutive (0-17)
    num_edge_types = len(labels)

    print('Pre-transform: ', dataset[0])
    dataset = dataset.apply_transform(WN_transform,
                                      num_edge_types=num_edge_types,
                                      deep_copy=False)
    print('Post-transform: ', dataset[0])
    print('Initial data: {} nodes; {} edges.'.format(
        dataset[0].G.number_of_nodes(), dataset[0].G.number_of_edges()))
    print('Number of node features: {}'.format(dataset.num_node_features))

    # split dataset
    datasets = {}
    datasets['train'], datasets['val'], datasets['test'] = dataset.split(
        transductive=True, split_ratio=[0.8, 0.1, 0.1])

    print('After split:')
    print('Train message-passing graph: {} nodes; {} edges.'.format(
        datasets['train'][0].G.number_of_nodes(),
        datasets['train'][0].G.number_of_edges()))
    print('Val message-passing graph: {} nodes; {} edges.'.format(
        datasets['val'][0].G.number_of_nodes(),
        datasets['val'][0].G.number_of_edges()))
    print('Test message-passing graph: {} nodes; {} edges.'.format(
        datasets['test'][0].G.number_of_nodes(),
        datasets['test'][0].G.number_of_edges()))

    # node feature dimension
    input_dim = datasets['train'].num_node_features
    edge_feat_dim = datasets['train'].num_edge_features
    num_classes = datasets['train'].num_edge_labels
    print(
        'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format(
            input_dim, edge_feat_dim, num_classes))

    # relation type is both used for edge features and edge labels
    model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.001,
                                 weight_decay=5e-3)
    follow_batch = []  # e.g., follow_batch = ['edge_index']

    dataloaders = {
        split: DataLoader(ds,
                          collate_fn=Batch.collate(follow_batch),
                          batch_size=1,
                          shuffle=(split == 'train'))
        for split, ds in datasets.items()
    }
    print('Graphs after split: ')
    for key, dataloader in dataloaders.items():
        for batch in dataloader:
            print(key, ': ', batch)

    train(model, dataloaders, optimizer, args)
Exemplo n.º 21
0
        print("Use SnapX as the backend network library.")
    else:
        raise ValueError("{} network library is not supported.".format(args.netlib))

    args.netlib = netlib

    graphs = GraphDataset.pyg_to_graphs(pyg_dataset, netlib=args.netlib)

    dataset = GraphDataset(graphs, task="graph")
    datasets = {}
    datasets['train'], datasets['val'], datasets['test'] = dataset.split(
            transductive=False, split_ratio = [0.8, 0.1, 0.1])

    if args.transform_dataset is not None:
        trans_func = get_transform(args.transform_dataset)
        for _, dataset in datasets.items():
            dataset.apply_transform(trans_func, radius=args.radius, netlib=args.netlib)

    dataloaders = {
        split: DataLoader(
            dataset, collate_fn=Batch.collate(), 
            batch_size=args.batch_size, shuffle=True
        ) for split, dataset in datasets.items()
    }

    num_classes = datasets['train'].num_graph_labels
    num_node_features = datasets['train'].num_node_features

    train(dataloaders['train'], dataloaders['val'], dataloaders['test'], 
            args, num_node_features, num_classes, args.device)