def test_dataset_property(self): _, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = ( simple_networkx_graph()) G = Graph(node_feature=x, node_label=y, edge_index=edge_index, edge_feature=edge_x, edge_label=edge_y, graph_feature=graph_x, graph_label=graph_y, directed=True) H = deepcopy(G) H.graph_label = torch.tensor([1]) graphs = [G, H] dataset = GraphDataset(graphs) self.assertEqual(dataset.num_node_labels, 5) self.assertEqual(dataset.num_node_features, 2) self.assertEqual(dataset.num_edge_labels, 4) self.assertEqual(dataset.num_edge_features, 2) self.assertEqual(dataset.num_graph_labels, 1) self.assertEqual(dataset.num_graph_features, 2) self.assertEqual(dataset.num_labels, 5) # node task dataset = GraphDataset(graphs, task="edge") self.assertEqual(dataset.num_labels, 4) dataset = GraphDataset(graphs, task="link_pred") self.assertEqual(dataset.num_labels, 5) dataset = GraphDataset(graphs, task="graph") self.assertEqual(dataset.num_labels, 1)
def test_torch_dataloader_collate(self): # graph classification example pyg_dataset = TUDataset('./enzymes', 'ENZYMES') graphs = GraphDataset.pyg_to_graphs(pyg_dataset) dataset = GraphDataset(graphs, task="graph") train_batch_num = math.ceil(len(dataset) * 0.8 / 32) test_batch_num = math.ceil(len(dataset) * 0.1 / 32) val_batch_num = math.ceil(len(dataset) * 0.1 / 32) datasets = {} datasets['train'], datasets['val'], datasets['test'] = \ dataset.split(transductive=False, split_ratio=[0.8, 0.1, 0.1]) dataloaders = { split: DataLoader(dataset, collate_fn=Batch.collate(), batch_size=32, shuffle=True) for split, dataset in datasets.items() } self.assertEqual(len(dataloaders['train']), train_batch_num) self.assertEqual(len(dataloaders['val']), test_batch_num) self.assertEqual(len(dataloaders['test']), val_batch_num) for i, data in enumerate(dataloaders['train']): if i != len(dataloaders['train']) - 1: self.assertEqual(data.num_graphs, 32) for i, data in enumerate(dataloaders['val']): if i != len(dataloaders['val']) - 1: self.assertEqual(data.num_graphs, 32) for i, data in enumerate(dataloaders['test']): if i != len(dataloaders['test']) - 1: self.assertEqual(data.num_graphs, 32)
def test_dataset_property(self): G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = ( simple_networkx_graph() ) Graph.add_edge_attr(G, "edge_feature", edge_x) Graph.add_edge_attr(G, "edge_label", edge_y) Graph.add_node_attr(G, "node_feature", x) Graph.add_node_attr(G, "node_label", y) Graph.add_graph_attr(G, "graph_feature", graph_x) Graph.add_graph_attr(G, "graph_label", graph_y) H = G.copy() Graph.add_graph_attr(H, "graph_label", torch.tensor([1])) graphs = GraphDataset.list_to_graphs([G, H]) dataset = GraphDataset(graphs) self.assertEqual(dataset.num_node_labels, 5) self.assertEqual(dataset.num_node_features, 2) self.assertEqual(dataset.num_edge_labels, 4) self.assertEqual(dataset.num_edge_features, 2) self.assertEqual(dataset.num_graph_labels, 2) self.assertEqual(dataset.num_graph_features, 2) self.assertEqual(dataset.num_labels, 5) # node task dataset = GraphDataset(graphs, task="edge") self.assertEqual(dataset.num_labels, 4) dataset = GraphDataset(graphs, task="link_pred") self.assertEqual(dataset.num_labels, 4) dataset = GraphDataset(graphs, task="graph") self.assertEqual(dataset.num_labels, 2)
def test_resample_disjoint_heterogeneous(self): G = generate_dense_hete_dataset() hete = HeteroGraph(G) hete = HeteroGraph(node_feature=hete.node_feature, node_label=hete.node_label, edge_feature=hete.edge_feature, edge_label=hete.edge_label, edge_index=hete.edge_index, directed=True) graphs = [hete] dataset = GraphDataset(graphs, task="link_pred", edge_train_mode="disjoint", edge_message_ratio=0.8, resample_disjoint=True, resample_disjoint_period=1) dataset_train, _, _ = dataset.split(split_ratio=[0.5, 0.2, 0.3]) graph_train_first = dataset_train[0] graph_train_second = dataset_train[0] for message_type in graph_train_first.edge_index: self.assertEqual( graph_train_first.edge_label_index[message_type].shape[1], graph_train_second.edge_label_index[message_type].shape[1]) self.assertEqual(graph_train_first.edge_label[message_type].shape, graph_train_second.edge_label[message_type].shape)
def train(rank, pygds, args, num_node_features, num_classes): if args.skip is not None: model_cls = skip_models.SkipLastGNN elif args.model == "GIN": model_cls = GIN else: model_cls = GNN model = model_cls(num_node_features, args.hidden_dim, num_classes, args).to(device) opt = build_optimizer(args, model.parameters()) # DISTRIBUTED TRAINING - can be replaced with 1 call to model_parallelize() world_size = torch.cuda.device_count() os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' pyg_dataset[0] = pyg_dataset[0].split(pyg_dataset[0].size(0) // world_size)[rank] device = rank model = DistributedDataParallel(model) graphs = GraphDataset.pyg_to_graphs(pyg_dataset) dataset = GraphDataset(graphs, task="graph") datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=False, split_ratio = [0.8, 0.1, 0.1])
def test_resample_disjoint(self): pyg_dataset = Planetoid("./cora", "Cora") graphs = GraphDataset.pyg_to_graphs(pyg_dataset) graph = graphs[0] graph = Graph(node_label=graph.node_label, node_feature=graph.node_feature, edge_index=graph.edge_index, edge_feature=graph.edge_feature, directed=False) graphs = [graph] dataset = GraphDataset(graphs, task="link_pred", edge_train_mode="disjoint", edge_message_ratio=0.8, resample_disjoint=True, resample_disjoint_period=1) dataset_train, _, _ = dataset.split(split_ratio=[0.5, 0.2, 0.3]) graph_train_first = dataset_train[0] graph_train_second = dataset_train[0] self.assertEqual(graph_train_first.edge_label_index.shape[1], graph_train_second.edge_label_index.shape[1]) self.assertTrue( torch.equal(graph_train_first.edge_label, graph_train_second.edge_label))
def load_dataset(format, name, dataset_dir): if format != 'NetlistOmitted': return None dataset_dir = '{}/{}'.format(dataset_dir, name) netlists = find_netlists(dataset_dir) if cfg.dataset.mean: mean = np.load(cfg.dataset.mean) stddev = np.load(cfg.dataset.stddev) dataset = datasets.omitted(netlists, min_edge_count=5, resample=cfg.dataset.resample, mean=mean, std=stddev) else: dataset = datasets.omitted(netlists, min_edge_count=5, resample=cfg.dataset.resample) graphs = h.to_deepsnap(dataset) dataset = GraphDataset( graphs, task=cfg.dataset.task, edge_train_mode=cfg.dataset.edge_train_mode, edge_message_ratio=cfg.dataset.edge_message_ratio, edge_negative_sampling_ratio=cfg.dataset.edge_negative_sampling_ratio, resample_disjoint=cfg.dataset.resample_disjoint, minimum_node_per_graph=0) dataset._num_graph_labels = len(datasets.helpers.component_types) return dataset
def deepsnap_ego(args, pyg_dataset): avg_time = 0 task = "graph" for i in range(args.num_runs): if args.print_run: print("Run {}".format(i + 1)) graphs = GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, netlib=netlib) dataset = GraphDataset(graphs, task=task) datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=False, split_ratio=[0.8, 0.1, 0.1], shuffle=False) dataloaders = { split: DataLoader(dataset, collate_fn=Batch.collate(), batch_size=1, shuffle=False) for split, dataset in datasets.items() } s = time.time() for batch in dataloaders['train']: batch = batch.apply_transform(ego_nets, update_tensor=True) avg_time += (time.time() - s) print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
def main(): args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) G = nx.read_gpickle(args.data_path) print(G.number_of_edges()) print('Each node has node ID (n_id). Example: ', G.nodes[0]) print( 'Each edge has edge ID (id) and categorical label (e_label). Example: ', G[0][5871]) # find num edge types max_label = 0 labels = [] for u, v, edge_key in G.edges: l = G[u][v][edge_key]['e_label'] if not l in labels: labels.append(l) # labels are consecutive (0-17) num_edge_types = len(labels) H = WN_transform(G, num_edge_types) # The nodes in the graph have the features: node_feature and node_type (just one node type "n1" here) for node in H.nodes(data=True): print(node) break # The edges in the graph have the features: edge_feature and edge_type ("0" - "17" here) for edge in H.edges(data=True): print(edge) break hete = HeteroGraph(H) dataset = GraphDataset([hete], task='link_pred') dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=1) val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=1) test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=1) dataloaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader } hidden_size = 32 model = HeteroNet(hete, hidden_size, 0.2).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) train(model, dataloaders, optimizer, args)
def main(): args = arg_parse() pyg_dataset = Planetoid('./cora', 'Cora', transform=T.TargetIndegree()) # the input that we assume users have edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) graphs = GraphDataset.pyg_to_graphs(pyg_dataset, tensor_backend=True) if args.multigraph: graphs = [copy.deepcopy(graphs[0]) for _ in range(10)] dataset = GraphDataset(graphs, task='link_pred', edge_message_ratio=args.edge_message_ratio, edge_train_mode=edge_train_mode) print('Initial dataset: {}'.format(dataset)) # split dataset datasets = {} datasets['train'], datasets['val'], datasets['test']= dataset.split( transductive=not args.multigraph, split_ratio=[0.85, 0.05, 0.1]) print('after split') print('Train message-passing graph: {} nodes; {} edges.'.format( datasets['train'][0].num_nodes, datasets['train'][0].num_edges)) print('Val message-passing graph: {} nodes; {} edges.'.format( datasets['val'][0].num_nodes, datasets['val'][0].num_edges)) print('Test message-passing graph: {} nodes; {} edges.'.format( datasets['test'][0].num_nodes, datasets['test'][0].num_edges)) # node feature dimension input_dim = datasets['train'].num_node_features # link prediction needs 2 classes (0, 1) num_classes = datasets['train'].num_edge_labels model = Net(input_dim, num_classes, args).to(args.device) #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = {split: DataLoader( ds, collate_fn=Batch.collate(follow_batch), batch_size=args.batch_size, shuffle=(split=='train')) for split, ds in datasets.items()} print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args, scheduler=scheduler)
def create_dataset(): ## Load dataset time1 = time.time() if cfg.dataset.format == 'OGB': graphs, splits = load_dataset() else: graphs = load_dataset() ## Filter graphs time2 = time.time() min_node = filter_graphs() ## Create whole dataset if type(graphs) is GraphDataset: dataset = graphs else: dataset = GraphDataset( graphs, task=cfg.dataset.task, edge_train_mode=cfg.dataset.edge_train_mode, edge_message_ratio=cfg.dataset.edge_message_ratio, edge_negative_sampling_ratio=cfg.dataset. edge_negative_sampling_ratio, resample_disjoint=cfg.dataset.resample_disjoint, minimum_node_per_graph=min_node) ## Transform the whole dataset dataset = transform_before_split(dataset) ## Split dataset time3 = time.time() # Use custom data splits if cfg.dataset.format == 'OGB': datasets = [] datasets.append(dataset[splits['train']]) datasets.append(dataset[splits['valid']]) datasets.append(dataset[splits['test']]) # Use random split, supported by DeepSNAP else: datasets = dataset.split(transductive=cfg.dataset.transductive, split_ratio=cfg.dataset.split) # We only change the training negative sampling ratio for i in range(1, len(datasets)): dataset.edge_negative_sampling_ratio = 1 ## Transform each split dataset time4 = time.time() datasets = transform_after_split(datasets) time5 = time.time() logging.info('Load: {:.4}s, Before split: {:.4}s, ' 'Split: {:.4}s, After split: {:.4}s'.format( time2 - time1, time3 - time2, time4 - time3, time5 - time4)) return datasets
def test_dataset_basic(self): G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = \ simple_networkx_graph() Graph.add_edge_attr(G, "edge_feature", edge_x) Graph.add_edge_attr(G, "edge_label", edge_y) Graph.add_node_attr(G, "node_feature", x) Graph.add_node_attr(G, "node_label", y) Graph.add_graph_attr(G, "graph_feature", graph_x) Graph.add_graph_attr(G, "graph_label", graph_y) H = deepcopy(G) graphs = GraphDataset.list_to_graphs([G, H]) dataset = GraphDataset(graphs) self.assertEqual(len(dataset), 2)
def deepsnap_pagerank(args, pyg_dataset): avg_time = 0 task = 'graph' for i in range(args.num_runs): if args.print_run: print("Run {}".format(i + 1)) graphs = GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, fixed_split=False, netlib=netlib) dataset = GraphDataset(graphs, task=task) s = time.time() dataset.apply_transform(page_fun, update_tensor=False, lib=args.netlib) avg_time += (time.time() - s) print("DeepSNAP has average time: {}".format(avg_time / args.num_runs))
def load_dataset(): ''' load raw datasets. :return: a list of networkx/deepsnap graphs, plus additional info if needed ''' format = cfg.dataset.format name = cfg.dataset.name # dataset_dir = '{}/{}'.format(cfg.dataset.dir, name) dataset_dir = cfg.dataset.dir # Try to load customized data format for func in register.loader_dict.values(): graphs = func(format, name, dataset_dir) if graphs is not None: return graphs # Load from Pytorch Geometric dataset if format == 'PyG': graphs = load_pyg(name, dataset_dir) # Load from networkx formatted data # todo: clean nx dataloader elif format == 'nx': graphs = load_nx(name, dataset_dir) # Load from OGB formatted data elif cfg.dataset.format == 'OGB': if cfg.dataset.name == 'ogbg-molhiv': dataset = PygGraphPropPredDataset(name=cfg.dataset.name) graphs = GraphDataset.pyg_to_graphs(dataset) # Note this is only used for custom splits from OGB split_idx = dataset.get_idx_split() return graphs, split_idx else: raise ValueError('Unknown data format: {}'.format(cfg.dataset.format)) return graphs
def test_ensemble_generator(self): pyg_dataset = Planetoid("./cora", "Cora") dg = Graph.pyg_to_graph(pyg_dataset[0]) num_nodes = 500 sizes = [2, 3] class NeighborGenerator1(Generator): def __len__(self): return sizes def generate(self): graph = Graph(gen_graph(num_nodes, dg.G)) return graph class NeighborGenerator2(Generator): def __len__(self): return sizes def generate(self): graph = Graph(gen_graph(num_nodes, dg.G)) return graph ensemble_generator = ( EnsembleGenerator( [ NeighborGenerator1(sizes), NeighborGenerator2(sizes), ] ) ) dataset = GraphDataset(None, generator=ensemble_generator) self.assertTrue(dataset[0].node_feature.shape[0] == num_nodes)
def batch_nx_graphs(graphs, anchors=None): #motifs_batch = [pyg_utils.from_networkx( # nx.convert_node_labels_to_integers(graph)) for graph in graphs] #loader = DataLoader(motifs_batch, batch_size=len(motifs_batch)) #for b in loader: batch = b augmenter = feature_preprocess.FeatureAugment() if anchors is not None: for anchor, g in zip(anchors, graphs): for v in g.nodes: g.nodes[v]["node_feature"] = torch.tensor([float(v == anchor)]) if 'aifb' == 'aifb' or 'wn18' == 'wn18': # 90 edge types for g in graphs: for e in g.edges: # tmp = torch.zeros(90) # tmp[g.edges[e]['edge_type']] = 1. g.edges[e]["edge_feature"] = torch.tensor( [g.edges[e]['edge_type']], dtype=torch.long) batch = Batch.from_data_list(GraphDataset.list_to_graphs(graphs)) batch = augmenter.augment(batch) batch = batch.to(get_device()) return batch
def load_dataset(name): task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") if task == "graph": dataset = GraphDataset(GraphDataset.pyg_to_graphs(dataset)) dataset = dataset.apply_transform( lambda g: g.G.subgraph(max(nx.connected_components(g.G), key=len))) dataset = dataset.filter(lambda g: len(g.G) >= 6) train, test = dataset.split(split_ratio=[0.8, 0.2]) return train, test, task
def load_dataset_example(format, name, dataset_dir): dataset_dir = '{}/{}'.format(dataset_dir, name) if format == 'PyG': if name == 'QM7b': dataset_raw = QM7b(dataset_dir) graphs = GraphDataset.pyg_to_graphs(dataset_raw) return graphs
def test_filter(self): pyg_dataset = TUDataset('./enzymes', 'ENZYMES') graphs = GraphDataset.pyg_to_graphs(pyg_dataset) dataset = GraphDataset(graphs, task="graph") thresh = 90 orig_dataset_size = len(dataset) num_graphs_large = 0 for graph in dataset: if len(graph.G) >= thresh: num_graphs_large += 1 dataset = dataset.filter( lambda graph: len(graph.G) < thresh, deep_copy=False) filtered_dataset_size = len(dataset) self.assertEqual( orig_dataset_size - filtered_dataset_size, num_graphs_large)
def test_filter(self): pyg_dataset = TUDataset("./enzymes", "ENZYMES") ds = pyg_to_dicts(pyg_dataset) graphs = [Graph(**item) for item in ds] dataset = GraphDataset(graphs, task="graph") thresh = 90 orig_dataset_size = len(dataset) num_graphs_large = 0 for graph in dataset: if graph.num_nodes >= thresh: num_graphs_large += 1 dataset = dataset.filter(lambda graph: graph.num_nodes < thresh, deep_copy=False) filtered_dataset_size = len(dataset) self.assertEqual( orig_dataset_size - filtered_dataset_size, num_graphs_large, )
def load_pyg(name, dataset_dir): ''' load pyg format dataset :param name: dataset name :param dataset_dir: data directory :return: a list of networkx/deepsnap graphs ''' dataset_dir = '{}/{}'.format(dataset_dir, name) if name in ['Cora', 'CiteSeer', 'PubMed']: dataset_raw = Planetoid(dataset_dir, name) elif name[:3] == 'TU_': # TU_IMDB doesn't have node features if name[3:] == 'IMDB': name = 'IMDB-MULTI' dataset_raw = TUDataset(dataset_dir, name, transform=T.Constant()) else: dataset_raw = TUDataset(dataset_dir, name[3:]) # TU_dataset only has graph-level label # The goal is to have synthetic tasks # that select smallest 100 graphs that have more than 200 edges if cfg.dataset.tu_simple and cfg.dataset.task != 'graph': size = [] for data in dataset_raw: edge_num = data.edge_index.shape[1] edge_num = 9999 if edge_num < 200 else edge_num size.append(edge_num) size = torch.tensor(size) order = torch.argsort(size)[:100] dataset_raw = dataset_raw[order] elif name == 'Karate': dataset_raw = KarateClub() elif 'Coauthor' in name: if 'CS' in name: dataset_raw = Coauthor(dataset_dir, name='CS') else: dataset_raw = Coauthor(dataset_dir, name='Physics') elif 'Amazon' in name: if 'Computers' in name: dataset_raw = Amazon(dataset_dir, name='Computers') else: dataset_raw = Amazon(dataset_dir, name='Photo') elif name == 'MNIST': dataset_raw = MNISTSuperpixels(dataset_dir) elif name == 'PPI': dataset_raw = PPI(dataset_dir) elif name == 'QM7b': dataset_raw = QM7b(dataset_dir) else: raise ValueError('{} not support'.format(name)) graphs = GraphDataset.pyg_to_graphs(dataset_raw) return graphs
def gen_data_loaders(self, size, batch_size, train=True, use_distributed_sampling=False): loaders = [] for i in range(2): neighs = [] for j in range(size // 2): graph, neigh = utils.sample_neigh( self.train_set if train else self.test_set, random.randint(self.min_size, self.max_size)) neighs.append(graph.subgraph(neigh)) dataset = GraphDataset(GraphDataset.list_to_graphs(neighs)) loaders.append( TorchDataLoader(dataset, collate_fn=Batch.collate([]), batch_size=batch_size // 2 if i == 0 else batch_size // 2, sampler=None, shuffle=False)) loaders.append([None] * (size // batch_size)) return loaders
def test_pyg_to_graphs_global(self): import deepsnap deepsnap.use(nx) pyg_dataset = Planetoid('./planetoid', "Cora") graphs = GraphDataset.pyg_to_graphs(pyg_dataset) self.assertTrue(isinstance(graphs[0].G, nx.Graph)) dataset = GraphDataset(graphs, task='node') num_nodes = dataset.num_nodes[0] node_0 = int(0.8 * num_nodes) node_1 = int(0.1 * num_nodes) node_2 = num_nodes - node_0 - node_1 train, val, test = dataset.split() self.assertTrue(isinstance(train[0].G, nx.Graph)) self.assertTrue(isinstance(val[0].G, nx.Graph)) self.assertTrue(isinstance(test[0].G, nx.Graph)) self.assertEqual(train[0].node_label_index.shape[0], node_0) self.assertEqual(val[0].node_label_index.shape[0], node_1) self.assertEqual(test[0].node_label_index.shape[0], node_2) train_loader = DataLoader(train, collate_fn=Batch.collate(), batch_size=1) for batch in train_loader: self.assertTrue(isinstance(batch.G[0], nx.Graph)) deepsnap.use(sx) graphs = GraphDataset.pyg_to_graphs(pyg_dataset) self.assertTrue(isinstance(graphs[0].G, sx.Graph)) dataset = GraphDataset(graphs, task='node') num_nodes = dataset.num_nodes[0] node_0 = int(0.8 * num_nodes) node_1 = int(0.1 * num_nodes) node_2 = num_nodes - node_0 - node_1 train, val, test = dataset.split() self.assertTrue(isinstance(train[0].G, sx.Graph)) self.assertTrue(isinstance(val[0].G, sx.classes.graph.Graph)) self.assertTrue(isinstance(test[0].G, sx.classes.graph.Graph)) self.assertEqual(train[0].node_label_index.shape[0], node_0) self.assertEqual(val[0].node_label_index.shape[0], node_1) self.assertEqual(test[0].node_label_index.shape[0], node_2) train_loader = DataLoader(train, collate_fn=Batch.collate(), batch_size=1) for batch in train_loader: self.assertTrue(isinstance(batch.G[0], sx.Graph))
def test_batch_basic(self): G, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = \ simple_networkx_graph() Graph.add_edge_attr(G, "edge_feature", edge_x) Graph.add_edge_attr(G, "edge_label", edge_y) Graph.add_node_attr(G, "node_feature", x) Graph.add_node_attr(G, "node_label", y) Graph.add_graph_attr(G, "graph_feature", graph_x) Graph.add_graph_attr(G, "graph_label", graph_y) H = deepcopy(G) graphs = GraphDataset.list_to_graphs([G, H]) batch = Batch.from_data_list(graphs) self.assertEqual(batch.num_graphs, 2) self.assertEqual(len(batch.node_feature), 2 * len(graphs[0].node_feature))
def load_dataset(name): def add_feats(graph): for v in graph.G.nodes: graph.G.nodes[v]["node_feature"] = torch.ones(1) return graph task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") if task == "graph": dataset = GraphDataset(GraphDataset.pyg_to_graphs(dataset)) # add blank features for imdb-binary, which doesn't have node labels if name == "imdb-binary": dataset = dataset.apply_transform(add_feats) dataset = dataset.apply_transform( lambda g: g.G.subgraph(max(nx.connected_components(g.G), key=len))) dataset = dataset.filter(lambda g: len(g.G) >= 6) train, test = dataset.split(split_ratio=[0.8, 0.2]) return train, test, task
def batch_nx_graphs_multi(graphs, anchors=None): # motifs_batch = [pyg_utils.from_networkx( # nx.convert_node_labels_to_integers(graph)) for graph in graphs] # loader = DataLoader(motifs_batch, batch_size=len(motifs_batch)) # for b in loader: batch = b augmenter = feature_preprocess.FeatureAugment() if anchors is not None: for anchor, g in zip(anchors, graphs): for v in g.nodes: g.nodes[v]["node_feature"] = torch.tensor([float(v == anchor)]) batch = Batch.from_data_list(GraphDataset.list_to_graphs(graphs)) batch = augmenter.augment(batch) batch = batch.to(get_device()) return batch
def test_dataset_basic(self): _, x, y, edge_x, edge_y, edge_index, graph_x, graph_y = ( simple_networkx_graph()) G = Graph(node_feature=x, node_label=y, edge_index=edge_index, edge_feature=edge_x, edge_label=edge_y, graph_feature=graph_x, graph_label=graph_y, directed=True) H = deepcopy(G) dataset = GraphDataset([G, H]) self.assertEqual(len(dataset), 2)
def test_generator(self): pyg_dataset = Planetoid('./cora', 'Cora') dg = Graph.pyg_to_graph(pyg_dataset[0]) num_nodes = 500 sizes = [2, 3] class NeighborGenerator(Generator): def __len__(self): return sizes def generate(self): graph = Graph(gen_graph(num_nodes, dg.G)) return graph dataset = GraphDataset(None, generator=NeighborGenerator(sizes)) self.assertTrue(dataset[0].node_feature.shape[0] == num_nodes)
def main(): args = arg_parse() edge_train_mode = args.mode print('edge train mode: {}'.format(edge_train_mode)) WN_graph = nx.read_gpickle(args.data_path) print('Each node has node ID (n_id). Example: ', WN_graph.nodes[0]) print( 'Each edge has edge ID (id) and categorical label (e_label). Example: ', WN_graph[0][5871]) # Since both feature and label are relation types, # Only the disjoint mode would make sense dataset = GraphDataset( [WN_graph], task='link_pred', edge_train_mode=edge_train_mode, edge_message_ratio=args.edge_message_ratio, edge_negative_sampling_ratio=args.neg_sampling_ratio) # find num edge types max_label = 0 labels = [] for u, v, edge_key in WN_graph.edges: l = WN_graph[u][v][edge_key]['e_label'] if not l in labels: labels.append(l) # labels are consecutive (0-17) num_edge_types = len(labels) print('Pre-transform: ', dataset[0]) dataset = dataset.apply_transform(WN_transform, num_edge_types=num_edge_types, deep_copy=False) print('Post-transform: ', dataset[0]) print('Initial data: {} nodes; {} edges.'.format( dataset[0].G.number_of_nodes(), dataset[0].G.number_of_edges())) print('Number of node features: {}'.format(dataset.num_node_features)) # split dataset datasets = {} datasets['train'], datasets['val'], datasets['test'] = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) print('After split:') print('Train message-passing graph: {} nodes; {} edges.'.format( datasets['train'][0].G.number_of_nodes(), datasets['train'][0].G.number_of_edges())) print('Val message-passing graph: {} nodes; {} edges.'.format( datasets['val'][0].G.number_of_nodes(), datasets['val'][0].G.number_of_edges())) print('Test message-passing graph: {} nodes; {} edges.'.format( datasets['test'][0].G.number_of_nodes(), datasets['test'][0].G.number_of_edges())) # node feature dimension input_dim = datasets['train'].num_node_features edge_feat_dim = datasets['train'].num_edge_features num_classes = datasets['train'].num_edge_labels print( 'Node feature dim: {}; edge feature dim: {}; num classes: {}.'.format( input_dim, edge_feat_dim, num_classes)) # relation type is both used for edge features and edge labels model = Net(input_dim, edge_feat_dim, num_classes, args).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3) follow_batch = [] # e.g., follow_batch = ['edge_index'] dataloaders = { split: DataLoader(ds, collate_fn=Batch.collate(follow_batch), batch_size=1, shuffle=(split == 'train')) for split, ds in datasets.items() } print('Graphs after split: ') for key, dataloader in dataloaders.items(): for batch in dataloader: print(key, ': ', batch) train(model, dataloaders, optimizer, args)
raise ValueError("Unsupported dataset.") if args.netlib == "nx": import networkx as netlib print("Use NetworkX as the backend network library.") elif args.netlib == "sx": import snap import snapx as netlib print("Use SnapX as the backend network library.") else: raise ValueError("{} network library is not supported.".format( args.netlib)) if args.split == 'random': graphs = GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, fixed_split=False, netlib=netlib) dataset = GraphDataset(graphs, task='node') # node, edge, link_pred, graph dataset_train, dataset_val, dataset_test = dataset.split( transductive=True, split_ratio=[0.8, 0.1, 0.1]) # transductive split, inductive split else: graphs_train, graphs_val, graphs_test = \ GraphDataset.pyg_to_graphs(pyg_dataset, verbose=True, fixed_split=True, netlib=netlib) dataset_train, dataset_val, dataset_test = \ GraphDataset(graphs_train, task='node'), GraphDataset(graphs_val,task='node'), \ GraphDataset(graphs_test, task='node')