def load_ogb_mag(): name = 'ogbn-mag' from ogb.nodeproppred import DglNodePropPredDataset os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset')) print('load', name) dataset = DglNodePropPredDataset(name=name) print('finish loading', name) split_idx = dataset.get_idx_split() train_idx = split_idx["train"]['paper'] val_idx = split_idx["valid"]['paper'] test_idx = split_idx["test"]['paper'] hg_orig, labels = dataset[0] subgs = {} for etype in hg_orig.canonical_etypes: u, v = hg_orig.all_edges(etype=etype) subgs[etype] = (u, v) subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u) hg = dgl.heterograph(subgs) hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat'] hg.nodes['paper'].data['labels'] = labels['paper'].squeeze() train_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool) train_mask[train_idx] = True val_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool) val_mask[val_idx] = True test_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool) test_mask[test_idx] = True hg.nodes['paper'].data['train_mask'] = train_mask hg.nodes['paper'].data['val_mask'] = val_mask hg.nodes['paper'].data['test_mask'] = test_mask num_classes = dataset.num_classes return OGBDataset(hg, num_classes, 'paper')
def load_ogb(name): from ogb.nodeproppred import DglNodePropPredDataset data = DglNodePropPredDataset(name=name) splitted_idx = data.get_idx_split() graph, labels = data[0] labels = labels[:, 0] graph.ndata['features'] = graph.ndata['feat'] graph.ndata['labels'] = labels in_feats = graph.ndata['features'].shape[1] num_labels = len(th.unique(labels)) # Find the node IDs in the training, validation, and test set. train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64) train_mask[train_nid] = 1 val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64) val_mask[val_nid] = 1 test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64) test_mask[test_nid] = 1 graph.ndata['train_mask'] = train_mask graph.ndata['val_mask'] = val_mask graph.ndata['test_mask'] = test_mask return graph, len(th.unique(graph.ndata['labels']))
def __init__(self, dataset_name='ogbn-arxiv', k=5): super(Ogbn, self).__init__() print("Loading dataset {}".format(dataset_name)) self.dataset_name = dataset_name self.dataset_path = os.path.join('./data', dataset_name.replace('-', '_')) ogbn_dataset = DglNodePropPredDataset(dataset_name, root='./data') self.graph, self.label = ogbn_dataset[0] self.graph = self.graph.add_self_loop() self.label = self.label.flatten() self.split = ogbn_dataset.get_idx_split() self.length = self.graph.num_nodes() self.nodes = self.graph.nodes() self.edges = self.graph.edges() self.k = k print("Generate Context...") time1 = time.time() self.context = np.squeeze( dgl.sampling.random_walk(self.graph, self.nodes, length=self.k)[0]) time2 = time.time() print("Context shape: {}, time: {}s".format(self.context.shape, time2 - time1)) print("Loading WL...") self.path_WL = os.path.join(self.dataset_path, 'WL.pkl') self.WL = self.load_WL(self.path_WL) max_wl = 0 for i in self.WL.values(): max_wl = i if i > max_wl else max_wl print("Max WL id: ", max_wl)
def load_data(name, ogb_root, seed, device): if name == 'ogbn-arxiv': data = DglNodePropPredDataset('ogbn-arxiv', ogb_root) g, labels = data[0] split = data.get_idx_split() return g.to(device), labels.squeeze(dim=-1).to(device), data.num_classes, \ split['train'].to(device), split['valid'].to(device), split['test'].to(device) elif name in ('cora', 'citeseer', 'pubmed'): data = load_citation_dataset(name) elif name == 'cora_full': data = gnn_benckmark.CoraFullDataset() elif name in ('cs', 'physics'): data = gnn_benckmark.Coauthor(name) elif name in ('photo', 'computers'): data = gnn_benckmark.AmazonCoBuy(name) else: raise ValueError('Unknown dataset:', name) g = data[0].to(device) # https://github.com/dmlc/dgl/issues/2479 num_classes = data.num_classes if name in ('photo', 'computers'): num_classes = g.ndata['label'].max().item() + 1 if 'train_mask' in g.ndata: train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0] val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0] test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0] else: train_idx, val_idx, test_idx = split_idx(torch.arange(g.num_nodes()), 0.2, 0.3, seed) return g, g.ndata['label'], num_classes, train_idx.to(device), val_idx.to( device), test_idx.to(device)
def process_DglNodeDataset_hetero(self, dataset: DglNodePropPredDataset): graph, labels = dataset[0] self._name = dataset.name if self.node_types is None: self.node_types = graph.ntypes self.num_nodes_dict = { ntype: graph.num_nodes(ntype) for ntype in self.node_types } self.y_dict = labels self.x_dict = graph.ndata["feat"] for ntype, labels in self.y_dict.items(): if labels.dim() == 2 and labels.shape[1] == 1: labels = labels.squeeze(1) graph.nodes[ntype].data["labels"] = labels if self.head_node_type is None: if self.y_dict is not None: self.head_node_type = list(self.y_dict.keys())[0] else: self.head_node_type = self.node_types[0] self.metapaths = graph.canonical_etypes split_idx = dataset.get_idx_split() self.training_idx, self.validation_idx, self.testing_idx = split_idx["train"][self.head_node_type], \ split_idx["valid"][self.head_node_type], \ split_idx["test"][self.head_node_type] self.G = graph
def load_ogb_product(name): from ogb.nodeproppred import DglNodePropPredDataset os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset')) print('load', name) data = DglNodePropPredDataset(name=name) print('finish loading', name) splitted_idx = data.get_idx_split() graph, labels = data[0] labels = labels[:, 0] graph.ndata['label'] = labels in_feats = graph.ndata['feat'].shape[1] num_labels = len( torch.unique(labels[torch.logical_not(torch.isnan(labels))])) # Find the node IDs in the training, validation, and test set. train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx[ 'valid'], splitted_idx['test'] train_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool) train_mask[train_nid] = True val_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool) val_mask[val_nid] = True test_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool) test_mask[test_nid] = True graph.ndata['train_mask'] = train_mask graph.ndata['val_mask'] = val_mask graph.ndata['test_mask'] = test_mask return OGBDataset(graph, num_labels)
def load_ogb(name): from ogb.nodeproppred import DglNodePropPredDataset print('load', name) data = DglNodePropPredDataset(name=name) print('finish loading', name) splitted_idx = data.get_idx_split() graph, labels = data[0] labels = labels[:, 0] graph.ndata['features'] = graph.ndata['feat'] del graph.ndata['feat'] graph.ndata['labels'] = labels in_feats = graph.ndata['features'].shape[1] num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))])) # Find the node IDs in the training, validation, and test set. train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) train_mask[train_nid] = True val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) val_mask[val_nid] = True test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) test_mask[test_nid] = True graph.ndata['train_mask'] = train_mask graph.ndata['val_mask'] = val_mask graph.ndata['test_mask'] = test_mask print('finish constructing', name) return graph, num_labels
def load_ogb(name): from ogb.nodeproppred import DglNodePropPredDataset data = DglNodePropPredDataset(name=name) splitted_idx = data.get_idx_split() graph, labels = data[0] labels = labels[:, 0] graph.ndata["features"] = graph.ndata["feat"] graph.ndata["labels"] = labels in_feats = graph.ndata["features"].shape[1] num_labels = len(th.unique(labels)) # Find the node IDs in the training, validation, and test set. train_nid, val_nid, test_nid = ( splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"], ) train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) train_mask[train_nid] = True val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) val_mask[val_nid] = True test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) test_mask[test_nid] = True graph.ndata["train_mask"] = train_mask graph.ndata["val_mask"] = val_mask graph.ndata["test_mask"] = test_mask return graph, len(th.unique(graph.ndata["labels"]))
def __init__(self, batch_size: int): super().__init__() dataset = DglNodePropPredDataset(name='ogbn-arxiv') self.split_idx = dataset.get_idx_split() self.g, labels = dataset[0] self.g.ndata["label"] = labels.squeeze() self.g = add_self_loop(self.g) self.batch_size = batch_size
def load_dataset(): dataset = DglNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() # there is only one graph in Node Property Prediction datasets g, labels = dataset[0] g = dgl.add_self_loop(g) return g, labels, split_idx
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--wd', type=float, default=0) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--model', type=str, default='AFFN') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = DglNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() graph, label = dataset[0] # 有向图转无向图,添加反向边 g = dgl.DGLGraph((graph.edges()[0], graph.edges()[1])) g.add_edges(graph.edges()[1], graph.edges()[0]) x = graph.ndata['feat'].to(device) y_true = label.to(device) train_idx = split_idx['train'].to(device) model = Net(x.size(-1), args.hidden_channels, dataset.num_classes, args.model, args.dropout).to(device) print(model) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam( model.parameters(), lr=args.lr, weight_decay=args.wd) for epoch in range(1, 1 + args.epochs): loss = train(model, g, x, y_true, train_idx, optimizer) result = test(model, g, x, y_true, split_idx, evaluator) logger.add_result(run, result) train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def load_data(path, device): data = DglNodePropPredDataset('ogbn-mag', path) g, labels = data[0] g = add_reverse_edges(g) labels = labels['paper'].to(device) split_idx = data.get_idx_split() train_idx = split_idx['train']['paper'].to(device) val_idx = split_idx['valid']['paper'].to(device) test_idx = split_idx['test']['paper'].to(device) return g, labels, data.num_classes, train_idx, val_idx, test_idx, Evaluator(data.name)
def load_data(dataset): data = DglNodePropPredDataset(name=dataset) evaluator = Evaluator(name=dataset) splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"] graph, labels = data[0] graph.ndata["labels"] = labels return graph, labels, train_idx, val_idx, test_idx, evaluator
def __init__(self, dataset_name): super(NodeClassificationDataset, self).__init__() if dataset_name == 'ogb-mag': dataset = DglNodePropPredDataset(name='ogbn-mag') else: raise ValueError split_idx = dataset.get_idx_split() self.num_classes = dataset.num_classes self.train_idx, self.valid_idx, self.test_idx = split_idx[ "train"], split_idx["valid"], split_idx["test"] self.g, self.label = dataset[0] self.category = 'paper' # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
def load_dataset(dataset_type, **kwargs): """ Load dataset. Args: dataset_type: str, support 'proteins', 'cora', 'citeseer', 'pubmed', 'amazon', 'reddit'. """ if dataset_type == 'proteins': data = DglNodePropPredDataset(name='ogbn-proteins', root=kwargs['root']) evaluator = Evaluator(name='ogbn-proteins') splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx[ "valid"], splitted_idx["test"] graph, labels = data[0] species = graph.ndata['species'] features = one_hot_encoder(species) graph.ndata['feat'] = features graph.ndata['label'] = labels return graph, labels, train_idx, val_idx, test_idx, evaluator if dataset_type == 'cora': dataset = dgl.data.CoraGraphDataset() elif dataset_type == 'citeseer': dataset = dgl.data.CiteseerGraphDataset() elif dataset_type == 'pubmed': dataset = dgl.data.PubmedGraphDataset() elif dataset_type == 'amazon': dataset = dgl.data.AmazonCoBuyComputerDataset() elif dataset_type == 'reddit': dataset = dgl.data.RedditDataset() else: raise (KeyError( 'Dataset type {} not recognized.'.format(dataset_type))) if dataset_type == 'amazon': num_classes = dataset.num_classes graph = dataset[0] features = th.FloatTensor(graph.ndata['feat']) labels = th.LongTensor(graph.ndata['label']) else: num_classes = dataset.num_classes graph = dataset[0] features = th.FloatTensor(graph.ndata['feat']) labels = th.LongTensor(graph.ndata['label']) train_mask = th.BoolTensor(graph.ndata['train_mask']) val_mask = th.BoolTensor(graph.ndata['val_mask']) test_mask = th.BoolTensor(graph.ndata['test_mask']) return graph, features, labels, num_classes, train_mask, val_mask, test_mask
def load_ogb(name): tic_step = time.time() get_memory("-" * 40 + "---------------------from ogb.nodeproppred import DglNodePropPredDataset***************************") print('load', name) data = DglNodePropPredDataset(name=name) t1 = ttt(tic_step, "-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************") # get_memory("-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************") print('finish loading', name) splitted_idx = data.get_idx_split() t2 = ttt(t1,"-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************") # get_memory("-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************") graph, labels = data[0] # get_memory("-" * 40 + "---------------------graph, labels = data[0]***************************") t3 = ttt(t2, "-" * 40 + "---------------------graph, labels = data[0]***************************") print(labels) print(data[0]) print(graph) labels = labels[:, 0] # get_memory("-" * 40 + "---------------------labels = labels[:, 0]***************************") t4 = ttt(t3, "-" * 40 + "---------------------labels = labels[:, 0]***************************") graph.ndata['features'] = graph.ndata['feat'] # get_memory("-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************") t5 = ttt(t4, "-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************") graph.ndata['labels'] = labels t6 = ttt(t5, "-" * 40 + "---------graph.ndata['labels'] = labels******************") in_feats = graph.ndata['features'].shape[1] num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))])) # Find the node IDs in the training, validation, and test set. train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] t7 = ttt(t6, "-" * 40 + "---------train_nid, val_nid, test_nid = splitted_idx******************") # get_memory( # "-" * 40 + "---------------------train_nid, val_nid, test_nid = splitted_idx***************************") train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) train_mask[train_nid] = True val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) val_mask[val_nid] = True test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) test_mask[test_nid] = True graph.ndata['train_mask'] = train_mask graph.ndata['val_mask'] = val_mask graph.ndata['test_mask'] = test_mask t8 = ttt(t7, "-" * 40 + "---------end of load ogb******************") # get_memory( # "-" * 40 + "---------------------end of load ogb***************************") print('finish constructing', name) print('load ogb-products time total: '+ str(time.time()-tic_step)) return graph, num_labels
def load_data(dataset): global n_node_feats, n_classes data = DglNodePropPredDataset(name=dataset) evaluator = Evaluator(name=dataset) splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"] graph, labels = data[0] n_node_feats = graph.ndata["feat"].shape[1] n_classes = (labels.max() + 1).item() return graph, labels, train_idx, val_idx, test_idx, evaluator
def __init__(self, dataset_name): super(NodeClassificationDataset, self).__init__() if dataset_name in ['aifb', 'mutag', 'bgs', 'am']: self.g, self.category, self.num_classes = self.load_RDF_dgl( dataset_name) elif dataset_name in ['acm', 'imdb', 'acm1', 'academic']: self.g, self.category, self.num_classes = self.load_HIN( dataset_name) elif dataset_name in 'ogbn-mag': dataset = DglNodePropPredDataset(name='ogbn-mag') split_idx = dataset.get_idx_split() self.num_classes = dataset.num_classes self.train_idx, self.valid_idx, self.test_idx = split_idx[ "train"], split_idx["valid"], split_idx["test"] self.g, self.label = dataset[0] self.category = 'paper' # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
def load_ogb(dataset): if dataset == 'ogbn-mag': dataset = DglNodePropPredDataset(name=dataset) split_idx = dataset.get_idx_split() train_idx = split_idx["train"]['paper'] val_idx = split_idx["valid"]['paper'] test_idx = split_idx["test"]['paper'] hg_orig, labels = dataset[0] subgs = {} for etype in hg_orig.canonical_etypes: u, v = hg_orig.all_edges(etype=etype) subgs[etype] = (u, v) subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u) hg = dgl.heterograph(subgs) hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat'] paper_labels = labels['paper'].squeeze() num_rels = len(hg.canonical_etypes) num_of_ntype = len(hg.ntypes) num_classes = dataset.num_classes category = 'paper' print('Number of relations: {}'.format(num_rels)) print('Number of class: {}'.format(num_classes)) print('Number of train: {}'.format(len(train_idx))) print('Number of valid: {}'.format(len(val_idx))) print('Number of test: {}'.format(len(test_idx))) # get target category id category_id = len(hg.ntypes) for i, ntype in enumerate(hg.ntypes): if ntype == category: category_id = i train_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool) train_mask[train_idx] = True val_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool) val_mask[val_idx] = True test_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool) test_mask[test_idx] = True hg.nodes['paper'].data['train_mask'] = train_mask hg.nodes['paper'].data['val_mask'] = val_mask hg.nodes['paper'].data['test_mask'] = test_mask hg.nodes['paper'].data['labels'] = paper_labels return hg else: raise ("Do not support other ogbn datasets.")
def load_ogb_data(dataset, device): from ogb.nodeproppred import DglNodePropPredDataset data = DglNodePropPredDataset(name="ogbn-" + dataset, root='data') splitted_idx = data.get_idx_split() idx_train, idx_val, idx_test = splitted_idx["train"], splitted_idx[ "valid"], splitted_idx["test"] graph, labels = data[0] labels = labels.squeeze() srcs, dsts = graph.all_edges() graph.add_edges(dsts, srcs) graph = graph.remove_self_loop().add_self_loop() features = graph.ndata['feat'] graph = graph.to(device) features = features.to(device) labels = labels.to(device) idx_train = idx_train.to(device) idx_val = idx_val.to(device) idx_test = idx_test.to(device) return graph, features, labels, idx_train, idx_val, idx_test
def load_ogbn_mag(root: str = None) -> OGBDataset: dataset = DglNodePropPredDataset(name='ogbn-mag', root=root) split_idx = dataset.get_idx_split() train_idx = split_idx['train']['paper'] valid_idx = split_idx['valid']['paper'] test_idx = split_idx['test']['paper'] hg_original, labels = dataset[0] labels = labels['paper'].squeeze() num_labels = dataset.num_classes subgraphs = {} for etype in hg_original.canonical_etypes: src, dst = hg_original.all_edges(etype=etype) subgraphs[etype] = (src, dst) subgraphs[(etype[2], f'rev-{etype[1]}', etype[0])] = (dst, src) hg = dgl.heterograph(subgraphs) hg.nodes['paper'].data['feat'] = hg_original.nodes['paper'].data['feat'] hg.nodes['paper'].data['labels'] = labels train_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool) train_mask[train_idx] = True valid_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool) valid_mask[valid_idx] = True test_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool) test_mask[test_idx] = True hg.nodes['paper'].data['train_mask'] = train_mask hg.nodes['paper'].data['valid_mask'] = valid_mask hg.nodes['paper'].data['test_mask'] = test_mask ogb_dataset = OGBDataset(hg, num_labels, 'paper') return ogb_dataset
def load_ogbn_mag(device, add_reverse_edge, reverse_self): """加载ogbn-mag数据集 :param device: torch.device 将图和数据移动到指定的设备上,默认为CPU :param add_reverse_edge: bool 是否添加反向边 :param reverse_self: bool 起点和终点类型相同时是否添加反向边 :return: dataset, g, features, labels, predict_ntype, train_mask, val_mask, test_mask, evaluator """ data = DglNodePropPredDataset('ogbn-mag', DATA_DIR) g, labels = data[0] if add_reverse_edge: g = add_reverse_edges(g, reverse_self) g = g.to(device) features = g.nodes['paper'].data['feat'] labels = labels['paper'].squeeze(dim=1).to(device) split_idx = data.get_idx_split() train_idx = split_idx['train']['paper'].to(device) val_idx = split_idx['valid']['paper'].to(device) test_idx = split_idx['test']['paper'].to(device) evaluator = Evaluator(data.name) return data, g, features, labels, 'paper', train_idx, val_idx, test_idx, evaluator
def load_dataset(name, device): """ Load dataset and move graph and features to device """ if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]: raise RuntimeError("Dataset {} is not supported".format(name)) dataset = DglNodePropPredDataset(name=name) splitted_idx = dataset.get_idx_split() train_nid = splitted_idx["train"] val_nid = splitted_idx["valid"] test_nid = splitted_idx["test"] g, labels = dataset[0] g = g.to(device) if name == "ogbn-arxiv": g = dgl.add_reverse_edges(g, copy_ndata=True) g = dgl.add_self_loop(g) g.ndata['feat'] = g.ndata['feat'].float() elif name == "ogbn-mag": # MAG is a heterogeneous graph. The task is to make prediction for # paper nodes labels = labels["paper"] train_nid = train_nid["paper"] val_nid = val_nid["paper"] test_nid = test_nid["paper"] g = convert_mag_to_homograph(g, device) else: g.ndata['feat'] = g.ndata['feat'].float() n_classes = dataset.num_classes labels = labels.squeeze() evaluator = get_ogb_evaluator(name) print(f"# Nodes: {g.number_of_nodes()}\n" f"# Edges: {g.number_of_edges()}\n" f"# Train: {len(train_nid)}\n" f"# Val: {len(val_nid)}\n" f"# Test: {len(test_nid)}\n" f"# Classes: {n_classes}") return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
def load_mag(device, args): from ogb.nodeproppred import DglNodePropPredDataset path = args.use_emb home_dir = os.getenv("HOME") dataset = DglNodePropPredDataset(name="ogbn-mag", root=os.path.join(home_dir, ".ogb", "dataset")) g, labels = dataset[0] splitted_idx = dataset.get_idx_split() train_nid = splitted_idx["train"]['paper'] val_nid = splitted_idx["valid"]['paper'] test_nid = splitted_idx["test"]['paper'] features = g.nodes['paper'].data['feat'] author_emb = torch.load(os.path.join(path, "author.pt")).float() topic_emb = torch.load(os.path.join(path, "field_of_study.pt")).float() institution_emb = torch.load(os.path.join(path, "institution.pt")).float() g.nodes["author"].data["feat"] = author_emb.to(device) g.nodes["institution"].data["feat"] = institution_emb.to(device) g.nodes["field_of_study"].data["feat"] = topic_emb.to(device) g.nodes["paper"].data["feat"] = features.to(device) paper_dim = g.nodes["paper"].data["feat"].shape[1] author_dim = g.nodes["author"].data["feat"].shape[1] if paper_dim != author_dim: paper_feat = g.nodes["paper"].data.pop("feat") rand_weight = torch.Tensor(paper_dim, author_dim).uniform_(-0.5, 0.5) g.nodes["paper"].data["feat"] = torch.matmul(paper_feat, rand_weight.to(device)) print( f"Randomly project paper feature from dimension {paper_dim} to {author_dim}" ) labels = labels['paper'].to(device).squeeze() n_classes = int(labels.max() - labels.min()) + 1 train_nid, val_nid, test_nid = np.array(train_nid), np.array( val_nid), np.array(test_nid) return g, labels, n_classes, train_nid, val_nid, test_nid
def load_data(name, ogb_root, device): if name in ('ogbn-products', 'ogbn-arxiv'): data = DglNodePropPredDataset(name, ogb_root) g, labels = data[0] if name == 'ogbn-arxiv': g = dgl.to_bidirected(g, copy_ndata=True) feat = g.ndata['feat'] feat = (feat - feat.mean(dim=0)) / feat.std(dim=0) g.ndata['feat'] = feat g = g.to(device) labels = labels.squeeze(dim=1).to(device) split_idx = data.get_idx_split() train_idx = split_idx['train'].to(device) val_idx = split_idx['valid'].to(device) test_idx = split_idx['test'].to(device) return g, labels, data.num_classes, train_idx, val_idx, test_idx else: data = load_citation_dataset(name) g = data[0].to(device) train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0] val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0] test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0] return g, g.ndata[ 'label'], data.num_classes, train_idx, val_idx, test_idx
argparser.add_argument('--lr', type=float, default=0.001) argparser.add_argument('--num-workers', type=int, default=8, help="Number of sampling processes. Use 0 for no extra process.") argparser.add_argument('--save-pred', type=str, default='') argparser.add_argument('--head', type=int, default=4) argparser.add_argument('--wd', type=float, default=0) args = argparser.parse_args() if args.gpu >= 0: device = th.device('cuda:%d' % args.gpu) else: device = th.device('cpu') # load data data = DglNodePropPredDataset(name='ogbn-products') splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] graph, labels = data[0] nfeat = graph.ndata.pop('feat').to(device) labels = labels[:, 0].to(device) print('Total edges before adding self-loop {}'.format(graph.num_edges())) graph = graph.remove_self_loop().add_self_loop() print('Total edges after adding self-loop {}'.format(graph.num_edges())) in_feats = nfeat.shape[1] n_classes = (labels.max() + 1).item() # Create csr/coo/csc formats before launching sampling processes # This avoids creating certain formats in each data loader process, which saves momory and CPU. graph.create_formats_()
'field_of_study'] # add types of edges, not used in this work for src_type, etype, dst_type in original_graph.canonical_etypes: graph.edges[(src_type, etype, dst_type)].data['reltype'] = \ original_graph.edges[(src_type, etype, dst_type)].data['reltype'] graph.edges[(dst_type, f'rev_{etype}', src_type)].data['reltype'] = \ original_graph.edges[(src_type, etype, dst_type)].data['reltype'] + len(original_graph.etypes) graph_output_path = '../dataset/OGB_MAG/OGB_MAG.pkl' save_graphs(graph_output_path, graph, labels) print(f"{graph_output_path} writes successfully.") split_idx = dataset.get_idx_split() split_idx = { 'train': { 'paper': split_idx['train']['paper'] }, 'valid': { 'paper': split_idx['valid']['paper'] }, 'test': { 'paper': split_idx['test']['paper'] } } split_idx_output_path = '../dataset/OGB_MAG/OGB_MAG_split_idx.pkl' torch.save(split_idx, split_idx_output_path) print(f"{split_idx_output_path} writes successfully.")
def main(): parser = argparse.ArgumentParser( description='OGBN-Arxiv (GraphSAGE Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument("--eval", action='store_true', help='If not set, we will only do the training part.') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = DglNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() g, labels = dataset[0] feats = g.ndata['feat'] g = dgl.to_bidirected(g) g = g.int().to(device) feats, labels = feats.to(device), labels.to(device) train_idx = split_idx['train'].to(device) model = GraphSAGE(in_feats=feats.size(-1), hidden_feats=args.hidden_channels, out_feats=dataset.num_classes, num_layers=args.num_layers, dropout=args.dropout).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) dur = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): t0 = time.time() loss = train(model, g, feats, labels, train_idx, optimizer) if epoch >= 3: dur.append(time.time() - t0) print('Training time/epoch {}'.format(np.mean(dur))) if not args.eval: continue result = test(model, g, feats, labels, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') if args.eval: logger.print_statistics(run) if args.eval: logger.print_statistics()
def main(): # check cuda device = f'cuda:{args.gpu}' if torch.cuda.is_available() and args.gpu >= 0 else 'cpu' # load data dataset = DglNodePropPredDataset(name=args.dataset) evaluator = Evaluator(name=args.dataset) split_idx = dataset.get_idx_split() g, labels = dataset[0] # graph: DGLGraph object, label: torch tensor of shape (num_nodes, num_tasks) if args.dataset == 'ogbn-arxiv': g = dgl.to_bidirected(g, copy_ndata=True) feat = g.ndata['feat'] feat = (feat - feat.mean(0)) / feat.std(0) g.ndata['feat'] = feat g = g.to(device) feats = g.ndata['feat'] labels = labels.to(device) # load masks for train / validation / test train_idx = split_idx["train"].to(device) valid_idx = split_idx["valid"].to(device) test_idx = split_idx["test"].to(device) n_features = feats.size()[-1] n_classes = dataset.num_classes # load model if args.model == 'mlp': model = MLP(n_features, args.hid_dim, n_classes, args.num_layers, args.dropout) elif args.model == 'linear': model = MLPLinear(n_features, n_classes) else: raise NotImplementedError(f'Model {args.model} is not supported.') model = model.to(device) print(f'Model parameters: {sum(p.numel() for p in model.parameters())}') if args.pretrain: print('---------- Before ----------') model.load_state_dict(torch.load(f'base/{args.dataset}-{args.model}.pt')) model.eval() y_soft = model(feats).exp() y_pred = y_soft.argmax(dim=-1, keepdim=True) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}') print('---------- Correct & Smoothing ----------') cs = CorrectAndSmooth(num_correction_layers=args.num_correction_layers, correction_alpha=args.correction_alpha, correction_adj=args.correction_adj, num_smoothing_layers=args.num_smoothing_layers, smoothing_alpha=args.smoothing_alpha, smoothing_adj=args.smoothing_adj, autoscale=args.autoscale, scale=args.scale) mask_idx = torch.cat([train_idx, valid_idx]) y_soft = cs.correct(g, y_soft, labels[mask_idx], mask_idx) y_soft = cs.smooth(g, y_soft, labels[mask_idx], mask_idx) y_pred = y_soft.argmax(dim=-1, keepdim=True) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}') else: opt = optim.Adam(model.parameters(), lr=args.lr) best_acc = 0 best_model = copy.deepcopy(model) # training print('---------- Training ----------') for i in range(args.epochs): model.train() opt.zero_grad() logits = model(feats) train_loss = F.nll_loss(logits[train_idx], labels.squeeze(1)[train_idx]) train_loss.backward() opt.step() model.eval() with torch.no_grad(): logits = model(feats) y_pred = logits.argmax(dim=-1, keepdim=True) train_acc = evaluate(y_pred, labels, train_idx, evaluator) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) print(f'Epoch {i} | Train loss: {train_loss.item():.4f} | Train acc: {train_acc:.4f} | Valid acc {valid_acc:.4f}') if valid_acc > best_acc: best_acc = valid_acc best_model = copy.deepcopy(model) # testing & saving model print('---------- Testing ----------') best_model.eval() logits = best_model(feats) y_pred = logits.argmax(dim=-1, keepdim=True) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Test acc: {test_acc:.4f}') if not os.path.exists('base'): os.makedirs('base') torch.save(best_model.state_dict(), f'base/{args.dataset}-{args.model}.pt')
def main(): global device, in_feats, n_classes, epsilon argparser = argparse.ArgumentParser("GAT on OGBN-Arxiv", formatter_class=argparse.ArgumentDefaultsHelpFormatter) argparser.add_argument("--cpu", action="store_true", help="CPU mode. This option overrides --gpu.") argparser.add_argument("--gpu", type=int, default=0, help="GPU device ID.") argparser.add_argument("--n-runs", type=int, default=10) argparser.add_argument("--n-epochs", type=int, default=2000) argparser.add_argument( "--use-labels", action="store_true", help="Use labels in the training set as input features." ) argparser.add_argument("--use-norm", action="store_true", help="Use symmetrically normalized adjacency matrix.") argparser.add_argument("--lr", type=float, default=0.002) argparser.add_argument("--n-layers", type=int, default=3) argparser.add_argument("--n-heads", type=int, default=3) argparser.add_argument("--n-hidden", type=int, default=256) argparser.add_argument("--dropout", type=float, default=0.75) argparser.add_argument("--attn_drop", type=float, default=0.05) argparser.add_argument("--wd", type=float, default=0) argparser.add_argument("--log-every", type=int, default=20) argparser.add_argument("--plot-curves", action="store_true") argparser.add_argument("--competition", action="store_true") args = argparser.parse_args() if args.cpu: device = th.device("cpu") else: device = th.device("cuda:%d" % args.gpu) # load data if not args.competition: data = DglNodePropPredDataset(name="ogbn-arxiv") evaluator = Evaluator(name="ogbn-arxiv") splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"] graph, labels = data[0] else: evaluator = Evaluator(name="ogbn-arxiv") edges = pd.read_csv("dataset/ogbn_arxiv/pgl/edges.csv", header=None, names=["src", "dst"]).values graph = dgl.graph((edges[:, 0], edges[:, 1])) node_feat = np.load("dataset/ogbn_arxiv/pgl/feat.npy") graph.ndata['feat'] = th.from_numpy(node_feat) df = pd.read_csv("dataset/ogbn_arxiv/pgl/train.csv") node_index = df["nid"].values labels = np.zeros(node_feat.shape[0], dtype=int) for k, v in enumerate(df["nid"]): labels[v] = df["label"][k] labels = th.from_numpy(labels).reshape((len(labels) ,1)) train_part = int(len(node_index) * 0.8) #train_idx = th.from_numpy(node_index[:train_part]) train_idx = th.from_numpy(node_index) val_idx = th.from_numpy(node_index[train_part:]) test_idx = val_idx # test_idx = th.from_numpy(pd.read_csv("dataset/ogbn_arxiv/pgl/test.csv")["nid"].values) # add reverse edges srcs, dsts = graph.all_edges() graph.add_edges(dsts, srcs) # add self-loop print(f"Total edges before adding self-loop {graph.number_of_edges()}") graph = graph.remove_self_loop().add_self_loop() print(f"Total edges after adding self-loop {graph.number_of_edges()}") in_feats = graph.ndata["feat"].shape[1] n_classes = (labels.max() + 1).item() # graph.create_format_() train_idx = train_idx.to(device) val_idx = val_idx.to(device) test_idx = test_idx.to(device) labels = labels.to(device) graph = graph.to(device) # run val_accs = [] test_accs = [] model_dir = f'../models/arxiv_gat' if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) with open(f'{model_dir}/metadata', 'w') as f: f.write(f'# of params: {sum(p.numel() for p in gen_model(args).parameters())}\n') for i in range(1, args.n_runs + 1): val_acc, test_acc, out = run(args, graph, labels, train_idx, val_idx, test_idx, evaluator, i) val_accs.append(val_acc) test_accs.append(test_acc) th.save(F.softmax(out, dim=1), f'{model_dir}/{i-1}.pt') print(f"Runned {args.n_runs} times") print("Val Accs:", val_accs) print("Test Accs:", test_accs) print(f"Average val accuracy: {np.mean(val_accs)} ± {np.std(val_accs)}") print(f"Average test accuracy: {np.mean(test_accs)} ± {np.std(test_accs)}") print(f"Number of params: {count_parameters(args)}")