def load_ogbl(name, device=th.device('cpu'), root='/home/eva_share_users/zhuyu'): from ogb.linkproppred import DglLinkPropPredDataset print('load', name) data = DglLinkPropPredDataset(name=name, root=root) print('finish loading', name) splitted_idx = data.get_edge_split() if name == 'ogbl-citation2': splitted_idx['train']['edge'] = th.cat( (splitted_idx['train']['source_node'].unsqueeze(1), splitted_idx['train']['target_node'].unsqueeze(1)), axis=1) splitted_idx['valid']['edge'] = th.cat( (splitted_idx['valid']['source_node'].unsqueeze(1), splitted_idx['valid']['target_node'].unsqueeze(1)), axis=1) splitted_idx['valid']['neg_edge'] = th.cat( (splitted_idx['valid']['source_node'].repeat(1000).unsqueeze(1), splitted_idx['valid']['target_node_neg'].view(-1).unsqueeze(1)), axis=1) splitted_idx['test']['edge'] = th.cat( (splitted_idx['test']['source_node'].unsqueeze(1), splitted_idx['test']['target_node'].unsqueeze(1)), axis=1) splitted_idx['test']['neg_edge'] = th.cat( (splitted_idx['test']['source_node'].repeat(1000).unsqueeze(1), splitted_idx['test']['target_node_neg'].view(-1).unsqueeze(1)), axis=1) graph = data[0] #from IPython import embed; embed() return graph, splitted_idx
def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-collab') self.graph = self.dataset[0] # single DGL graph # Create edge feat by concatenating weight and year self.graph.edata['feat'] = torch.cat( [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], dim=1 ) self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-collab') print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time()-start))
def load_ogb_dataset(dataset): """ Load OGB dataset Args: dataset(str): name of dataset (ogbl-collab, ogbl-ddi, ogbl-citation) Returns: graph(DGLGraph): graph split_edge(dict): split edge """ dataset = DglLinkPropPredDataset(name=dataset) split_edge = dataset.get_edge_split() graph = dataset[0] return graph, split_edge
class DDIDataset(Dataset): def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-ddi') self.graph = self.dataset[0] # single DGL graph self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-ddi') print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time()-start)) def _add_positional_encodings(self, pos_enc_dim): # Graph positional encoding v/ Laplacian eigenvectors self.graph = positional_encoding(self.graph, pos_enc_dim)
def prepare_train_labels() -> Tuple[dgl.DGLHeteroGraph, Tensor, Tensor, Tensor]: dataset = DglLinkPropPredDataset(name="ogbl-collab") split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph: dgl.DGLGraph = dataset[0] train_src_nodes = torch.cat([train_edge["edge"][:, 0], train_edge["edge"][:, 1]], dim=0) train_dst_nodes = torch.cat([train_edge["edge"][:, 1], train_edge["edge"][:, 0]], dim=0) train_graph = dgl.graph(data=(train_src_nodes, train_dst_nodes), num_nodes=graph.number_of_nodes()) train_graph.ndata["feat"] = graph.ndata["feat"] train_labels = train_edge["edge"] train_labels = torch.cat([train_labels, torch.ones(len(train_labels), 1).long()], dim=1) valid_labels = get_label_from_split(valid_edge) test_labels = get_label_from_split(test_edge) return train_graph, train_labels, valid_labels, test_labels
def prepare_ogb(name): dataset = DglLinkPropPredDataset(name) split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge[ "valid"], split_edge["test"] g = dataset[0] # dgl graph object containing only training edges train_data = sample_data(1500000, train_edge, sampling=True).numpy() valid_data = sample_data(50000, valid_edge) test_data = sample_data(50000, test_edge) num_nodes = g.number_of_nodes() num_rels = len(torch.unique(train_edge['relation'])) del g return train_data, valid_data, test_data, num_nodes, num_rels
def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-ddi') self.graph = self.dataset[0] # single DGL graph self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-ddi') print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time()-start))
def load_OGB(dataset): if dataset == 'mag': dataset = DglNodePropPredDataset(name='ogbn-mag') return dataset # split_idx = dataset.get_idx_split() # train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] # graph, label = dataset[0] # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks) elif dataset in ['biokg', 'wikikg']: d_name = 'ogbl-' + dataset dataset = DglLinkPropPredDataset(name=d_name) split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph = dataset[0] # dgl graph object containing only training edges
os.path.join(log_dir, "%s.log" % (dataset_name + "_" + setting_name)) ]) snapshot_dir = os.path.join(log_dir, "snapshot") if not os.path.isdir(snapshot_dir): os.makedirs(snapshot_dir) print("Process Id:", os.getpid()) print(os.path.join(log_dir, sys.argv[0])) print(args) shutil.copyfile(__file__, os.path.join(log_dir, "train.py")) shutil.copyfile(model_file + ".py", os.path.join(log_dir, model_file + ".py")) evaluator = Evaluator(name="ogbl-citation") print(evaluator.expected_input_format) print(evaluator.expected_output_format) dataset = DglLinkPropPredDataset(name="ogbl-citation") split_edge = dataset.get_edge_split() num_worker = 16 train_edge, valid_edge, test_edge = split_edge["train"], split_edge[ "valid"], split_edge["test"] graph = dataset[0] origin_graph = copy.deepcopy(graph) graph.readonly(False) graph.add_edges(graph.edges()[1], graph.edges()[0]) graph.add_edges( torch.arange(0, graph.number_of_nodes()).long(), torch.arange(0, graph.number_of_nodes()).long()) graph.edata["etype"] = torch.cat([ torch.ones( (graph.number_of_edges() - graph.number_of_nodes()) // 2).long(), (torch.ones(
def main(): parser = argparse.ArgumentParser( description='Link prediction (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--dataset', type=str, default='ogbl-citation') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--negs', type=int, default=1) parser.add_argument('--gnn_type', type=str, default='gcn') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = DglLinkPropPredDataset(name=args.dataset) # Manually add self-loop link since GCN will wash out the feature of isolated nodes. n_nodes = dataset[0].number_of_nodes() g_data = dgl.add_self_loop(dataset[0]) g_data = dgl.to_bidirected(g_data) for k in dataset[0].node_attr_schemes().keys(): g_data.ndata[k] = dataset[0].ndata[k] print(g_data.number_of_nodes(), g_data.number_of_edges()) g_data.create_formats_() cluster_dataset = ClusterIterDataset(args.dataset, g_data, args.num_partitions, use_pp=False) cluster_iterator = DataLoader(cluster_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=partial(subgraph_collate_fn, g_data, negs=args.negs)) model = GCN(g_data.ndata['feat'].size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, gnn_type=args.gnn_type).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name=args.dataset) logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0 for epoch in range(1, 1 + args.epochs): loss, c_epoch_time, c_to_device_time, c_ff_time, c_pred_loss_time, c_bp_time, c_io_time, c_memory, c_part1 = train( model, predictor, cluster_iterator, optimizer, device) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') epoch_time += c_epoch_time to_device_time += c_to_device_time ff_time += c_ff_time pred_loss_time += c_pred_loss_time bp_time += c_bp_time io_time += c_io_time part_1 += c_part1 memory = max(memory, c_memory[0]) if epoch % args.eval_steps == 0: print('Ave') print('epoch time: ', epoch_time / args.eval_steps) print('to_device_time: ', to_device_time / args.eval_steps) print('ff_time: ', ff_time / args.eval_steps) print('part1_time: ', part_1 / args.eval_steps) print('pred_loss_time: ', pred_loss_time / args.eval_steps) print('bp_time: ', bp_time / args.eval_steps) print('io_time: ', io_time / args.eval_steps) print('max memory', memory) print('\n') epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0 result = test(model, predictor, g_data, split_edge, evaluator, 64 * 4 * args.batch_size, device) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
def load_from_ogbl_with_name(name): choices = ['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation'] assert name in choices, "name must be selected from " + str(choices) dataset = DglLinkPropPredDataset(name) return dataset[0]
eids = np.arange(graph.number_of_edges()) eids = np.random.permutation(eids) test_pos_u, test_pos_v = u[eids[:50]], v[eids[:50]] train_pos_u, train_pos_v = u[eids[50:]], v[eids[50:]] # Find all negative edges and split them for training and testing adj = sp.coo_matrix((np.ones(len(u)), (u, v))) adj_neg = 1 - adj.todense() - np.eye(34) neg_u, neg_v = np.where(adj_neg != 0) neg_eids = np.random.choice(len(neg_u), 200) test_neg_u, test_neg_v = neg_u[neg_eids[:50]], neg_v[neg_eids[:50]] train_neg_u, train_neg_v = neg_u[neg_eids[50:]], neg_v[neg_eids[50:]] # Code from OGB instructions # You may need to use DGL backend, otherwise PyTorch will be used. dataset = DglLinkPropPredDataset(name='ogbl-collab') graph = dataset[0] split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], \ split_edge["test"] """ Building models I stack different layers together, here is the full list of supported layers in Tensorflow https://docs.dgl.ai/api/python/nn.tensorflow.html, PyTorch https://docs.dgl.ai/api/python/nn.pytorch.html, and MXNet https://docs.dgl.ai/api/python/nn.mxnet.html backends Below I'm using GraphSage (SAmple and aggreGatE) -- this aggregate feature information from a node's local neighborhood (e.g., the degrees or text attributes of nearby nodes) Paper: https://arxiv.org/pdf/1706.02216.pdf Blog: https://towardsdatascience.com/an-intuitive-explanation-of-graphsage-6df9437ee64f
class COLLABDataset(Dataset): def __init__(self, name, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-collab') self.graph = self.dataset[0] # single DGL graph #self._add_positional_encodings(10, norm) self._add_eig(norm=norm, number=6) # Create edge feat by concatenating weight and year self.graph.edata['feat'] = torch.cat( [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], dim=1 ) self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-collab') if verbose: print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start)) def _add_positional_encodings(self, pos_enc_dim, norm): # Graph positional encoding v/ Laplacian eigenvectors self.graph = positional_encoding(self.graph, pos_enc_dim, norm) def _add_eig(self, norm='none', number=6): dataset = LinkPropPredDataset(name='ogbl-collab') graph = dataset[0] G = nx.Graph() G.add_nodes_from([i for i in range(235868)]) for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]): G.add_edge(nod1, nod2) components = list(nx.connected_components(G)) list_G = [] list_nodes = [] for component in components: G_new = nx.Graph() G_new.add_nodes_from(list(component)) list_G.append(G_new) list_nodes.append(list(component)) for i in range(len(list_G)): for nod1, nod2 in list(G.edges(list_nodes[i])): list_G[i].add_edge(nod1, nod2) EigVec_global = np.ones((235868, number)) for g in list_G: node_list = list(g.nodes) A = nx.adjacency_matrix(g, nodelist=node_list).astype(float) if norm == 'none': D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D - A elif norm == 'sym': D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) * D_norm elif norm == 'walk': D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) if len(node_list) > 2: EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0) EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0) EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :] elif len(node_list) == 2: EigVec_global[node_list[0], :number] = np.zeros((1, number)) self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float() print(sorted(self.graph.ndata['eig'][1]))
def test_datasetsaver(): # test on graph classification # ogbg-molhiv test_task = 'link' # testing all the dataset objects are working. if test_task == 'graph': from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = GraphPropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'node': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-arxiv' # test ogbn-proteins dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'link': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-collab' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() elif test_task == 'heteronode': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-mag' dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'heterolink': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-biokg' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() else: raise ValueError('Invalid task category') print(dataset[0]) if 'link' in test_task: print(dataset.get_edge_split()) else: print(dataset.get_idx_split()) if 'graph' in test_task: graph_list = dataset.graphs else: graph_list = [dataset.graph] if 'link' not in test_task: labels = dataset.labels is_hetero = 'hetero' in test_task version = 2 if dataset_name == 'ogbn-mag' else 1 saver = DatasetSaver(dataset_name, is_hetero, version=version) # saving graph objects saver.save_graph_list(graph_list) # saving target labels if 'link' not in test_task: saver.save_target_labels(labels) # saving split if 'link' in test_task: split_idx = dataset.get_edge_split() else: split_idx = dataset.get_idx_split() # second argument must be the name of the split saver.save_split(split_idx, dataset.meta_info['split']) # copying mapping dir # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/") saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join( dataset_name.split('-')))) saver.save_task_info( dataset.task_type, dataset.eval_metric, dataset.num_classes if hasattr(dataset, 'num_classes') else None) meta_dict = saver.get_meta_dict() print(meta_dict) print('Now testing.') if 'graph' in test_task: print('library agnostic') dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'node' in test_task: print('library agnostic') dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'link' in test_task: print('library agnostic') dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('Pytorch Geometric') dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('DGL') dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) else: raise ValueError('Invalid task category') # zip saver.zip() print('Finished zipping!') saver.cleanup()
def main(): parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=1) parser.add_argument('--gnn_type', type=str, default='gated-gcn') parser.add_argument('--num_layer', type=int, default=3) parser.add_argument('--emb_dim', type=int, default=64) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=32*1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=200) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) print(device) dataset = DglLinkPropPredDataset(name='ogbl-collab') split_edge = dataset.get_edge_split() data = dataset[0] print(data) model = GNN(gnn_type=args.gnn_type, emb_dim=args.emb_dim, num_layer=args.num_layer, dropout=args.dropout).to(device) print(model) total_param = 0 for param in model.parameters(): total_param += np.prod(list(param.data.size())) print(f'Model parameters: {total_param}') predictor = LinkPredictor(emb_dim=args.emb_dim).to(device) print(predictor) total_param = 0 for param in predictor.parameters(): total_param += np.prod(list(param.data.size())) print(f'Predictor parameters: {total_param}') evaluator = Evaluator(name='ogbl-collab') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } tb_logger = SummaryWriter( os.path.join( "logs", f"{args.gnn_type}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}", time.strftime("%Y%m%dT%H%M%S") ) ) for run in range(args.runs): assert args.runs == 1 # model.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, min_lr=1e-5, verbose=True) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size, device) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size, device) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: tb_logger.add_scalar('loss', loss, epoch) tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') tb_logger.add_scalar(f'{key}/train_hits', 100 * train_hits, epoch) tb_logger.add_scalar(f'{key}/valid_hits', 100 * valid_hits, epoch) tb_logger.add_scalar(f'{key}/test_hits', 100 * test_hits, epoch) print('---') scheduler.step(100 * results["Hits@10"][1]) if optimizer.param_groups[0]['lr'] < 1e-5: break for key in loggers.keys(): print(key) loggers[key].print_statistics(run)
def evaluate(model, edge_split, device, num_workers): with torch.no_grad(): node_emb = model.inference(graph, device, 4096, num_workers, 'cpu') results = [] for split in ['valid', 'test']: src = edge_split[split]['source_node'].to(device) dst = edge_split[split]['target_node'].to(device) neg_dst = edge_split[split]['target_node_neg'].to(device) results.append( compute_mrr(model, node_emb, src, dst, neg_dst, device)) return results dataset = DglLinkPropPredDataset('ogbl-citation2') graph = dataset[0] graph, reverse_eids = to_bidirected_with_reverse_mapping(graph) graph = graph.to('cuda' if args.pure_gpu else 'cpu') reverse_eids = reverse_eids.to(device) seed_edges = torch.arange(graph.num_edges()).to(device) edge_split = dataset.get_edge_split() model = SAGE(graph.ndata['feat'].shape[1], 256).to(device) opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) sampler = dgl.dataloading.NeighborSampler([15, 10, 5], prefetch_node_feats=['feat']) sampler = dgl.dataloading.as_edge_prediction_sampler( sampler, exclude='reverse_id',
# -*- coding: utf-8 -*- # @Time : 2020-10-20 15:39 # @Author : xiaorui su # @Email : [email protected] # @File : load_kg.py # @Software : PyCharm d_name = "ogbl-biokg" import dgl import numpy as np import torch as th from ogb.linkproppred import DglLinkPropPredDataset import random dataset = DglLinkPropPredDataset(name=d_name) print(dataset[0]) split_edge = dataset.get_edge_split() #print(split_edge) #five kinds of nodes: drug, protein,function,disease train_edge, valid_edge, test_edge = split_edge["train"], split_edge[ "valid"], split_edge["test"] graph = dataset[0] # dgl graph object containing only training edges #head_type,head,relation,tail_type,tail #ids: head,relation,tail def construct_entity(): ##construct entity2id.txt for biokg # 10687 disease_entity = "dataset/ogbl_biokg/mapping/disease_entidx2name.csv"
def main(): parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') parser.add_argument('--use_gpu', action='store_true', help='Use gpu for computation (default: False)') parser.add_argument( '--log_steps', type=int, default=1, help='Print training progress every {log_steps} epochs (default: 1)') parser.add_argument('--use_sage', action='store_true', help='Use GraphSAGE rather than GCN (default: False)') parser.add_argument( '--num_layers', type=int, default=3, help='Number of GNN layers to use as well as ' 'linear layers to use for final link prediction (default: 3)') parser.add_argument('--hidden_feats', type=int, default=256, help='Size for hidden representations (default: 256)') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout (default: 0.0)') parser.add_argument( '--batch_size', type=int, default=64 * 1024, help='Batch size to use for link prediction (default: 64 * 1024)') parser.add_argument('--lr', type=float, default=0.01, help='Learning rate (default: 0.01)') parser.add_argument('--epochs', type=int, default=20, help='Number of epochs for training (default: 20)') parser.add_argument( '--eval_steps', type=int, default=1, help='Evaluate hits@100 every {eval_steps} epochs (default: 1)') parser.add_argument( '--runs', type=int, default=10, help='Number of random experiments to perform (default: 10)') args = parser.parse_args() print(args) if args.use_gpu and torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') dataset = DglLinkPropPredDataset(name='ogbl-ppa') # Get DGLGraph data = dataset[0] data.readonly(False) data.add_edges(data.nodes(), data.nodes()) data = data.to(device) splitted_edge = dataset.get_edge_split() x = data.ndata['feat'].float().to(device) if args.use_sage: model = GraphSAGE( in_feats=x.size(-1), hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], activation=[F.relu for _ in range(args.num_layers - 1)] + [None], dropout=[0] + [args.dropout for _ in range(args.num_layers - 1)]).to(device) else: model = GCN( in_feats=x.size(-1), hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], activation=[F.relu for _ in range(args.num_layers - 1)] + [None], residual=[False for _ in range(args.num_layers)], batchnorm=[False for _ in range(args.num_layers)], dropout=[args.dropout for _ in range(args.num_layers - 1)] + [0]).to(device) predictor = HadamardLinkPredictor(in_feats=args.hidden_feats, hidden_feats=args.hidden_feats, num_layers=args.num_layers, n_tasks=1, dropout=args.dropout).to(device) evaluator = Evaluator(name='ogbl-ppa') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, x, splitted_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, x, splitted_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()