class COLLABDataset(Dataset): def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-collab') self.graph = self.dataset[0] # single DGL graph # Create edge feat by concatenating weight and year self.graph.edata['feat'] = torch.cat( [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], dim=1 ) self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-collab') print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time()-start)) def _add_positional_encodings(self, pos_enc_dim): # Graph positional encoding v/ Laplacian eigenvectors self.graph = positional_encoding(self.graph, pos_enc_dim)
class DDIDataset(Dataset): def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-ddi') self.graph = self.dataset[0] # single DGL graph self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-ddi') print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time()-start)) def _add_positional_encodings(self, pos_enc_dim): # Graph positional encoding v/ Laplacian eigenvectors self.graph = positional_encoding(self.graph, pos_enc_dim)
def load_ogbl(name, device=th.device('cpu'), root='/home/eva_share_users/zhuyu'): from ogb.linkproppred import DglLinkPropPredDataset print('load', name) data = DglLinkPropPredDataset(name=name, root=root) print('finish loading', name) splitted_idx = data.get_edge_split() if name == 'ogbl-citation2': splitted_idx['train']['edge'] = th.cat( (splitted_idx['train']['source_node'].unsqueeze(1), splitted_idx['train']['target_node'].unsqueeze(1)), axis=1) splitted_idx['valid']['edge'] = th.cat( (splitted_idx['valid']['source_node'].unsqueeze(1), splitted_idx['valid']['target_node'].unsqueeze(1)), axis=1) splitted_idx['valid']['neg_edge'] = th.cat( (splitted_idx['valid']['source_node'].repeat(1000).unsqueeze(1), splitted_idx['valid']['target_node_neg'].view(-1).unsqueeze(1)), axis=1) splitted_idx['test']['edge'] = th.cat( (splitted_idx['test']['source_node'].unsqueeze(1), splitted_idx['test']['target_node'].unsqueeze(1)), axis=1) splitted_idx['test']['neg_edge'] = th.cat( (splitted_idx['test']['source_node'].repeat(1000).unsqueeze(1), splitted_idx['test']['target_node_neg'].view(-1).unsqueeze(1)), axis=1) graph = data[0] #from IPython import embed; embed() return graph, splitted_idx
def load_ogb_dataset(dataset): """ Load OGB dataset Args: dataset(str): name of dataset (ogbl-collab, ogbl-ddi, ogbl-citation) Returns: graph(DGLGraph): graph split_edge(dict): split edge """ dataset = DglLinkPropPredDataset(name=dataset) split_edge = dataset.get_edge_split() graph = dataset[0] return graph, split_edge
def prepare_train_labels() -> Tuple[dgl.DGLHeteroGraph, Tensor, Tensor, Tensor]: dataset = DglLinkPropPredDataset(name="ogbl-collab") split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph: dgl.DGLGraph = dataset[0] train_src_nodes = torch.cat([train_edge["edge"][:, 0], train_edge["edge"][:, 1]], dim=0) train_dst_nodes = torch.cat([train_edge["edge"][:, 1], train_edge["edge"][:, 0]], dim=0) train_graph = dgl.graph(data=(train_src_nodes, train_dst_nodes), num_nodes=graph.number_of_nodes()) train_graph.ndata["feat"] = graph.ndata["feat"] train_labels = train_edge["edge"] train_labels = torch.cat([train_labels, torch.ones(len(train_labels), 1).long()], dim=1) valid_labels = get_label_from_split(valid_edge) test_labels = get_label_from_split(test_edge) return train_graph, train_labels, valid_labels, test_labels
def prepare_ogb(name): dataset = DglLinkPropPredDataset(name) split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge[ "valid"], split_edge["test"] g = dataset[0] # dgl graph object containing only training edges train_data = sample_data(1500000, train_edge, sampling=True).numpy() valid_data = sample_data(50000, valid_edge) test_data = sample_data(50000, test_edge) num_nodes = g.number_of_nodes() num_rels = len(torch.unique(train_edge['relation'])) del g return train_data, valid_data, test_data, num_nodes, num_rels
class COLLABDataset(Dataset): def __init__(self, name, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglLinkPropPredDataset(name='ogbl-collab') self.graph = self.dataset[0] # single DGL graph #self._add_positional_encodings(10, norm) self._add_eig(norm=norm, number=6) # Create edge feat by concatenating weight and year self.graph.edata['feat'] = torch.cat( [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], dim=1 ) self.split_edge = self.dataset.get_edge_split() self.train_edges = self.split_edge['train']['edge'] # positive train edges self.val_edges = self.split_edge['valid']['edge'] # positive val edges self.val_edges_neg = self.split_edge['valid']['edge_neg'] # negative val edges self.test_edges = self.split_edge['test']['edge'] # positive test edges self.test_edges_neg = self.split_edge['test']['edge_neg'] # negative test edges self.evaluator = Evaluator(name='ogbl-collab') if verbose: print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start)) def _add_positional_encodings(self, pos_enc_dim, norm): # Graph positional encoding v/ Laplacian eigenvectors self.graph = positional_encoding(self.graph, pos_enc_dim, norm) def _add_eig(self, norm='none', number=6): dataset = LinkPropPredDataset(name='ogbl-collab') graph = dataset[0] G = nx.Graph() G.add_nodes_from([i for i in range(235868)]) for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]): G.add_edge(nod1, nod2) components = list(nx.connected_components(G)) list_G = [] list_nodes = [] for component in components: G_new = nx.Graph() G_new.add_nodes_from(list(component)) list_G.append(G_new) list_nodes.append(list(component)) for i in range(len(list_G)): for nod1, nod2 in list(G.edges(list_nodes[i])): list_G[i].add_edge(nod1, nod2) EigVec_global = np.ones((235868, number)) for g in list_G: node_list = list(g.nodes) A = nx.adjacency_matrix(g, nodelist=node_list).astype(float) if norm == 'none': D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D - A elif norm == 'sym': D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) * D_norm elif norm == 'walk': D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) if len(node_list) > 2: EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0) EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0) EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :] elif len(node_list) == 2: EigVec_global[node_list[0], :number] = np.zeros((1, number)) self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float() print(sorted(self.graph.ndata['eig'][1]))
]) snapshot_dir = os.path.join(log_dir, "snapshot") if not os.path.isdir(snapshot_dir): os.makedirs(snapshot_dir) print("Process Id:", os.getpid()) print(os.path.join(log_dir, sys.argv[0])) print(args) shutil.copyfile(__file__, os.path.join(log_dir, "train.py")) shutil.copyfile(model_file + ".py", os.path.join(log_dir, model_file + ".py")) evaluator = Evaluator(name="ogbl-citation") print(evaluator.expected_input_format) print(evaluator.expected_output_format) dataset = DglLinkPropPredDataset(name="ogbl-citation") split_edge = dataset.get_edge_split() num_worker = 16 train_edge, valid_edge, test_edge = split_edge["train"], split_edge[ "valid"], split_edge["test"] graph = dataset[0] origin_graph = copy.deepcopy(graph) graph.readonly(False) graph.add_edges(graph.edges()[1], graph.edges()[0]) graph.add_edges( torch.arange(0, graph.number_of_nodes()).long(), torch.arange(0, graph.number_of_nodes()).long()) graph.edata["etype"] = torch.cat([ torch.ones( (graph.number_of_edges() - graph.number_of_nodes()) // 2).long(), (torch.ones( (graph.number_of_edges() - graph.number_of_nodes()) // 2) * 2).long(),
def main(): parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') parser.add_argument('--use_gpu', action='store_true', help='Use gpu for computation (default: False)') parser.add_argument( '--log_steps', type=int, default=1, help='Print training progress every {log_steps} epochs (default: 1)') parser.add_argument('--use_sage', action='store_true', help='Use GraphSAGE rather than GCN (default: False)') parser.add_argument( '--num_layers', type=int, default=3, help='Number of GNN layers to use as well as ' 'linear layers to use for final link prediction (default: 3)') parser.add_argument('--hidden_feats', type=int, default=256, help='Size for hidden representations (default: 256)') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout (default: 0.0)') parser.add_argument( '--batch_size', type=int, default=64 * 1024, help='Batch size to use for link prediction (default: 64 * 1024)') parser.add_argument('--lr', type=float, default=0.01, help='Learning rate (default: 0.01)') parser.add_argument('--epochs', type=int, default=20, help='Number of epochs for training (default: 20)') parser.add_argument( '--eval_steps', type=int, default=1, help='Evaluate hits@100 every {eval_steps} epochs (default: 1)') parser.add_argument( '--runs', type=int, default=10, help='Number of random experiments to perform (default: 10)') args = parser.parse_args() print(args) if args.use_gpu and torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') dataset = DglLinkPropPredDataset(name='ogbl-ppa') # Get DGLGraph data = dataset[0] data.readonly(False) data.add_edges(data.nodes(), data.nodes()) data = data.to(device) splitted_edge = dataset.get_edge_split() x = data.ndata['feat'].float().to(device) if args.use_sage: model = GraphSAGE( in_feats=x.size(-1), hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], activation=[F.relu for _ in range(args.num_layers - 1)] + [None], dropout=[0] + [args.dropout for _ in range(args.num_layers - 1)]).to(device) else: model = GCN( in_feats=x.size(-1), hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], activation=[F.relu for _ in range(args.num_layers - 1)] + [None], residual=[False for _ in range(args.num_layers)], batchnorm=[False for _ in range(args.num_layers)], dropout=[args.dropout for _ in range(args.num_layers - 1)] + [0]).to(device) predictor = HadamardLinkPredictor(in_feats=args.hidden_feats, hidden_feats=args.hidden_feats, num_layers=args.num_layers, n_tasks=1, dropout=args.dropout).to(device) evaluator = Evaluator(name='ogbl-ppa') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, x, splitted_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, x, splitted_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=1) parser.add_argument('--gnn_type', type=str, default='gated-gcn') parser.add_argument('--num_layer', type=int, default=3) parser.add_argument('--emb_dim', type=int, default=64) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=32*1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=200) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) print(device) dataset = DglLinkPropPredDataset(name='ogbl-collab') split_edge = dataset.get_edge_split() data = dataset[0] print(data) model = GNN(gnn_type=args.gnn_type, emb_dim=args.emb_dim, num_layer=args.num_layer, dropout=args.dropout).to(device) print(model) total_param = 0 for param in model.parameters(): total_param += np.prod(list(param.data.size())) print(f'Model parameters: {total_param}') predictor = LinkPredictor(emb_dim=args.emb_dim).to(device) print(predictor) total_param = 0 for param in predictor.parameters(): total_param += np.prod(list(param.data.size())) print(f'Predictor parameters: {total_param}') evaluator = Evaluator(name='ogbl-collab') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } tb_logger = SummaryWriter( os.path.join( "logs", f"{args.gnn_type}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}", time.strftime("%Y%m%dT%H%M%S") ) ) for run in range(args.runs): assert args.runs == 1 # model.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, min_lr=1e-5, verbose=True) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size, device) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size, device) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: tb_logger.add_scalar('loss', loss, epoch) tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') tb_logger.add_scalar(f'{key}/train_hits', 100 * train_hits, epoch) tb_logger.add_scalar(f'{key}/valid_hits', 100 * valid_hits, epoch) tb_logger.add_scalar(f'{key}/test_hits', 100 * test_hits, epoch) print('---') scheduler.step(100 * results["Hits@10"][1]) if optimizer.param_groups[0]['lr'] < 1e-5: break for key in loggers.keys(): print(key) loggers[key].print_statistics(run)