def main(args_test): if os.path.isfile(args_test.load_path): print("=> loading checkpoint '{}'".format(args_test.load_path)) checkpoint = torch.load(args_test.load_path, map_location="cpu") print( "=> loaded successfully '{}' (epoch {})".format( args_test.load_path, checkpoint["epoch"] ) ) else: print("=> no checkpoint found at '{}'".format(args_test.load_path)) args = checkpoint["opt"] assert args_test.gpu is None or torch.cuda.is_available() print("Use GPU: {} for generation".format(args_test.gpu)) args.gpu = args_test.gpu args.device = torch.device("cpu") if args.gpu is None else torch.device(args.gpu) if args_test.dataset in GRAPH_CLASSIFICATION_DSETS: train_dataset = GraphClassificationDataset( dataset=args_test.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( dataset=args_test.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) args.batch_size = len(train_dataset) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=batcher(), shuffle=False, num_workers=args.num_workers, ) # create model and optimizer model = GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, gnn_model=args.model, norm=args.norm, degree_input=True, ) model = model.to(args.device) model.load_state_dict(checkpoint["model"]) del checkpoint emb = test_moco(train_loader, model, args) print(os.path.join(args.model_folder, args_test.dataset)) np.save(os.path.join(args.model_folder, args_test.dataset), emb.numpy())
if __name__ == "__main__": num_workers = 1 import psutil mem = psutil.virtual_memory() print(mem.used / 1024 ** 3) graph_dataset = LoadBalanceGraphDataset( num_workers=num_workers, aug="ns", rw_hops=4, num_neighbors=5 ) mem = psutil.virtual_memory() print(mem.used / 1024 ** 3) graph_loader = torch.utils.data.DataLoader( graph_dataset, batch_size=1, collate_fn=data_util.batcher(), num_workers=num_workers, worker_init_fn=worker_init_fn, ) mem = psutil.virtual_memory() print(mem.used / 1024 ** 3) for step, batch in enumerate(graph_loader): print("bs", batch[0].batch_size) print("n=", batch[0].number_of_nodes()) print("m=", batch[0].number_of_edges()) mem = psutil.virtual_memory() print(mem.used / 1024 ** 3) # print(batch.graph_q) # print(batch.graph_q.ndata['pos_directed']) print(batch[0].ndata["pos_undirected"]) #exit(0)
def main(args): dgl.random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location="cpu") pretrain_args = checkpoint["opt"] pretrain_args.fold_idx = args.fold_idx pretrain_args.gpu = args.gpu pretrain_args.finetune = args.finetune pretrain_args.resume = args.resume pretrain_args.cv = args.cv pretrain_args.dataset = args.dataset pretrain_args.epochs = args.epochs pretrain_args.num_workers = args.num_workers if args.dataset in GRAPH_CLASSIFICATION_DSETS: # HACK for speeding up finetuning on graph classification tasks pretrain_args.num_workers = 0 pretrain_args.batch_size = args.batch_size args = pretrain_args else: print("=> no checkpoint found at '{}'".format(args.resume)) args = option_update(args) print(args) assert args.gpu is not None and torch.cuda.is_available() print("Use GPU: {} for training".format(args.gpu)) assert args.positional_embedding_size % 2 == 0 print("setting random seeds") mem = psutil.virtual_memory() print("before construct dataset", mem.used / 1024**3) if args.finetune: if args.dataset in GRAPH_CLASSIFICATION_DSETS: dataset = GraphClassificationDatasetLabeled( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) labels = dataset.dataset.data.y.tolist() else: dataset = NodeClassificationDatasetLabeled( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) labels = dataset.data.y.argmax(dim=1).tolist() skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): idx_list.append(idx) assert (0 <= args.fold_idx and args.fold_idx < 10), "fold_idx must be from 0 to 9." train_idx, test_idx = idx_list[args.fold_idx] train_dataset = torch.utils.data.Subset(dataset, train_idx) valid_dataset = torch.utils.data.Subset(dataset, test_idx) elif args.dataset == "dgl": train_dataset = LoadBalanceGraphDataset( rw_hops=args.rw_hops, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, num_workers=args.num_workers, num_samples=args.num_samples, dgl_graphs_file="./data/small.bin", num_copies=args.num_copies, ) else: if args.dataset in GRAPH_CLASSIFICATION_DSETS: train_dataset = GraphClassificationDataset( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) mem = psutil.virtual_memory() print("before construct dataloader", mem.used / 1024**3) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=labeled_batcher() if args.finetune else batcher(), shuffle=True if args.finetune else False, num_workers=args.num_workers, worker_init_fn=None if args.finetune or args.dataset != "dgl" else worker_init_fn, ) if args.finetune: valid_loader = torch.utils.data.DataLoader( dataset=valid_dataset, batch_size=args.batch_size, collate_fn=labeled_batcher(), num_workers=args.num_workers, ) mem = psutil.virtual_memory() print("before training", mem.used / 1024**3) # create model and optimizer # n_data = train_dataset.total n_data = None model, model_ema = [ GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, norm=args.norm, gnn_model=args.model, degree_input=True, ) for _ in range(2) ] # copy weights from `model' to `model_ema' if args.moco: moment_update(model, model_ema, 0) # set the contrast memory and criterion contrast = MemoryMoCo(args.hidden_size, n_data, args.nce_k, args.nce_t, use_softmax=True).cuda(args.gpu) if args.finetune: criterion = nn.CrossEntropyLoss() else: criterion = NCESoftmaxLoss() if args.moco else NCESoftmaxLossNS() criterion = criterion.cuda(args.gpu) model = model.cuda(args.gpu) model_ema = model_ema.cuda(args.gpu) if args.finetune: output_layer = nn.Linear(in_features=args.hidden_size, out_features=dataset.num_classes) output_layer = output_layer.cuda(args.gpu) output_layer_optimizer = torch.optim.Adam( output_layer.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) def clear_bn(m): classname = m.__class__.__name__ if classname.find("BatchNorm") != -1: m.reset_running_stats() model.apply(clear_bn) if args.optimizer == "sgd": optimizer = torch.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adam": optimizer = torch.optim.Adam( model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) elif args.optimizer == "adagrad": optimizer = torch.optim.Adagrad( model.parameters(), lr=args.learning_rate, lr_decay=args.lr_decay_rate, weight_decay=args.weight_decay, ) else: raise NotImplementedError # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: # print("=> loading checkpoint '{}'".format(args.resume)) # checkpoint = torch.load(args.resume, map_location="cpu") # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint["epoch"] + 1 model.load_state_dict(checkpoint["model"]) # optimizer.load_state_dict(checkpoint["optimizer"]) contrast.load_state_dict(checkpoint["contrast"]) if args.moco: model_ema.load_state_dict(checkpoint["model_ema"]) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) del checkpoint torch.cuda.empty_cache() sw = SummaryWriter(args.tb_folder) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() if args.finetune: loss, _ = train_finetune( epoch, train_loader, model, output_layer, criterion, optimizer, output_layer_optimizer, sw, args, ) else: loss = train_moco( epoch, train_loader, model, model_ema, contrast, criterion, optimizer, sw, args, ) time2 = time.time() print("epoch {}, total time {:.2f}".format(epoch, time2 - time1)) # save model if epoch % args.save_freq == 0: print("==> Saving...") state = { "opt": args, "model": model.state_dict(), "contrast": contrast.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, } if args.moco: state["model_ema"] = model_ema.state_dict() save_file = os.path.join( args.model_folder, "ckpt_epoch_{epoch}.pth".format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state # saving the model print("==> Saving...") state = { "opt": args, "model": model.state_dict(), "contrast": contrast.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, } if args.moco: state["model_ema"] = model_ema.state_dict() save_file = os.path.join(args.model_folder, "current.pth") torch.save(state, save_file) if epoch % args.save_freq == 0: save_file = os.path.join( args.model_folder, "ckpt_epoch_{epoch}.pth".format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache() if args.finetune: valid_loss, valid_f1 = test_finetune(epoch, valid_loader, model, output_layer, criterion, sw, args) return valid_f1
def main(args): dgl.random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu >= 0: torch.cuda.manual_seed(args.seed) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location="cpu") pretrain_args = checkpoint["opt"] pretrain_args.fold_idx = args.fold_idx pretrain_args.gpu = args.gpu pretrain_args.finetune = args.finetune pretrain_args.resume = args.resume pretrain_args.cv = args.cv pretrain_args.dataset = args.dataset pretrain_args.epochs = args.epochs pretrain_args.num_workers = args.num_workers if args.dataset in GRAPH_CLASSIFICATION_DSETS: # HACK for speeding up finetuning on graph classification tasks pretrain_args.num_workers = 0 pretrain_args.batch_size = args.batch_size args = pretrain_args else: print("=> no checkpoint found at '{}'".format(args.resume)) args = option_update(args) print(args) if args.gpu >= 0: assert args.gpu is not None and torch.cuda.is_available() print("Use GPU: {} for training".format(args.gpu)) assert args.positional_embedding_size % 2 == 0 print("setting random seeds") mem = psutil.virtual_memory() print("before construct dataset", mem.used / 1024**3) if args.finetune: if args.dataset in GRAPH_CLASSIFICATION_DSETS: dataset = GraphClassificationDatasetLabeled( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) labels = dataset.dataset.data.y.tolist() else: dataset = NodeClassificationDatasetLabeled( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) labels = dataset.data.y.argmax(dim=1).tolist() skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): idx_list.append(idx) assert (0 <= args.fold_idx and args.fold_idx < 10), "fold_idx must be from 0 to 9." train_idx, test_idx = idx_list[args.fold_idx] train_dataset = torch.utils.data.Subset(dataset, train_idx) valid_dataset = torch.utils.data.Subset(dataset, test_idx) elif args.dataset == "dgl": train_dataset = LoadBalanceGraphDataset( rw_hops=args.rw_hops, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, num_workers=args.num_workers, num_samples=args.num_samples, dgl_graphs_file="./data/small.bin", num_copies=args.num_copies, ) else: if args.dataset in GRAPH_CLASSIFICATION_DSETS: train_dataset = GraphClassificationDataset( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( dataset=args.dataset, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) mem = psutil.virtual_memory() print("before construct dataloader", mem.used / 1024**3) train_loader = torch.utils.data.graph.Dataloader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=labeled_batcher() if args.finetune else batcher(), shuffle=True if args.finetune else False, num_workers=args.num_workers, worker_init_fn=None if args.finetune or args.dataset != "dgl" else worker_init_fn, ) if args.finetune: valid_loader = torch.utils.data.DataLoader( dataset=valid_dataset, batch_size=args.batch_size, collate_fn=labeled_batcher(), num_workers=args.num_workers, ) mem = psutil.virtual_memory() print("before training", mem.used / 1024**3) # create model and optimizer # n_data = train_dataset.total n_data = None import gcc.models.graph_encoder gcc.models.graph_encoder.final_dropout = 0 ##disable dropout model, model_ema = [ GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, norm=args.norm, gnn_model=args.model, degree_input=True, ) for _ in range(2) ] # copy weights from `model' to `model_ema' if args.moco: moment_update(model, model_ema, 0) # set the contrast memory and criterion contrast = MemoryMoCo(args.hidden_size, n_data, args.nce_k, args.nce_t, use_softmax=True) if args.gpu >= 0: contrast = contrast.cuda(args.gpu) if args.finetune: criterion = nn.CrossEntropyLoss() else: criterion = NCESoftmaxLoss() if args.moco else NCESoftmaxLossNS() if args.gpu >= 0: criterion = criterion.cuda(args.gpu) if args.gpu >= 0: model = model.cuda(args.gpu) model_ema = model_ema.cuda(args.gpu) if args.finetune: output_layer = nn.Linear(in_features=args.hidden_size, out_features=dataset.num_classes) if args.gpu >= 0: output_layer = output_layer.cuda(args.gpu) output_layer_optimizer = torch.optim.Adam( output_layer.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) def clear_bn(m): classname = m.__class__.__name__ if classname.find("BatchNorm") != -1: m.reset_running_stats() model.apply(clear_bn) if args.optimizer == "sgd": optimizer = torch.optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adam": optimizer = torch.optim.Adam( model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) elif args.optimizer == "adagrad": optimizer = torch.optim.Adagrad( model.parameters(), lr=args.learning_rate, lr_decay=args.lr_decay_rate, weight_decay=args.weight_decay, ) else: raise NotImplementedError # optionally resume from a checkpoint args.start_epoch = 1 if True: # print("=> loading checkpoint '{}'".format(args.resume)) # checkpoint = torch.load(args.resume, map_location="cpu") import torch as th checkpoint = th.load("torch_models/ckpt_epoch_100.pth", map_location=th.device('cpu')) torch_input_output_grad = th.load( "torch_models/torch_input_output_grad.pt", map_location=th.device('cpu')) from paddorch.convert_pretrain_model import load_pytorch_pretrain_model print("loading.............. model") paddle_state_dict = load_pytorch_pretrain_model( model, checkpoint["model"]) model.load_state_dict(paddle_state_dict) print("loading.............. contrast") paddle_state_dict2 = load_pytorch_pretrain_model( contrast, checkpoint["contrast"]) contrast.load_state_dict(paddle_state_dict2) print("loading.............. model_ema") paddle_state_dict3 = load_pytorch_pretrain_model( model_ema, checkpoint["model_ema"]) if args.moco: model_ema.load_state_dict(paddle_state_dict3) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) del checkpoint if args.gpu >= 0: torch.cuda.empty_cache() optimizer = torch.optim.Adam( model.parameters(), lr=args.learning_rate * 0.1, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) for _ in range(1): graph_q, graph_k = train_dataset[0] graph_q2, graph_k2 = train_dataset[1] graph_q, graph_k = dgl.batch([graph_q, graph_q2 ]), dgl.batch([graph_k, graph_k2]) input_output_grad = [] input_output_grad.append([graph_q, graph_k]) model.train() model_ema.eval() feat_q = model(graph_q) with torch.no_grad(): feat_k = model_ema(graph_k) out = contrast(feat_q, feat_k) loss = criterion(out) optimizer.zero_grad() loss.backward() input_output_grad.append([feat_q, out, loss]) print("loss:", loss.numpy()) optimizer.step() moment_update(model, model_ema, args.alpha) print( "max diff feat_q:", np.max( np.abs(torch_input_output_grad[1][0].detach().numpy() - feat_q.numpy()))) print( "max diff out:", np.max( np.abs(torch_input_output_grad[1][1].detach().numpy() - out.numpy()))) print( "max diff loss:", np.max( np.abs(torch_input_output_grad[1][2].detach().numpy() - loss.numpy()))) name2grad = dict() for name, p in dict(model.named_parameters()).items(): if p.grad is not None: name2grad[name] = p.grad torch_grad = torch_input_output_grad[2][name].numpy() if "linear" in name and "weight" in name: torch_grad = torch_grad.T max_grad_diff = np.max(np.abs(p.grad - torch_grad)) print("max grad diff:", name, max_grad_diff) input_output_grad.append(name2grad)