def train_finetune( epoch, train_loader, model, output_layer, criterion, optimizer, output_layer_optimizer, sw, opt, ): """ one epoch training for moco """ n_batch = len(train_loader) model.train() output_layer.train() batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() f1_meter = AverageMeter() epoch_loss_meter = AverageMeter() epoch_f1_meter = AverageMeter() prob_meter = AverageMeter() graph_size = AverageMeter() max_num_nodes = 0 max_num_edges = 0 end = time.time() for idx, batch in enumerate(train_loader): data_time.update(time.time() - end) graph_q, y = batch graph_q.to(torch.device(opt.gpu)) y = y.to(torch.device(opt.gpu)) bsz = graph_q.batch_size # ===================forward===================== feat_q = model(graph_q) assert feat_q.shape == (graph_q.batch_size, opt.hidden_size) out = output_layer(feat_q) loss = criterion(out, y) # ===================backward===================== optimizer.zero_grad() output_layer_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_value_(model.parameters(), 1) torch.nn.utils.clip_grad_value_(output_layer.parameters(), 1) global_step = epoch * n_batch + idx lr_this_step = opt.learning_rate * warmup_linear( global_step / (opt.epochs * n_batch), 0.1) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step for param_group in output_layer_optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() output_layer_optimizer.step() preds = out.argmax(dim=1) f1 = f1_score(y.cpu().numpy(), preds.cpu().numpy(), average="micro") # ===================meters===================== f1_meter.update(f1, bsz) epoch_f1_meter.update(f1, bsz) loss_meter.update(loss.item(), bsz) epoch_loss_meter.update(loss.item(), bsz) graph_size.update(graph_q.number_of_nodes() / bsz, bsz) max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes()) max_num_edges = max(max_num_edges, graph_q.number_of_edges()) torch.cuda.synchronize() batch_time.update(time.time() - end) end = time.time() # print info if (idx + 1) % opt.print_freq == 0: mem = psutil.virtual_memory() # print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}') # mem_used.append(mem.used/1024**3) print("Train: [{0}][{1}/{2}]\t" "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "DT {data_time.val:.3f} ({data_time.avg:.3f})\t" "loss {loss.val:.3f} ({loss.avg:.3f})\t" "f1 {f1.val:.3f} ({f1.avg:.3f})\t" "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t" "mem {mem:.3f}".format( epoch, idx + 1, n_batch, batch_time=batch_time, data_time=data_time, loss=loss_meter, f1=f1_meter, graph_size=graph_size, mem=mem.used / 1024**3, )) # print(out[0].abs().max()) # tensorboard logger if (idx + 1) % opt.tb_freq == 0: sw.add_scalar("ft_loss", loss_meter.avg, global_step) sw.add_scalar("ft_f1", f1_meter.avg, global_step) sw.add_scalar("graph_size", graph_size.avg, global_step) sw.add_scalar("lr", lr_this_step, global_step) sw.add_scalar("graph_size/max", max_num_nodes, global_step) sw.add_scalar("graph_size/max_edges", max_num_edges, global_step) # sw.add_scalar( # "learning_rate", optimizer.param_groups[0]["lr"], global_step # ) loss_meter.reset() f1_meter.reset() graph_size.reset() max_num_nodes, max_num_edges = 0, 0 return epoch_loss_meter.avg, epoch_f1_meter.avg
def train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, sw, opt): """ one epoch training for moco """ n_batch = train_loader.dataset.total // opt.batch_size model.train() model_ema.eval() def set_bn_train(m): classname = m.__class__.__name__ if classname.find("BatchNorm") != -1: m.train() model_ema.apply(set_bn_train) batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() epoch_loss_meter = AverageMeter() prob_meter = AverageMeter() graph_size = AverageMeter() gnorm_meter = AverageMeter() max_num_nodes = 0 max_num_edges = 0 end = time.time() for idx, batch in enumerate(train_loader): data_time.update(time.time() - end) graph_q, graph_k = batch graph_q.to(torch.device(opt.gpu)) graph_k.to(torch.device(opt.gpu)) bsz = graph_q.batch_size if opt.moco: # ===================Moco forward===================== feat_q = model(graph_q) with torch.no_grad(): feat_k = model_ema(graph_k) out = contrast(feat_q, feat_k) prob = out[:, 0].mean() else: # ===================Negative sampling forward===================== feat_q = model(graph_q) feat_k = model(graph_k) out = torch.matmul(feat_k, feat_q.t()) / opt.nce_t prob = out[range(graph_q.batch_size), range(graph_q.batch_size)].mean() assert feat_q.shape == (graph_q.batch_size, opt.hidden_size) # ===================backward===================== optimizer.zero_grad() loss = criterion(out) loss.backward() grad_norm = clip_grad_norm(model.parameters(), opt.clip_norm) global_step = epoch * n_batch + idx lr_this_step = opt.learning_rate * warmup_linear( global_step / (opt.epochs * n_batch), 0.1) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() # ===================meters===================== loss_meter.update(loss.item(), bsz) epoch_loss_meter.update(loss.item(), bsz) prob_meter.update(prob.item(), bsz) graph_size.update( (graph_q.number_of_nodes() + graph_k.number_of_nodes()) / 2.0 / bsz, 2 * bsz) gnorm_meter.update(grad_norm, 1) max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes()) max_num_edges = max(max_num_edges, graph_q.number_of_edges()) if opt.moco: moment_update(model, model_ema, opt.alpha) torch.cuda.synchronize() batch_time.update(time.time() - end) end = time.time() # print info if (idx + 1) % opt.print_freq == 0: mem = psutil.virtual_memory() # print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}') # mem_used.append(mem.used/1024**3) print("Train: [{0}][{1}/{2}]\t" "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "DT {data_time.val:.3f} ({data_time.avg:.3f})\t" "loss {loss.val:.3f} ({loss.avg:.3f})\t" "prob {prob.val:.3f} ({prob.avg:.3f})\t" "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t" "mem {mem:.3f}".format( epoch, idx + 1, n_batch, batch_time=batch_time, data_time=data_time, loss=loss_meter, prob=prob_meter, graph_size=graph_size, mem=mem.used / 1024**3, )) # print(out[0].abs().max()) # tensorboard logger if (idx + 1) % opt.tb_freq == 0: global_step = epoch * n_batch + idx sw.add_scalar("moco_loss", loss_meter.avg, global_step) sw.add_scalar("moco_prob", prob_meter.avg, global_step) sw.add_scalar("graph_size", graph_size.avg, global_step) sw.add_scalar("graph_size/max", max_num_nodes, global_step) sw.add_scalar("graph_size/max_edges", max_num_edges, global_step) sw.add_scalar("gnorm", gnorm_meter.avg, global_step) sw.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], global_step) loss_meter.reset() prob_meter.reset() graph_size.reset() gnorm_meter.reset() max_num_nodes, max_num_edges = 0, 0 return epoch_loss_meter.avg
def train_moco(epoch, model_name, train_loader, model, model_ema, contrast, criterion, optimizer, sw, opt, output_layer, output_layer_optimizer, global_output_layer, global_output_layer_optimizer): """ one epoch training for moco """ n_batch = train_loader.dataset.total // opt.batch_size model.train() model_ema.eval() print("pretrain") def set_bn_train(m): classname = m.__class__.__name__ if classname.find("BatchNorm") != -1: m.train() model_ema.apply(set_bn_train) batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() epoch_loss_meter = AverageMeter() global_loss_meter = AverageMeter() epoch_global_loss_meter = AverageMeter() prob_meter = AverageMeter() graph_size = AverageMeter() gnorm_meter = AverageMeter() max_num_nodes = 0 max_num_edges = 0 end = time.time() # read global label graph_list = np.zeros(15) f = open("./motifs/" + model_name + "-counts.out") for line in f: nums = [int(x) for x in line.split()] graph_list += np.array(nums) global_label = torch.FloatTensor( np.array([x * 1.0 / sum(graph_list) for x in graph_list])) for idx, batch in enumerate(train_loader): data_time.update(time.time() - end) graph_q, label, degree_label = batch graph_q.to(torch.device(opt.gpu)) bsz = graph_q.batch_size # ===================Negative sampling forward===================== feat_q = model(graph_q) out = output_layer(feat_q) #global_feature.append(feat_q.detach().cpu()) #mean_t = torch.mean(torch.cat(global_feature), dim=0, keepdim=True).squeeze() #print(len(global_feature), mean_t.shape) #continue #print(global_feature[0].shape) #print(global_feature, len(global_feature)) #mean_t = mean_t.to(torch.device(opt.gpu)) #global_out = global_output_layer(mean_t) degree_out = global_output_layer(feat_q) # print(feat_q.size(), feat_k.size()) #print("negative sampling") assert feat_q.shape == (graph_q.batch_size, opt.hidden_size) # ===================backward===================== optimizer.zero_grad() loss = criterion(out, label) global_loss = criterion(degree_out, degree_label) output_layer_optimizer.zero_grad() loss = loss + global_loss loss.backward(retain_graph=True) global_output_layer_optimizer.zero_grad() global_loss.backward() torch.nn.utils.clip_grad_value_(output_layer.parameters(), 1) torch.nn.utils.clip_grad_value_(global_output_layer.parameters(), 1) grad_norm = clip_grad_norm(model.parameters(), opt.clip_norm) global_step = epoch * n_batch + idx lr_this_step = opt.learning_rate * warmup_linear( global_step / (opt.epochs * n_batch), 0.1) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step for param_group in output_layer_optimizer.param_groups: param_group["lr"] = lr_this_step for param_group in global_output_layer_optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() output_layer_optimizer.step() global_output_layer_optimizer.step() # ===================meters===================== loss_meter.update(loss.item(), bsz) epoch_loss_meter.update(loss.item(), bsz) global_loss_meter.update(loss.item(), bsz) epoch_global_loss_meter.update(loss.item(), bsz) gnorm_meter.update(grad_norm, 1) max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes()) max_num_edges = max(max_num_edges, graph_q.number_of_edges()) torch.cuda.synchronize() batch_time.update(time.time() - end) end = time.time() # print info if (idx + 1) % opt.print_freq == 0: mem = psutil.virtual_memory() # print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}') # mem_used.append(mem.used/1024**3) print("Train: [{0}][{1}/{2}]\t" "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "DT {data_time.val:.3f} ({data_time.avg:.3f})\t" "loss {loss.val:.3f} ({loss.avg:.3f})\t" "prob {prob.val:.3f} ({prob.avg:.3f})\t" "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t" "mem {mem:.3f}".format( epoch, idx + 1, n_batch, batch_time=batch_time, data_time=data_time, loss=loss_meter, prob=prob_meter, graph_size=graph_size, mem=mem.used / 1024**3, )) # tensorboard logger if (idx + 1) % opt.tb_freq == 0: global_step = epoch * n_batch + idx sw.add_scalar("moco_loss", loss_meter.avg, global_step) sw.add_scalar("global_moco_loss", global_loss_meter.avg, global_step) sw.add_scalar("moco_prob", prob_meter.avg, global_step) sw.add_scalar("graph_size", graph_size.avg, global_step) sw.add_scalar("graph_size/max", max_num_nodes, global_step) sw.add_scalar("graph_size/max_edges", max_num_edges, global_step) sw.add_scalar("gnorm", gnorm_meter.avg, global_step) sw.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], global_step) loss_meter.reset() global_loss_meter.reset() prob_meter.reset() graph_size.reset() gnorm_meter.reset() max_num_nodes, max_num_edges = 0, 0 return epoch_loss_meter.avg, epoch_global_loss_meter.avg