def train(train_dataloader, model, optimizer, lr_scheduler): def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) logger.info("model\n{}".format(describe(model))) tb_writer = SummaryWriter(cfg.PRUNING.FINETUNE.LOG_DIR) average_meter = AverageMeter() start_epoch = cfg.PRUNING.FINETUNE.START_EPOCH num_per_epoch = len( train_dataloader.dataset) // (cfg.PRUNING.FINETUNE.BATCH_SIZE) iter = 0 if not os.path.exists(cfg.PRUNING.FINETUNE.SNAPSHOT_DIR): os.makedirs(cfg.PRUNING.FINETUNE.SNAPSHOT_DIR) for epoch in range(cfg.PRUNING.FINETUNE.START_EPOCH, cfg.PRUNING.FINETUNE.EPOCHS): train_dataloader.dataset.shuffle() lr_scheduler.step(epoch) # log for lr for idx, pg in enumerate(optimizer.param_groups): tb_writer.add_scalar('lr/group{}'.format(idx + 1), pg['lr'], iter) cur_lr = lr_scheduler.get_cur_lr() for data in train_dataloader: begin = time.time() examplar_img = data['examplar_img'].cuda() search_img = data['search_img'].cuda() gt_cls = data['gt_cls'].cuda() gt_delta = data['gt_delta'].cuda() delta_weight = data['delta_weight'].cuda() data_time = time.time() - begin losses = model.forward(examplar_img, search_img, gt_cls, gt_delta, delta_weight) cls_loss = losses['cls_loss'] loc_loss = losses['loc_loss'] loss = losses['total_loss'] if is_valid_number(loss.item()): optimizer.zero_grad() loss.backward() if cfg.PRUNING.FINETUNE.LOG_GRAD: log_grads(model.module, tb_writer, iter) clip_grad_norm_(model.parameters(), cfg.PRUNING.FINETUNE.GRAD_CLIP) optimizer.step() batch_time = time.time() - begin batch_info = {} batch_info['data_time'] = data_time batch_info['batch_time'] = batch_time for k, v in losses.items(): batch_info[k] = v average_meter.update(**batch_info) for k, v in batch_info.items(): tb_writer.add_scalar(k, v, iter) if iter % cfg.TRAIN.PRINT_EVERY == 0: logger.info( 'epoch: {}, iter: {}, cur_lr:{}, cls_loss: {}, loc_loss: {}, loss: {}' .format(epoch + 1, iter, cur_lr, cls_loss.item(), loc_loss.item(), loss.item())) print_speed(iter + 1 + start_epoch * num_per_epoch, average_meter.batch_time.avg, cfg.PRUNING.FINETUNE.EPOCHS * num_per_epoch) iter += 1 # save model state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1, 'mask': model.mask, 'mask_scores': model.mask_scores } logger.info('save snapshot to {}/checkpoint_e{}.pth'.format( cfg.PRUNING.FINETUNE.SNAPSHOT_DIR, epoch + 1)) torch.save( state, '{}/checkpoint_e{}.pth'.format(cfg.PRUNING.FINETUNE.SNAPSHOT_DIR, epoch + 1))
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): """ 模型训练 :param train_loader:训练数据 :param model: :param optimizer: :param lr_scheduler: :param epoch: :param cfg: :return: """ global tb_index, best_acc, cur_lr, logger # 获取当前的学习率 cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') # avg = AverageMeter() model.train() # GPU # model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch print("num_per_epoch", num_per_epoch) start_epoch = epoch epoch = epoch # 获取每个batch的输入 for iter, input in enumerate(train_loader): if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch # 创建存储路径 if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) # 存储训练结果 save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) if epoch == args.epochs: return # 更新优化器和学习方法 if model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch) # 获取当前学习率 lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) # 更新日志 tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) # 输入数据 x = { # GPU # 'cfg': cfg, # 'template': torch.autograd.Variable(input[0]).cuda(), # 'search': torch.autograd.Variable(input[1]).cuda(), # 'label_cls': torch.autograd.Variable(input[2]).cuda(), # 'label_loc': torch.autograd.Variable(input[3]).cuda(), # 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(), # 'label_mask': torch.autograd.Variable(input[6]).cuda(), # 'label_mask_weight': torch.autograd.Variable(input[7]).cuda(), 'cfg': cfg, 'template': torch.autograd.Variable(input[0]), 'search': torch.autograd.Variable(input[1]), 'label_cls': torch.autograd.Variable(input[2]), 'label_loc': torch.autograd.Variable(input[3]), 'label_loc_weight': torch.autograd.Variable(input[4]), 'label_mask': torch.autograd.Variable(input[6]), 'label_mask_weight': torch.autograd.Variable(input[7]), } # 输出数据 outputs = model(x) # 计算损失函数 rpn_cls_loss, rpn_loc_loss, rpn_mask_loss = torch.mean( outputs['losses'][0]), torch.mean( outputs['losses'][1]), torch.mean(outputs['losses'][2]) # 计算精度 mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean( outputs['accuracy'][0]), torch.mean( outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) # 获取分类,回归和分割所占的比例 cls_weight, reg_weight, mask_weight = cfg['loss']['weight'] # 计算总损失 loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_mask_loss * mask_weight # 将梯度置零 optimizer.zero_grad() # 反向传播 loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end # 参数更新 avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, rpn_mask_loss=rpn_mask_loss, siammask_loss=siammask_loss, mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) # 参数写入tensorboard tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/mask', rpn_mask_loss, tb_index) tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_5, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_7, tb_index) end = time.time() # 日志输出 if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_mask_loss:s}\t{siammask_loss:s}' '\t{mask_iou_mean:s}\t{mask_iou_at_5:s}\t{mask_iou_at_7:s}'. format(epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, rpn_mask_loss=avg.rpn_mask_loss, siammask_loss=avg.siammask_loss, mask_iou_mean=avg.mask_iou_mean, mask_iou_at_5=avg.mask_iou_at_5, mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): global tb_index, best_acc, cur_lr, logger cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') avg = AverageMeter() model.module.features.eval() model.module.rpn_model.eval() model.module.mask_model.eval() # model.train() model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch start_epoch = epoch epoch = epoch for iter, input in enumerate(train_loader): if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) if epoch == args.epochs: return if model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) x = { 'cfg': cfg, 'template': torch.autograd.Variable(input[0]).cuda(), 'search': torch.autograd.Variable(input[1]).cuda(), 'label_cls': torch.autograd.Variable(input[2]).cuda(), 'label_loc': torch.autograd.Variable(input[3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(), 'label_mask': torch.autograd.Variable(input[6]).cuda(), 'label_kp_weight': torch.autograd.Variable(input[7]).cuda(), 'label_mask_weight': torch.autograd.Variable(input[8]).cuda(), 'label_kp': torch.autograd.Variable(input[9]).cuda() } outputs = model(x) rpn_cls_loss, rpn_loc_loss, rpn_kp_loss, rpn_heatmap_loss = torch.mean(outputs['losses'][0]),\ torch.mean(outputs['losses'][1]),\ torch.mean(outputs['losses'][2]),\ torch.mean(outputs['losses'][3]) # mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean(outputs['accuracy'][0]), torch.mean(outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) htmap_pred = outputs['predict'][-1] kp_pred = outputs['predict'][-2] htmap_pred = htmap_pred.squeeze(1) kp_pred = kp_pred.squeeze(1) # htmap_pred = htmap_pred.permute(0, 2, 3, 1) # kp_pred = kp_pred.permute(0, 2, 3, 1) htmap_pred = htmap_pred.cpu().numpy() kp_pred = kp_pred.cpu().detach().numpy() f, (ax1, ax2) = plt.subplots(1, 2) print(htmap_pred[0].shape) print(kp_pred[0].shape) ax1.imshow(htmap_pred[0]) ax2.imshow(kp_pred[0]) plt.show() cls_weight, reg_weight, kp_weight, heatmap_weight = cfg['loss'][ 'weight'] loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_kp_loss * kp_weight + rpn_heatmap_loss * heatmap_weight optimizer.zero_grad() loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, rpn_kp_loss=rpn_kp_loss * kp_weight, rpn_heatmap_loss=rpn_heatmap_loss * heatmap_weight, siammask_loss=siammask_loss) # mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/kp_reg', rpn_kp_loss * kp_weight, tb_index) tb_writer.add_scalar('loss/heatmap', rpn_heatmap_loss * heatmap_weight, tb_index) # tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index) # tb_writer.add_scalar('mask/[email protected]', mask_iou_at_5, tb_index) # tb_writer.add_scalar('mask/[email protected]', mask_iou_at_7, tb_index) end = time.time() if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_kp_loss:s}\t{rpn_heatmap_loss:s}\t{siammask_loss:s}' .format(epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, rpn_kp_loss=avg.rpn_kp_loss, rpn_heatmap_loss=avg.rpn_heatmap_loss, siammask_loss=avg.siammask_loss)) # mask_iou_mean=avg.mask_iou_mean, # mask_iou_at_5=avg.mask_iou_at_5,mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
epoch * train_lenth + step + 1) writer.add_scalars( 'loss/merge', { "train_loss": train_loss, "test_loss": test_loss, "train_metric": train_metric, "test_metric": test_metric }, epoch * train_lenth + step + 1) # 更新avrager avg.update(step_time=step_time, train_loss=train_loss, test_loss=test_loss, train_metric=train_metric) # 算平均值 # 打印结果 if (step + 1) % print_freq == 0: global_logger.info( 'Epoch: [{0}][{1}/{2}] {step_time:s}\t{train_loss:s}\t{test_loss:s}\t{train_metric:s}' .format(epoch + 1, (step + 1) % train_lenth, train_lenth, step_time=avg.step_time, train_loss=avg.train_loss, test_loss=avg.test_loss, train_metric=avg.train_metric)) print_speed(epoch * train_lenth + step + 1, avg.step_time.avg, epoches * train_lenth) # scheduler更新 scheduler.step()
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): global tb_index, best_acc, cur_lr, logger cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') avg = AverageMeter() model.train() model.module.features.eval() model.module.rpn_model.eval() model.module.features.apply(BNtoFixed) model.module.rpn_model.apply(BNtoFixed) model.module.mask_model.train() model.module.refine_model.train() model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch start_epoch = epoch epoch = epoch for iter, input in enumerate(train_loader): if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) if epoch == args.epochs: return optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) track12 = { 'cfg': cfg, 'template': torch.autograd.Variable(input[0][0]).cuda(), 'search': torch.autograd.Variable(input[0][1]).cuda(), 'label_cls': torch.autograd.Variable(input[0][2]).cuda(), 'label_loc': torch.autograd.Variable(input[0][3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[0][4]).cuda(), 'template_bbox': torch.autograd.Variable(input[0][5]).cuda(), 'label_mask': torch.autograd.Variable(input[0][6]).cuda(), 'label_mask_weight': torch.autograd.Variable(input[0][7]).cuda(), } track21 = { 'cfg': cfg, 'template': torch.autograd.Variable(input[1][0]).cuda(), 'search': torch.autograd.Variable(input[1][1]).cuda(), 'label_cls': torch.autograd.Variable(input[1][2]).cuda(), 'label_loc': torch.autograd.Variable(input[1][3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[1][4]).cuda(), 'template_bbox': torch.autograd.Variable(input[1][5]).cuda(), 'label_mask': torch.autograd.Variable(input[1][6]).cuda(), 'label_mask_weight': torch.autograd.Variable(input[1][7]).cuda(), } # ========================== cycle forward frame1 -> frame2 =================================== outputs12 = model(track12, softmax=False) out_patch12 = trackres(cfg, outputs12, track12) track21['template'] = torch.autograd.Variable( torch.from_numpy(out_patch12).float()).cuda() # ========================== cycle backward frame2 -> frame1 =================================== outputs = model(track21, softmax=True) rpn_cls_loss, rpn_loc_loss, rpn_mask_loss = torch.mean( outputs['losses'][0]), torch.mean( outputs['losses'][1]), torch.mean(outputs['losses'][2]) mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean( outputs['accuracy'][0]), torch.mean( outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) cls_weight, reg_weight, mask_weight = cfg['loss']['weight'] loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_mask_loss * mask_weight optimizer.zero_grad() loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) torch.nn.utils.clip_grad_norm_( model.module.refine_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end rpn_cls_loss = min(rpn_cls_loss.item(), 1) rpn_loc_loss = min(rpn_loc_loss.item(), 1) rpn_mask_loss = min(rpn_mask_loss.item(), 1) mask_iou_mean = mask_iou_mean.item() mask_iou_at_5 = mask_iou_at_5.item() mask_iou_at_7 = mask_iou_at_7.item() avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, rpn_mask_loss=rpn_mask_loss, siammask_loss=siammask_loss, mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/mask', rpn_mask_loss, tb_index) tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_5, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_7, tb_index) end = time.time() if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_mask_loss:s}\t{siammask_loss:s}' '\t{mask_iou_mean:s}\t{mask_iou_at_5:s}\t{mask_iou_at_7:s}'. format(epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, rpn_mask_loss=avg.rpn_mask_loss, siammask_loss=avg.siammask_loss, mask_iou_mean=avg.mask_iou_mean, mask_iou_at_5=avg.mask_iou_at_5, mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
def train_one_epoch(train_loader, model, optimizer, device, epoch): logger = logging.getLogger('global') model.train() lossitem, abs_diff, abs_rel, sq_rel, a1, a2, a3 = 0, 0, 0, 0, 0, 0, 0 num_batches = 0.0 for i_batch, sample_batched in enumerate(train_loader): t0 = time.time() rgb = sample_batched['rgb'].type(torch.FloatTensor) depth = sample_batched['depth'] s1 = sample_batched['s1'].type(torch.FloatTensor) s2 = sample_batched['s2'].type(torch.FloatTensor) b = rgb.shape[0] rgb = rgb.to(device) depth = depth.to(device) s1 = s1.to(device) s2 = s2.to(device) s1.unsqueeze_(-1) s2.unsqueeze_(-1) s1s2 = torch.cat((s1, s2), 3) rgb = rgb.permute(0, 3, 1, 2) s1s2 = s1s2.permute(0, 3, 1, 2) # zero the parameter gradients optimizer.zero_grad() depth_predict = model(rgb, s1s2) depth_predict.squeeze_(1) loss = l2_loss(depth, depth_predict) # backward + optimize loss.backward() optimizer.step() lossitem0 = loss.item() abs_diff0, abs_rel0, sq_rel0, a10, a20, a30 = compute_errors( depth, depth_predict) num_batches += 1 lossitem += lossitem0 abs_diff += abs_diff0 abs_rel += abs_rel0 sq_rel += sq_rel0 a1 += a10 a2 += a20 a3 += a30 t1 = time.time() print_speed(i_batch, t1 - t0, train_loader.__len__()) lossitem = lossitem / num_batches abs_diff = abs_diff / num_batches abs_rel = abs_rel / num_batches sq_rel = sq_rel / num_batches a1 = a1 / num_batches a2 = a2 / num_batches a3 = a3 / num_batches logger.info('Train Loss: {:.4f},'.format(lossitem)) logger.info(' abs_diff: {:.4f}'.format(abs_diff)) logger.info(' abs_rel: {:.4f}'.format(abs_rel)) logger.info(' sq_rel: {:.4f}'.format(sq_rel)) logger.info(' a1: {:.4f}'.format(a1)) logger.info(' a2: {:.4f}'.format(a2)) logger.info(' a3: {:.4f}'.format(a3)) writer.add_scalar('train/loss', lossitem, epoch) writer.add_scalar('train/abs_diff', abs_diff, epoch) writer.add_scalar('train/abs_rel', abs_rel, epoch) writer.add_scalar('train/sq_rel', sq_rel, epoch) writer.add_scalar('train/a1', a1, epoch) writer.add_scalar('train/a2', a2, epoch) writer.add_scalar('train/a3', a3, epoch)
def validate(val_loader, model, cfg): logger = logging.getLogger('global') try: rank = dist.get_rank() world_size = dist.get_world_size() except Exception as e: print(e) rank, world_size = 0, 1 # switch to evaluate mode model.eval() total_rc = 0 total_gt = 0 logger.info('start validate') if not os.path.exists(args.results_dir): try: os.makedirs(args.results_dir) except Exception as e: print(e) fout = open(os.path.join(args.results_dir, 'results.json.rank%d' % rank), 'w') for iter, input in enumerate(val_loader): img = torch.autograd.Variable(input[0]).cuda() img_info = input[1] gt_boxes = input[2] filenames = input[-1] x = { 'cfg': cfg, 'image': img, 'image_info': img_info[:, :3], 'ground_truth_bboxes': gt_boxes, 'ignore_regions': None, 'ground_truth_keypoints': None, 'ground_truth_masks': None } batch_size = img.shape[0] t0 = time.time() outputs = model(x)['predict'] t2 = time.time() proposals = outputs[0].data.cpu().numpy() bboxes = outputs[1].data.cpu().numpy() #keypoints = outputs[2].data.cpu().numpy() if isinstance(outputs[2], torch.autograd.Variable): keypoints = outputs[2].data.cpu().numpy() masks = None else: keypoints = None masks = outputs[2] # heatmap = outputs[3].data.cpu().numpy() if torch.is_tensor(gt_boxes): gt_boxes = gt_boxes.cpu().numpy() image_info = img_info img_ids = [ _.split('/')[-1].split('_')[-1].split('.')[0] for _ in filenames ] image_info = [list(x) + [y] for x, y in zip(image_info, img_ids)] # visualize results #vis_helper.vis_results(args.results_dir, image_info, bboxes, keypoints, masks, heatmap, cfg['shared']['class_names']) write_results_to_file(fout, image_info, bboxes, keypoints, masks, mask_thresh=0.5, keep_num=100) # rpn recall for b_ix in range(batch_size): rois_per_image = proposals[proposals[:, 0] == b_ix] gts_per_image = gt_boxes[b_ix] num_rc, num_gt = bbox_helper.compute_recall( rois_per_image[:, 1:1 + 4], gts_per_image) total_gt += num_gt total_rc += num_rc logger.info('Test: [%d/%d] Time: %.3f %d/%d' % (iter, len(val_loader), t2 - t0, total_rc, total_gt)) print_speed(iter + 1, t2 - t0, len(val_loader)) logger.info('rpn300 recall=%f' % (total_rc / total_gt)) fout.close() return total_rc / total_gt
def validation(epoch, log_interval, test_dataloader, model, loss, writer, device): """Validate on test dataset. Current validation is only for loss, pos|neg_distance. In future, we will add more validation like MAP5|10|50|100. (maybe in another file.) Args: log_interval: How many time will the logger log once. test_dataloader: It should not be none! A Triplet dataloader to validate data. model: The model that used to test on dataset. loss: Loss metric. writer: Tensorboard writer device: Device that model compute on Return: epoch avrage value: triplet_loss, pos_dists, neg_dists """ logger.info( "\n------------------------- Start validation -------------------------\n" ) # epoch average meter avg_test = AverageMeter() # get test batch count current_test_batch = 0 total_test_batch = len(test_dataloader) # check dataloader is not None assert test_dataloader is not None, "test_dataloader should not be None." for batch_idx, batch_sample in enumerate(test_dataloader): # Skip last iteration to avoid the problem of having different number of tensors while calculating # averages (sizes of tensors must be the same for pairwise distance calculation) if batch_idx + 1 == len(test_dataloader): continue # switch to evaluation mode. for param in model.parameters(): param.requires_grad = False model.eval() # start time counting batch_start_time_test = time.time() # Forward pass - compute embeddings anc_imgs = batch_sample['anchor_img'] pos_imgs = batch_sample['pos_img'] neg_imgs = batch_sample['neg_img'] pos_cls = batch_sample['pos_cls'] neg_cls = batch_sample['neg_cls'] # move to device anc_imgs = anc_imgs.to(device) pos_imgs = pos_imgs.to(device) neg_imgs = neg_imgs.to(device) pos_cls = pos_cls.to(device) neg_cls = neg_cls.to(device) # forward output = model.forward_triplet(anc_imgs, pos_imgs, neg_imgs) # get output anc_emb = output['anchor_map'] pos_emb = output['pos_map'] neg_emb = output['neg_map'] pos_dists = torch.mean(output['dist_pos']) neg_dists = torch.mean(output['dist_neg']) # loss compute loss_value = loss(anc_emb, pos_emb, neg_emb) # batch time & batch count current_test_batch += 1 batch_time = time.time() - batch_start_time_test # update avg avg_test.update(time=batch_time, triplet_loss=loss_value, pos_dists=pos_dists, neg_dists=neg_dists) if current_test_batch % log_interval == 0: print_speed(current_test_batch, batch_time, total_test_batch, "global") logger.info( "\n current global average information:\n batch_time {0:.5f} | triplet_loss: {1:.5f} | pos_dists: {2:.5f} | neg_dists: {3:.5f} \n" .format(avg_test.time.avg, avg_test.triplet_loss.avg, avg_test.pos_dists.avg, avg_test.neg_dists.avg)) else: writer.add_scalar("Validate/Loss/train", avg_test.triplet_loss.avg, global_step=epoch) writer.add_scalar("Validate/Other/pos_dists", avg_test.pos_dists.avg, global_step=epoch) writer.add_scalar("Validate/Other/neg_dists", avg_test.neg_dists.avg, global_step=epoch) return avg_test.triplet_loss.avg, avg_test.pos_dists.avg, avg_test.neg_dists.avg
writer.add_scalar("Train_Batch/Distance/neg_dists", neg_dists, global_step=current_batch) writer.add_scalar("Train_Batch_Global_AVG/loss", avg.triplet_loss.avg, global_step=current_batch) writer.add_scalar("Train_Batch_Global_AVG/pos_dists", avg.pos_dists.avg, global_step=current_batch) writer.add_scalar("Train_Batch_Global_AVG/neg_dists", avg.neg_dists.avg, global_step=current_batch) # log to logger if current_batch % log_interval == 0: print_speed(current_batch, batch_time, total_batch, "global") logger.info( "\n current batch information:\n epoch: {0} | batch_time {1:5f} | triplet_loss: {2:.5f} | pos_dists: {3:.5f} | neg_dists: {4:.5f} \n" .format(epoch + 1, avg.time.val, avg.triplet_loss.val, avg.pos_dists.val, avg.neg_dists.val)) logger.info( "\n current global average information:\n epoch: {0} | batch_time {1:5f} | triplet_loss: {2:.5f} | pos_dists: {3:.5f} | neg_dists: {4:.5f} \n" .format(epoch + 1, avg.time.avg, avg.triplet_loss.avg, avg.pos_dists.avg, avg.neg_dists.avg)) else: # add epoch avg writer.add_scalar("Train/Loss/train", avg.triplet_loss.avg, global_step=epoch) writer.add_scalar("Train/Other/train_pos_dists", avg.pos_dists.avg,
def train(train_loader, model, lr_scheduler, epoch, cfg, warmup=False): logger = logging.getLogger('global') model.cuda() model.train() world_size = 1 rank = 0 if args.dist: rank = dist.get_rank() world_size = dist.get_world_size() def freeze_bn(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: m.eval() model.apply(freeze_bn) logger.info('freeze bn') t0 = time.time() if args.dist: # update random seed train_loader.sampler.set_epoch(epoch) t0 = time.time() for iter, input in enumerate(train_loader): #torch.cuda.empty_cache() if warmup: # update lr for each iteration lr_scheduler.step() x = { 'cfg': cfg, 'image': torch.autograd.Variable(input[0]).cuda(), 'image_info': input[1][:, :3], 'ground_truth_bboxes': input[2], 'ignore_regions': None, # input[3], 'ground_truth_keypoints': input[4], 'ground_truth_masks': input[5] } # for debug #debugger.store_tensor_as_image(input[0]) #debugger.store_filenames(input[-1]) t1 = time.time() outputs = model(x) t11 = time.time() rpn_cls_loss, rpn_loc_loss, rcnn_cls_loss, rcnn_loc_loss, keypoint_loss = outputs[ 'losses'] # gradient is averaged by normalizing the loss with world_size #loss = (rpn_cls_loss + rpn_loc_loss + rcnn_cls_loss + rcnn_loc_loss + keypoint_loss) / world_size loss = sum(outputs['losses']) / world_size ''' if args.dist == 0 or dist.get_rank() == 0: graph = vis_helper.make_dot(loss, dict(model.named_parameters())) logger.info('PATH:{}'.format(os.environ['PATH'])) graph.render(filename = 'graph', directory='graph', view=False) exit() ''' t12 = time.time() lr_scheduler.optimizer.zero_grad() loss.backward() t13 = time.time() if args.dist: average_gradients(model) t14 = time.time() lr_scheduler.optimizer.step() t15 = time.time() rpn_accuracy = outputs['accuracy'][0][0] / 100. rcnn_accuracy = outputs['accuracy'][1][0] / 100. loss = loss.data.cpu()[0] rpn_cls_loss = rpn_cls_loss.data.cpu()[0] rpn_loc_loss = rpn_loc_loss.data.cpu()[0] rcnn_cls_loss = rcnn_cls_loss.data.cpu()[0] rcnn_loc_loss = rcnn_loc_loss.data.cpu()[0] if keypoint_loss is not None: keypoint_loss = keypoint_loss.data.cpu()[0] t2 = time.time() lr = lr_scheduler.get_lr()[0] logger.info( 'Epoch: [%d][%d/%d] LR:%f Time: %.3f Loss: %.5f (rpn_cls: %.5f rpn_loc: %.5f rpn_acc: %.5f' ' rcnn_cls: %.5f, rcnn_loc: %.5f rcnn_acc:%.5f kpt:%.5f)' % (epoch, iter, len(train_loader), lr, t2 - t0, loss * world_size, rpn_cls_loss, rpn_loc_loss, rpn_accuracy, rcnn_cls_loss, rcnn_loc_loss, rcnn_accuracy, keypoint_loss)) t3 = time.time() #logger.info('data:{0}, forward:{1}, bp:{2}, sync:{3}, upd:{4}, loss:{5}, prt:{6}'.format(t1-t0, t11-t1, t13-t12, t14-t13, t15-t14, t2-t15, t3-t2)) #logger.info('data:%f, ' % (t1-t0) + # 'forward:%f, ' % (t11-t1) + # 'sum_loss:%f, ' % (t12-t11) + # 'bp:%f, ' % (t13-t12) + # 'sync:%f, ' % (t14-t13) + # 'upd:%f, ' % (t15-t14) + # 'loss:%f, ' % (t2-t15) + # 'prt:%f, ' % (t3-t2)) print_speed((epoch - 1) * len(train_loader) + iter + 1, t2 - t0, args.epochs * len(train_loader)) t0 = t2
def train(train_loader, target_loader, val_loader, model, dec_model, dis_model, dis_model_patch, lr_scheduler, lr_scheduler_dec, lr_scheduler_dis, lr_scheduler_dis_patch, epoch, cfg, warmup=False): logger = logging.getLogger('global') model.cuda() model.train() dis_model.cuda() dis_model.train() dec_model.cuda() dec_model.train() dis_model_patch.cuda() dis_model_patch.train() if args.dist: rank = dist.get_rank() world_size = dist.get_world_size() else: world_size = 1 rank = 0 def freeze_bn(m): classname = m.__class__.__name__ if classname.find('Norm') != -1: m.eval() model.apply(freeze_bn) fix_num = args.fix_num count = 1 for mm in model.modules(): if count > fix_num: break if isinstance(mm, torch.nn.Conv2d) and count <= fix_num: mm.eval() count += 1 # dec_model.apply(freeze_bn) logger.info('freeze bn') end = time.time() t0 = time.time() l1_loss = torch.nn.L1Loss() if args.dist: # update random seed train_loader.sampler.set_epoch(epoch) target_loader.sampler.set_epoch(epoch) for iter, (input, target) in enumerate(zip(train_loader, target_loader)): # torch.cuda.empty_cache() if warmup: # update lr for each iteration lr_scheduler.step() lr_scheduler_dis.step() lr_scheduler_dec.step() lr_scheduler_dis_patch.step() x = { 'cfg': cfg, 'image': (input[0]).cuda(), 'image_info': input[1], 'ground_truth_bboxes': input[2], 'ignore_regions': None, 'cluster_num': args.cluster_num, 'threshold': args.threshold # 'ignore_regions': input[3] if args.dataset == 'coco' else None } target = (target).cuda() outputs = model(x, target) centers_source, centers_target = outputs['cluster_centers'] corners_source = get_corner_from_center(centers_source) corners_target = get_corner_from_center(centers_target) x_small = [] target_small = [] for corners_idx in range(0, len(corners_source)): x1 = corners_source[corners_idx][0] y1 = corners_source[corners_idx][1] x2 = corners_source[corners_idx][2] y2 = corners_source[corners_idx][3] assert ( x2 - x1 == args.recon_size), "x size does not match 256 in source " assert ( y2 - y1 == args.recon_size), "y size does not match 256 in source " x_small_tmp = x['image'][:, :, y1:y2, x1:x2] x_small.append(x_small_tmp) x_small = torch.cat(x_small, 0) for corners_idx in range(0, len(corners_target)): x1 = corners_target[corners_idx][0] y1 = corners_target[corners_idx][1] x2 = corners_target[corners_idx][2] y2 = corners_target[corners_idx][3] assert ( x2 - x1 == args.recon_size), "x size does not match 256 in target " assert ( y2 - y1 == args.recon_size), "y size does not match 256 in target " target_small_tmp = target[:, :, y1:y2, x1:x2] target_small.append(target_small_tmp) target_small = torch.cat(target_small, 0) # Size(4, 3, 256, 256) x_source_patch, x_target_patch = outputs[ 'cluster_features'] # Size(4, 128, 4096) x_source_recon, x_target_recon = dec_model( x_source_patch, x_target_patch) # Size(4, 3, 256, 256) ########################################################################## ######################### (1): start dis_update ########################## ########################################################################## lr_scheduler_dis.optimizer.zero_grad() x_source_dis, x_target_dis = dis_model(x_source_recon, x_target_recon) # (4, 256) x_source_real, x_target_real = dis_model(x_small, target_small) # (4, 256) x_source_dis = torch.sigmoid(x_source_dis) # (4, dim) x_target_dis = torch.sigmoid(x_target_dis) # (4, dim) x_source_real = torch.sigmoid(x_source_real) # (4, dim) x_target_real = torch.sigmoid(x_target_real) x_source_dis_cluster = torch.split(x_source_dis, 1, dim=0) x_source_real_cluster = torch.split(x_source_real, 1, dim=0) score_1_cluster = generate_soft_label(1, x_source_real_cluster[0]) score_0_cluster = generate_soft_label(0, x_source_dis_cluster[0]) adloss_source = 0.0 #################### (1.1): for source clusters############################ for clu_idx in range(0, len(x_source_dis_cluster)): adloss_source += ( F.binary_cross_entropy(x_source_dis_cluster[clu_idx], score_1_cluster) + F.binary_cross_entropy(x_source_real_cluster[clu_idx], score_0_cluster)) #################### (1.2): for target clusters############################ x_target_patch_pro = dis_model_patch(x_target_patch) x_target_patch_pro_mean = torch.mean(x_target_patch_pro, 1) # Size(4,1) x_source_patch_pro = dis_model_patch(x_source_patch) # (4, 512) x_target_dis_cluster = torch.split(x_target_dis, 1, dim=0) x_target_real_cluster = torch.split(x_target_real, 1, dim=0) adloss_target = 0.0 for clu_idx in range(0, len(x_target_dis_cluster)): adloss_target += ( x_target_patch_pro_mean[clu_idx] * F.binary_cross_entropy( x_target_dis_cluster[clu_idx], score_0_cluster) + F.binary_cross_entropy(x_target_real_cluster[clu_idx], score_1_cluster)) adloss = (adloss_source + adloss_target) / world_size adloss.backward(retain_graph=True) max_grad3 = 0.0 for pp in dis_model.parameters(): tmp = torch.max(pp.grad.data) if max_grad3 < tmp: max_grad3 = tmp if args.dist: average_gradients(dis_model) lr_scheduler_dis.optimizer.step() ########################################################################## ####################### (2): start dis_patch_update ###################### ########################################################################## lr_scheduler_dis_patch.optimizer.zero_grad() score_0_patch = generate_soft_label(0, x_target_patch_pro) score_1_patch = generate_soft_label(1, x_source_patch_pro) patch_loss_target = F.binary_cross_entropy(x_target_patch_pro, score_0_patch) patch_loss_source = F.binary_cross_entropy(x_source_patch_pro, score_1_patch) dis_patch_loss = (patch_loss_source + patch_loss_target) / world_size dis_patch_loss.backward(retain_graph=True) if args.dist: average_gradients(dis_model_patch) lr_scheduler_dis_patch.optimizer.step() ########################################################################## ########################## (3): start decoder_update ##################### ########################################################################## lr_scheduler_dec.optimizer.zero_grad() # x_source_recon, x_target_recon = dec_model(x_target_patch, x_source_patch) x_source_dis, x_target_dis = dis_model(x_source_recon, x_target_recon) x_source_dis = torch.sigmoid(x_source_dis) # (4, dim) x_source_real, x_target_real = dis_model(x_small, target_small) # (4, 256) x_source_real = torch.sigmoid(x_source_real) x_target_real = torch.sigmoid(x_target_real) """ for the patch loss of Target image """ x_target_patch_pro = dis_model_patch(x_target_patch) # size(4, dim) gtav_dis_sigmoid_target = torch.sigmoid(x_target_dis) """ obtain the weighting factor of target patches and calculate the target loss """ x_target_patch_pro_mean2 = torch.mean(x_target_patch_pro, 1) # Size(4,1) fake_loss1_target = 0.0 gtav_dis_sigmoid_target = torch.split(gtav_dis_sigmoid_target, 1, dim=0) # allone_target_1 = (torch.ones(gtav_dis_sigmoid_target[0].size()).float().cuda()) all_target_1 = generate_hard_label(1, gtav_dis_sigmoid_target[0]) gtav_real_sigmoid_target = torch.split(x_target_real, 1, dim=0) all_target_0 = generate_hard_label(0, gtav_real_sigmoid_target[0]) for clu_idx in range(0, len(gtav_dis_sigmoid_target)): fake_loss1_target += x_target_patch_pro_mean2[clu_idx] * ( F.binary_cross_entropy(gtav_dis_sigmoid_target[clu_idx], all_target_1) + F.binary_cross_entropy(gtav_real_sigmoid_target[clu_idx], all_target_0)) fake_loss1_source = 0.0 x_source_fake_cluster2 = torch.split(x_source_dis, 1, dim=0) all_source_1 = generate_hard_label(1, x_source_fake_cluster2[0]) x_source_real_cluster2 = torch.split(x_source_real, 1, dim=0) all_source_0 = generate_hard_label(0, x_source_real_cluster2[0]) for clu_idx in range(0, len(x_source_fake_cluster2)): fake_loss1_source += ( F.binary_cross_entropy(x_source_fake_cluster2[clu_idx], all_source_1) + F.binary_cross_entropy(x_source_real_cluster2[clu_idx], all_source_0)) recon_loss = (fake_loss1_source + fake_loss1_target ) / world_size # no-discriminator in the Decoder # recon_loss = recon_loss recon_loss.backward(retain_graph=True) max_grad2 = 0.0 for pp in dec_model.parameters(): tmp = torch.max(pp.grad.data) if max_grad2 < tmp: max_grad2 = tmp if args.dist: average_gradients(dec_model) # torch.nn.utils.clip_grad_norm(dec_model.parameters(), 10.0) lr_scheduler_dec.optimizer.step() ########################################################################## ########################### (4): start detection_update ################## ########################################################################## """ target feature maps --> source reconstruction for cross-domain alignment """ x_source_recon, x_target_recon = dec_model(x_target_patch, x_source_patch) x_source_dis, x_target_dis = dis_model(x_source_recon, x_target_recon) # (4, dim) """ weight of target patches """ x_fake_dis_sigmoid = torch.sigmoid(x_target_dis) # allone_11 = generate_hard_label(1, x_fake_dis_sigmoid) fake_loss_source = F.binary_cross_entropy( x_fake_dis_sigmoid, allone_11) # NO discriminator in Detection x_fake_dis_sigmoid2 = torch.sigmoid(x_source_dis) x_fake_dis_sigmoid2_cluster = torch.split(x_fake_dis_sigmoid2, 1, dim=0) allone_11_cluster = (torch.ones( x_fake_dis_sigmoid2_cluster[0].size()).float().cuda()) fake_loss_target = 0.0 for clu_idx in range(0, len(x_fake_dis_sigmoid2_cluster)): fake_loss_target += x_target_patch_pro_mean2[clu_idx] * ( F.binary_cross_entropy(x_fake_dis_sigmoid2_cluster[clu_idx], allone_11_cluster)) rpn_cls_loss, rpn_loc_loss, rcnn_cls_loss, rcnn_loc_loss = outputs[ 'losses'] # gradient is averaged by normalizing the loss with world_size loss = (rpn_cls_loss + rpn_loc_loss + rcnn_cls_loss + rcnn_loc_loss + 0.1 * (fake_loss_source + fake_loss_target)) / world_size lr_scheduler.optimizer.zero_grad() loss.backward() max_grad1 = 0.0 for pp in model.parameters(): tmp = torch.max(pp.grad.data) if max_grad1 < tmp: max_grad1 = tmp if args.dist: average_gradients(model) # torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) lr_scheduler.optimizer.step() ########################################################################## ################################ Output information ###################### ########################################################################## rpn_accuracy = outputs['accuracy'][0][0] / 100. rcnn_accuracy = outputs['accuracy'][1][0] / 100. t2 = time.time() lr = lr_scheduler.get_lr()[0] logger.info( 'Epoch: [%d][%d/%d] LR:%f Time: %.3f Loss: %.5f (rpn_cls: %.5f rpn_loc: %.5f rpn_acc: %.5f' ' rcnn_cls: %.5f, rcnn_loc: %.5f rcnn_acc:%.5f fake_loss: %.5f dec_loss: %.5f dis_loss: %.5f fake_loss1: %.5f)' % (epoch, iter, len(train_loader), lr, t2 - t0, loss.item() * world_size, rpn_cls_loss.item(), rpn_loc_loss.item(), rpn_accuracy, rcnn_cls_loss.item(), rcnn_loc_loss.item(), rcnn_accuracy, fake_loss_target.item(), recon_loss.item(), adloss.item(), fake_loss1_source.item())) print_speed((epoch - 1) * len(train_loader) + iter + 1, t2 - t0, args.epochs * len(train_loader)) t0 = t2 logger.info("Max Grad, Det: %5f, Dec: %5f, Dis: %5f" % (max_grad1, max_grad2, max_grad3))
def validate_single(val_loader, model, cfg): global best_map logger = logging.getLogger('global') rank, world_size = 0, 1 # switch to evaluate mode model.eval() total_rc = 0 total_gt = 0 logger.info('start validate') if not os.path.exists(args.results_dir): try: os.makedirs(args.results_dir) except Exception as e: print(e) # remove the original results file # if rank == 0: for f in os.listdir(args.results_dir): if 'results.txt.rank' in f and int(f.split('k')[-1]) >= world_size: logger.info("remove %s" % f) os.remove(os.path.join(args.results_dir, f)) fout = open(os.path.join(args.results_dir, 'results.txt.rank%d' % rank), 'w') for iter, input in enumerate(val_loader): img = (input[0]).cuda() img_info = input[1] gt_boxes = input[2] filenames = input[-1] x = { 'cfg': cfg, 'image': img, 'image_info': img_info, 'ground_truth_bboxes': gt_boxes, 'ignore_regions': None } batch_size = img.shape[0] t1 = time.time() t0 = time.time() outputs = model(x)['predict'] t2 = time.time() proposals = outputs[0].data.cpu().numpy() bboxes = outputs[1].data.cpu().numpy() if torch.is_tensor(gt_boxes): gt_boxes = gt_boxes.cpu().numpy() for b_ix in range(batch_size): img_id = filenames[b_ix].rsplit('/', 1)[-1].rsplit('.', 1)[0] img_resize_scale = img_info[b_ix, -1] if args.dataset == 'coco': img_resize_scale = img_info[b_ix, 2] rois_per_image = proposals[proposals[:, 0] == b_ix] dts_per_image = bboxes[bboxes[:, 0] == b_ix] gts_per_image = gt_boxes[b_ix] # rpn recall num_rc, num_gt = bbox_helper.compute_recall( rois_per_image[:, 1:1 + 4], gts_per_image) total_gt += num_gt total_rc += num_rc order = dts_per_image[:, -2].argsort()[::-1][:100] dts_per_image = dts_per_image[order] # faster-rcnn eval for cls in range(1, cfg['shared']['num_classes']): dts_per_cls = dts_per_image[dts_per_image[:, -1] == cls] gts_per_cls = gts_per_image[gts_per_image[:, -1] == cls] dts_per_cls = dts_per_cls[:, 1:-1] # dts_per_cls = dts_per_cls[dts_per_cls[:, -1] > 0.05] gts_per_cls = gts_per_cls[:, :-1] dts_per_cls = bbox_helper.clip_bbox(dts_per_cls, img_info[b_ix, :2]) if len(dts_per_cls) > 0: dts_per_cls[:, :4] = dts_per_cls[:, :4] / img_resize_scale if len(gts_per_cls) > 0: gts_per_cls[:, :4] = gts_per_cls[:, :4] / img_resize_scale for bx in dts_per_cls: if args.dataset == 'coco': fout.write('val2017/{0}.jpg {1} {2}\n'.format( img_id, ' '.join(map(str, bx)), cls)) else: fout.write('{0} {1} {2}\n'.format( img_id, ' '.join(map(str, bx)), cls)) fout.flush() logger.info('Test: [%d/%d] Time: %.3f %d/%d' % (iter, len(val_loader), t2 - t0, total_rc, total_gt)) print_speed(iter + 1, t2 - t0, len(val_loader)) logger.info('rpn300 recall=%f' % (total_rc / total_gt)) fout.close() """ eval the cityscapes for getting the map """ # eval coco ap with official python api if args.dataset == 'coco': eval_coco_ap(args.results_dir, 'bbox', args.val_meta_file) else: Cal_MAP(args.results_dir, args.val_meta_file, int(cfg['shared']['num_classes'])) return total_rc / total_gt
def test_model(model, test_dataloader, log_interval, device): """Test and Return the feature vector of all sample in dataset with its index. Args: cfg: (dict) config file of the test precedure. model: (nn.module) loaded model test_dataloader: (torch.Dataloader) It should not be none! A non-triplet dataloader to validate data. It's sample protocal is: { "img": target image, "cls": target class, "other": other information, { "index" : index, } } writer: (tensorboard writer) device: cuda or cpu Return: a list of dict:[ { "cls": class label of the sample, "feature": feature vectuer of the result, "other": other information, { "index": index of the sample in the dataset, } }, ..., { "cls": class label of the sample, "feature": feature vectuer of the result, "other": other information, { "index": index of the sample in the dataset, } }] """ logger.info("\n------------------------- Start Forwarding Dataset -------------------------\n") # epoch average meter avg_test = AverageMeter() # get test batch count current_test_batch = 0 total_test_batch = len(test_dataloader) # to return list out_sample_list = [] for batch_idx, batch_sample in enumerate(test_dataloader): # Skip last iteration to avoid the problem of having different number of tensors while calculating # averages (sizes of tensors must be the same for pairwise distance calculation) if batch_idx + 1 == len(test_dataloader): continue batch_size = test_dataloader.batch_size # switch to evaluation mode. for param in model.parameters(): param.requires_grad = False model.eval() # start time counting batch_start_time_test = time.time() # Forward pass - compute embeddings imgs = batch_sample["img"] cls = batch_sample["cls"] indexs = batch_sample["other"]["index"] imgs = imgs.to(device) out_put = model(imgs) out_put.to("cpu") for i in range(batch_size): out_dict = { "cls": cls[i], "feature": out_put[i], "other": { "index" : indexs[i] }, } out_sample_list.append(out_dict) # batch time & batch count current_test_batch += 1 batch_time = time.time() - batch_start_time_test if current_test_batch % log_interval == 0: print_speed(current_test_batch, batch_time, total_test_batch, "global") else: logger.info("\n------------------------- End Forwarding Dataset -------------------------\n") return out_sample_list
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): global tb_index, best_acc, cur_lr, logger cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') avg = AverageMeter() model.train() # model.module.features.eval() # model.module.rpn_model.eval() # model.module.features.apply(BNtoFixed) # model.module.rpn_model.apply(BNtoFixed) # # model.module.mask_model.train() # model.module.refine_model.train() model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch start_epoch = epoch epoch = epoch with torch.no_grad(): for iter, input in enumerate(train_loader): if iter > 100: break if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch if epoch == args.epochs: return if model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr( model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) x_rpn = { 'cfg': cfg, 'template': torch.autograd.Variable(input[0]).cuda(), 'search': torch.autograd.Variable(input[1]).cuda(), 'label_cls': torch.autograd.Variable(input[2]).cuda(), 'label_loc': torch.autograd.Variable(input[3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(), 'label_mask': torch.autograd.Variable(input[6]).cuda() } x_kp = input[7] x_kp = { x: torch.autograd.Variable(y).cuda() for x, y in x_kp.items() } x_rpn['anchors'] = train_loader.dataset.anchors.all_anchors[0] outputs = model(x_rpn, x_kp) roi_box = outputs['predict'][-1] pred_kp = outputs['predict'][2]['hm_hp'] batch_img = x_rpn['search'].expand(x_kp['hm_hp'].size(0), -1, -1, -1) gt_img, pred_img = save_gt_pred_heatmaps( batch_img, x_kp['hm_hp'], pred_kp, 'test_imgs/test_{}.jpg'.format(iter)) # rpn_pred_cls, rpn_pred_loc = outputs['predict'][:2] # rpn_pred_cls = outputs['predict'][-1] # anchors = train_loader.dataset.anchors.all_anchors[0] # # normalized_boxes = proposal_layer([rpn_pred_cls, rpn_pred_loc], anchors, config=cfg) # print('rpn_pred_cls: ', rpn_pred_cls.shape) rpn_cls_loss, rpn_loc_loss, kp_losses = torch.mean(outputs['losses'][0]),\ torch.mean(outputs['losses'][1]),\ outputs['losses'][3] kp_loss = torch.mean(kp_losses['loss']) kp_hp_loss = torch.mean(kp_losses['hp_loss']) kp_hm_hp_loss = torch.mean(kp_losses['hm_hp_loss']) kp_hp_offset_loss = torch.mean(kp_losses['hp_offset_loss']) # mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean(outputs['accuracy'][0]), torch.mean(outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) cls_weight, reg_weight, kp_weight = cfg['loss']['weight'] loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + kp_loss * kp_weight optimizer.zero_grad() loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_( model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_( model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, kp_hp_loss=kp_hp_loss, kp_hm_hp_loss=kp_hm_hp_loss, kp_hp_offset_loss=kp_hp_offset_loss, kp_loss=kp_loss, siammask_loss=siammask_loss) # mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/kp_hp_loss', kp_hp_loss, tb_index) tb_writer.add_scalar('loss/kp_hm_hp_loss', kp_hm_hp_loss, tb_index) tb_writer.add_scalar('loss/kp_hp_offset_loss', kp_hp_offset_loss, tb_index) # tb_writer.add_scalar('loss/kp', kp_loss, tb_index) end = time.time() if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}' '\t{kp_hp_loss:s}\t{kp_hm_hp_loss:s}\t{kp_hp_offset_loss:s}' '\t{kp_loss:s}\t{siammask_loss:s}'.format( epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, kp_hp_loss=avg.kp_hp_loss, kp_hm_hp_loss=avg.kp_hm_hp_loss, kp_hp_offset_loss=avg.kp_hp_offset_loss, kp_loss=avg.kp_loss, siammask_loss=avg.siammask_loss, )) # mask_iou_mean=avg.mask_iou_mean, # mask_iou_at_5=avg.mask_iou_at_5,mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): global tb_index, best_acc, cur_lr, logger cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') avg = AverageMeter() model.train() model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch start_epoch = epoch epoch = epoch for iter, input in enumerate(train_loader): if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) if epoch == args.epochs: return if model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) x = { 'cfg': cfg, 'template': torch.autograd.Variable(input[0]).cuda(), 'search': torch.autograd.Variable(input[1]).cuda(), 'label_cls': torch.autograd.Variable(input[2]).cuda(), 'label_loc': torch.autograd.Variable(input[3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(), 'label_mask': torch.autograd.Variable(input[6]).cuda(), 'label_kp_weight': torch.autograd.Variable(input[7]).cuda(), 'label_mask_weight': torch.autograd.Variable(input[8]).cuda(), } outputs = model(x) # print(x['search'].shape) pred_mask = outputs['predict'][2] pred_mask = select_pred_heatmap( pred_mask, x['label_mask_weight']) #is rpn_pred_mask (bs, 17, 127, 127) true_search = select_gt_img(x['search'], x['label_mask_weight']) if true_search.shape: save_batch_heatmaps(true_search, pred_mask, vis_outpath + '{}.jpg'.format(iter), normalize=True) # pred_mask = pred_mask.cpu(.sh).detach().numpy() # true_search = true_search.cpu().detach().numpy() # print("pose_mask", pred_mask.shape) # pose_heat = np.transpose(pred_mask[0,:,:,:],(1,2,0)) #shape (127,127,17) # plt.figure(num='image', figsize=(128,128)) # # plt.subplot(1, 2, 1) # plt.title('origin image') # plt.imshow(np.transpose(true_search[0,:,:,:], (1,2,0))) # # plt.subplot(1, 2, 2) # plt.title('heatmap') # pose_map = np.zeros((127,127), np.float32) # for i in range(pred_mask.shape[1]): # pose_map += pose_heat[:,:,i] # plt.imshow(pose_map) # plt.axis('off') # # # plt.show() # 可视化: 把17个map都投影到一张黑色图片上 rpn_cls_loss, rpn_loc_loss, rpn_mask_loss = torch.mean(outputs['losses'][0]),\ torch.mean(outputs['losses'][1]),\ torch.mean(outputs['losses'][2]) # mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean(outputs['accuracy'][0]), torch.mean(outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) cls_weight, reg_weight, mask_weight = cfg['loss']['weight'] loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_mask_loss * mask_weight optimizer.zero_grad() loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, rpn_mask_loss=rpn_mask_loss * mask_weight, siammask_loss=siammask_loss) # mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/mask', rpn_mask_loss * mask_weight, tb_index) # tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index) # tb_writer.add_scalar('mask/[email protected]', mask_iou_at_5, tb_index) # tb_writer.add_scalar('mask/[email protected]', mask_iou_at_7, tb_index) end = time.time() if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_mask_loss:s}\t{siammask_loss:s}' .format( epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, rpn_mask_loss=avg.rpn_mask_loss, siammask_loss=avg.siammask_loss, )) # mask_iou_mean=avg.mask_iou_mean, # mask_iou_at_5=avg.mask_iou_at_5,mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
def train(dataloader, optimizer, model): iter = 0 begin_time = 0.0 average_meter = AverageMeter() num_per_epoch = len(dataloader.dataset) // (cfg.GRAD.BATCH_SIZE) tb_writer = SummaryWriter(cfg.GRAD.LOG_DIR) for epoch in range(cfg.GRAD.EPOCHS): dataloader.dataset.shuffle() begin_time = time.time() for data in dataloader: examplar_img = data['examplar_img'].cuda() train_search_img = data['train_search_img'].cuda() train_gt_cls = data['train_gt_cls'].cuda() train_gt_delta = data['train_gt_delta'].cuda() train_delta_weight = data['train_delta_weight'].cuda() test_search_img = data['test_search_img'].cuda() test_gt_cls = data['test_gt_cls'].cuda() test_gt_delta = data['test_gt_delta'].cuda() test_delta_weight = data['test_delta_weight'].cuda() data_time = time.time() - begin_time losses = model.forward(examplar_img, train_search_img, train_gt_cls, train_gt_delta, train_delta_weight, test_search_img, test_gt_cls, test_gt_delta, test_delta_weight) cls_loss = losses['cls_loss'] loc_loss = losses['loc_loss'] loss = losses['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - begin_time batch_info = {} batch_info['data_time'] = data_time batch_info['batch_time'] = batch_time average_meter.update(**batch_info) # add summary writer for k, v in losses.items(): if k.startswith('examplar'): tb_writer.add_histogram(k, v, iter) else: tb_writer.add_scalar(k, v, iter) if iter % cfg.TRAIN.PRINT_EVERY == 0: logger.info( 'epoch: {}, iter: {}, init_cls_loss: {}, init_loc_loss: {}, init_loss: {}' .format(epoch + 1, iter, losses['init_cls_loss'].item(), losses['init_loc_loss'].item(), losses['init_total_loss'].item())) logger.info( 'epoch: {}, iter: {}, cls_loss: {}, loc_loss: {}, loss: {}' .format(epoch + 1, iter, cls_loss.item(), loc_loss.item(), loss.item())) print_speed(iter + 1, average_meter.batch_time.avg, cfg.GRAD.EPOCHS * num_per_epoch) begin_time = time.time() iter += 1 # save train_state if not os.path.exists(cfg.GRAD.SNAPSHOT_DIR): os.makedirs(cfg.GRAD.SNAPSHOT_DIR) # put the update to the rpn state state = { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, } save_path = "{}/checkpoint_e{}.pth".format(cfg.GRAD.SNAPSHOT_DIR, epoch) logger.info("save state to {}".format(save_path)) torch.save(state, save_path)
def main(): # init logger init_log('global', args.save_dir, logging.INFO) logger = logging.getLogger('global') # print arguments for arg in vars(args): logger.info("{}: {}".format(arg, getattr(args, arg))) # get device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # build dataloader and model train_loader, test_loader = build_nyu_dataloader(args.dataset_dir) opts = {"L": 5, "k": 12, "bn": True} model = D3(opts) # check GPU numbers and deploy parallel # parallel = False # if torch.cuda.device_count() > 1: # parallel = True # logger.info("Let's use {:d} GPUs!".format(torch.cuda.device_count())) # # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs # model = nn.DataParallel(model) model.to(device) logger.info("*" * 40) logger.info(model) logger.info("*" * 40) # optimizer settings optimizer = optim.Adam(model.parameters(), lr=args.lr) # optionally resume from a checkpoint # if args.resume: # if os.path.isfile(args.resume): # model, _, args.start_epoch = restore_from(model, optimizer, args.resume) # set the best model best_model_wts = copy.deepcopy(model.state_dict()) best_abs_rel = 0.0 logger.info("Start training...") # epoches = args.batches // train_loader.__len__() for epoch in range(args.epoches): for g in optimizer.param_groups: g['lr'] = args.lr * (1 - args.lr_decay)**(epoch // args.lr_decay_step) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) t0 = time.time() train_one_epoch(train_loader, model, optimizer, device, epoch) t1 = time.time() if epoch % args.test_rate == 0: test_abs_rel = test_one_epoch(test_loader, model, device, epoch) if test_abs_rel < best_abs_rel: best_model_wts = copy.deepcopy(model.state_dict()) torch.cuda.empty_cache() if epoch % args.test_rate == 0: filename = os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch + 1)) save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, is_best=False, filename=filename) logger.info("Saved model : {}".format(filename)) print_speed(epoch, t1 - t0, args.epoches) save_checkpoint( { 'batch_num': epoch, 'state_dict': best_model_wts, 'optimizer': optimizer.state_dict() }, is_best=True, filename=os.path.join(args.save_dir, 'model_best.pth')) writer.close()
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg): global tb_index, best_acc, cur_lr, logger cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') avg = AverageMeter() model.train() model = model.cuda() end = time.time() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch start_epoch = epoch epoch = epoch for iter, input in enumerate(train_loader): if epoch != iter // num_per_epoch + start_epoch: # next epoch epoch = iter // num_per_epoch + start_epoch if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) if epoch == args.epochs: return if model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{}'.format(epoch)) tb_index = iter if iter % num_per_epoch == 0 and iter != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info("epoch {} lr {}".format(epoch, pg['lr'])) tb_writer.add_scalar('lr/group%d' % (idx + 1), pg['lr'], tb_index) data_time = time.time() - end avg.update(data_time=data_time) x = { 'cfg': cfg, 'template': torch.autograd.Variable(input[0]).cuda(), 'search': torch.autograd.Variable(input[1]).cuda(), 'label_cls': torch.autograd.Variable(input[2]).cuda(), 'label_loc': torch.autograd.Variable(input[3]).cuda(), 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(), 'label_mask': torch.autograd.Variable(input[6]).cuda(), 'label_mask_weight': torch.autograd.Variable(input[7]).cuda(), } outputs = model(x) gt_mask = x['label_mask'] gt_mask = select_gt_img(gt_mask, x['label_mask_weight']) pred_mask = outputs['predict'][2] pred_mask = select_pred_heatmap( pred_mask, x['label_mask_weight']) #(bs, channel, 127, 127) # print("gt_mask", gt_mask.shape) # print("pred_mask", pred_mask.shape) rpn_cls_loss, rpn_loc_loss, rpn_mask_loss = torch.mean( outputs['losses'][0]), torch.mean( outputs['losses'][1]), torch.mean(outputs['losses'][2]) mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean( outputs['accuracy'][0]), torch.mean( outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2]) cls_weight, reg_weight, mask_weight = cfg['loss']['weight'] loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_mask_loss * mask_weight optimizer.zero_grad() loss.backward() if cfg['clip']['split']: torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature']) torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn']) torch.nn.utils.clip_grad_norm_( model.module.mask_model.parameters(), cfg['clip']['mask']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip if is_valid_number(loss.item()): optimizer.step() siammask_loss = loss.item() batch_time = time.time() - end avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss, rpn_mask_loss=rpn_mask_loss, siammask_loss=siammask_loss, mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7) tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index) tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index) tb_writer.add_scalar('loss/mask', rpn_mask_loss, tb_index) tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_5, tb_index) tb_writer.add_scalar('mask/[email protected]', mask_iou_at_7, tb_index) if tb_index % 200 == 0: tb_writer.add_image('gt_img', gt_mask[0, :, :, :], tb_index) tb_writer.add_image('pred_img', pred_mask[0, :, :, :], tb_index) end = time.time() if (iter + 1) % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}' '\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_mask_loss:s}\t{siammask_loss:s}' '\t{mask_iou_mean:s}\t{mask_iou_at_5:s}\t{mask_iou_at_7:s}'. format(epoch + 1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time, data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss, rpn_mask_loss=avg.rpn_mask_loss, siammask_loss=avg.siammask_loss, mask_iou_mean=avg.mask_iou_mean, mask_iou_at_5=avg.mask_iou_at_5, mask_iou_at_7=avg.mask_iou_at_7)) print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
def train(train_loader, model, criterion, optimizer, epoch): logger = logging.getLogger('global') batch_time = AverageMeter() data_time = AverageMeter() mask_losses = AverageMeter() score_losses = AverageMeter() # switch to train mode model.train() if args.freeze_bn: model.apply(BNtoFixed) train_loader.dataset.shuffle() end = time.time() for i, (img, target, head_status) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) img = img.to(device) target = target.to(device) # compute output output = model(img) loss = criterion(output[head_status[0]], target) # measure and record loss if head_status[0] == 0: mask_losses.update(loss.item(), img.size(0)) loss.mul_(img.size(0)) else: score_losses.update(loss.item(), img.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() # gradOutputs:mul(self.inputs:size(1)) # torch.nn.utils.clip_grad_norm_(model.parameters(), 10) # REMOVE? optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if args.visualize and head_status[0] == 0: visual_batch(img, output[0].sigmoid(), target) if i % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\n' 'LR {lr:.1e} \t Mask Loss {mask_loss.val:.4f} ({mask_loss.avg:.4f})\t' 'Score Loss {score_loss.val:.3f} ({score_loss.avg:.3f})'. format(epoch, i, len(train_loader), batch_time=batch_time, lr=optimizer.param_groups[0]['lr'], data_time=data_time, mask_loss=mask_losses, score_loss=score_losses)) print_speed(epoch * len(train_loader) + i + 1, batch_time.avg, args.maxepoch * len(train_loader)) step = epoch * len(train_loader) + i + 1 if head_status[0] == 0: mask_true_iSz = torch.nn.functional.interpolate(target, size=(args.iSz, args.iSz)) mask_true = torch.nn.functional.pad(mask_true_iSz, (16, 16, 16, 16)) mask_pred_iSz = torch.nn.functional.interpolate(output[0], size=(args.iSz, args.iSz)) mask_pred = torch.nn.functional.pad(mask_pred, (16, 16, 16, 16)) writer.add_images('train/img', img, global_step=step, dataformats='NCHW') writer.add_images('train/mask_true', mask_true, global_step=step, dataformats='NCHW') writer.add_images('train/mask_pred', mask_pred, global_step=step, dataformats='NCHW') writer.add_scalar('train_loss/mask_loss', mask_losses.avg, epoch) writer.add_scalar('train_loss/score_losses', score_losses.avg, epoch)