def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) datasets = [build_dataset(cfg.data.train)] model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) if cfg.load_from: checkpoint = load_checkpoint(model, cfg.load_from, map_location='cpu') model.CLASSES = datasets[0].CLASSES if cfg.load_from: checkpoint = load_checkpoint(model, cfg.load_from, map_location='cpu') model.CLASSES = datasets[0].CLASSES if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmdet_version=__version__, config=cfg.text, CLASSES=datasets[0].CLASSES) data_loader = build_dataloader(datasets[0], imgs_per_gpu=cfg.data.imgs_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, num_gpus=cfg.gpus, dist=False, shuffle=False) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() model.train() if hasattr(model, 'module'): model_load = model.module optimizer_all = obj_from_dict(cfg.optimizer, torch.optim, dict(params=model_load.parameters())) optimizer = obj_from_dict(cfg.optimizer, torch.optim, dict(params=model_load.agg.parameters())) check_video = None start_epoch = 0 meta = None epoch = start_epoch vis = visdom.Visdom(env='fuse_c') loss_cls_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss of classification', title='Loss of classification ', legend=['Loss of classification'])) loss_init_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss of init reppoint', title='Loss of init reppoint', legend=['Loss of init reppoint'])) loss_refine_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss of refine reppoint', title='Loss of refine reppoint', legend=['Loss of refine reppoint' ])) loss_total_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss all', title='Loss all', legend=['Loss all'])) loss_trans_window = vis.line(X=torch.zeros((1, )).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss trans', title='Loss trans', legend=['Loss trans'])) training_sample = 0 for e in range(cfg.total_epochs): i = 0 if epoch % 1 == 0: if meta is None: meta = dict(epoch=epoch + 1, iter=i) else: meta.update(epoch=epoch + 1, iter=i) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(model.state_dict()) } print() if optimizer_all is not None: checkpoint['optimizer'] = optimizer_all.state_dict() if not os.path.exists(cfg.work_dir): os.mkdir(cfg.work_dir) filename = os.path.join(cfg.work_dir, 'epoch_{}.pth'.format(epoch)) torch.save(checkpoint, filename) for i, data in enumerate(data_loader): # if len(data['gt_bboxes'].data[0][0]) == 0: # continue optimizer.zero_grad() optimizer_all.zero_grad() reference_id = (data['img_meta'].data[0][0]['filename'].split('/') [-1]).split('.')[0] video_id = data['img_meta'].data[0][0]['filename'].split('/')[-2] print('start image:', data['img_meta'].data[0][0]['filename']) print('end image:', data['img_meta'].data[-1][-1]['filename']) # print(len(data['img'].data),len(data['img'].data[0])) # exit() for m in range(len(data['img_meta'].data)): start_name = data['img_meta'].data[m][0]['filename'].split( '/')[-2] # print(data['img_meta'].data[m][0]['filename']) for n in range(len(data['img_meta'].data[m])): check_name = data['img_meta'].data[m][n]['filename'].split( '/')[-2] # print(data['img_meta'].data[m][n]['filename']) if start_name != check_name: print('end of video') data['img_meta'].data[m][n] = data['img_meta'].data[m][ 0] data['gt_bboxes'].data[m][n] = data['gt_bboxes'].data[ m][0] data['gt_labels'].data[m][n] = data['gt_labels'].data[ m][0] data['img'].data[m][n] = data['img'].data[m][0] # losses,loss_trans=model(return_loss=True, **data) losses = model(return_loss=True, **data) # print(losses) if isinstance(losses, list): loss_all = [] log = [] for p in range(len(losses)): # print(p) # print(losses[p]) loss, log_var = parse_losses(losses[p]) loss_all.append(loss) log.append(log_var) else: losses, log_vars = parse_losses(losses) if isinstance(losses, list): losses = loss_all[0] + 0.5 * loss_all[1] + 0.5 * loss_all[ 2] + 0.5 * loss_all[3] losses = losses / 2.5 # print(loss_trans.shape) # loss_trans=torch.mean(loss_trans)*0.1 # losses=losses+loss_trans # if losses.item()>10: # losses.backward(retain_graph=False) # optimizer.zero_grad() # continue losses.backward() if epoch < 10: optimizer.step() else: optimizer_all.step() # if training_sample<700: # optimizer.step() # else: # optimizer_all.step() # print('transform kernel check',model.module.agg.trans_kernel.sum().item()) log_vars = log[0] vis.line(X=torch.ones(1).cpu() * training_sample, Y=(log_vars['loss_cls']) * torch.ones(1).cpu(), win=loss_cls_window, update='append') vis.line(X=torch.ones(1).cpu() * training_sample, Y=(log_vars['loss_pts_init']) * torch.ones(1).cpu(), win=loss_init_window, update='append') vis.line(X=torch.ones(1).cpu() * training_sample, Y=(log_vars['loss_pts_refine']) * torch.ones(1).cpu(), win=loss_refine_window, update='append') vis.line(X=torch.ones(1).cpu() * training_sample, Y=(losses).item() * torch.ones(1).cpu(), win=loss_total_window, update='append') # vis.line( # X=torch.ones(1).cpu() * training_sample, # Y=loss_trans.item() * torch.ones(1).cpu(), # win=loss_trans_window, # update='append') print('agg') print('epoch:',epoch,'index:',i,'video_id:',video_id,'reference_id:',reference_id, \ 'loss_cls:',log_vars['loss_cls'],'loss_init_box:',log_vars['loss_pts_init'], \ 'loss_refine_box:',log_vars['loss_pts_refine']) log_vars = log[1] print('refer') print('epoch:',epoch,'index:',i,'video_id:',video_id,'reference_id:',reference_id, \ 'loss_cls:',log_vars['loss_cls'],'loss_init_box:',log_vars['loss_pts_init'], \ 'loss_refine_box:',log_vars['loss_pts_refine']) log_vars = log[2] print('support') print('epoch:',epoch,'index:',i,'video_id:',video_id,'reference_id:',reference_id, \ 'loss_cls:',log_vars['loss_cls'],'loss_init_box:',log_vars['loss_pts_init'], \ 'loss_refine_box:',log_vars['loss_pts_refine']) training_sample += 1 # if i % 300 == 0: # if meta is None: # meta = dict(epoch=epoch + 1, iter=i) # else: # meta.update(epoch=epoch + 1, iter=i) # checkpoint = { # 'meta': meta, # 'state_dict': weights_to_cpu(model.state_dict()) # } # if optimizer_all is not None: # checkpoint['optimizer'] = optimizer_all.state_dict() # if not os.path.exists(cfg.work_dir): # os.mkdir(cfg.work_dir) # filename=os.path.join(cfg.work_dir,'epoch_{}_{}.pth'.format(epoch,i)) # torch.save(checkpoint,filename) epoch += 1
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) datasets = [build_dataset(cfg.data.train)] model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) if cfg.load_from: checkpoint = load_checkpoint(model, cfg.load_from, map_location='cpu') model.CLASSES = datasets[0].CLASSES if cfg.load_from: checkpoint = load_checkpoint(model, cfg.load_from, map_location='cpu') model.CLASSES = datasets[0].CLASSES if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict(mmdet_version=__version__, config=cfg.text, CLASSES=datasets[0].CLASSES) data_loader = build_dataloader(datasets[0], imgs_per_gpu=cfg.data.imgs_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, num_gpus=cfg.gpus, dist=False, shuffle=False) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() model.train() if hasattr(model, 'module'): model_load = model.module optimizer = obj_from_dict(cfg.optimizer, torch.optim, dict(params=model_load.parameters())) check_video = None start_epoch = 0 meta = None epoch = start_epoch for e in range(cfg.total_epochs): i = 0 print(data_loader.__len__()) for i, data in enumerate(data_loader): reference_id = (data['img_meta'].data[0][0]['filename'].split('/') [-1]).split('.')[0] video_id = data['img_meta'].data[0][0]['filename'].split('/')[-2] losses = model(return_loss=True, **data) losses, log_vars = parse_losses(losses) optimizer.zero_grad() losses.backward() optimizer.step() # print('transform kernel check',model.module.agg.trans_kernel.sum().item()) print('epoch:',epoch,'i:',i,'video_id:',video_id,'reference_id:',reference_id, \ 'loss_cls:',log_vars['loss_cls'],'loss_init_box:',log_vars['loss_pts_init'], \ 'loss_refine_box:',log_vars['loss_pts_refine']) if i % 1000 == 0: if meta is None: meta = dict(epoch=epoch + 1, iter=i) else: meta.update(epoch=epoch + 1, iter=i) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(model.state_dict()) } if optimizer is not None: checkpoint['optimizer'] = optimizer.state_dict() if not os.path.exists(cfg.work_dir): os.mkdir(cfg.work_dir) filename = os.path.join(cfg.work_dir, 'epoch_{}_{}.pth'.format(epoch, i)) torch.save(checkpoint, filename) if epoch % 1 == 0: if meta is None: meta = dict(epoch=epoch + 1, iter=i) else: meta.update(epoch=epoch + 1, iter=i) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(model.state_dict()) } if optimizer is not None: checkpoint['optimizer'] = optimizer.state_dict() if not os.path.exists(cfg.work_dir): os.mkdir(cfg.work_dir) filename = os.path.join(cfg.work_dir, 'epoch_{}.pth'.format(epoch)) torch.save(checkpoint, filename) epoch += 1