def check_termination(epoch): if AutoResume: shouldterminate = AutoResume.termination_requested() if shouldterminate: if args.global_rank == 0: progress = "Progress %d%% (epoch %d of %d)" % ( (epoch * 100 / args.max_epoch), epoch, args.max_epoch) AutoResume.request_resume(user_dict={ "RESUME_FILE": logx.save_ckpt_fn, "TENSORBOARD_DIR": args.result_dir, "EPOCH": str(epoch) }, message=progress) return 1 else: return 1 return 0
def train(train_ds_path, val_ds_path, pths_path, results_path, batch_size, lr, num_workers, train_iter, interval, opt_level=0, checkpoint_path=None, val_freq=10): torch.cuda.set_device(rank) tensorboard_dir = os.path.join(results_path, 'logs') checkpoints_dir = os.path.join(results_path, 'checkpoints') if rank == 0: os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(checkpoints_dir, exist_ok=True) barrier() try: logger.info('Importing AutoResume lib...') from userlib.auto_resume import AutoResume as auto_resume auto_resume.init() logger.info('Success!') except: logger.info('Failed!') auto_resume = None trainset = custom_dataset( os.path.join(train_ds_path, 'images'), os.path.join(train_ds_path, 'gt'), ) valset = custom_dataset(os.path.join(val_ds_path, 'images'), os.path.join(val_ds_path, 'gt'), is_val=True) logger.info(f'World Size: {world_size}, Rank: {rank}') if world_size > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( trainset) val_sampler = torch.utils.data.distributed.DistributedSampler( valset, shuffle=False) else: train_sampler = None val_sampler = None worker_init = LoaderWorkerProcessInit(rank, 43) train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=train_sampler is None, sampler=train_sampler, num_workers=num_workers, pin_memory=True, drop_last=True, worker_init_fn=worker_init) val_loader = DataLoader(valset, batch_size=batch_size, shuffle=False, sampler=val_sampler, num_workers=num_workers, pin_memory=True, drop_last=True, worker_init_fn=worker_init) criterion = Loss() device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") model = EAST() model.to(device) model = apex.parallel.convert_syncbn_model(model) optimizer = torch.optim.Adam(model.parameters(), lr=lr) model, optimizer = amp.initialize(model, optimizer, opt_level=f'O{opt_level}') start_iter = 0 if auto_resume is not None: auto_resume_details = auto_resume.get_resume_details() if auto_resume_details is not None: logger.info( 'Detected that this is a resumption of a previous job!') checkpoint_path = auto_resume_details['CHECKPOINT_PATH'] if checkpoint_path: logger.info(f'Loading checkpoint at path "{checkpoint_path}"...') checkpoint = torch.load(checkpoint_path, map_location=f'cuda:{rank}') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) amp.load_state_dict(checkpoint['amp_state']) start_iter = checkpoint['iter'] logger.info('Done') data_parallel = False main_model = model if torch.distributed.is_initialized(): logger.info( f'DataParallel: Using {torch.cuda.device_count()} devices!') model = DDP(model) data_parallel = True for param_group in optimizer.param_groups: param_group.setdefault('initial_lr', lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[train_iter // 2], gamma=0.1, last_epoch=start_iter) # This allows us to change dataset size without affecting things such as validation frequency steps_per_epoch = 1000 // (world_size * batch_size) step = start_iter start_epoch = step // steps_per_epoch epoch_iter = int(math.ceil(train_iter / steps_per_epoch)) if rank == 0: logger.info('Initializing Tensorboard') writer = SummaryWriter(tensorboard_dir, purge_step=step) loss_meters = MeterDict(reset_on_value=True) val_loss_meters = MeterDict(reset_on_value=True) time_meters = MeterDict(reset_on_value=True) logger.info('Training') model.train() train_start_time = time.time() best_loss = 100 train_iter = [iter(train_loader)] def get_batch(): try: return next(train_iter[0]) except: train_iter[0] = iter(train_loader) return get_batch() for epoch in range(start_epoch, epoch_iter): if train_sampler is not None: train_sampler.set_epoch(epoch) epoch_loss = 0 epoch_time = time.time() start_time = time.time() model.train() for i in range(steps_per_epoch): batch = get_batch() optimizer.zero_grad() batch = [b.cuda(rank, non_blocking=True) for b in batch] img, gt_score, gt_geo, ignored_map = batch barrier() time_meters['batch_time'].add_sample(time.time() - start_time) pred_score, pred_geo = model(img) loss, details = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.detach().item() with amp.scale_loss(loss, optimizer) as loss_scaled: loss_scaled.backward() optimizer.step() barrier() time_meters['step_time'].add_sample(time.time() - start_time) details['global'] = loss.detach().item() for k, v in details.items(): loss_meters[k].add_sample(v) if i % 10 == 0: logger.info(f'\tStep [{i+1}/{steps_per_epoch}]') start_time = time.time() step += 1 scheduler.step() if step == train_iter: break term_requested = auto_resume is not None and auto_resume.termination_requested( ) checkpoint_path = None if rank == 0: times = {k: m.value() for k, m in time_meters.items()} losses = {k: m.value() for k, m in loss_meters.items()} times['epoch'] = time.time() - epoch_time logger.info( f'Epoch is [{epoch+1}/{epoch_iter}], time consumption is {times}, batch_loss is {losses}' ) for k, v in times.items(): writer.add_scalar(f'performance/{k}', v, step) for k, v in losses.items(): writer.add_scalar(f'loss/{k}', v, step) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], step) if term_requested or (epoch + 1) % interval == 0: state_dict = main_model.state_dict() optim_state = optimizer.state_dict() checkpoint_path = os.path.join( checkpoints_dir, 'model_epoch_{}.pth'.format(epoch + 1)) logger.info(f'Saving checkpoint to "{checkpoint_path}"...') torch.save( { 'model': state_dict, 'optimizer': optim_state, 'amp_state': amp.state_dict(), 'epoch': epoch + 1, 'iter': step }, checkpoint_path) logger.info(f'Done') if (epoch + 1) % val_freq == 0 or step == train_iter: logger.info(f'Validating epoch {epoch+1}...') model.eval() val_loader.dataset.reset_random() with torch.no_grad(): for i, batch in enumerate(val_loader): batch = [b.cuda(rank, non_blocking=True) for b in batch] img, gt_score, gt_geo, ignored_map = batch barrier() pred_score, pred_geo = model(img) loss, details = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) details['global'] = loss.detach().item() barrier() for k, v in details.items(): val_loss_meters[k].add_sample(v) print_dict = dict() for k, m in val_loss_meters.items(): t = torch.tensor(m.value(), device=f'cuda:{rank}', dtype=torch.float32) if world_size > 1: torch.distributed.reduce(t, 0) t /= world_size if rank == 0: writer.add_scalar(f'val/loss/{k}', t.item(), step) print_dict[k] = t.item() logger.info(f'\tLoss: {print_dict}') val_loss = print_dict['global'] if rank == 0 and val_loss < best_loss: logger.info( f'This is the best model so far. New loss: {val_loss}, previous: {best_loss}' ) best_loss = val_loss shutil.copyfile(checkpoint_path, os.path.join(checkpoints_dir, 'best.pth')) logger.info('Training') if term_requested: logger.warning('Termination requested! Exiting...') if rank == 0: auto_resume.request_resume(user_dict={ 'CHECKPOINT_PATH': save_path, 'EPOCH': epoch }) break logger.info( f'Finished training!!! Took {time.time()-train_start_time:0.3f} seconds!' )
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) #define the NASA optimizer parameter iter_tot = len(train_loader) * args.max_epoch # tau = args.tau_factor/sqrt(iter_tot) tau = 1 net = network.get_net(args, criterion) k = 1 # optim, scheduler = get_optimizer(args, net) optim, scheduler = get_optimizer(args, net, tau, k) # Visualize feature maps #activation = {} #def get_activation(name): #def hook(model, input, output): #activation[name] = output.detach() #return hook #net.layer[0].register_forward_hook(get_activation('conv1')) #data, _ = dataset[0] #data.unsqueeze_(0) #output = model(data) #act = activation['conv1'].squeeze() #fig, axarr = plt.subplots(act.size(0)) #for idx in range(act.size(0)): #axarr[idx].imshow(act[idx]) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: from thop import profile img = torch.randn(1, 3, 640, 640).cuda() mask = torch.randn(1, 1, 640, 640).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'test': validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True, testing=True, grid=city) return 0 if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: print(str(net)) from pytorchOpCounter.thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=False, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) net = network.wrap_network_in_dataparallel(net, args.apex) if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) if args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics # validate(val_loader, net, criterion=None, optim=None, epoch=0, # calc_metrics=False, dump_assets=args.dump_assets, # dump_all_images=True) if not os.path.exists(args.result_dir + 'image_2/'): os.mkdir(args.result_dir + 'image_2/') if not os.path.exists(args.result_dir + 'image_3/'): os.mkdir(args.result_dir + 'image_3/') num_image = 7481 for idx in tqdm(range(num_image)): sample_idx = "%06d" % idx eval_minibatch(sample_idx, "image_2/", net, args) eval_minibatch(sample_idx, "image_3/", net, args) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval)