for epoch in range(start_epoch, args['n_epochs']): print('Starting epoch {}'.format(epoch)) scheduler.step(epoch) train_loss = train(epoch) val_loss, val_iou = val(epoch) print('===> train loss: {:.2f}'.format(train_loss)) print('===> val loss: {:.2f}, val iou: {:.2f}'.format(val_loss, val_iou)) logger.add('train', train_loss) logger.add('val', val_loss) logger.add('iou', val_iou) logger.plot(save=args['save'], save_dir=args['save_dir']) is_best = val_iou > best_iou best_iou = max(val_iou, best_iou) if args['save']: state = { 'epoch': epoch, 'best_iou': best_iou, 'model_state_dict': model.state_dict(), 'optim_state_dict': optimizer.state_dict(), 'logger_data': logger.data } save_checkpoint(state, is_best)
def main(): args = parse_args() cfg = from_file(args.config) if args.work_dir is not None: cfg.work_dir = args.work_dir if args.load_from is not None: cfg.load_from = args.load_from if args.resume_from is not None: cfg.resume_from = args.resume_from if args.seed is not None: cfg.seed = args.seed if args.gpus is not None: cfg.gpus = args.gpus # set random seeds if cfg.seed is not None: print('Set random seed to {}'.format(cfg.seed)) set_random_seed(cfg.seed) if not os.path.exists(cfg.work_dir): os.makedirs(cfg.work_dir) ################ 1 DATA ################### print('Training model on {} dataset...'.format(cfg.data['dataset'])) batch_size = cfg.data['batch_size'] * cfg.gpus train_dataset = UCF101Dataset(data_file=cfg.data['train_file'], img_tmpl=cfg.data['train_img_tmp'], clip_len=cfg.data['train_clip_len'], size=cfg.data['size'], mode='train', shuffle=True) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) val_dataset = UCF101Dataset(data_file=cfg.data['val_file'], img_tmpl=cfg.data['val_img_tmp'], clip_len=cfg.data['val_clip_len'], size=cfg.data['size'], mode='val', shuffle=False) val_dataloader= DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8) ################ 2 MODEL ################## if cfg.load_from is not None: print('Init the model from pretrained weight {}.'.format(cfg.load_from)) model = S3DG(num_class=cfg.model['num_class']) load_pretrained_model(model, pretrained_path=cfg.load_from) else: print('Init the model from scratch.') model = S3DG(num_class=cfg.model['num_class']) # MODEL # NOTE: train and resume train must have same number of GPU, since the name 'module' # nn.parallel if cfg.resume_from is not None: load_checkpoint_model(model, checkpoint_path=cfg.resume_from) if torch.cuda.device_count() > 1: print('use %d gpus' % (torch.cuda.device_count())) model = nn.DataParallel(model, device_ids=range(cfg.gpus)) else: print('use 1 gpu') print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) # ################### 3 CRITERION and OPTIMIZER ######################### criterion = nn.CrossEntropyLoss().to(device) # standard crossentropy loss for classification # criterion = nn.BCEWithLogitsLoss().to(device) optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4) # set lr scheduler if cfg.lr_scheduler is not None: if cfg.lr_scheduler['type'] == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma']) elif cfg.lr_scheduler['type'] == 'multistep': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma']) elif cfg.lr_scheduler['type'] == 'exponent': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.lr_scheduler['gamma']) log_path = cfg.work_dir # IF RESUME if cfg.resume_from is not None: checkpoint = torch.load(cfg.resume_from) print("Resume training from checkpoint: {}...".format(cfg.resume_from)) optimizer.load_state_dict(checkpoint['opt_dict']) scheduler.load_state_dict(checkpoint['lr_dict']) resume_epoch = checkpoint['epoch'] + 1 logger = Logger(os.path.join(log_path, 'log.txt'), resume=True) else: print("Training model from start...") resume_epoch = 0 logger = Logger(os.path.join(log_path, 'log.txt')) logger.set_names(['Learning Rate', 'Train Loss', 'Val Loss', 'Train Acc.', 'Val Acc.']) # tensorboard log_dir = os.path.join(cfg.work_dir, datetime.now().strftime('%b%d_%H-%M-%S')) writer = SummaryWriter(log_dir=log_dir) ################## 4 BEGIN TRAINING ######################### num_epochs = cfg.num_epochs save_epoch = cfg.interval save_dir = cfg.work_dir display = cfg.display best_acc = 0.0 best_epoch = 0 for epoch in tqdm(range(resume_epoch, num_epochs)): print('\n----------------- Training -------------------') print('Epoch: {}/{}'.format(epoch, num_epochs-1)) train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, epoch, writer, display) if args.validate: print('\n----------------- Validation -------------------') print('Epoch: {}/{}'.format(epoch, num_epochs-1)) val_loss, val_acc = validation(val_dataloader, model, criterion, optimizer, epoch, writer, display) if val_acc >= best_acc: best_acc = val_acc best_epoch = epoch print("\nThe best validation top1-accuracy: {:.3f}%, the best epoch: {}".format(best_acc,best_epoch)) # EPOCH lr = optimizer.state_dict()['param_groups'][0]['lr'] if args.validate: logger.append([lr, train_loss, val_loss, train_acc, val_acc]) else: logger.append([lr, train_loss, 0.0, train_acc, 0.0]) # no valid writer.add_scalar('train/learning_rate', optimizer.state_dict()['param_groups'][0]['lr'], epoch) if cfg.lr_scheduler is not None: scheduler.step() if epoch % save_epoch == 0: torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), 'lr_dict': scheduler.state_dict() }, os.path.join(save_dir, 'epoch-' + str(epoch) + '.pth')) writer.close() logger.close() logger.plot() savefig(os.path.join(log_path, 'log.eps'))