trf.RandomVerticalFlip(p=0.5), trf.RandomTranspose(p=0.5), ])) train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True) validation_loader = DataLoader(validation_dataset, batch_size=cfg.batch_size, shuffle=True,) print('Dataset loaded!') # Set up model model = UNet().to(device) # Set up loss function loss_func = nn.L1Loss() # loss_func = perceptual_loss(perceptual_model='vgg16', dist_func=nn.MSELoss(), device=device) # Perceptual loss # Set up optimizer optimizer = optim.Adam(model.parameters(), lr=cfg.initial_learning_rate) # Experiment with 16-bit precision amp.initialize(model, optimizer, opt_level='O2') # Learning rate scheduling scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=cfg.epochs/2, gamma=0.1) # Set up TensorBoard writer writer = SummaryWriter('runs/'+cfg.run_name, flush_secs=1) # Load model (if applicable) - by default load the latest start_epoch = 0 if os.path.exists(cfg.checkpoint_to_load): checkpoint = torch.load(cfg.checkpoint_to_load)
def main_worker(train_loader, val_loader, args): global best_loss # create model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'=> device used: {device}') norm_kwargs = { 'mode': args.norm_mode, 'alpha_fwd': args.afwd, 'alpha_bkw': args.abkw, 'ecm': args.ecm } print("=> creating model...") model = UNet(args.classes, norm_layer=norm_layer, norm_kwargs=norm_kwargs).to(device) print(model) print("=> creating optimizer...") optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) print("=> setting up learning rate scheduler...") scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_milestone, gamma=args.lr_multiplier) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = False if args.seed else True if args.evaluate: validate(val_loader, model, args.start_epoch, device, args) return for epoch in range(args.start_epoch, args.epochs): if epoch: scheduler.step() # train for one epoch train(train_loader, model, optimizer, epoch, device, args) # evaluate on validation set eval_loss = validate(val_loader, model, epoch, device, args) # remember best loss and save checkpoint is_best = eval_loss < best_loss best_loss = min(eval_loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, args) print('best val loss: {:4f}'.format(best_loss)) # load best model weights model_best_file = os.path.join(args.model_dir, 'model_best.pth.tar') if os.path.isfile(model_best_file): print("=> loading checkpoint '{}'".format(model_best_file)) checkpoint = torch.load(model_best_file) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( model_best_file, checkpoint['epoch'])) return model
epochs = args.epoch tag = 'Unet' if args.model == 'Unet': model = UNet(start_fm=args.startfm).to(device) else: tag = 'UnetRes' model = UNet_ResNet(dropout=args.dropout, start_fm=args.startfm).to(device) run.tags = [tag] criterion = nn.SmoothL1Loss() # loss_func = Weighted_Cross_Entropy_Loss() optimizer = optim.Adam(model.parameters(), lr=args.lr) # wandb watch run.watch(models=model, criterion=criterion, log='all', log_freq=10) # training best_iou = -1 for epoch in range(epochs): t0 = time.time() train_loss, train_iou = train(model, device, trainloader, optimizer, criterion) t1 = time.time() print( f'Epoch: {epoch} | Train loss: {train_loss:.3f} | Train IoU: {train_iou:.3f} | Time: {(t1-t0):.1f}s' )