def main(): input_args = train_input() print_model(input_args) device = torch.device("cuda:0" if torch.cuda.is_available() and input_args.gpu == True else "cpu") model = create_model(input_args.arch, input_args.hidden_units) criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), input_args.learning_rate,) exp_lr_scheduler = lr_scheduler.StepLR( optimizer, step_size=5, gamma=0.1) image_datasets, dataloaders = create_dataloaders( input_args.data_dir) train(model, dataloaders, image_datasets, criterion, optimizer, exp_lr_scheduler, device, input_args.epochs) if input_args.save_dir: model.cpu() save_checkpoint({ 'epoch': input_args.epochs, 'arch': input_args.arch, 'classifier': model.classifier, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'mapping': image_datasets['train'].class_to_idx }, input_args.save_dir)
def main(args, config, writer): best_loss = math.inf best_model, best_epoch = None, None cuda = cnn_utils.check_cuda(config) # Attempts to otimise - see # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do torch.backends.cudnn.benchmark = True data_loaders = create_dataloaders(args, config) model, criterion, optimizer, lr_scheduler = setup_model(args) if cuda: # GPU support model = model.cuda() # The below is only needed if loss fn has params #criterion = criterion.cuda() if args.checkpoint: # Resume from a checkpoint best_loss = cnn_utils.load_from_checkpoint(model, optimizer, args, config) if args.pretrained: # Direct copy weights from another model cnn_utils.load_weights(model, args, config, frozen=args.frozen) # Perform training and testing print("Beginning training loop") for epoch in range(args.start_epoch, args.start_epoch + args.nEpochs): epoch_loss = 0 epoch_loss_all = train(model=model, dset_loaders=data_loaders, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, epoch=epoch, cuda=cuda, clip=args.clip, writer=writer) for val in epoch_loss_all: epoch_loss += val / len(epoch_loss_all) if epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch best_model = copy.deepcopy(model) # Update the scheduler - restarting if lr_scheduler.last_epoch == lr_scheduler.T_max: for group in optimizer.param_groups: group['lr'] = args.lr lr_scheduler = CosineAnnealingLR(optimizer, T_max=lr_scheduler.T_max * 2) # cnn_utils.log_all_layer_weights(model, writer, epoch) if epoch % 1 == 0 and epoch != 0: cnn_utils.save_checkpoint(model, epoch, optimizer, best_loss, config['PATH']['checkpoint_dir'], args.tag + "{}.pth".format(epoch)) if args.prompt: if not helpers.prompt_user(CONTINUE_MESSAGE): print("Ending training") break print("Best loss was {:.5f} at epoch {}".format(best_loss, best_epoch)) save = True if args.prompt: if not helpers.prompt_user(SAVE_MESSAGE): print("Not saving the model") save = False # Save the best model if save: cnn_utils.save_checkpoint( best_model, best_epoch, optimizer, best_loss, config['PATH']['model_dir'], args.tag + "_best_at{}.pth".format(best_epoch)) parent_dir = os.path.abspath(os.pardir) scalar_dir = os.path.join(parent_dir, "logs", args.tag) # if not os.path.isdir(scalar_dir): # pathlib.Path(scalar_dir).mkdir(parents=True, exist_ok=True) # writer.export_scalars_to_json( # os.path.join(scalar_dir, "all_scalars.json")) writer.close()