def _callback(epoch_num, sym=None, arg=None, aux=None): if epoch_num % save_freq == 0: misc.save_checkpoint(prefix, epoch_num, symbol=sym, arg_params=arg, aux_params=aux)
def trainer(dataloader, model, criterion, optimizer, args, num_i, epoch_num=10, checkpoint=0, device="cuda:0"): print('======= Start Training =======') best_epoch = 0 best_acc = 0.0 recorder = open('acc_result.txt', 'w') for epoch in range(epoch_num): time_start = time.time() print('Epoch {}/{}'.format(epoch, epoch_num)) print('=' * 40) train_acc = train(dataloader['train'], net, criterion, optimizer, device, recorder) valid_acc = validate(dataloader['val'], net, criterion, optimizer, device, recorder) time_elapsed = time.time() - time_start print('-' * 10) print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) output = 'Epoch:{:3} Train Acc={:.3f}, Val Acc={:3f}'.format( epoch, train_acc, valid_acc) print(output) recorder.write(output) print('-' * 10) if valid_acc > best_acc: best_acc = valid_acc best_epoch = epoch is_best = 1 else: is_best = 0 if checkpoint == 1: misc.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best, num=args.classnum, filename="checkpoint_v{}_{:02}_{:1}.pth.tar".format( num_i, epoch, args.classnum)) recorder.write(f'best epoch: {best_epoch}') recorder.close()
def main(): torch.manual_seed(0) torch.random.manual_seed(0) # create results folder, if not already exists output_directory = misc.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) train_csv = os.path.join(output_directory, 'train.csv') test_csv = os.path.join(output_directory, 'test.csv') best_txt = os.path.join(output_directory, 'best.txt') print("=> creating data loaders ...") if args.data == 'MNIST': datadir = './data/' all_dataset = loader.MNIST(datadir) train_size = len(all_dataset) // 5 * 4 test_size = len(all_dataset) // 10 val_size = len(all_dataset) - (train_size + test_size) train_dataset, test_dataset, val_dataset = torch.utils.data.random_split(all_dataset, [train_size, test_size, val_size]) else: raise RuntimeError('Dataset not found.' + 'The dataset must be either of nyudepthv2 or kitti.') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) # set batch size to be 1 for validation test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) print("=> data loaders created.") # optionally resume from a checkpoint if args.start_epoch != 0: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) # create new model else: # define model print("=> creating Model ({}) ...".format(args.arch)) if args.arch == 'resnet50': model = models.ResNet(50) else: raise RuntimeError("model not found") print("=> model created.") # define loss function (criterion) and optimizer if args.criterion == 'cce': criterion = criteria.CrossEntropyLoss().cuda() else: raise RuntimeError("criterion not found") if args.optimizer == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr = args.lr, weight_decay = args.weight_decay) elif args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: raise RuntimeError("optimizer not defined") optimizer_scheduler = lr_scheduler.StepLR(optimizer, args.epochs//3) model = model.cuda() print(model) print("=> model transferred to GPU.") train_logger, test_logger = None, None for epoch in range(args.start_epoch, args.epochs): train_result = train.train(train_loader, model, criterion, optimizer) if epoch == 0: train_logger = logger.Logger(train_result, output_directory, train_csv) else: train_logger.append(train_result) optimizer_scheduler.step() # evaluate on validation set test_result = test.validate(test_loader, model, criterion, optimizer) if epoch == 0: test_logger = logger.Logger(test_result, output_directory, test_csv) else: test_logger.append(test_result) misc.save_checkpoint({ 'args': args, 'epoch': epoch, 'arch': args.arch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch, output_directory) train_logger.write_into_file('train') test_logger.write_into_file('test')
for epoch in range(N_EPOCH): t1 = time.time() # train train_acc, train_loss = train(model, train_iter, optimizer, loss_func) # val val_acc, val_loss = val(model, val_iter, loss_func) diff = (time.time() - t1) print("Epoch [{}/{}] Train acc {:.4f} Train loss {:.4f} " "Val acc {:.4f} Val loss {:.4f} Time:{}".format( epoch + 1, N_EPOCH, train_acc, train_loss, val_acc, val_loss, int(diff))) # 保存最优模型 if val_loss < best_val_loss: is_best = True print('save model loss descreasing {:.4f}->{:.4f}'.format( best_val_loss, val_loss)) best_val_loss = val_loss save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'train_loss': train_loss, 'val_loss': val_loss, 'best_loss': best_val_loss, 'optimizer': optimizer.state_dict() }, is_best)
train_loss, train_acc = train(train_loader, model, criterion, optimizer, use_cuda) test_loss, test_acc = validation(val_loader, model, criterion, use_cuda) # Append logger file writer.add_scalar('lr', state['lr'], epoch + 1) writer.add_scalar('train_loss', train_loss, epoch + 1) writer.add_scalar('test_loss', test_loss, epoch + 1) writer.add_scalar('train_acc', train_acc, epoch + 1) writer.add_scalar('test_acc', test_acc, epoch + 1) # Save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) # Export scalar data to JSON for external processing writer.export_scalars_to_json( os.path.join(args.checkpoint, 'logger' + str(start_epoch) + '.json')) writer.close()
def setup_and_run_train(n_channels, n_classes, dir_img, dir_gt, dir_results, load, val_perc, batch_size, epochs, lr, run, optimizer, loss, evaluation, dir_weights): # Use GPU or not use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Create the model net = UNet(n_channels, n_classes).to(device) net = torch.nn.DataParallel(net, device_ids=list( range(torch.cuda.device_count()))).to(device) # Load old weights if load: net.load_state_dict(torch.load(load)) print('Model loaded from {}'.format(load)) # Load the dataset if loss != "WCE": train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size) else: train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size, isWCE = True, dir_weights = dir_weights) # Pretty print of the run print('''\n Starting training: Dataset: {} Num Channels: {} Groundtruth: {} Num Classes: {} Folder to save: {} Load previous: {} Training size: {} Validation size: {} Validation Percentage: {} Batch size: {} Epochs: {} Learning rate: {} Optimizer: {} Loss Function: {} Evaluation Function: {} CUDA: {} '''.format(dir_img, n_channels, dir_gt, n_classes, dir_results, load, len(train_loader)*batch_size, len(val_loader)*batch_size, val_perc, batch_size, epochs, lr, optimizer, loss, evaluation, use_cuda)) # Definition of the optimizer ADD MORE IF YOU WANT if optimizer == "Adam": optimizer = torch.optim.Adam(net.parameters(), lr=lr) elif optimizer == "SGD": optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005) # Definition of the loss function ADD MORE IF YOU WANT if loss == "Dice": criterion = DiceLoss() elif loss == "RMSE": criterion = RMSELoss() elif loss == "MSE": criterion = nn.MSELoss() elif loss == "MAE": criterion = nn.L1Loss() elif loss == "CE": criterion = CELoss() elif loss == "WCE": criterion = WCELoss() # Saving History to csv header = ['epoch', 'train loss'] best_loss = 10000 time_start = time.time() # Run the training and validation for epoch in range(epochs): print('\nStarting epoch {}/{}.'.format(epoch + 1, epochs)) train_loss = train_net(net, device, train_loader, optimizer, criterion, batch_size, isWCE = (loss == "WCE")) #val_loss = val_net(net, device, val_loader, criterion_val, batch_size) values = [epoch+1, train_loss] export_history(header, values, dir_results, "result"+run+".csv") # save model if train_loss < best_loss: best_loss = train_loss save_checkpoint({ 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'loss': train_loss, 'optimizer' : optimizer.state_dict(), }, path=dir_results, filename="weights"+run+".pth") time_dif = time.time() - time_start print("It tooks %.4f seconds to finish the run." % (time_dif))