Пример #1
0
 def _callback(epoch_num, sym=None, arg=None, aux=None):
     if epoch_num % save_freq == 0:
         misc.save_checkpoint(prefix,
                              epoch_num,
                              symbol=sym,
                              arg_params=arg,
                              aux_params=aux)
Пример #2
0
def trainer(dataloader,
            model,
            criterion,
            optimizer,
            args,
            num_i,
            epoch_num=10,
            checkpoint=0,
            device="cuda:0"):
    print('======= Start Training =======')
    best_epoch = 0
    best_acc = 0.0
    recorder = open('acc_result.txt', 'w')
    for epoch in range(epoch_num):

        time_start = time.time()
        print('Epoch {}/{}'.format(epoch, epoch_num))
        print('=' * 40)
        train_acc = train(dataloader['train'], net, criterion, optimizer,
                          device, recorder)
        valid_acc = validate(dataloader['val'], net, criterion, optimizer,
                             device, recorder)
        time_elapsed = time.time() - time_start
        print('-' * 10)
        print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60,
                                                   time_elapsed % 60))
        output = 'Epoch:{:3}    Train Acc={:.3f}, Val Acc={:3f}'.format(
            epoch, train_acc, valid_acc)
        print(output)
        recorder.write(output)
        print('-' * 10)

        if valid_acc > best_acc:
            best_acc = valid_acc
            best_epoch = epoch
            is_best = 1
        else:
            is_best = 0
        if checkpoint == 1:
            misc.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                num=args.classnum,
                filename="checkpoint_v{}_{:02}_{:1}.pth.tar".format(
                    num_i, epoch, args.classnum))
    recorder.write(f'best epoch: {best_epoch}')
    recorder.close()
Пример #3
0
def main():
    torch.manual_seed(0)
    torch.random.manual_seed(0)

    # create results folder, if not already exists
    output_directory = misc.get_output_directory(args)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    train_csv = os.path.join(output_directory, 'train.csv')
    test_csv = os.path.join(output_directory, 'test.csv')
    best_txt = os.path.join(output_directory, 'best.txt')
    
    print("=> creating data loaders ...")
    if args.data == 'MNIST':
        datadir = './data/'
        all_dataset = loader.MNIST(datadir)
        train_size = len(all_dataset) // 5 * 4
        test_size = len(all_dataset) // 10
        val_size = len(all_dataset) - (train_size + test_size)
        train_dataset, test_dataset, val_dataset = torch.utils.data.random_split(all_dataset, [train_size, test_size, val_size])
    else:
        raise RuntimeError('Dataset not found.' +
                           'The dataset must be either of nyudepthv2 or kitti.')

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True, sampler=None)
    # set batch size to be 1 for validation
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True)

    print("=> data loaders created.")

    # optionally resume from a checkpoint
    if args.start_epoch != 0:
        assert os.path.isfile(args.resume), \
            "=> no checkpoint found at '{}'".format(args.resume)
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch'] + 1
        best_result = checkpoint['best_result']
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']
        print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch']))

    # create new model
    else:
        # define model
        print("=> creating Model ({}) ...".format(args.arch))

        if args.arch == 'resnet50':
            model = models.ResNet(50)
        else:
            raise RuntimeError("model not found")

        print("=> model created.")

        
    # define loss function (criterion) and optimizer
    if args.criterion == 'cce':
        criterion = criteria.CrossEntropyLoss().cuda()
    else:
        raise RuntimeError("criterion not found")

    if args.optimizer == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr = args.lr, weight_decay = args.weight_decay)
    elif  args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        raise RuntimeError("optimizer not defined")

    optimizer_scheduler = lr_scheduler.StepLR(optimizer, args.epochs//3)

    model = model.cuda()
    print(model)
    print("=> model transferred to GPU.")

    train_logger, test_logger = None, None

    for epoch in range(args.start_epoch, args.epochs):
        train_result = train.train(train_loader, model, criterion, optimizer)

        if epoch == 0:
            train_logger = logger.Logger(train_result, output_directory, train_csv)
        else:
            train_logger.append(train_result)

        optimizer_scheduler.step()

        # evaluate on validation set
        test_result = test.validate(test_loader, model, criterion, optimizer)

        if epoch == 0:
            test_logger = logger.Logger(test_result, output_directory, test_csv)
        else:
            test_logger.append(test_result)

        misc.save_checkpoint({
            'args': args,
            'epoch': epoch,
            'arch': args.arch,
            'model': model,
            'best_result': best_result,
            'optimizer': optimizer,
        }, is_best, epoch, output_directory)

    train_logger.write_into_file('train')
    test_logger.write_into_file('test')
Пример #4
0
    for epoch in range(N_EPOCH):
        t1 = time.time()

        # train
        train_acc, train_loss = train(model, train_iter, optimizer, loss_func)
        # val
        val_acc, val_loss = val(model, val_iter, loss_func)

        diff = (time.time() - t1)
        print("Epoch [{}/{}] Train acc {:.4f} Train loss {:.4f} "
              "Val acc {:.4f} Val loss {:.4f} Time:{}".format(
                  epoch + 1, N_EPOCH, train_acc, train_loss, val_acc, val_loss,
                  int(diff)))

        # 保存最优模型
        if val_loss < best_val_loss:
            is_best = True
            print('save model loss descreasing {:.4f}->{:.4f}'.format(
                best_val_loss, val_loss))
            best_val_loss = val_loss

            save_checkpoint(
                {
                    'epoch': epoch,
                    'state_dict': model.state_dict(),
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                    'best_loss': best_val_loss,
                    'optimizer': optimizer.state_dict()
                }, is_best)
Пример #5
0
    train_loss, train_acc = train(train_loader, model, criterion, optimizer,
                                  use_cuda)
    test_loss, test_acc = validation(val_loader, model, criterion, use_cuda)

    # Append logger file
    writer.add_scalar('lr', state['lr'], epoch + 1)
    writer.add_scalar('train_loss', train_loss, epoch + 1)
    writer.add_scalar('test_loss', test_loss, epoch + 1)
    writer.add_scalar('train_acc', train_acc, epoch + 1)
    writer.add_scalar('test_acc', test_acc, epoch + 1)

    # Save model
    is_best = test_acc > best_acc
    best_acc = max(test_acc, best_acc)
    save_checkpoint(
        {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'acc': test_acc,
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
        },
        is_best,
        checkpoint=args.checkpoint)

# Export scalar data to JSON for external processing
writer.export_scalars_to_json(
    os.path.join(args.checkpoint, 'logger' + str(start_epoch) + '.json'))
writer.close()
Пример #6
0
def setup_and_run_train(n_channels, n_classes, dir_img, dir_gt, dir_results, load, 
                val_perc, batch_size, epochs, lr, run, optimizer, loss, evaluation, dir_weights):
    
    # Use GPU or not
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Create the model
    net = UNet(n_channels, n_classes).to(device)
    net = torch.nn.DataParallel(net, device_ids=list(
        range(torch.cuda.device_count()))).to(device)

    # Load old weights
    if load:
        net.load_state_dict(torch.load(load))
        print('Model loaded from {}'.format(load))

    # Load the dataset
    if loss != "WCE":
        train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size)
    else:
        train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size, isWCE = True, dir_weights = dir_weights)

    # Pretty print of the run
    print('''\n
    Starting training:
        Dataset: {}
        Num Channels: {}
        Groundtruth: {}
        Num Classes: {}
        Folder to save: {}
        Load previous: {}
        Training size: {}
        Validation size: {}
        Validation Percentage: {}
        Batch size: {}
        Epochs: {}
        Learning rate: {}
        Optimizer: {}
        Loss Function: {}
        Evaluation Function: {}
        CUDA: {}
    '''.format(dir_img, n_channels, dir_gt, n_classes, dir_results, load, 
            len(train_loader)*batch_size, len(val_loader)*batch_size, 
            val_perc, batch_size, epochs, lr, optimizer, loss, evaluation, use_cuda))

    # Definition of the optimizer ADD MORE IF YOU WANT
    if optimizer == "Adam":
        optimizer = torch.optim.Adam(net.parameters(),
                             lr=lr)
    elif optimizer == "SGD":
        optimizer = torch.optim.SGD(net.parameters(),
                        lr=lr,
                        momentum=0.9,
                        weight_decay=0.0005)

    # Definition of the loss function ADD MORE IF YOU WANT
    if loss == "Dice":
        criterion = DiceLoss()
    elif loss == "RMSE":
        criterion = RMSELoss()
    elif loss == "MSE":
        criterion = nn.MSELoss()
    elif loss == "MAE":
        criterion = nn.L1Loss()
    elif loss == "CE":
        criterion = CELoss()
    elif loss == "WCE":
        criterion = WCELoss()

    # Saving History to csv
    header = ['epoch', 'train loss']

    best_loss = 10000
    time_start = time.time()
    # Run the training and validation
    for epoch in range(epochs):
        print('\nStarting epoch {}/{}.'.format(epoch + 1, epochs))

        train_loss = train_net(net, device, train_loader, optimizer, criterion, batch_size, isWCE = (loss == "WCE"))
        #val_loss = val_net(net, device, val_loader, criterion_val, batch_size)
        
        values = [epoch+1, train_loss]
        export_history(header, values, dir_results, "result"+run+".csv")
        
        # save model
        if train_loss < best_loss:
            best_loss = train_loss
            save_checkpoint({
                    'epoch': epoch + 1,
                    'state_dict': net.state_dict(),
                    'loss': train_loss,
                    'optimizer' : optimizer.state_dict(),
                }, path=dir_results, filename="weights"+run+".pth")

    time_dif = time.time() - time_start
    print("It tooks %.4f seconds to finish the run." % (time_dif))