Exemplo n.º 1
0
def save_json(args, model, reglog, optimizer, loader):
    pred_label = []
    log_top1 = AverageMeter()

    for iter_epoch, (inp, target) in enumerate(loader):
        # measure data loading time

        learning_rate_decay(optimizer, len(loader) * args.epoch + iter_epoch, args.lr)

        # start at iter start_iter
        if iter_epoch < args.start_iter:
            continue

        # move to gpu
        inp = inp.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        if 'VOC2007' in args.data_path:
            target = target.float()

        # forward
        with torch.no_grad():
            output = model(inp)

        output = reglog(output)
        _, pred = output.topk(1, 1, True, True)
        pred = pred.t()

        pred_var = pred.data.cpu().numpy().reshape(-1) 
        for i in range(len(pred_var)):
            pred_label.append(pred_var[i])
  
        prec1 = accuracy(args, output, target)
        log_top1.update(prec1.item(), output.size(0)) 


    def load_json(file_path):
        assert os.path.exists(file_path), "{} does not exist".format(file_path)
        with open(file_path, 'r') as fp:
            data = json.load(fp)
        img_names = list(data.keys())
        return img_names
    
    json_predictions,img_names = {}, []
    img_names = load_json('./val_targets.json')

    for idx in range(len(pred_label)):
        json_predictions[img_names[idx]] = int(pred_label[idx])
    output_file = os.path.join(args.json_save_path, args.json_save_name)
 
    with open(output_file, 'w') as fp:
        json.dump(json_predictions, fp)   

    return log_top1.avg
Exemplo n.º 2
0
def train_network(args, model, reglog, optimizer, loader):
    """
    Train the models on the dataset.
    """
    # running statistics
    batch_time = AverageMeter()
    data_time = AverageMeter()

    # training statistics
    log_top1 = AverageMeter()
    log_loss = AverageMeter()
    end = time.perf_counter()

    if 'pascal' in args.data_path:
        criterion = nn.BCEWithLogitsLoss(reduction='none')
    else:
        criterion = nn.CrossEntropyLoss().cuda()

    for iter_epoch, (inp, target) in enumerate(loader):
        # measure data loading time
        data_time.update(time.perf_counter() - end)

        learning_rate_decay(optimizer, len(loader) * args.epoch + iter_epoch, args.lr)

        # start at iter start_iter
        if iter_epoch < args.start_iter:
            continue

        # move to gpu
        inp = inp.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        if 'pascal' in args.data_path:
            target = target.float()

        # forward
        with torch.no_grad():
            output = model(inp)
        output = reglog(output)

        # compute cross entropy loss
        loss = criterion(output, target)

        if 'pascal' in args.data_path:
            mask = (target == 255)
            loss = torch.sum(loss.masked_fill_(mask, 0)) / target.size(0)

        optimizer.zero_grad()

        # compute the gradients
        loss.backward()

        # step
        optimizer.step()

        # log

        # signal received, relaunch experiment
        if os.environ['SIGNAL_RECEIVED'] == 'True':
            if not args.rank:
                torch.save({
                    'epoch': args.epoch,
                    'start_iter': iter_epoch + 1,
                    'state_dict': reglog.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }, os.path.join(args.dump_path, 'checkpoint.pth.tar'))
                trigger_job_requeue(os.path.join(args.dump_path, 'checkpoint.pth.tar'))

        # update stats
        log_loss.update(loss.item(), output.size(0))
        if not 'pascal' in args.data_path:
            prec1 = accuracy(args, output, target)
            log_top1.update(prec1.item(), output.size(0))

        batch_time.update(time.perf_counter() - end)
        end = time.perf_counter()

        # verbose
        if iter_epoch % 100 == 0:
            logger.info('Epoch[{0}] - Iter: [{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {log_top1.val:.3f} ({log_top1.avg:.3f})\t'
                  .format(args.epoch, iter_epoch, len(loader), batch_time=batch_time,
                   data_time=data_time, loss=log_loss, log_top1=log_top1))

    # end of epoch
    args.start_iter = 0
    args.epoch += 1

    # dump checkpoint
    if not args.rank:
        torch.save({
            'epoch': args.epoch,
            'start_iter': 0,
            'state_dict': reglog.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, os.path.join(args.dump_path, 'checkpoint.pth.tar'))

    return (args.epoch - 1, args.epoch * len(loader), log_top1.avg, log_loss.avg)
Exemplo n.º 3
0
def train_network(args, model, optimizer, dataset):
    """
    Train the models on the dataset.
    """
    # swith to train mode
    model.train()

    sampler = torch.utils.data.distributed.DistributedSampler(dataset)

    loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size,
        num_workers=args.workers,
        pin_memory=True,
    )

    # running statistics
    batch_time = AverageMeter()
    data_time = AverageMeter()

    # training statistics
    log_top1 = AverageMeter()
    log_loss = AverageMeter()
    end = time.perf_counter()

    cel = nn.CrossEntropyLoss().cuda()

    for iter_epoch, (inp, target) in enumerate(loader):
        # measure data loading time
        data_time.update(time.perf_counter() - end)

        # start at iter start_iter
        if iter_epoch < args.start_iter:
            continue

        # move to gpu
        inp = inp.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # forward
        output = model(inp)

        # compute cross entropy loss
        loss = cel(output, target)

        optimizer.zero_grad()

        # compute the gradients
        loss.backward()

        # step
        optimizer.step()

        # log

        # signal received, relaunch experiment
        if os.environ['SIGNAL_RECEIVED'] == 'True':
            if not args.rank:
                torch.save(
                    {
                        'epoch': args.epoch,
                        'start_iter': iter_epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, os.path.join(args.dump_path, 'checkpoint.pth.tar'))
                trigger_job_requeue(
                    os.path.join(args.dump_path, 'checkpoint.pth.tar'))

        # update stats
        log_loss.update(loss.item(), output.size(0))
        prec1 = accuracy(args, output, target)
        log_top1.update(prec1.item(), output.size(0))

        batch_time.update(time.perf_counter() - end)
        end = time.perf_counter()

        # verbose
        if iter_epoch % 100 == 0:
            logger.info(
                'Epoch[{0}] - Iter: [{1}/{2}]\t'
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                'Prec {log_top1.val:.3f} ({log_top1.avg:.3f})\t'.format(
                    args.epoch,
                    iter_epoch,
                    len(loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=log_loss,
                    log_top1=log_top1))

    # end of epoch
    args.start_iter = 0
    args.epoch += 1

    # dump checkpoint
    if not args.rank:
        torch.save(
            {
                'epoch': args.epoch,
                'start_iter': 0,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, os.path.join(args.dump_path, 'checkpoint.pth.tar'))
        if not (args.epoch - 1) % args.checkpoint_freq:
            shutil.copyfile(
                os.path.join(args.dump_path, 'checkpoint.pth.tar'),
                os.path.join(args.dump_checkpoints,
                             'checkpoint' + str(args.epoch - 1) + '.pth.tar'),
            )

    return (args.epoch - 1, args.epoch * len(loader), log_top1.avg,
            log_loss.avg)