예제 #1
0
def check_termination(epoch):
    if AutoResume:
        shouldterminate = AutoResume.termination_requested()
        if shouldterminate:
            if args.global_rank == 0:
                progress = "Progress %d%% (epoch %d of %d)" % (
                    (epoch * 100 / args.max_epoch), epoch, args.max_epoch)
                AutoResume.request_resume(user_dict={
                    "RESUME_FILE": logx.save_ckpt_fn,
                    "TENSORBOARD_DIR": args.result_dir,
                    "EPOCH": str(epoch)
                },
                                          message=progress)
                return 1
            else:
                return 1
    return 0
예제 #2
0
def train(train_ds_path,
          val_ds_path,
          pths_path,
          results_path,
          batch_size,
          lr,
          num_workers,
          train_iter,
          interval,
          opt_level=0,
          checkpoint_path=None,
          val_freq=10):
    torch.cuda.set_device(rank)

    tensorboard_dir = os.path.join(results_path, 'logs')
    checkpoints_dir = os.path.join(results_path, 'checkpoints')
    if rank == 0:
        os.makedirs(tensorboard_dir, exist_ok=True)
        os.makedirs(checkpoints_dir, exist_ok=True)
    barrier()

    try:
        logger.info('Importing AutoResume lib...')
        from userlib.auto_resume import AutoResume as auto_resume
        auto_resume.init()
        logger.info('Success!')
    except:
        logger.info('Failed!')
        auto_resume = None

    trainset = custom_dataset(
        os.path.join(train_ds_path, 'images'),
        os.path.join(train_ds_path, 'gt'),
    )

    valset = custom_dataset(os.path.join(val_ds_path, 'images'),
                            os.path.join(val_ds_path, 'gt'),
                            is_val=True)

    logger.info(f'World Size: {world_size}, Rank: {rank}')

    if world_size > 1:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            trainset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            valset, shuffle=False)
    else:
        train_sampler = None
        val_sampler = None

    worker_init = LoaderWorkerProcessInit(rank, 43)
    train_loader = DataLoader(trainset,
                              batch_size=batch_size,
                              shuffle=train_sampler is None,
                              sampler=train_sampler,
                              num_workers=num_workers,
                              pin_memory=True,
                              drop_last=True,
                              worker_init_fn=worker_init)
    val_loader = DataLoader(valset,
                            batch_size=batch_size,
                            shuffle=False,
                            sampler=val_sampler,
                            num_workers=num_workers,
                            pin_memory=True,
                            drop_last=True,
                            worker_init_fn=worker_init)

    criterion = Loss()

    device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    model = EAST()
    model.to(device)

    model = apex.parallel.convert_syncbn_model(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=f'O{opt_level}')

    start_iter = 0
    if auto_resume is not None:
        auto_resume_details = auto_resume.get_resume_details()
        if auto_resume_details is not None:
            logger.info(
                'Detected that this is a resumption of a previous job!')
            checkpoint_path = auto_resume_details['CHECKPOINT_PATH']

    if checkpoint_path:
        logger.info(f'Loading checkpoint at path "{checkpoint_path}"...')
        checkpoint = torch.load(checkpoint_path, map_location=f'cuda:{rank}')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        amp.load_state_dict(checkpoint['amp_state'])
        start_iter = checkpoint['iter']
        logger.info('Done')

    data_parallel = False
    main_model = model
    if torch.distributed.is_initialized():
        logger.info(
            f'DataParallel: Using {torch.cuda.device_count()} devices!')
        model = DDP(model)
        data_parallel = True

    for param_group in optimizer.param_groups:
        param_group.setdefault('initial_lr', lr)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[train_iter // 2],
                                         gamma=0.1,
                                         last_epoch=start_iter)

    # This allows us to change dataset size without affecting things such as validation frequency
    steps_per_epoch = 1000 // (world_size * batch_size)

    step = start_iter
    start_epoch = step // steps_per_epoch
    epoch_iter = int(math.ceil(train_iter / steps_per_epoch))
    if rank == 0:
        logger.info('Initializing Tensorboard')
        writer = SummaryWriter(tensorboard_dir, purge_step=step)

    loss_meters = MeterDict(reset_on_value=True)
    val_loss_meters = MeterDict(reset_on_value=True)
    time_meters = MeterDict(reset_on_value=True)

    logger.info('Training')
    model.train()

    train_start_time = time.time()

    best_loss = 100

    train_iter = [iter(train_loader)]

    def get_batch():
        try:
            return next(train_iter[0])
        except:
            train_iter[0] = iter(train_loader)
            return get_batch()

    for epoch in range(start_epoch, epoch_iter):
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)

        epoch_loss = 0
        epoch_time = time.time()
        start_time = time.time()

        model.train()

        for i in range(steps_per_epoch):
            batch = get_batch()

            optimizer.zero_grad()

            batch = [b.cuda(rank, non_blocking=True) for b in batch]

            img, gt_score, gt_geo, ignored_map = batch
            barrier()
            time_meters['batch_time'].add_sample(time.time() - start_time)

            pred_score, pred_geo = model(img)

            loss, details = criterion(gt_score, pred_score, gt_geo, pred_geo,
                                      ignored_map)

            epoch_loss += loss.detach().item()

            with amp.scale_loss(loss, optimizer) as loss_scaled:
                loss_scaled.backward()
            optimizer.step()

            barrier()
            time_meters['step_time'].add_sample(time.time() - start_time)

            details['global'] = loss.detach().item()

            for k, v in details.items():
                loss_meters[k].add_sample(v)

            if i % 10 == 0:
                logger.info(f'\tStep [{i+1}/{steps_per_epoch}]')

            start_time = time.time()
            step += 1
            scheduler.step()

            if step == train_iter:
                break

        term_requested = auto_resume is not None and auto_resume.termination_requested(
        )

        checkpoint_path = None
        if rank == 0:
            times = {k: m.value() for k, m in time_meters.items()}
            losses = {k: m.value() for k, m in loss_meters.items()}

            times['epoch'] = time.time() - epoch_time

            logger.info(
                f'Epoch is [{epoch+1}/{epoch_iter}], time consumption is {times}, batch_loss is {losses}'
            )

            for k, v in times.items():
                writer.add_scalar(f'performance/{k}', v, step)
            for k, v in losses.items():
                writer.add_scalar(f'loss/{k}', v, step)
            writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'],
                              step)

            if term_requested or (epoch + 1) % interval == 0:
                state_dict = main_model.state_dict()
                optim_state = optimizer.state_dict()

                checkpoint_path = os.path.join(
                    checkpoints_dir, 'model_epoch_{}.pth'.format(epoch + 1))
                logger.info(f'Saving checkpoint to "{checkpoint_path}"...')
                torch.save(
                    {
                        'model': state_dict,
                        'optimizer': optim_state,
                        'amp_state': amp.state_dict(),
                        'epoch': epoch + 1,
                        'iter': step
                    }, checkpoint_path)
                logger.info(f'Done')

        if (epoch + 1) % val_freq == 0 or step == train_iter:
            logger.info(f'Validating epoch {epoch+1}...')
            model.eval()
            val_loader.dataset.reset_random()
            with torch.no_grad():
                for i, batch in enumerate(val_loader):
                    batch = [b.cuda(rank, non_blocking=True) for b in batch]

                    img, gt_score, gt_geo, ignored_map = batch
                    barrier()

                    pred_score, pred_geo = model(img)

                    loss, details = criterion(gt_score, pred_score, gt_geo,
                                              pred_geo, ignored_map)
                    details['global'] = loss.detach().item()

                    barrier()

                    for k, v in details.items():
                        val_loss_meters[k].add_sample(v)

            print_dict = dict()
            for k, m in val_loss_meters.items():
                t = torch.tensor(m.value(),
                                 device=f'cuda:{rank}',
                                 dtype=torch.float32)
                if world_size > 1:
                    torch.distributed.reduce(t, 0)
                    t /= world_size
                if rank == 0:
                    writer.add_scalar(f'val/loss/{k}', t.item(), step)
                print_dict[k] = t.item()
            logger.info(f'\tLoss: {print_dict}')
            val_loss = print_dict['global']
            if rank == 0 and val_loss < best_loss:
                logger.info(
                    f'This is the best model so far. New loss: {val_loss}, previous: {best_loss}'
                )
                best_loss = val_loss
                shutil.copyfile(checkpoint_path,
                                os.path.join(checkpoints_dir, 'best.pth'))
            logger.info('Training')

        if term_requested:
            logger.warning('Termination requested! Exiting...')
            if rank == 0:
                auto_resume.request_resume(user_dict={
                    'CHECKPOINT_PATH': save_path,
                    'EPOCH': epoch
                })
            break

    logger.info(
        f'Finished training!!! Took {time.time()-train_start_time:0.3f} seconds!'
    )
예제 #3
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    #define the NASA optimizer parameter
    iter_tot = len(train_loader) * args.max_epoch
    #    tau = args.tau_factor/sqrt(iter_tot)
    tau = 1
    net = network.get_net(args, criterion)
    k = 1
    #    optim, scheduler = get_optimizer(args, net)
    optim, scheduler = get_optimizer(args, net, tau, k)
    # Visualize feature maps
    #activation = {}
    #def get_activation(name):
    #def hook(model, input, output):
    #activation[name] = output.detach()
    #return hook

    #net.layer[0].register_forward_hook(get_activation('conv1'))
    #data, _ = dataset[0]
    #data.unsqueeze_(0)
    #output = model(data)

    #act = activation['conv1'].squeeze()
    #fig, axarr = plt.subplots(act.size(0))
    #for idx in range(act.size(0)):
    #axarr[idx].imshow(act[idx])

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:

        from thop import profile
        img = torch.randn(1, 3, 640, 640).cuda()
        mask = torch.randn(1, 1, 640, 640).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets

    if args.eval == 'test':
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True,
                 testing=True,
                 grid=city)

        return 0

    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=criterion_val,
                 optim=optim,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
예제 #4
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True, hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir,
                            args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume,
                                map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:
        print(str(net))
        from pytorchOpCounter.thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader, net, criterion=None, optim=None, epoch=0,
                 calc_metrics=False, dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
예제 #5
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=False,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    if args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        # validate(val_loader, net, criterion=None, optim=None, epoch=0,
        #          calc_metrics=False, dump_assets=args.dump_assets,
        #          dump_all_images=True)
        if not os.path.exists(args.result_dir + 'image_2/'):
            os.mkdir(args.result_dir + 'image_2/')
        if not os.path.exists(args.result_dir + 'image_3/'):
            os.mkdir(args.result_dir + 'image_3/')

        num_image = 7481
        for idx in tqdm(range(num_image)):
            sample_idx = "%06d" % idx
            eval_minibatch(sample_idx, "image_2/", net, args)
            eval_minibatch(sample_idx, "image_3/", net, args)

        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)