Пример #1
0
def train(model,
          device,
          args,
          *,
          val_interval,
          bn_process=False,
          all_iters=None):

    optimizer = args.optimizer
    loss_function = args.loss_function
    scheduler = args.scheduler
    train_dataprovider = args.train_dataprovider

    t1 = time.time()
    Top1_err, Top5_err = 0.0, 0.0
    model.train()
    for iters in range(1, val_interval + 1):
        scheduler.step()
        if bn_process:
            adjust_bn_momentum(model, iters)

        all_iters += 1
        d_st = time.time()
        data, target = train_dataprovider.next()
        target = target.type(torch.LongTensor)
        data, target = data.to(device), target.to(device)
        data_time = time.time() - d_st

        output = model(data)
        loss = loss_function(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        prec1, prec5 = accuracy(output, target, topk=(1, 5))

        Top1_err += 1 - prec1.item() / 100
        Top5_err += 1 - prec5.item() / 100

        if all_iters % args.display_interval == 0:
            printInfo = 'TRAIN Iter {}: lr = {:.6f},\tloss = {:.6f},\t'.format(all_iters, scheduler.get_lr()[0], loss.item()) + \
                        'Top-1 err = {:.6f},\t'.format(Top1_err / args.display_interval) + \
                        'Top-5 err = {:.6f},\t'.format(Top5_err / args.display_interval) + \
                        'data_time = {:.6f},\ttrain_time = {:.6f}'.format(data_time, (time.time() - t1) / args.display_interval)
            logging.info(printInfo)
            t1 = time.time()
            Top1_err, Top5_err = 0.0, 0.0

        if all_iters % args.save_interval == 0:
            save_checkpoint({
                'state_dict': model.state_dict(),
            }, all_iters)

    return all_iters
def fit(start_epoch, num_epochs, model, loss_func, opt, lr_scheduler,
        best_score, max_batches_per_iter_cnt, checkpoint_dir, train_loader,
        val_loader):
    for epoch in range(start_epoch, start_epoch + num_epochs):
        val_loss = train_one_epoch(model, loss_func, opt, lr_scheduler,
                                   max_batches_per_iter_cnt, train_loader,
                                   val_loader, epoch)
        if best_score > val_loss:
            best_score = val_loss
            save_as_best = True
        else:
            save_as_best = False
        save_checkpoint(epoch, model, opt, lr_scheduler, best_score,
                        checkpoint_dir, save_as_best)
Пример #3
0
def main_worker(gpu_idx, configs):
    configs.gpu_idx = gpu_idx
    configs.device = torch.device('cpu' if configs.gpu_idx is None else 'cuda:{}'.format(configs.gpu_idx))

    if configs.distributed:
        if configs.dist_url == "env://" and configs.rank == -1:
            configs.rank = int(os.environ["RANK"])
        if configs.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx

        dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url,
                                world_size=configs.world_size, rank=configs.rank)
        configs.subdivisions = int(64 / configs.batch_size / configs.ngpus_per_node)
    else:
        configs.subdivisions = int(64 / configs.batch_size)

    configs.is_master_node = (not configs.distributed) or (
            configs.distributed and (configs.rank % configs.ngpus_per_node == 0))

    if configs.is_master_node:
        logger = Logger(configs.logs_dir, configs.saved_fn)
        logger.info('>>> Created a new logger')
        logger.info('>>> configs: {}'.format(configs))
        tb_writer = SummaryWriter(log_dir=os.path.join(configs.logs_dir, 'tensorboard'))
    else:
        logger = None
        tb_writer = None

    # model
    model = create_model(configs)

    # load weight from a checkpoint
    if configs.pretrained_path is not None:
        assert os.path.isfile(configs.pretrained_path), "=> no checkpoint found at '{}'".format(configs.pretrained_path)
        model.load_state_dict(torch.load(configs.pretrained_path, map_location='cpu'))
        if logger is not None:
            logger.info('loaded pretrained model at {}'.format(configs.pretrained_path))

    # resume weights of model from a checkpoint
    if configs.resume_path is not None:
        assert os.path.isfile(configs.resume_path), "=> no checkpoint found at '{}'".format(configs.resume_path)
        model.load_state_dict(torch.load(configs.resume_path, map_location='cpu'))
        if logger is not None:
            logger.info('resume training model from checkpoint {}'.format(configs.resume_path))

    # Data Parallel
    model = make_data_parallel(model, configs)

    # Make sure to create optimizer after moving the model to cuda
    optimizer = create_optimizer(configs, model)
    lr_scheduler = create_lr_scheduler(optimizer, configs)
    configs.step_lr_in_epoch = False if configs.lr_type in ['multi_step', 'cosin', 'one_cycle'] else True

    # resume optimizer, lr_scheduler from a checkpoint
    if configs.resume_path is not None:
        utils_path = configs.resume_path.replace('Model_', 'Utils_')
        assert os.path.isfile(utils_path), "=> no checkpoint found at '{}'".format(utils_path)
        utils_state_dict = torch.load(utils_path, map_location='cuda:{}'.format(configs.gpu_idx))
        optimizer.load_state_dict(utils_state_dict['optimizer'])
        lr_scheduler.load_state_dict(utils_state_dict['lr_scheduler'])
        configs.start_epoch = utils_state_dict['epoch'] + 1

    if configs.is_master_node:
        num_parameters = get_num_parameters(model)
        logger.info('number of trained parameters of the model: {}'.format(num_parameters))

    if logger is not None:
        logger.info(">>> Loading dataset & getting dataloader...")
    # Create dataloader
    train_dataloader, train_sampler = create_train_dataloader(configs)
    if logger is not None:
        logger.info('number of batches in training set: {}'.format(len(train_dataloader)))

    if configs.evaluate:
        val_dataloader = create_val_dataloader(configs)
        val_loss = validate(val_dataloader, model, configs)
        print('val_loss: {:.4e}'.format(val_loss))
        return

    for epoch in range(configs.start_epoch, configs.num_epochs + 1):
        if logger is not None:
            logger.info('{}'.format('*-' * 40))
            logger.info('{} {}/{} {}'.format('=' * 35, epoch, configs.num_epochs, '=' * 35))
            logger.info('{}'.format('*-' * 40))
            logger.info('>>> Epoch: [{}/{}]'.format(epoch, configs.num_epochs))

        if configs.distributed:
            train_sampler.set_epoch(epoch)
        # train for one epoch
        train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer)
        if (not configs.no_val) and (epoch % configs.checkpoint_freq == 0):
            val_dataloader = create_val_dataloader(configs)
            print('number of batches in val_dataloader: {}'.format(len(val_dataloader)))
            val_loss = validate(val_dataloader, model, configs)
            print('val_loss: {:.4e}'.format(val_loss))
            if tb_writer is not None:
                tb_writer.add_scalar('Val_loss', val_loss, epoch)

        # Save checkpoint
        if configs.is_master_node and ((epoch % configs.checkpoint_freq) == 0):
            model_state_dict, utils_state_dict = get_saved_state(model, optimizer, lr_scheduler, epoch, configs)
            save_checkpoint(configs.checkpoints_dir, configs.saved_fn, model_state_dict, utils_state_dict, epoch)

        if not configs.step_lr_in_epoch:
            lr_scheduler.step()
            if tb_writer is not None:
                tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], epoch)

    if tb_writer is not None:
        tb_writer.close()
    if configs.distributed:
        cleanup()
Пример #4
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    netG = moco.builder.MaskGenerator()
    netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim,
                             args.moco_k, args.moco_m, args.moco_t, args.mlp)
    print(netG)
    print(netD)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            netG.cuda(args.gpu)
            netD.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            netG = torch.nn.parallel.DistributedDataParallel(
                netG, device_ids=[args.gpu], find_unused_parameters=True)
            netD = torch.nn.parallel.DistributedDataParallel(
                netD, device_ids=[args.gpu], find_unused_parameters=True)
        else:
            netG.cuda()
            netD.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            netG = torch.nn.parallel.DistributedDataParallel(netG)
            netD = torch.nn.parallel.DistributedDataParallel(netD)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        netG = netG.cuda(args.gpu)
        netD = netD.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        pass  # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu
    # torch.cuda.synchronize()
    optimizer_g = torch.optim.SGD(netG.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    optimizer_d = torch.optim.SGD(netD.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    G_criterion = nn.L1Loss().cuda(args.gpu)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netD.load_state_dict(checkpoint['state_dict'])
            #optimizer_d.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

        if os.path.isfile(args.resumeG):
            print("=> loading checkpoint '{}'".format(args.resumeG))
            if args.gpu is None:
                checkpoint = torch.load(args.resumeG)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resumeG, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netG.load_state_dict(checkpoint['state_dict'])
            #optimizer_g.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resumeG, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resumeG))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    video_augmentation = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)),
    ])
    audio_augmentation = moco.loader.DummyAudioTransform()
    augmentation = {'video': video_augmentation, 'audio': audio_augmentation}

    augmentation_gpu = moco.loader.MoCoAugmentV2(
        args.crop_size) if args.aug_plus else moco.loader.MoCoAugment(
            args.crop_size)

    train_dataset = Kinetics400(traindir,
                                args.frame_per_clip,
                                args.step_between_clips,
                                extensions='mp4',
                                transform=augmentation,
                                num_workers=4)

    train_sampler = RandomClipSampler(train_dataset.video_clips, 1)

    if args.distributed:
        # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler)
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               multiprocessing_context="fork")
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer_d, epoch, args)
        adjust_learning_rate(optimizer_g, epoch, args)

        # train for one epoch
        train(train_loader, augmentation_gpu, criterion, G_criterion, netG,
              netD, optimizer_g, optimizer_d, epoch, args, writer)

        if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or
                                      (args.multiprocessing_distributed
                                       and args.rank % ngpus_per_node == 0)):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netG.state_dict(),
            },
                            ckp_dir + '/netG',
                            max_save=20,
                            is_best=False)

            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netD.state_dict(),
            },
                            ckp_dir + '/netD',
                            max_save=20,
                            is_best=False)
Пример #5
0
    def train_loop(self,
                   iterators,
                   optimizers,
                   run_dir,
                   task="joint",
                   epochs=100,
                   min_epochs=0,
                   patience=5,
                   epoch_start=0,
                   best_f1=None,
                   epochs_no_improv=None,
                   best_scores=None,
                   criterion="re",
                   mode="strict",
                   train_key="train",
                   dev_key="dev",
                   save_all_tasks=False,
                   gradient_accumulation=1,
                   tensorboard_summary=True,
                   **kwargs):

        # Validation or not
        if dev_key is not None:
            logging.info(
                "Starting train loop: {} epochs; {} min; {} patience".format(
                    epochs, min_epochs, patience))
        else:
            logging.info(
                "Starting train loop without validation for {} epochs".format(
                    epochs))
            patience = 0
            min_epochs = epochs

        #
        tasks = iterators.keys()
        if best_f1 is None:
            best_f1 = {t: 0 for t in tasks}

        if epochs_no_improv is None:
            epochs_no_improv = {t: 0 for t in tasks}

        if best_scores is None:
            best_scores = {t: 0 for t in tasks}

        # Check for early stopping already matched (when reloading a checkpoint)
        if patience and epoch_start > min_epochs and epochs_no_improv[
                criterion] >= patience:
            logging.info(
                "Early stopping after {} epochs without improvement.".format(
                    patience))
        else:
            writer = SummaryWriter(
                run_dir) if TENSORBOARD and tensorboard_summary else None
            # Training loop
            for epoch in range(epoch_start, epochs):
                logging.info("Epoch {}/{} :".format(epoch + 1, epochs))
                train_losses = self.run_epoch(
                    iterators,
                    epoch,
                    optimizers,
                    writer,
                    task=task,
                    train_key=train_key,
                    gradient_accumulation=gradient_accumulation)
                n_iter = (epoch + 1) * len(list(train_losses.values())[0])

                # Log train losses + evaluate on dev if not None
                if "ner" in tasks:
                    logging.info("Train NER Loss : {}".format(
                        np.mean(train_losses["ner"])))
                    if dev_key is not None:
                        ner_preds, _, ner_loss, ner_scores = self.evaluate_ner(
                            iterators["ner"][dev_key])
                        logging.info("Dev NER Loss : {}".format(ner_loss))

                if "re" in tasks:
                    logging.info("Train RE Loss : {}".format(
                        np.mean(train_losses["re"])))
                    if dev_key is not None:
                        re_preds, _, re_loss, re_scores = self.evaluate_re(
                            iterators["re"][dev_key], mode=mode)
                        logging.info("Dev RE Loss : {}".format(re_loss))

                # If validation : record current and best checkpoints + enable early stopping on dev score
                if dev_key is not None:
                    # save checkpoint and scores
                    scores = {}
                    f1 = {}
                    for t in tasks:
                        f1[t] = locals()["{}_scores".format(t)]["ALL"]["f1"]
                        scores[t] = locals()["{}_scores".format(t)]

                    for t in f1.keys():
                        if f1[t] > best_f1[t] or epoch == 0:
                            logging.info(
                                "New best {} F1 score on dev : {}".format(
                                    t, f1[t]))
                            if save_all_tasks or t == criterion:
                                logging.info("Saving model...")
                            best_f1[t] = f1[t]
                            epochs_no_improv[t] = 0
                            is_best = True

                        else:
                            epochs_no_improv[t] += 1
                            is_best = False

                        state = {
                            'epoch': epoch + 1,
                            'epochs_no_improv': epochs_no_improv,
                            'model': self.state_dict(),
                            'scores': scores,
                            'optimizers': {
                                k: optimizer.state_dict()
                                for k, optimizer in optimizers.items()
                            }
                        }

                        if save_all_tasks or t == criterion:
                            save_checkpoint(state,
                                            is_best,
                                            checkpoint=run_dir +
                                            '{}_checkpoint.pth.tar'.format(t),
                                            best=run_dir +
                                            '{}_best.pth.tar'.format(t))

                    if TENSORBOARD and tensorboard_summary:
                        if "ner" in iterators.keys():
                            writer.add_scalars("ner_loss", {"dev": ner_loss},
                                               n_iter)
                            add_score(writer, ner_scores, n_iter, task="ner")
                        if "re" in iterators.keys():
                            writer.add_scalars("re_loss", {"dev": re_loss},
                                               n_iter)
                            add_score(writer, re_scores, n_iter, task="re")

                    # early stopping
                    if patience and epoch > min_epochs and epochs_no_improv[
                            criterion] >= patience:
                        logging.info(
                            "Early stopping after {} epochs without improvement on {}."
                            .format(patience, criterion))
                        break

                # Else : record current checkpoint
                else:
                    state = {
                        'epoch': epoch + 1,
                        'epochs_no_improv': 0,
                        'model': self.state_dict(),
                        'optimizers': {
                            k: optimizer.state_dict()
                            for k, optimizer in optimizers.items()
                        }
                    }

                    save_checkpoint(state,
                                    is_best=epoch == epochs - 1,
                                    checkpoint=run_dir +
                                    '{}_checkpoint.pth.tar'.format(criterion),
                                    best=run_dir +
                                    '{}_best.pth.tar'.format(criterion))

            if TENSORBOARD and tensorboard_summary:
                writer.close()
)


start_epoch = 1
if args.resume is not None:
    print('Resume training from %s' % args.resume)
    checkpoint = torch.load(args.resume)
    start_epoch = checkpoint['epoch'] + 1
    model.load_state_dict(checkpoint['model_state'])
    optimizer.load_state_dict(checkpoint['optimizer_state'])

columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'time']

train_utils.save_checkpoint(
    args.dir,
    start_epoch - 1,
    model_state=model.state_dict(),
    optimizer_state=optimizer.state_dict()
)

test_res = {'loss': None, 'accuracy': None, 'nll': None}
for epoch in range(start_epoch, args.epochs + 1):
    time_ep = time.time()

    lr = learning_rate_schedule(args.lr, epoch, args.epochs)
    train_utils.adjust_learning_rate(optimizer, lr)

    train_res = train_utils.train(loaders['train'], model, optimizer, criterion, regularizer, cuda=args.cuda)
    test_res = train_utils.test(loaders['test'], model, criterion, regularizer, cuda=args.cuda)

    if epoch % args.save_freq == 0:
        train_utils.save_checkpoint(
Пример #7
0
def PolarOffsetMain(args, cfg):
    if args.launcher == None:
        dist_train = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(
            common_utils, 'init_dist_%s' % args.launcher)(args.batch_size,
                                                          args.tcp_port,
                                                          args.local_rank,
                                                          backend='nccl')
        dist_train = True
    cfg['DIST_TRAIN'] = dist_train
    output_dir = os.path.join('./output', args.tag)
    ckpt_dir = os.path.join(output_dir, 'ckpt')
    tmp_dir = os.path.join(output_dir, 'tmp')
    summary_dir = os.path.join(output_dir, 'summary')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir, exist_ok=True)
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir, exist_ok=True)

    if args.onlyval and args.saveval:
        results_dir = os.path.join(output_dir, 'test', 'sequences')
        if not os.path.exists(results_dir):
            os.makedirs(results_dir, exist_ok=True)
        for i in range(8, 9):
            sub_dir = os.path.join(results_dir, str(i).zfill(2), 'predictions')
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)

    if args.onlytest:
        results_dir = os.path.join(output_dir, 'test', 'sequences')
        if not os.path.exists(results_dir):
            os.makedirs(results_dir, exist_ok=True)
        for i in range(11, 22):
            sub_dir = os.path.join(results_dir, str(i).zfill(2), 'predictions')
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)

    log_file = os.path.join(
        output_dir, ('log_train_%s.txt' %
                     datetime.datetime.now().strftime('%Y%m%d-%H%M%S')))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.config, output_dir))

    ### create dataloader
    if (not args.onlytest) and (not args.onlyval):
        train_dataset_loader = build_dataloader(args,
                                                cfg,
                                                split='train',
                                                logger=logger)
        val_dataset_loader = build_dataloader(args,
                                              cfg,
                                              split='val',
                                              logger=logger,
                                              no_shuffle=True,
                                              no_aug=True)
    elif args.onlyval:
        val_dataset_loader = build_dataloader(args,
                                              cfg,
                                              split='val',
                                              logger=logger,
                                              no_shuffle=True,
                                              no_aug=True)
    else:
        test_dataset_loader = build_dataloader(args,
                                               cfg,
                                               split='test',
                                               logger=logger,
                                               no_shuffle=True,
                                               no_aug=True)

    ### create model
    model = build_network(cfg)
    model.cuda()

    ### create optimizer
    optimizer = train_utils.build_optimizer(model, cfg)

    ### load ckpt
    ckpt_fname = os.path.join(ckpt_dir, args.ckpt_name)
    epoch = -1

    other_state = {}
    if args.pretrained_ckpt is not None and os.path.exists(ckpt_fname):
        logger.info(
            "Now in pretrain mode and loading ckpt: {}".format(ckpt_fname))
        if not args.nofix:
            if args.fix_semantic_instance:
                logger.info(
                    "Freezing backbone, semantic and instance part of the model."
                )
                model.fix_semantic_instance_parameters()
            else:
                logger.info(
                    "Freezing semantic and backbone part of the model.")
                model.fix_semantic_parameters()
        optimizer = train_utils.build_optimizer(model, cfg)
        epoch, other_state = train_utils.load_params_with_optimizer_otherstate(
            model,
            ckpt_fname,
            to_cpu=dist_train,
            optimizer=optimizer,
            logger=logger)  # new feature
        logger.info("Loaded Epoch: {}".format(epoch))
    elif args.pretrained_ckpt is not None:
        train_utils.load_pretrained_model(model,
                                          args.pretrained_ckpt,
                                          to_cpu=dist_train,
                                          logger=logger)
        if not args.nofix:
            if args.fix_semantic_instance:
                logger.info(
                    "Freezing backbone, semantic and instance part of the model."
                )
                model.fix_semantic_instance_parameters()
            else:
                logger.info(
                    "Freezing semantic and backbone part of the model.")
                model.fix_semantic_parameters()
        else:
            logger.info("No Freeze.")
        optimizer = train_utils.build_optimizer(model, cfg)
    elif os.path.exists(ckpt_fname):
        epoch, other_state = train_utils.load_params_with_optimizer_otherstate(
            model,
            ckpt_fname,
            to_cpu=dist_train,
            optimizer=optimizer,
            logger=logger)  # new feature
        logger.info("Loaded Epoch: {}".format(epoch))
    if other_state is None:
        other_state = {}

    ### create optimizer and scheduler
    lr_scheduler = None
    if lr_scheduler == None:
        logger.info('Not using lr scheduler')

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()],
            find_unused_parameters=True)
    logger.info(model)

    if cfg.LOCAL_RANK == 0:
        writer = SummaryWriter(log_dir=summary_dir)

    logger.info('**********************Start Training**********************')
    rank = cfg.LOCAL_RANK
    best_before_iou = -1 if 'best_before_iou' not in other_state else other_state[
        'best_before_iou']
    best_pq = -1 if 'best_pq' not in other_state else other_state['best_pq']
    best_after_iou = -1 if 'best_after_iou' not in other_state else other_state[
        'best_after_iou']
    global_iter = 0 if 'global_iter' not in other_state else other_state[
        'global_iter']
    val_global_iter = 0 if 'val_global_iter' not in other_state else other_state[
        'val_global_iter']
    best_tracking_loss = 10086 if 'best_tracking_loss' not in other_state else other_state[
        'best_tracking_loss']

    ### test
    if args.onlytest:
        logger.info('----EPOCH {} Testing----'.format(epoch))
        model.eval()
        if rank == 0:
            vbar = tqdm(total=len(test_dataset_loader), dynamic_ncols=True)
        for i_iter, inputs in enumerate(test_dataset_loader):
            with torch.no_grad():
                if cfg.MODEL.NAME.startswith(
                        'PolarOffsetSpconvPytorchMeanshiftTracking'
                ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                    ret_dict = model(inputs,
                                     is_test=True,
                                     merge_evaluator_list=None,
                                     merge_evaluator_window_k_list=None,
                                     require_cluster=True)
                else:
                    ret_dict = model(inputs,
                                     is_test=True,
                                     require_cluster=True,
                                     require_merge=True)
                common_utils.save_test_results(ret_dict, results_dir, inputs)
            if rank == 0:
                vbar.set_postfix({
                    'fname':
                    '/'.join(inputs['pcd_fname'][0].split('/')[-3:])
                })
                vbar.update(1)
        if rank == 0:
            vbar.close()
        logger.info("----Testing Finished----")
        return

    ### evaluate
    if args.onlyval:
        logger.info('----EPOCH {} Evaluating----'.format(epoch))
        model.eval()
        min_points = 50  # according to SemanticKITTI official rule
        if cfg.MODEL.NAME.startswith(
                'PolarOffsetSpconvPytorchMeanshiftTracking'
        ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
            merge_evaluator_list = []
            merge_evaluator_window_k_list = []
            for k in [1, 5, 10, 15]:
                merge_evaluator_list.append(init_eval(min_points))
                merge_evaluator_window_k_list.append(k)
        else:
            before_merge_evaluator = init_eval(min_points=min_points)
            after_merge_evaluator = init_eval(min_points=min_points)
        if rank == 0:
            vbar = tqdm(total=len(val_dataset_loader), dynamic_ncols=True)
        for i_iter, inputs in enumerate(val_dataset_loader):
            inputs['i_iter'] = i_iter
            # torch.cuda.empty_cache()
            with torch.no_grad():
                if cfg.MODEL.NAME.startswith(
                        'PolarOffsetSpconvPytorchMeanshiftTracking'
                ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                    ret_dict = model(inputs,
                                     is_test=True,
                                     merge_evaluator_list=merge_evaluator_list,
                                     merge_evaluator_window_k_list=
                                     merge_evaluator_window_k_list,
                                     require_cluster=True)
                else:
                    ret_dict = model(
                        inputs,
                        is_test=True,
                        before_merge_evaluator=before_merge_evaluator,
                        after_merge_evaluator=after_merge_evaluator,
                        require_cluster=True)
                #########################
                # with open('./ipnb/{}_matching_list.pkl'.format(i_iter), 'wb') as fd:
                #     pickle.dump(ret_dict['matching_list'], fd)
                #########################
                if args.saveval:
                    common_utils.save_test_results(ret_dict, results_dir,
                                                   inputs)
            if rank == 0:
                vbar.set_postfix({
                    'loss':
                    ret_dict['loss'].item(),
                    'fname':
                    '/'.join(inputs['pcd_fname'][0].split('/')[-3:]),
                    'ins_num':
                    -1 if 'ins_num' not in ret_dict else ret_dict['ins_num']
                })
                vbar.update(1)
        if dist_train:
            if cfg.MODEL.NAME.startswith(
                    'PolarOffsetSpconvPytorchMeanshiftTracking'
            ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                pass
            else:
                before_merge_evaluator = common_utils.merge_evaluator(
                    before_merge_evaluator, tmp_dir)
                dist.barrier()
                after_merge_evaluator = common_utils.merge_evaluator(
                    after_merge_evaluator, tmp_dir)

        if rank == 0:
            vbar.close()
        if rank == 0:
            ## print results
            if cfg.MODEL.NAME.startswith(
                    'PolarOffsetSpconvPytorchMeanshiftTracking'
            ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                for evaluate, window_k in zip(merge_evaluator_list,
                                              merge_evaluator_window_k_list):
                    logger.info("Current Window K: {}".format(window_k))
                    printResults(evaluate, logger=logger)
            else:
                logger.info("Before Merge Semantic Scores")
                before_merge_results = printResults(before_merge_evaluator,
                                                    logger=logger,
                                                    sem_only=True)
                logger.info("After Merge Panoptic Scores")
                after_merge_results = printResults(after_merge_evaluator,
                                                   logger=logger)

        logger.info("----Evaluating Finished----")
        return

    ### train
    while True:
        epoch += 1
        if 'MAX_EPOCH' in cfg.OPTIMIZE.keys():
            if epoch > cfg.OPTIMIZE.MAX_EPOCH:
                break

        ### train one epoch
        logger.info('----EPOCH {} Training----'.format(epoch))
        loss_acc = 0
        if rank == 0:
            pbar = tqdm(total=len(train_dataset_loader), dynamic_ncols=True)
        for i_iter, inputs in enumerate(train_dataset_loader):
            # torch.cuda.empty_cache()
            torch.autograd.set_detect_anomaly(True)
            model.train()
            optimizer.zero_grad()
            inputs['i_iter'] = i_iter
            inputs['rank'] = rank
            ret_dict = model(inputs)

            if args.pretrained_ckpt is not None and not args.fix_semantic_instance:  # training offset
                if args.nofix:
                    loss = ret_dict['loss']
                elif len(ret_dict['offset_loss_list']) > 0:
                    loss = sum(ret_dict['offset_loss_list'])
                else:
                    loss = torch.tensor(0.0, requires_grad=True)  #mock pbar
                    ret_dict['offset_loss_list'] = [loss]  #mock writer
            elif args.pretrained_ckpt is not None and args.fix_semantic_instance and cfg.MODEL.NAME == 'PolarOffsetSpconvPytorchMeanshift':  # training dynamic shifting
                loss = sum(ret_dict['meanshift_loss'])
            elif cfg.MODEL.NAME.startswith(
                    'PolarOffsetSpconvPytorchMeanshiftTracking'
            ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                loss = sum(ret_dict['tracking_loss'])
                #########################
                # with open('./ipnb/{}_matching_list.pkl'.format(i_iter), 'wb') as fd:
                #     pickle.dump(ret_dict['matching_list'], fd)
                #########################
            else:
                loss = ret_dict['loss']
            loss.backward()
            optimizer.step()

            if rank == 0:
                try:
                    cur_lr = float(optimizer.lr)
                except:
                    cur_lr = optimizer.param_groups[0]['lr']
                loss_acc += loss.item()
                pbar.set_postfix({
                    'loss': loss.item(),
                    'lr': cur_lr,
                    'mean_loss': loss_acc / float(i_iter + 1)
                })
                pbar.update(1)
                writer.add_scalar('Train/01_Loss', ret_dict['loss'].item(),
                                  global_iter)
                writer.add_scalar('Train/02_SemLoss',
                                  ret_dict['sem_loss'].item(), global_iter)
                if 'offset_loss_list' in ret_dict and sum(
                        ret_dict['offset_loss_list']).item() > 0:
                    writer.add_scalar('Train/03_InsLoss',
                                      sum(ret_dict['offset_loss_list']).item(),
                                      global_iter)
                writer.add_scalar('Train/04_LR', cur_lr, global_iter)
                writer_acc = 5
                if 'meanshift_loss' in ret_dict:
                    writer.add_scalar('Train/05_DSLoss',
                                      sum(ret_dict['meanshift_loss']).item(),
                                      global_iter)
                    writer_acc += 1
                if 'tracking_loss' in ret_dict:
                    writer.add_scalar('Train/06_TRLoss',
                                      sum(ret_dict['tracking_loss']).item(),
                                      global_iter)
                    writer_acc += 1
                more_keys = []
                for k, _ in ret_dict.items():
                    if k.find('summary') != -1:
                        more_keys.append(k)
                for ki, k in enumerate(more_keys):
                    if k == 'bandwidth_weight_summary':
                        continue
                    ki += writer_acc
                    writer.add_scalar(
                        'Train/{}_{}'.format(str(ki).zfill(2), k), ret_dict[k],
                        global_iter)
                global_iter += 1
        if rank == 0:
            pbar.close()

        ### evaluate after each epoch
        logger.info('----EPOCH {} Evaluating----'.format(epoch))
        model.eval()
        min_points = 50
        before_merge_evaluator = init_eval(min_points=min_points)
        after_merge_evaluator = init_eval(min_points=min_points)
        tracking_loss = 0
        if rank == 0:
            vbar = tqdm(total=len(val_dataset_loader), dynamic_ncols=True)
        for i_iter, inputs in enumerate(val_dataset_loader):
            # torch.cuda.empty_cache()
            inputs['i_iter'] = i_iter
            inputs['rank'] = rank
            with torch.no_grad():
                if cfg.MODEL.NAME.startswith(
                        'PolarOffsetSpconvPytorchMeanshiftTracking'
                ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                    ret_dict = model(inputs,
                                     is_test=True,
                                     merge_evaluator_list=None,
                                     merge_evaluator_window_k_list=None,
                                     require_cluster=True)
                else:
                    ret_dict = model(
                        inputs,
                        is_test=True,
                        before_merge_evaluator=before_merge_evaluator,
                        after_merge_evaluator=after_merge_evaluator,
                        require_cluster=True)
            if rank == 0:
                vbar.set_postfix({'loss': ret_dict['loss'].item()})
                vbar.update(1)
                writer.add_scalar('Val/01_Loss', ret_dict['loss'].item(),
                                  val_global_iter)
                writer.add_scalar('Val/02_SemLoss',
                                  ret_dict['sem_loss'].item(), val_global_iter)
                if 'offset_loss_list' in ret_dict and sum(
                        ret_dict['offset_loss_list']).item() > 0:
                    writer.add_scalar('Val/03_InsLoss',
                                      sum(ret_dict['offset_loss_list']).item(),
                                      val_global_iter)
                if 'tracking_loss' in ret_dict:
                    writer.add_scalar('Val/06_TRLoss',
                                      sum(ret_dict['tracking_loss']).item(),
                                      global_iter)
                    tracking_loss += sum(ret_dict['tracking_loss']).item()
                more_keys = []
                for k, _ in ret_dict.items():
                    if k.find('summary') != -1:
                        more_keys.append(k)
                for ki, k in enumerate(more_keys):
                    if k == 'bandwidth_weight_summary':
                        continue
                    ki += 4
                    writer.add_scalar('Val/{}_{}'.format(str(ki).zfill(2), k),
                                      ret_dict[k], val_global_iter)
                val_global_iter += 1
        tracking_loss /= len(val_dataset_loader)
        if dist_train:
            try:
                before_merge_evaluator = common_utils.merge_evaluator(
                    before_merge_evaluator, tmp_dir, prefix='before_')
                dist.barrier()
                after_merge_evaluator = common_utils.merge_evaluator(
                    after_merge_evaluator, tmp_dir, prefix='after_')
            except:
                print("Someting went wrong when merging evaluator in rank {}".
                      format(rank))
        if rank == 0:
            vbar.close()
        if rank == 0:
            ## print results
            logger.info("Before Merge Semantic Scores")
            before_merge_results = printResults(before_merge_evaluator,
                                                logger=logger,
                                                sem_only=True)
            logger.info("After Merge Panoptic Scores")
            after_merge_results = printResults(after_merge_evaluator,
                                               logger=logger)
            ## save ckpt
            other_state = {
                'best_before_iou': best_before_iou,
                'best_pq': best_pq,
                'best_after_iou': best_after_iou,
                'global_iter': global_iter,
                'val_global_iter': val_global_iter,
                'best_tracking_loss': best_tracking_loss,
            }
            saved_flag = False
            if best_tracking_loss > tracking_loss and cfg.MODEL.NAME.startswith(
                    'PolarOffsetSpconvPytorchMeanshiftTracking'
            ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'):
                best_tracking_loss = tracking_loss
                if not saved_flag:
                    states = train_utils.checkpoint_state(
                        model, optimizer, epoch, other_state)
                    train_utils.save_checkpoint(
                        states,
                        os.path.join(
                            ckpt_dir, 'checkpoint_epoch_{}_{}.pth'.format(
                                epoch,
                                str(tracking_loss)[:5])))
                    saved_flag = True
            if best_before_iou < before_merge_results['iou_mean']:
                best_before_iou = before_merge_results['iou_mean']
                if not saved_flag:
                    states = train_utils.checkpoint_state(
                        model, optimizer, epoch, other_state)
                    train_utils.save_checkpoint(
                        states,
                        os.path.join(
                            ckpt_dir,
                            'checkpoint_epoch_{}_{}_{}_{}.pth'.format(
                                epoch,
                                str(best_before_iou)[:5],
                                str(best_pq)[:5],
                                str(best_after_iou)[:5])))
                    saved_flag = True
            if best_pq < after_merge_results['pq_mean']:
                best_pq = after_merge_results['pq_mean']
                if not saved_flag:
                    states = train_utils.checkpoint_state(
                        model, optimizer, epoch, other_state)
                    train_utils.save_checkpoint(
                        states,
                        os.path.join(
                            ckpt_dir,
                            'checkpoint_epoch_{}_{}_{}_{}.pth'.format(
                                epoch,
                                str(best_before_iou)[:5],
                                str(best_pq)[:5],
                                str(best_after_iou)[:5])))
                    saved_flag = True
            if best_after_iou < after_merge_results['iou_mean']:
                best_after_iou = after_merge_results['iou_mean']
                if not saved_flag:
                    states = train_utils.checkpoint_state(
                        model, optimizer, epoch, other_state)
                    train_utils.save_checkpoint(
                        states,
                        os.path.join(
                            ckpt_dir,
                            'checkpoint_epoch_{}_{}_{}_{}.pth'.format(
                                epoch,
                                str(best_before_iou)[:5],
                                str(best_pq)[:5],
                                str(best_after_iou)[:5])))
                    saved_flag = True
            logger.info("Current best before IoU: {}".format(best_before_iou))
            logger.info("Current best after IoU: {}".format(best_after_iou))
            logger.info("Current best after PQ: {}".format(best_pq))
            logger.info(
                "Current best tracking loss: {}".format(best_tracking_loss))
        if lr_scheduler != None:
            lr_scheduler.step(epoch)  # new feature
Пример #8
0
def main_worker(gpu_idx, configs):
    configs.gpu_idx = gpu_idx

    if configs.gpu_idx is not None:
        print("Use GPU: {} for training".format(configs.gpu_idx))
        configs.device = torch.device('cuda:{}'.format(configs.gpu_idx))

    if configs.distributed:
        if configs.dist_url == "env://" and configs.rank == -1:
            configs.rank = int(os.environ["RANK"])
        if configs.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx

        dist.init_process_group(backend=configs.dist_backend,
                                init_method=configs.dist_url,
                                world_size=configs.world_size,
                                rank=configs.rank)

    configs.is_master_node = (not configs.distributed) or (
        configs.distributed and (configs.rank % configs.ngpus_per_node == 0))

    if configs.is_master_node:
        logger = Logger(configs.logs_dir, configs.saved_fn)
        logger.info('>>> Created a new logger')
        logger.info('>>> configs: {}'.format(configs))
        tb_writer = SummaryWriter(
            log_dir=os.path.join(configs.logs_dir, 'tensorboard'))
    else:
        logger = None
        tb_writer = None

    # model
    model = create_model(configs)

    # load weight from a checkpoint
    if configs.pretrained_path is not None:
        assert os.path.isfile(
            configs.pretrained_path), "=> no checkpoint found at '{}'".format(
                configs.pretrained_path)
        model.load_weights(weightfile=configs.pretrained_path)
        if logger is not None:
            logger.info('loaded pretrained model at {}'.format(
                configs.pretrained_path))

    # resume weights of model from a checkpoint
    if configs.resume_path is not None:
        assert os.path.isfile(
            configs.resume_path), "=> no checkpoint found at '{}'".format(
                configs.resume_path)
        model.load_weights(weightfile=configs.resume_path)
        if logger is not None:
            logger.info('resume training model from checkpoint {}'.format(
                configs.pretrained_path))

    # Data Parallel
    model = make_data_parallel(model, configs)

    # Make sure to create optimizer after moving the model to cuda
    optimizer = create_optimizer(configs, model)
    lr_scheduler = create_lr_scheduler(optimizer, configs)

    # resume optimizer, lr_scheduler from a checkpoint
    if configs.resume_path is not None:
        utils_path = configs.resume_path.replace('Model_', 'Utils_')
        assert os.path.isfile(
            utils_path), "=> no checkpoint found at '{}'".format(utils_path)
        utils_state_dict = torch.load(utils_path,
                                      map_location='cuda:{}'.format(
                                          configs.gpu_idx))
        optimizer.load_state_dict(utils_state_dict['optimizer'])
        lr_scheduler.load_state_dict(utils_state_dict['lr_scheduler'])
        configs.start_epoch = utils_state_dict['epoch'] + 1

    if configs.is_master_node:
        num_parameters = get_num_parameters(model)
        logger.info('number of trained parameters of the model: {}'.format(
            num_parameters))

    if logger is not None:
        logger.info(">>> Loading dataset & getting dataloader...")
    # Create dataloader
    train_loader, val_loader, train_sampler = create_train_val_dataloader(
        configs)
    if logger is not None:
        logger.info('number of batches in train set: {}'.format(
            len(train_loader)))
        if val_loader is not None:
            logger.info('number of batches in val set: {}'.format(
                len(val_loader)))

    if configs.evaluate:
        assert val_loader is not None, "The validation should not be None"
        eval_metrics = evaluate_one_epoch(val_loader, model,
                                          configs.start_epoch - 1, configs,
                                          logger)
        precision, recall, AP, f1, ap_class = eval_metrics
        print(
            'Evaluate - precision: {}, recall: {}, AP: {}, f1: {}, ap_class: {}'
            .format(precision, recall, AP, f1, ap_class))
        return

    for epoch in range(configs.start_epoch, configs.num_epochs + 1):
        if logger is not None:
            logger.info('{}'.format('*-' * 40))
            logger.info('{} {}/{} {}'.format('=' * 35, epoch,
                                             configs.num_epochs, '=' * 35))
            logger.info('{}'.format('*-' * 40))
            logger.info('>>> Epoch: [{}/{}]'.format(epoch, configs.num_epochs))

        if configs.distributed:
            train_sampler.set_epoch(epoch)
        # train for one epoch
        train_one_epoch(train_loader, model, optimizer, lr_scheduler, epoch,
                        configs, logger, tb_writer)
        if not configs.no_val:
            precision, recall, AP, f1, ap_class = evaluate_one_epoch(
                val_loader, model, epoch, configs, logger)
            val_metrics_dict = {
                'precision': precision,
                'recall': recall,
                'AP': AP,
                'f1': f1,
                'ap_class': ap_class
            }
            if tb_writer is not None:
                tb_writer.add_scalars('Validation', val_metrics_dict, epoch)

        # Save checkpoint
        if configs.is_master_node and ((epoch % configs.checkpoint_freq) == 0):
            model_state_dict, utils_state_dict = get_saved_state(
                model, optimizer, lr_scheduler, epoch, configs)
            save_checkpoint(configs.checkpoints_dir, configs.saved_fn,
                            model_state_dict, utils_state_dict, epoch)

    if tb_writer is not None:
        tb_writer.close()
    if configs.distributed:
        cleanup()
Пример #9
0
def main():
    global args, best_acc1, device

    # Init seed
    np.random.seed(args.manual_seed)
    torch.manual_seed(args.manual_seed)
    torch.cuda.manual_seed(args.manual_seed)

    if args.dataset == 'miniImageNet':
        train_loader, val_loader = get_dataloader(args, 'train', 'val')
        in_channel = 3
        feature_dim = 64 * 3 * 3
    elif args.dataset == 'omniglot':
        train_loader, val_loader = get_dataloader(args, 'trainval', 'test')
        in_channel = 1
        feature_dim = 64
    else:
        raise ValueError(f"Dataset {args.dataset} is not supported")

    embedding = Embedding(in_channel).to(device)
    model = RelationNetwork(feature_dim).to(device)

    criterion = torch.nn.MSELoss()

    embed_optimizer = torch.optim.Adam(embedding.parameters(), args.lr)
    model_optimizer = torch.optim.Adam(model.parameters(), args.lr)

    cudnn.benchmark = True

    if args.resume:
        try:
            checkpoint = torch.load(
                sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1])
        except Exception:
            checkpoint = torch.load(args.log_dir + '/model_best.pth')
        model.load_state_dict(checkpoint['model_state_dict'])
        embedding.load_state_dict(checkpoint['embedding_state_dict'])
        model_optimizer.load_state_dict(
            checkpoint['model_optimizer_state_dict'])
        embed_optimizer.load_state_dict(
            checkpoint['embed_optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_acc1 = checkpoint['best_acc1']

        print(f"load checkpoint {args.exp_name}")
    else:
        start_epoch = 1

    embed_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
        optimizer=embed_optimizer, lr_lambda=lambda epoch: 0.5)
    model_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
        optimizer=model_optimizer, lr_lambda=lambda epoch: 0.5)

    for _ in range(start_epoch):
        embed_scheduler.step()
        model_scheduler.step()

    print(
        f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
    )

    for epoch in range(start_epoch, args.epochs + 1):

        train_loss = train(train_loader, model, embedding, model_optimizer,
                           embed_optimizer, criterion, epoch)

        is_test = False if epoch % args.test_iter else True
        if is_test or epoch == args.epochs or epoch == 1:

            val_loss, acc1 = validate(val_loader, model, embedding, criterion,
                                      epoch)

            if acc1 >= best_acc1:
                is_best = True
                best_acc1 = acc1
            else:
                is_best = False

            save_checkpoint(
                {
                    'model_state_dict': model.state_dict(),
                    'embedding_state_dict': embedding.state_dict(),
                    'model_optimizer_state_dict': model_optimizer.state_dict(),
                    'embed_optimizer_state_dict': embed_optimizer.state_dict(),
                    'best_acc1': best_acc1,
                    'epoch': epoch,
                }, is_best, args)

            if is_best:
                writer.add_scalar("BestAcc", acc1, epoch)

            print(
                f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}"
            )

        else:
            print(f"[{epoch}/{args.epochs}] {train_loss:.3f}")

        embed_scheduler.step()
        model_scheduler.step()

    writer.close()
Пример #10
0
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

    print('{batch}/{size} | Loss:{loss:.4f} | top1:{tp1:.4f} | AUROC:{ac:.4f}'.format(
         batch=batch_idx+1, size=len(val_loader), loss=losses.avg, tp1=top1.avg, ac=arc.avg))
    return (losses.avg, top1.avg, arc.avg)


for epoch in range(opt.start_epoch, opt.epochs):
    opt.lr = optimizer.state_dict()['param_groups'][0]['lr']
    adjust_learning_rate(optimizer, epoch, opt)
    print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr))
    
    train_loss, train_acc, train_auroc = train(opt, train_loader, model, criterion, optimizer, epoch, use_cuda)
    test_loss, test_acc, test_auroc = test(opt, val_loader, model, criterion, epoch, use_cuda)
    
    logger.append([opt.lr, train_loss, test_loss, train_acc, test_acc, train_auroc, test_auroc])
    scheduler_warmup.step()

    is_best = test_acc > best_acc
    best_acc = max(test_acc, best_acc)
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict' : model.state_dict(),
        'acc': test_acc,
        'best_acc': best_acc,
        'optimizer': optimizer.state_dict(),
    }, is_best, checkpoint=opt.checkpoint)
Пример #11
0
def main():
    global args, best_iou, iterations
    args = parser.parse_args()

    if args.tensorboard:
        from tensorboard_logger import configure
        print("Using tensorboard")
        configure("%s" % (args.dir))

    offset_list = generate_offsets(args.num_offsets)

    # model configurations
    num_classes = args.num_classes
    num_offsets = args.num_offsets

    # model
    model = get_model(num_classes, num_offsets, args.arch, args.pretrain)
    model = model.cuda()

    # dataset
    trainset = COCODataset(args.train_img,
                           args.train_ann,
                           num_classes,
                           offset_list,
                           scale=args.scale,
                           size=(args.train_image_size, args.train_image_size),
                           limits=args.limits,
                           crop=args.crop)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              num_workers=4,
                                              batch_size=args.batch_size,
                                              shuffle=True)
    valset = COCODataset(args.val_img,
                         args.val_ann,
                         num_classes,
                         offset_list,
                         scale=args.scale,
                         limits=args.limits)
    valloader = torch.utils.data.DataLoader(valset,
                                            num_workers=4,
                                            batch_size=4)
    num_train = len(trainset)
    num_val = len(valset)
    print('Training samples: {0} \n'
          'Validation samples: {1}'.format(num_train, num_val))

    # define optimizer
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                nesterov=args.nesterov,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_iou = checkpoint['best_iou']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            offset_list = checkpoint['offset']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            raise ValueError("=> no checkpoint found at '{}'".format(
                args.resume))
    print("offsets are: {}".format(offset_list))

    # define loss functions
    if args.loss == 'bce':
        print('Using Binary Cross Entropy Loss')
        criterion_cls = torch.nn.BCEWithLogitsLoss().cuda()
    elif args.loss == 'mbce':
        print('Using Weighted Multiclass BCE Loss')
        criterion_cls = MultiBCEWithLogitsLoss().cuda()
    elif args.loss == 'dice':
        print('Using Soft Dice Loss')
        criterion_cls = SoftDiceLoss().cuda()
    else:
        print('Using Cross Entropy Loss')
        criterion_cls = CrossEntropyLossOneHot().cuda()

    criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda()

    # define learning rate scheduler
    if not args.milestones:
        milestones = [args.epochs]
    else:
        milestones = args.milestones
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=milestones,
                                         gamma=0.1,
                                         last_epoch=args.start_epoch - 1)

    # start iteration count
    iterations = args.start_epoch * int(len(trainset) / args.batch_size)

    # define score metrics
    score_metrics_train = runningScore(num_classes, trainset.catNms)
    score_metrics = runningScore(num_classes, valset.catNms)
    offset_metrics_train = offsetIoU(offset_list)
    offset_metrics_val = offsetIoU(offset_list)

    # train
    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step()
        iterations = train(trainloader,
                           model,
                           criterion_cls,
                           criterion_ofs,
                           optimizer,
                           num_classes,
                           args.batch_size,
                           epoch,
                           iterations,
                           print_freq=args.print_freq,
                           log_freq=args.log_freq,
                           tensorboard=args.tensorboard,
                           score_metrics=score_metrics_train,
                           offset_metrics=offset_metrics_train,
                           alpha=args.alpha)
        val_iou = validate(valloader,
                           model,
                           criterion_cls,
                           criterion_ofs,
                           num_classes,
                           args.batch_size,
                           epoch,
                           iterations,
                           print_freq=args.print_freq,
                           log_freq=args.log_freq,
                           tensorboard=args.tensorboard,
                           score_metrics=score_metrics,
                           offset_metrics=offset_metrics_val,
                           alpha=args.alpha)
        # visualize some example outputs after each epoch
        if args.visualize:
            outdir = '{}/imgs/{}'.format(args.dir, epoch + 1)
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            sample(num_classes, num_offsets, model, valloader, outdir)

        is_best = val_iou > best_iou
        best_iou = max(val_iou, best_iou)
        save_checkpoint(
            args.dir, {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_iou': best_iou,
                'optimizer': optimizer.state_dict(),
                'offset': offset_list,
            }, is_best)
    print('Best validation mean iou: ', best_iou)
Пример #12
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=============> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()
    print(model)
    # freeze all layers but the last fc
    #     for name, param in model.named_parameters():
    #         if name not in ['fc.weight', 'fc.bias']:
    #             param.requires_grad = False
    # init the fc layer
    model.fc = nn.Linear(512, args.num_class, bias=True)
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()

    # load from pre-trained, before DistributedDataParallel constructor
    if args.pretrained:
        if os.path.isfile(args.pretrained):
            print("=> loading checkpoint '{}'".format(args.pretrained))
            checkpoint = torch.load(args.pretrained, map_location="cpu")

            # rename moco pre-trained keys
            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q'
                                ) and not k.startswith('module.encoder_q.fc'):
                    # remove prefix
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                # delete renamed or unused k
                del state_dict[k]

            args.start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)
            assert set(msg.missing_keys) == {"fc.weight", "fc.bias"}

            print("=> loaded pre-trained model '{}'".format(args.pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrained))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)  #.cuda() for debug on cpu
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # optimize only the linear classifier
    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    # assert len(parameters) == 2  # fc.weight, fc.bias
    optimizer = torch.optim.SGD(parameters,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    normalize_video = transforms_video.NormalizeVideo(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    video_augmentation_train = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size),
        transforms_video.RandomHorizontalFlipVideo(),
        normalize_video,
    ])
    video_augmentation_val = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.CenterCropVideo(args.crop_size),
        normalize_video,
    ])
    data_dir = os.path.join(args.data, 'data')
    anno_dir = os.path.join(args.data, 'anno')
    audio_augmentation = moco.loader.DummyAudioTransform()
    train_augmentation = {
        'video': video_augmentation_train,
        'audio': audio_augmentation
    }
    val_augmentation = {
        'video': video_augmentation_val,
        'audio': audio_augmentation
    }

    train_dataset = UCF101(data_dir,
                           anno_dir,
                           args.frame_per_clip,
                           args.step_between_clips,
                           fold=1,
                           train=True,
                           transform=train_augmentation,
                           num_workers=16)
    train_sampler = RandomClipSampler(train_dataset.video_clips, 10)
    if args.distributed:
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               multiprocessing_context="fork")

    val_dataset = UCF101(data_dir,
                         anno_dir,
                         args.frame_per_clip,
                         args.step_between_clips,
                         fold=1,
                         train=False,
                         transform=val_augmentation,
                         num_workers=16)
    # Do not use DistributedSampler since it will destroy the testing iteration process
    val_sampler = UniformClipSampler(val_dataset.video_clips,
                                     args.clip_per_video)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.clip_per_video,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             multiprocessing_context="fork")

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        val_loss, acc1, acc5 = validate(val_loader, model, criterion, args)
        if writer is not None:
            writer.add_scalar('lincls_val/loss', val_loss, epoch)
            writer.add_scalar('lincls_val/acc1', acc1, epoch)
            writer.add_scalar('lincls_val/acc5', acc5, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            },
                            ckp_dir,
                            max_save=1,
                            is_best=is_best)
Пример #13
0
    print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr))

    train_loss, train_auroc = train(opt, train_loader, teacher_model,
                                    student_model, criterion, optimizer, epoch,
                                    use_cuda)
    test_loss, test_auroc = test(opt, val_target_loader, student_model,
                                 criterion, epoch, use_cuda)
    source_loss, source_auroc = test(opt, val_source_loader, student_model,
                                     criterion, epoch, use_cuda)

    logger.append([
        opt.lr, train_loss, test_loss, source_loss, train_auroc, test_auroc,
        source_auroc
    ])
    is_best = test_auroc + source_auroc > best_acc
    best_acc = max(test_auroc + source_auroc, best_acc)
    save_checkpoint(
        {
            'epoch': epoch + 1,
            'state_dict': student_model.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
        },
        is_best,
        checkpoint=checkpoint)
    scheduler_cosine.step()
    scheduler_step.step()

    if (epoch + 1) % 200 == 0:
        teacher_model.load_state_dict(student_model.state_dict())
Пример #14
0
def main():
    global args, best_acc1, device

    # Init seed
    np.random.seed(args.manual_seed)
    torch.manual_seed(args.manual_seed)
    torch.cuda.manual_seed(args.manual_seed)

    if args.dataset.lower() == 'miniimagenet':
        train_loader, val_loader = get_dataloader(args, 'matching_train',
                                                  'test')
        in_channel = 3
        lstm_input_size = 1600
    elif args.dataset.lower() == 'omniglot':
        train_loader, val_loader = get_dataloader(args, 'trainval', 'test')
        in_channel = 1
        lstm_input_size = 64
    else:
        raise KeyError(f"Dataset {args.dataset} is not supported")

    model = MatchingNetworks(args.classes_per_it_tr,
                             args.num_support_tr,
                             args.num_query_tr,
                             args.num_query_val,
                             in_channel,
                             args.lstm_layers,
                             lstm_input_size,
                             args.unrolling_steps,
                             fce=True,
                             distance_fn='cosine').to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), args.lr)

    cudnn.benchmark = True

    if args.resume:
        try:
            checkpoint = torch.load(
                sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1])
        except Exception:
            checkpoint = torch.load(args.log_dir + '/model_best.pth')
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_acc1 = checkpoint['best_acc1']

        print(f"load checkpoint {args.exp_name}")
    else:
        start_epoch = 1

    print(
        f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
    )

    for epoch in range(start_epoch, args.epochs + 1):
        train_loss = train(train_loader, model, optimizer, criterion, epoch)

        is_test = False if epoch % args.test_iter else True
        if is_test or epoch == args.epochs or epoch == 1:

            val_loss, acc1 = validate(val_loader, model, criterion, epoch)

            if acc1 >= best_acc1:
                is_best = True
                best_acc1 = acc1
            else:
                is_best = False

            save_checkpoint(
                {
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'best_acc1': best_acc1,
                    'epoch': epoch,
                }, is_best, args)

            if is_best:
                writer.add_scalar("Acc/BestAcc", acc1, epoch)

            print(
                f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}"
            )

        else:
            print(f"[{epoch}/{args.epochs}] {train_loss:.3f}")

    writer.close()
Пример #15
0
def main(argv, configPath=None):
    # arguments
    args = getArgs_(argv, configPath)
    saveDir = savePath(args)
    logger = infoLogger(logdir=saveDir, name=args.model)

    logger.info(argv)
    logger.debug(cfgInfo(args))
    logger.info("CheckPoints path: {}".format(saveDir))
    logger.debug("Model Name: {}".format(args.model))

    train_dataset = BDD100K_Area_Seg(base_dir=args.dataPath,
                                     split='train',
                                     target_size=args.size)
    valid_dataset = BDD100K_Area_Seg(base_dir=args.dataPath,
                                     split='val',
                                     target_size=args.size)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_worker,
                              pin_memory=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_worker,
                              pin_memory=True)

    args.num_gpus, args.device = deviceSetting(logger=logger,
                                               device=args.device)
    # model
    model = Deeplabv3plus_Mobilenet(args.output_channels,
                                    output_stride=args.output_stride)

    optimizer, scheduler = create_optimizer_(model, args)
    loss_fn = MultiClassCriterion(loss_type=args.loss_type,
                                  ignore_index=args.ignore_index)
    model, trainData = modelDeploy(args, model, optimizer, scheduler, logger)

    tensorLogger = SummaryWriter(log_dir=os.path.join(saveDir, 'runs'),
                                 filename_suffix=args.model)
    logger.info("Tensorboard event log saved in {}".format(
        tensorLogger.log_dir))

    logger.info('Start training...')
    # global_step = 0
    start_epoch = trainData['epoch']

    num_classes = args.output_channels
    extra_info_ckpt = '{}_{}_{}'.format(args.model, args.size[0], args.size[1])
    for i_epoch in range(start_epoch, args.max_epoch):
        if i_epoch >= 29:
            optimizer.param_groups[0]["lr"] = np.float64(0.00001)
        trainData['epoch'] = i_epoch
        lossList, miouList = train_seg(model,
                                       train_loader,
                                       i_epoch,
                                       optimizer,
                                       loss_fn,
                                       num_classes,
                                       logger,
                                       tensorLogger,
                                       args=args)
        scheduler.step()
        trainData['loss'].extend(lossList)
        trainData['miou'].extend(miouList)

        valLoss, valMiou = val_seg(model,
                                   valid_loader,
                                   i_epoch,
                                   loss_fn,
                                   num_classes,
                                   logger,
                                   tensorLogger,
                                   args=args)
        trainData['val'].append([valLoss, valMiou])

        best = valMiou > trainData['bestMiou']
        if valMiou > trainData['bestMiou']:
            trainData['bestMiou'] = valMiou

        weights_dict = model.module.state_dict(
        ) if args.device == 'cuda' else model.state_dict()

        save_checkpoint(
            {
                'trainData': trainData,
                'model': weights_dict,
                'optimizer': optimizer.state_dict(),
            },
            is_best=best,
            dir=saveDir,
            extra_info=extra_info_ckpt,
            miou_val=valMiou,
            logger=logger)

    tensorLogger.close()
Пример #16
0
def main():
    global args, best_acc1, device

    # Init seed
    np.random.seed(args.manual_seed)
    torch.manual_seed(args.manual_seed)
    torch.cuda.manual_seed(args.manual_seed)

    if args.dataset == 'omniglot':
        train_loader, val_loader = get_dataloader(args, 'trainval', 'test')
        input_dim = 1
    else:
        train_loader, val_loader = get_dataloader(args, 'train', 'val')
        input_dim = 3

    if args.model == 'protonet':
        model = ProtoNet(input_dim).to(device)
        print("ProtoNet loaded")
    else:
        model = ResNet(input_dim).to(device)
        print("ResNet loaded")

    criterion = PrototypicalLoss().to(device)

    optimizer = torch.optim.Adam(model.parameters(), args.lr)

    cudnn.benchmark = True

    if args.resume:
        try:
            checkpoint = torch.load(
                sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1])
        except Exception:
            checkpoint = torch.load(args.log_dir + '/model_best.pth')
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_acc1 = checkpoint['best_acc1']

        print(f"load checkpoint {args.exp_name}")
    else:
        start_epoch = 1

    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer=optimizer,
        gamma=args.lr_scheduler_gamma,
        step_size=args.lr_scheduler_step)

    print(
        f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
    )

    for epoch in range(start_epoch, args.epochs + 1):

        train_loss = train(train_loader, model, optimizer, criterion, epoch)

        is_test = False if epoch % args.test_iter else True
        if is_test or epoch == args.epochs or epoch == 1:

            val_loss, acc1 = validate(val_loader, model, criterion, epoch)

            if acc1 >= best_acc1:
                is_best = True
                best_acc1 = acc1
            else:
                is_best = False

            save_checkpoint(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer_state_dict': optimizer.state_dict(),
                }, is_best, args)

            if is_best:
                writer.add_scalar("BestAcc", acc1, epoch)

            print(
                f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}"
            )

        else:
            print(f"[{epoch}/{args.epochs}] {train_loss:.3f}")

        scheduler.step()

    writer.close()
Пример #17
0
def main():
    global args, best_iou, iterations
    args = parser.parse_args()

    if args.tensorboard:
        from tensorboard_logger import configure
        print("Using tensorboard")
        configure("%s" % (args.dir))

    # model configurations
    num_classes = args.num_classes
    num_offsets = args.num_offsets
    if args.mode == 'offset':  # offset only
        num_classes = 0
    if args.mode == 'class':  # class only
        num_offsets = 0

    # model
    model = get_model(num_classes, num_offsets, args.arch, args.pretrain)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_iou = checkpoint['best_iou']
            model.load_state_dict(checkpoint['model_state'])
            if 'offset' in checkpoint:  # class mode doesn't have offset
                offset_list = checkpoint['offset']
                print("offsets are: {}".format(offset_list))
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            raise ValueError("=> no checkpoint found at '{}'".format(
                args.resume))

    # model distribution
    if args.gpu != -1:
        # DataParallel wrapper (synchronzied batchnorm edition)
        if len(args.gpu) > 1:
            model = DataParallelWithCallback(model, device_ids=args.gpu)
        model.cuda()

    # dataset
    if args.mode == 'all':
        offset_list = generate_offsets(80 / args.scale, args.num_offsets)
        trainset = AllDataset(args.train_img,
                              args.train_ann,
                              num_classes,
                              offset_list,
                              scale=args.scale,
                              crop=args.crop,
                              crop_size=(args.crop_size, args.crop_size),
                              limits=args.limits)
        valset = AllDataset(args.val_img,
                            args.val_ann,
                            num_classes,
                            offset_list,
                            scale=args.scale,
                            limits=args.limits)
        class_nms = trainset.catNms
    elif args.mode == 'class':
        offset_list = None
        trainset = ClassDataset(args.train_img,
                                args.train_ann,
                                scale=args.scale,
                                crop=args.crop,
                                crop_size=(args.crop_size, args.crop_size),
                                limits=args.limits)
        valset = ClassDataset(args.val_img,
                              args.val_ann,
                              scale=args.scale,
                              limits=args.limits)
        class_nms = trainset.catNms
    elif args.mode == 'offset':
        offset_list = generate_offsets(80 / args.scale, args.num_offsets)
        print("offsets are: {}".format(offset_list))
        trainset = OffsetDataset(args.train_img,
                                 args.train_ann,
                                 offset_list,
                                 scale=args.scale,
                                 crop=args.crop,
                                 crop_size=args.crop_size,
                                 limits=args.limits)
        valset = OffsetDataset(args.val_img,
                               args.val_ann,
                               offset_list,
                               scale=args.scale,
                               limits=args.limits)
        class_nms = None

    trainloader = torch.utils.data.DataLoader(trainset,
                                              num_workers=4,
                                              batch_size=args.batch_size,
                                              shuffle=True)
    valloader = torch.utils.data.DataLoader(valset,
                                            num_workers=4,
                                            batch_size=4)
    num_train = len(trainset)
    num_val = len(valset)
    print('Training samples: {0} \n'
          'Validation samples: {1}'.format(num_train, num_val))

    # define optimizer
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                nesterov=args.nesterov,
                                weight_decay=args.weight_decay)
    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])

    # # define loss functions

    criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda()

    if args.mode == 'all':
        criterion_cls = torch.nn.BCEWithLogitsLoss().cuda()
        criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda()
    elif args.mode == 'class':
        criterion_cls = torch.nn.BCEWithLogitsLoss().cuda()
        criterion_ofs = None
    elif args.mode == 'offset':
        criterion_cls = None
        if args.loss == 'bce':
            print('Using Binary Cross Entropy Loss')
            criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda()
        elif args.loss == 'mbce':
            print('Using Weighted Multiclass BCE Loss')
            criterion_ofs = MultiBCEWithLogitsLoss().cuda()
        elif args.loss == 'dice':
            print('Using Soft Dice Loss (0 mode)')
            criterion_ofs = SoftDiceLoss(mode='0').cuda()
        else:
            print('Using Cross Entropy Loss')
            criterion_ofs = CrossEntropyLossOneHot().cuda()

    # define learning rate scheduler
    if not args.milestones:
        milestones = [args.epochs]
    else:
        milestones = args.milestones
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=milestones,
                                         gamma=0.2,
                                         last_epoch=args.start_epoch - 1)

    # start iteration count
    iterations = args.start_epoch * int(len(trainset) / args.batch_size)

    # train
    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step()
        iterations = train(trainloader,
                           model,
                           optimizer,
                           args.batch_size,
                           epoch,
                           iterations,
                           criterion_cls=criterion_cls,
                           class_nms=class_nms,
                           criterion_ofs=criterion_ofs,
                           offset_list=offset_list,
                           print_freq=args.print_freq,
                           log_freq=args.log_freq,
                           tensorboard=args.tensorboard,
                           score=args.score,
                           alpha=args.alpha)
        val_iou = validate(valloader,
                           model,
                           args.batch_size,
                           epoch,
                           iterations,
                           criterion_cls=criterion_cls,
                           class_nms=class_nms,
                           criterion_ofs=criterion_ofs,
                           offset_list=offset_list,
                           print_freq=args.print_freq,
                           log_freq=args.log_freq,
                           tensorboard=args.tensorboard,
                           score=args.score,
                           alpha=args.alpha)
        # visualize some example outputs after each epoch
        if args.visual_freq > 0 and epoch % args.visual_freq == 0:
            outdir = '{}/imgs/{}'.format(args.dir, epoch)
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            sample(model, valloader, outdir, num_classes, num_offsets)

        # save checkpoint
        is_best = val_iou > best_iou
        best_iou = max(val_iou, best_iou)
        if args.gpu != -1 and len(args.gpu) > 1:
            state_dict = {
                'epoch': epoch + 1,
                'model_state':
                model.module.state_dict(),  # remove 'module' in checkpoint
                'best_iou': best_iou,
                'optimizer': optimizer.state_dict()
            }
        else:
            state_dict = {
                'epoch': epoch + 1,
                'model_state': model.state_dict(),
                'best_iou': best_iou,
                'optimizer': optimizer.state_dict()
            }
        if args.mode != 'class':
            state_dict['offset'] = offset_list
        save_checkpoint(args.dir, state_dict, is_best)

    print('Best validation mean iou: ', best_iou)
def main_worker(gpu_idx, configs):
    configs.gpu_idx = gpu_idx

    if configs.gpu_idx is not None:
        print("Use GPU: {} for training".format(configs.gpu_idx))
        configs.device = torch.device('cuda:{}'.format(configs.gpu_idx))

    if configs.distributed:
        if configs.dist_url == "env://" and configs.rank == -1:
            configs.rank = int(os.environ["RANK"])
        if configs.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx

        dist.init_process_group(backend=configs.dist_backend,
                                init_method=configs.dist_url,
                                world_size=configs.world_size,
                                rank=configs.rank)

    configs.is_master_node = (not configs.distributed) or (
        configs.distributed and (configs.rank % configs.ngpus_per_node == 0))

    if configs.is_master_node:
        logger = Logger(configs.logs_dir, configs.saved_fn)
        logger.info('>>> Created a new logger')
        logger.info('>>> configs: {}'.format(configs))
        tb_writer = SummaryWriter(
            log_dir=os.path.join(configs.logs_dir, 'tensorboard'))
    else:
        logger = None
        tb_writer = None

    # model
    model = create_model(configs)

    # Data Parallel
    model = make_data_parallel(model, configs)

    # Freeze model
    model = freeze_model(model, configs.freeze_modules_list)

    if configs.is_master_node:
        num_parameters = get_num_parameters(model)
        logger.info('number of trained parameters of the model: {}'.format(
            num_parameters))

    optimizer = create_optimizer(configs, model)
    lr_scheduler = create_lr_scheduler(optimizer, configs)
    best_val_loss = np.inf
    earlystop_count = 0
    is_best = False

    # optionally load weight from a checkpoint
    if configs.pretrained_path is not None:
        model = load_pretrained_model(model, configs.pretrained_path, gpu_idx,
                                      configs.overwrite_global_2_local)
        if logger is not None:
            logger.info('loaded pretrained model at {}'.format(
                configs.pretrained_path))

    # optionally resume from a checkpoint
    if configs.resume_path is not None:
        checkpoint = resume_model(configs.resume_path, configs.arch,
                                  configs.gpu_idx)
        if hasattr(model, 'module'):
            model.module.load_state_dict(checkpoint['state_dict'])
        else:
            model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        best_val_loss = checkpoint['best_val_loss']
        earlystop_count = checkpoint['earlystop_count']
        configs.start_epoch = checkpoint['epoch'] + 1

    if logger is not None:
        logger.info(">>> Loading dataset & getting dataloader...")
    # Create dataloader
    train_loader, val_loader, train_sampler = create_train_val_dataloader(
        configs)
    test_loader = create_test_dataloader(configs)
    if logger is not None:
        logger.info('number of batches in train set: {}'.format(
            len(train_loader)))
        if val_loader is not None:
            logger.info('number of batches in val set: {}'.format(
                len(val_loader)))
        logger.info('number of batches in test set: {}'.format(
            len(test_loader)))

    if configs.evaluate:
        assert val_loader is not None, "The validation should not be None"
        val_loss = evaluate_one_epoch(val_loader, model,
                                      configs.start_epoch - 1, configs, logger)
        print('Evaluate, val_loss: {}'.format(val_loss))
        return

    for epoch in range(configs.start_epoch, configs.num_epochs + 1):
        # Get the current learning rate
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        if logger is not None:
            logger.info('{}'.format('*-' * 40))
            logger.info('{} {}/{} {}'.format('=' * 35, epoch,
                                             configs.num_epochs, '=' * 35))
            logger.info('{}'.format('*-' * 40))
            logger.info('>>> Epoch: [{}/{}] learning rate: {:.2e}'.format(
                epoch, configs.num_epochs, lr))

        if configs.distributed:
            train_sampler.set_epoch(epoch)
        # train for one epoch
        train_loss = train_one_epoch(train_loader, model, optimizer, epoch,
                                     configs, logger)
        loss_dict = {'train': train_loss}
        if not configs.no_val:
            val_loss = evaluate_one_epoch(val_loader, model, epoch, configs,
                                          logger)
            is_best = val_loss <= best_val_loss
            best_val_loss = min(val_loss, best_val_loss)
            loss_dict['val'] = val_loss

        if not configs.no_test:
            test_loss = evaluate_one_epoch(test_loader, model, epoch, configs,
                                           logger)
            loss_dict['test'] = test_loss
        # Write tensorboard
        if tb_writer is not None:
            tb_writer.add_scalars('Loss', loss_dict, epoch)
        # Save checkpoint
        if configs.is_master_node and (is_best or (
            (epoch % configs.checkpoint_freq) == 0)):
            saved_state = get_saved_state(model, optimizer, lr_scheduler,
                                          epoch, configs, best_val_loss,
                                          earlystop_count)
            save_checkpoint(configs.checkpoints_dir, configs.saved_fn,
                            saved_state, is_best, epoch)
        # Check early stop training
        if configs.earlystop_patience is not None:
            earlystop_count = 0 if is_best else (earlystop_count + 1)
            print_string = ' |||\t earlystop_count: {}'.format(earlystop_count)
            if configs.earlystop_patience <= earlystop_count:
                print_string += '\n\t--- Early stopping!!!'
                break
            else:
                print_string += '\n\t--- Continue training..., earlystop_count: {}'.format(
                    earlystop_count)
            if logger is not None:
                logger.info(print_string)
        # Adjust learning rate
        if configs.lr_type == 'plateau':
            assert (not configs.no_val
                    ), "Only use plateau when having validation set"
            lr_scheduler.step(val_loss)
        else:
            lr_scheduler.step()

    if tb_writer is not None:
        tb_writer.close()
    if configs.distributed:
        cleanup()
Пример #19
0
def main():
    args = get_args()

    # Log
    log_format = '[%(asctime)s] %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%d %I:%M:%S')
    t = time.time()
    local_time = time.localtime(t)
    if not os.path.exists('./log'):
        os.mkdir('./log')
    fh = logging.FileHandler(
        os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000,
                                                  local_time.tm_mon, t)))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    input_size = efficientnet_lite_params[args.model_name][2]

    use_gpu = False
    if torch.cuda.is_available():
        use_gpu = True

    assert os.path.exists(args.train_dir)
    train_dataset = datasets.ImageFolder(
        args.train_dir,
        transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.ColorJitter(brightness=0.4,
                                   contrast=0.4,
                                   saturation=0.4),
            transforms.RandomHorizontalFlip(0.5),
            transforms.ToTensor(),
            transforms.Normalize(MEAN_RGB, STDDEV_RGB)
        ]))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               pin_memory=use_gpu)
    train_dataprovider = DataIterator(train_loader)

    assert os.path.exists(args.val_dir)
    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        args.val_dir,
        transforms.Compose([
            transforms.Resize(input_size + CROP_PADDING,
                              interpolation=PIL.Image.BICUBIC),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize(MEAN_RGB, STDDEV_RGB)
        ])),
                                             batch_size=200,
                                             shuffle=False,
                                             num_workers=args.num_workers,
                                             pin_memory=use_gpu)
    val_dataprovider = DataIterator(val_loader)
    print('load data successfully')

    model = build_efficientnet_lite(args.model_name, args.num_classes)

    optimizer = torch.optim.SGD(get_parameters(model),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1)

    if use_gpu:
        model = nn.DataParallel(model)
        loss_function = criterion_smooth.cuda()
        device = torch.device("cuda")
    else:
        loss_function = criterion_smooth
        device = torch.device("cpu")

    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lambda step: (1.0 - step / args.total_iters)
        if step <= args.total_iters else 0,
        last_epoch=-1)

    model = model.to(device)

    all_iters = 0
    if args.auto_continue:
        lastest_model, iters = get_lastest_model()
        if lastest_model is not None:
            all_iters = iters
            checkpoint = torch.load(lastest_model,
                                    map_location=None if use_gpu else 'cpu')
            model.load_state_dict(checkpoint['state_dict'], strict=True)
            print('load from checkpoint')
            for i in range(iters):
                scheduler.step()

    args.optimizer = optimizer
    args.loss_function = loss_function
    args.scheduler = scheduler
    args.train_dataprovider = train_dataprovider
    args.val_dataprovider = val_dataprovider

    if args.eval:
        if args.eval_resume is not None:
            checkpoint = torch.load(args.eval_resume,
                                    map_location=None if use_gpu else 'cpu')
            load_checkpoint(model, checkpoint)
            validate(model, device, args, all_iters=all_iters)
        exit(0)

    while all_iters < args.total_iters:
        all_iters = train(model,
                          device,
                          args,
                          val_interval=args.val_interval,
                          bn_process=False,
                          all_iters=all_iters)
        validate(model, device, args, all_iters=all_iters)
    all_iters = train(model,
                      device,
                      args,
                      val_interval=int(1280000 / args.batch_size),
                      bn_process=True,
                      all_iters=all_iters)
    validate(model, device, args, all_iters=all_iters)
    save_checkpoint({
        'state_dict': model.state_dict(),
    },
                    args.total_iters,
                    tag='bnps-')