Пример #1
0
def _non_dist_train(model, dataset, cfg, validate=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #2
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config

    if cfg.lr_config.get('customized', False):  ###

        cfg.lr_config = eval(cfg.lr_config['policy'])(**cfg.lr_config)

    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #3
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]   # 确认dataset是一个list或者tuple
    # template_dataset = template_dataset if isinstance(template_dataset, (list, tuple)) else [template_dataset]  # 确认dataset是一个list或者tuple

    # load the data for per batch
    data_loaders = [                                 # 调用该函数可实现每次返回minibatch张图片
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()   # 将模型送到GPU中,how to set data into gpus

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)                   # 建立优化器
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,    # 建立辅助Runner用来跑模型,进入后调用里面的train函数
                    cfg.log_level)
    # fp16 setting                                                      # 应当是关于FPN层的设置
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:                                                  # 选择恢复训练或者重新加载模型
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)              # 调用runner的run方法进行训练
Пример #4
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(
        cfg.gpus)).cuda()  #for multiple GPU
    # refer : torch.nn.DataParallel(model)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # runner : https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/runner.py
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    # print('fp16_cfg:',fp16_cfg) # None
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:  #default!
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #5
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    
    if cfg.model.type in ['TTFNet', 'CenterNet']:
        runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.bbox_head_hist_config, cfg.log_config)

        model_type = cfg.bbox_head_hist_config.model_type # ['ConvModule', 'DeformConvPack']
        sub_module = cfg.bbox_head_hist_config.sub_modules  # ['bbox_head']
        save_steps = cfg.bbox_head_hist_config.save_every_n_steps

        class ActivationHook():
            def __init__(self, runner, every_steps=1):
                self.runner = runner
                self.every_steps = every_steps
                self.steps = 0
                self.activation = {}
        
            def hook(self, model, input, output):
                if (self.steps % self.every_steps) == 0:
                    self.runner.module_name.append(self)
                    self.runner.features_in_hook.append(input[0])
                    self.runner.features_out_hook.append(output)
                self.steps += 1


        runner_hook = ActivationHook(runner, every_steps=save_steps)

        def module_to_list(module, lst):
            # remove sequence_and_modulelist
            mod_name = type(module).__name__
            if isinstance(module, nn.ModuleList):
                return [module_to_list(item, lst) for item in module]
            else:
                if isinstance(module, nn.Sequential) and type(module[0]).__name__ in lst:
                    module.register_forward_hook(save_feature_maps)

        for module in sub_module:
            if getattr(model.module, module, None):
                mod = getattr(model.module, module, None)
                for k, v in mod._modules.items():
                    sub_mod = getattr(mod, k)
                    out = module_to_list(sub_mod, model_type)

    else:
        runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #6
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)

    # defrost backbone hook
    when_defrost = cfg.get('when_defrost')
    if when_defrost is not None:
        if when_defrost < 0:
            raise RuntimeError('when_defrost < 0')
        frozen_stages = cfg.get('frozen_stages', -1)
        defrost_backbone = DefrostBackbone(when_defrost, frozen_stages)
        runner.register_hook(defrost_backbone)
    # log hook
    custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost,
                           os.path.join(cfg.work_dir, 'log.txt'))
    runner.register_hook(custom_log)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(
            cfg.resume_from,
            map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0]))
    elif cfg.load_from:
        runner.load_checkpoint(
            cfg.load_from,
            map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0]))
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #7
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None,
                meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    find_unused_parameters = cfg.get('find_unused_parameters', True)  # False
    # Sets the `find_unused_parameters` parameter in
    # torch.nn.parallel.DistributedDataParallel
    model = MMDistributedDataParallel(
        model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=True,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #8
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'use_img_sampling' not in cfg.data:
        cfg.data.update({'use_img_sampling': False})
    if 'use_sample_out' not in cfg.data:
        cfg.data.update({'use_sample_out': False})
    print('--Dist-train--IS:{}--ISout:{}'.format(cfg.data.use_img_sampling,
                                                 cfg.data.use_sample_out))
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         use_img_sampling=cfg.data.use_img_sampling,
                         use_sample_out=cfg.data.use_sample_out)
        for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    # Add for LVIS by LiYu
    import logging
    runner.logger.setLevel(logging.INFO)
    # ====================

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #9
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         seed=cfg.seed) for ds in dataset
    ]
    #print("im data loader ^^^^^^^*********$$$$$$, ",data_loaders)
    #print("data len is ", len(data_loaders[0]))
    #for i,d in enumerate(data_loaders[0]):
    #    if i > 20:
    #        print("ok break$$$$$$$$$")
    #        break
    #    print(d)
    #    print("current i " , i)
    #
    #print("OK ,fine")

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #10
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True) for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    finetune_htc = cfg.get('finetune_htc', False)
    freeze_htc = cfg.get('freeze_htc', False)
    freeze_backbone = cfg.get('freeze_backbone', False)
    optimizer = build_optimizer(model,
                                cfg.optimizer,
                                finetune_htc=finetune_htc,
                                freeze_htc=freeze_htc,
                                freeze_backbone=freeze_backbone)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    #for name, param in model.named_parameters():
    #    print(name, param.requires_grad)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #11
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    runner_attr_dict=dict()):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False) for ds in dataset
    ]
    # build runner
    runner_attr_dict.update({
        'imgs_per_gpu': cfg.data.imgs_per_gpu,
        'initial_lr': cfg.optimizer['lr']
    })
    if hasattr(dataset, 'CLASSES'):
        runner_attr_dict.update({'classes': dataset.CLASSES})
    optimizer = build_optimizer(model, cfg.optimizer)
    search_optimizer = getattr(getattr(cfg, 'search_config', {}),
                               'search_optimizer', None)
    assert search_optimizer is None, "Not support"
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    search_optimizer,
                    cfg.work_dir,
                    logger=logger,
                    runner_attr_dict=runner_attr_dict)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config)
    # register eval hooks
    if validate:
        if isinstance(model.module, RPN):
            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
        else:
            if cfg.data.val.type == 'CocoDataset':
                runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
            else:
                runner.register_hook(DistEvalmAPHook(cfg.data.val))

    register_hooks(runner, cfg)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #12
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None,
                runner_attr_dict=dict()):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True) for ds in dataset
    ]
    # build runner
    runner_attr_dict.update({
        'imgs_per_gpu': cfg.data.imgs_per_gpu,
        'initial_lr': cfg.optimizer['lr']
    })
    if hasattr(dataset, 'CLASSES'):
        runner_attr_dict.update({'classes': dataset.CLASSES})
    optimizer = build_optimizer(model, cfg.optimizer)
    search_optimizer = getattr(cfg, 'search_config',
                               {}).pop('search_optimizer', None)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    search_optimizer,
                    cfg.work_dir,
                    logger=logger,
                    runner_attr_dict=runner_attr_dict)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config)
    runner.register_hook(DistSamplerSeedHook())
    if search_optimizer is not None:
        runner.register_hook(DistSearchOptimizerHook())
        runner.register_hook(SearchHook(**cfg.search_config))
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    register_hooks(runner, cfg)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    save_random_weights=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]

    # Load Weights with different Modes
    if 'pretrain_weights' in cfg:
        model = partial_load(model, cfg.pretrain_weights, ignore_clf=cfg.ignore_clf, samenclasses=cfg.same_nclasses,
                            convert_dict=cfg.convert_dict if 'convert_dict' in cfg else {})
    elif 'trained_weights' in cfg:
        model.load_state_dict(torch.load(cfg['trained_weights'])['state_dict'])

    if 'freeze_vars' in cfg:
        model = freeze_model_partially(model, cfg.freeze_vars)

    # Save Full config copy in checkpoint dir
    if not os.path.exists(cfg.work_dir):
        os.makedirs(cfg.work_dir)
    config_file = open(os.path.join(cfg.work_dir, 'config.txt'), 'w')
    for key, value in cfg.items():
        config_file.write('%s:%s\n'%(key, value))
    repo = Repo('./')
    config_file.write('\nRepo Branch: %s, Commit: %s\n'%(repo.active_branch, repo.head.commit.hexsha))
    config_file.close()

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(
        model, batch_processor, optimizer, cfg.work_dir, logger=logger)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #14
0
def train_smpl_detector_fuse(model, datasets, cfg, **kwargs):
    # prepare data loaders
    data_loaders = [
        build_dataloader_fuse(dataset,
                              cfg.data.imgs_per_gpu,
                              cfg.data.workers_per_gpu,
                              cfg.gpus,
                              dist=False) for dataset in datasets
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    # TODO: Build up a logger here that inherit the hook class
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    val_dataset_cfg = cfg.data.val
    eval_cfg = cfg.get('evaluation', {})
    # No need to add hook for validation.
    # We shall return the losses inside the loss function.
    # I won't re-use the code for the Evaluation Hook.
    # The evaluation result will be passed to log only.

    # To crete a `latest.pth` file to run recursively
    if kwargs.get('create_dummy', False):
        print('Create a dummy checkpoint for recursive training')
        runner._epoch -= 1
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth')
        runner._epoch += 1
        return

    pretrain_path = kwargs.get('load_pretrain', None)
    if kwargs.get('load_pretrain', None):
        print(f"Load pretrained model from {pretrain_path}")
        runner._epoch -= 1
        runner.load_checkpoint(pretrain_path)
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth')
        runner._epoch += 1
        return

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    val_every = cfg.data.train.get('val_every', None)
    val_loader = None
    if val_every and val_every > 0:
        val_loader = build_dataloader_fuse(kwargs.get('val_dataset'),
                                           cfg.data.imgs_per_gpu,
                                           cfg.data.workers_per_gpu,
                                           cfg.gpus,
                                           dist=False)
    if kwargs.get('debug', None):
        import ipdb
        ipdb.set_trace()
    runner.run(data_loaders,
               cfg.workflow,
               cfg.total_epochs,
               val_every=val_every,
               val_loader=val_loader,
               time_limit=getattr(cfg, 'time_limit', None))
Пример #15
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # # if model.module.bbox_head.freeze_solov2_and_train_combonly:
    # if model.module.bbox_head.optimize_list is not None:
    #     for (key, param) in model.named_parameters():
    #         # if 'kernel_convs_convcomb' not in key and 'context_fusion_convs' not in key and 'learned_weight' not in key:
    #         if not any(s in key for s in model.module.bbox_head.optimize_list):
    #             param.requires_grad=False
    #         else:
    #             # print('optimize {}'.format(key))
    #             logger.info('optimize {}'.format(key))

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(
        model, batch_processor, optimizer, cfg.work_dir, logger=logger)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)

    ## add test after training
    if cfg.data.test.ann_file != 'data/lvis/lvis_v0.5_val_lvis_freqset.json': # if val set is lvis freq, only eval on lvis-freq val set
        cfg.data.test.test_mode = True
        dataset = build_dataset(cfg.data.test)
        data_loader = build_dataloader(
            dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False)
        model_orig=model.module
        model = MMDataParallel(model, device_ids=[0]).cuda()
        data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100]
        outputs = single_gpu_test(model, data_loader)

        print('\nwriting results to {}'.format('xxx'))
        # mmcv.dump(outputs, 'xxx')
        eval_types = ['segm']
        if eval_types:
            print('Starting evaluate {}'.format(' and '.join(eval_types)))
            if eval_types == ['proposal_fast']:
                result_file = 'xxx'
                coco_eval(result_file, eval_types, dataset.coco)
            else:
                if not isinstance(outputs[0], dict):
                    result_files = results2json_segm(dataset, outputs, 'xxx', dump=False)
                    coco_eval(result_files, eval_types, dataset.coco)
                else:
                    for name in outputs[0]:
                        print('\nEvaluating {}'.format(name))
                        outputs_ = [out[name] for out in outputs]
                        result_file = 'xxx' + '.{}'.format(name)
                        result_files = results2json(dataset, outputs_,
                                                    result_file, dump=False)
                        coco_eval(result_files, eval_types, dataset.coco)

        ##eval on lvis-77######
        cfg.data.test.ann_file = 'data/lvis/lvis_v0.5_val_cocofied.json'
        cfg.data.test.img_prefix = 'data/lvis/val2017/'
        cfg.data.test.test_mode = True
        dataset = build_dataset(cfg.data.test)
        data_loader = build_dataloader(
            dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False)
        # model_orig=model.module
        # model = MMDataParallel(model, device_ids=[0]).cuda()
        data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100]
        outputs = single_gpu_test(model, data_loader)

        print('\nwriting results to {}'.format('xxx'))
        # mmcv.dump(outputs, 'xxx')
        eval_types = ['segm']
        if eval_types:
            print('Starting evaluate {}'.format(' and '.join(eval_types)))
            if eval_types == ['proposal_fast']:
                result_file = 'xxx'
                coco_eval(result_file, eval_types, dataset.coco)
            else:
                if not isinstance(outputs[0], dict):
                    result_files = results2json_segm(dataset, outputs, 'xxx', dump=False)
                    from lvis import LVISEval
                    lvisEval = LVISEval('data/lvis/lvis_v0.5_val_cocofied.json', result_files, 'segm')
                    lvisEval.run()
                    lvisEval.print_results()
                    # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999
                    lvisEval.params.iou_thrs[8] = 0.9
                    for iou in [0.5, 0.6, 0.7, 0.8, 0.9]:
                        print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou)))
                else:
                    for name in outputs[0]:
                        print('\nEvaluating {}'.format(name))
                        outputs_ = [out[name] for out in outputs]
                        result_file = 'xxx' + '.{}'.format(name)
                        result_files = results2json(dataset, outputs_,
                                                    result_file, dump=False)
                        coco_eval(result_files, eval_types, dataset.coco)
    else:
        ##eval on lvis-freq######
        cfg.data.test.test_mode = True
        dataset = build_dataset(cfg.data.test)
        data_loader = build_dataloader(
            dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False)
        # model_orig=model.module
        # model = MMDataParallel(model, device_ids=[0]).cuda()
        data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100]
        outputs = single_gpu_test(model, data_loader)

        print('\nwriting results to {}'.format('xxx'))
        # mmcv.dump(outputs, 'xxx')
        eval_types = ['segm']
        if eval_types:
            print('Starting evaluate {}'.format(' and '.join(eval_types)))
            if eval_types == ['proposal_fast']:
                result_file = 'xxx'
                coco_eval(result_file, eval_types, dataset.coco)
            else:
                if not isinstance(outputs[0], dict):
                    result_files = results2json_segm(dataset, outputs, 'xxx', dump=False)
                    from lvis import LVISEval
                    lvisEval = LVISEval(cfg.data.test.ann_file, result_files, 'segm')
                    lvisEval.run()
                    lvisEval.print_results()
                    # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999
                    lvisEval.params.iou_thrs[8] = 0.9
                    for iou in [0.5, 0.6, 0.7, 0.8, 0.9]:
                        print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou)))
                else:
                    for name in outputs[0]:
                        print('\nEvaluating {}'.format(name))
                        outputs_ = [out[name] for out in outputs]
                        result_file = 'xxx' + '.{}'.format(name)
                        result_files = results2json(dataset, outputs_,
                                                    result_file, dump=False)
                        coco_eval(result_files, eval_types, dataset.coco)
Пример #16
0
def train_adv_smpl_detector(model, datasets, cfg, **kwargs):
    # prepare data loaders
    data_loaders = [
        build_dataloader_fuse(dataset,
                              cfg.data.imgs_per_gpu,
                              cfg.data.workers_per_gpu,
                              cfg.gpus,
                              dist=False) for dataset in datasets
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    adv_model = nn.DataParallel(Discriminator()).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    adv_optimizer = build_optimizer(adv_model, cfg.adv_optimizer)

    global runner
    runner = AdvRunner(adv_model, adv_optimizer, model, adv_batch_processor,
                       optimizer, cfg.work_dir, cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        assert NotImplementedError(
            "AdvOptimizer is not implemented for fp16 yet")
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    # TODO: Build up a logger here that inherit the hook class
    # import mmcv.runner.hooks.logger as mmcv_logger
    # mmcv_logger.LoggerHook
    runner.register_training_hooks(cfg.adv_optimizer_config, cfg.lr_config,
                                   optimizer_config, cfg.checkpoint_config,
                                   cfg.log_config)

    val_dataset_cfg = cfg.data.val
    eval_cfg = cfg.get('evaluation', {})
    # No need to add hook for validation.
    # We shall return the losses inside the loss function.
    # I won't re-use the code for the Evaluation Hook.
    # The evaluation result will be passed to log only.

    # To crete a `latest.pth` file to run recursively
    if kwargs.get('create_dummy', False):
        print('Create a dummy checkpoint for recursive training')
        runner._epoch -= 1
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth')
        runner._epoch += 1
        return

    pretrain_path = kwargs.get('load_pretrain', None)
    if kwargs.get('load_pretrain', None):
        print(f"Load pretrained model from {pretrain_path}")
        runner._epoch -= 1
        runner.load_checkpoint(pretrain_path)
        adv_pretrain_path = osp.join(*osp.split(pretrain_path)[:-1],
                                     'adv_' + osp.split(pretrain_path)[-1])
        if osp.isfile(adv_pretrain_path):
            runner.load_adv_checkpoint(adv_pretrain_path)
        else:
            print('No adversarial checkpoint is found.')
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth')
        runner._epoch += 1
        return

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    val_every = cfg.data.train.get('val_every', None)
    val_loader = None
    if val_every and val_every > 0:
        val_loader = build_dataloader_fuse(kwargs.get('val_dataset'),
                                           cfg.data.imgs_per_gpu,
                                           cfg.data.workers_per_gpu,
                                           cfg.gpus,
                                           dist=False)
    if kwargs.get('debug', None):
        import ipdb
        ipdb.set_trace()
    re_weight = cfg.re_weight if hasattr(cfg, 're_weight') else dict()
    mosh_path = cfg.common_train_cfg.mosh_path
    mosh_data = np.load(mosh_path)
    mosh = {
        'shape': mosh_data['shape'].copy(),
        'pose': mosh_data['pose'].copy()
    }
    runner.run(data_loaders,
               cfg.workflow,
               cfg.total_epochs,
               val_every=val_every,
               val_loader=val_loader,
               time_limit=getattr(cfg, 'time_limit', None),
               re_weight=re_weight,
               mosh=mosh,
               log_grad=cfg.get('log_grad', False))
Пример #17
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    map_location = 'default'
    if torch.cuda.is_available():
        if distributed:
            # put model on gpus
            find_unused_parameters = cfg.get('find_unused_parameters', False)
            # Sets the `find_unused_parameters` parameter in
            # torch.nn.parallel.DistributedDataParallel
            model = MMDistributedDataParallel(
                model.cuda(),
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
        else:
            model = MMDataParallel(
                model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
    else:
        model = MMDataCPU(model)
        map_location = 'cpu'

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    add_logging_on_first_and_last_iter(runner)

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from, map_location=map_location)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #18
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    # runner = Runner(
    #     model,
    #     batch_processor,
    #     optimizer,
    #     cfg.work_dir,
    #     logger=logger,
    #     meta=meta)
    runner = Runner(  # change for prune sparsity-regularization train
        model,
        batch_processor,
        optimizer,
        cfg,
        logger=logger,
        meta=meta)

    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(
        cfg.lr_config, optimizer_config, cfg.checkpoint_config,
        cfg.log_config)  # Register default hooks for training

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #19
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if validate:
        cfg.data.test.test_mode = True
        distributed = False
        dataset = build_dataset(cfg.data.test)
        data_loader = build_dataloader(
            dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)

        model = build_detector(cfg.model,
                               train_cfg=None,
                               test_cfg=cfg.test_cfg)
        checkpoint_path = os.path.join(cfg.work_dir, 'latest.pth')
        checkpoint = load_checkpoint(model,
                                     checkpoint_path,
                                     map_location='cpu')

        if 'CLASSES' in checkpoint['meta']:
            model.CLASSES = checkpoint['meta']['CLASSES']
        else:
            model.CLASSES = dataset.CLASSES

        model = MMDataParallel(model, device_ids=[0])
        outputs = single_gpu_test(model, data_loader)

        results = dataset.evaluate(outputs, 'keypoints')

        path = os.path.join(cfg.work_dir, 'keypoints.json')
        with open(path, 'a') as f:
            json.dump(results, f)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #20
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.wandb:
        runner.register_hook(
            WandbLoggerHook(init_kwargs=dict(
                project=cfg.project, name=cfg.exp_name, config=cfg)))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #21
0
def train_detector_m(model,
                     dataset,
                     cfg,
                     distributed=False,
                     validate=False,
                     timestamp=None,
                     meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    params_b = []
    """for key, value in dict(model.backbone.named_parameters()).items():
        if value.requires_grad:
            params_b += [{'params': [value]}]
    for key, value in dict(model.rpn_head.named_parameters()).items():
        if value.requires_grad:
            params_b += [{'params': [value]}]"""
    for key, value in dict(
            model.roi_head.shared_head.named_parameters()).items():
        if value.requires_grad:
            params_b += [{'params': [value]}]
    for key, value in dict(
            model.roi_head.bbox_head.named_parameters()).items():
        if value.requires_grad:
            params_b += [{'params': [value]}]
    optimizer_b = torch.optim.SGD(params_b,
                                  lr=cfg.optimizer.lr,
                                  momentum=cfg.optimizer.momentum,
                                  weight_decay=cfg.optimizer.weight_decay)
    optimizer_g = None
    optimizer_d = None
    if model.roi_head.with_fsr_generator:
        params_g = []
        for key, value in dict(
                model.roi_head.fsr_generator.named_parameters()).items():
            if value.requires_grad:
                params_g += [{'params': [value]}]
        for key, value in dict(
                model.roi_head.shared_head.named_parameters()).items():
            if value.requires_grad:
                params_g += [{'params': [value], 'lr': cfg.optimizer_b.lr}]
        for key, value in dict(
                model.roi_head.bbox_head.named_parameters()).items():
            if value.requires_grad:
                params_g += [{'params': [value], 'lr': cfg.optimizer_b.lr}]
        optimizer_g = torch.optim.SGD(
            params_g,
            lr=cfg.optimizer_g.lr,
            momentum=cfg.optimizer_g.momentum,
            weight_decay=cfg.optimizer_g.weight_decay)
    if model.roi_head.with_dis_head:
        params_d = []
        for key, value in dict(
                model.roi_head.dis_head.named_parameters()).items():
            if value.requires_grad:
                params_d += [{'params': [value]}]
        optimizer_d = torch.optim.SGD(
            params_d,
            lr=cfg.optimizer_d.lr,
            momentum=cfg.optimizer_d.momentum,
            weight_decay=cfg.optimizer_d.weight_decay)

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', True)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    runner = MultiOptimRunner(model,
                              optimizer_b,
                              optimizer_g,
                              optimizer_d,
                              cfg.work_dir,
                              logger=logger,
                              meta=meta)

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config_b = Fp16OptimizerHook(**cfg.optimizer_config_b,
                                               **fp16_cfg,
                                               distributed=distributed)
        optimizer_config_g = Fp16OptimizerHook(**cfg.optimizer_config_g,
                                               **fp16_cfg,
                                               distributed=distributed)
        optimizer_config_d = Fp16OptimizerHook(**cfg.optimizer_config_d,
                                               **fp16_cfg,
                                               distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config_b:
        optimizer_config_b = OptimHookB(**cfg.optimizer_config_b)
        optimizer_config_g = OptimHookG(**cfg.optimizer_config_g)
        optimizer_config_d = OptimHookD(**cfg.optimizer_config_d)
    else:
        optimizer_config_b = cfg.optimizer_config_b
        optimizer_config_g = cfg.optimizer_config_g
        optimizer_config_d = cfg.optimizer_config_d

    # register hooks
    runner.register_training_hooks(cfg.lr_config_b, cfg.lr_config_g,
                                   cfg.lr_config_d, optimizer_config_b,
                                   optimizer_config_g, optimizer_config_d,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #22
0
def _non_dist_train_runner(model,
                           trainDataset,
                           valDataset,
                           cfg,
                           validate=False) -> Runner:

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config

    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    before = toSingleGPUModeBefore
    after = toSingleGPUModeAfter

    # register eval hooks
    if validate:
        val_dataset_cfg = valDataset
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                HookWrapper(
                    CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg),
                    before, after))
        else:
            dataset_type = getattr(mmdetDatasets, val_dataset_cfg.type)
            if issubclass(dataset_type, mmdetDatasets.CocoDataset):
                runner.register_hook(
                    HookWrapper(
                        CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg),
                        before, after))
            else:
                runner.register_hook(
                    HookWrapper(DistEvalmAPHook(val_dataset_cfg, **eval_cfg),
                                before, after))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        weightsPath = cfg.load_from
        if weightsPath is not None:
            if weightsPath.startswith("open-mmlab://"):
                cfgName = weightsPath[len("open-mmlab://"):]
                weightsPath = checkpoint_registry.getPath(cfgName)
            if cfg.resetHeads:
                torchHome = torch.hub._get_torch_home()
                chpName = os.path.basename(weightsPath)[0:(-1) * len(".pth")]
                noHeadWeightsPath = os.path.join(
                    torchHome, f"checkpoints/nohead/{chpName}_nohead.pth")
                if not os.path.exists(noHeadWeightsPath):
                    if isURL(weightsPath):
                        weights = model_zoo.load_url(weightsPath)
                    else:
                        weights = torch.load(weightsPath)
                    weights['state_dict'] = {
                        k: v
                        for k, v in weights['state_dict'].items()
                        if not k.startswith('bbox_head')
                        and not k.startswith('mask_head')
                    }
                    weightsDir = os.path.dirname(noHeadWeightsPath)
                    if not os.path.exists(weightsDir):
                        os.mkdir(weightsDir)
                    torch.save(weights, noHeadWeightsPath)
                weightsPath = noHeadWeightsPath

            runner.load_checkpoint(weightsPath)
    return runner
Пример #23
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         len(cfg.gpu_ids),
                         dist=False,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    if cfg.optimizer.type == 'SGD_GC':
        optimizer = SGD_GC(model.parameters(),
                           cfg.optimizer.lr,
                           momentum=cfg.optimizer.momentum,
                           weight_decay=cfg.optimizer.weight_decay)
    else:
        optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(EvalHook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #24
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    classes_rearrange = cfg.get('classes_rearrange', False)
    if classes_rearrange:
        runner.model = rearrange_classes(runner.model, cfg.classes,
                                         cfg.dataset_type)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #25
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    # if validate:
    #     raise NotImplementedError('Built-in validation is not implemented '
    #                               'yet in not-distributed training. Use '
    #                               'distributed training or test.py and '
    #                               '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=True,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #26
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True) for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Пример #27
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                multitask=False,
                vis=False):
    # prepare data loaders
    data_loaders = [[
        build_dataloader(d,
                         cfg.data.imgs_per_gpu //
                         2 if issubclass(d.__class__, datasets.BDDVideo) else
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu //
                         2 if issubclass(d.__class__, datasets.BDDVideo) else
                         cfg.data.workers_per_gpu,
                         dist=True) for d in dataset
    ]] if multitask else [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu //
                         2 if issubclass(dataset.__class__, datasets.BDDVideo)
                         else cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu //
                         2 if issubclass(dataset.__class__, datasets.BDDVideo)
                         else cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    bp = batch_processor_with_vis if vis else batch_processor
    runner = MTLRunner(model, bp, optimizer, cfg.work_dir,
                       cfg.log_level) if multitask else \
             Runner(model, bp, optimizer, cfg.work_dir,
                       cfg.log_level)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            if not type(val_dataset_cfg) == list:
                val_dataset_cfg = [val_dataset_cfg]
            for _cfg in val_dataset_cfg:
                dataset_type = getattr(datasets, _cfg.type)
                if issubclass(dataset_type, datasets.BddStreet) or \
                   issubclass(dataset_type, datasets.BddSemanticSeg):
                    runner.register_hook(BddSegEvalHook(_cfg, **eval_cfg))
                elif issubclass(dataset_type, datasets.CocoDataset):
                    runner.register_hook(CocoDistEvalmAPHook(_cfg, **eval_cfg))
                elif issubclass(dataset_type, datasets.BDDVideo):
                    runner.register_hook(BDDEvalHook(_cfg))
                else:
                    runner.register_hook(DistEvalmAPHook(_cfg, **eval_cfg))
            runner.register_logger_hooks(cfg.log_config)
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    if cfg.get('init_asso_head', False):
        ori_key = cfg.init_asso_head[0]
        new_key = cfg.init_asso_head[1]
        for _key in model.module.state_dict().keys():
            if 'asso' in _key:

                exist_key = _key.replace(ori_key, new_key)
                if exist_key in model.module.state_dict().keys():
                    if dist.get_rank() == 0:
                        print('Init "{}" with "{}"'.format(_key, exist_key))
                    model.module.state_dict()[_key].copy_(
                        model.module.state_dict()[exist_key])
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)