def main(cfg):
    ######################################################################################
    # Create Training Data
    ######################################################################################
    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    model.CLASSES = datasets.CLASSES
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)
    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=cfg.optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=None))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          interval=10,
                                          image_interval=100,
                                          s3_dir=None))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
示例#2
0
def _dist_train(model,
                dataset,
                cfg,
                num_gpus=1,
                mixed_precision=False,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    tf_datasets = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         1,
                         num_gpus=num_gpus,
                         dist=True) for ds in dataset
    ]

    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    if mixed_precision:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')

    optimizer_config = cfg.optimizer_config
    optimizer_config['amp_enabled'] = mixed_precision
    gradient_clip = optimizer_config.get('gradient_clip', 15.0) # default is 15.0

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    amp_enabled=mixed_precision,
                    gradient_clip=gradient_clip)
 
    runner.timestamp = timestamp
    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
示例#3
0
def _dist_train(model,
                dataset,
                cfg,
                num_gpus=1,
                mixed_precision=False,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    tf_datasets = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         1,
                         num_gpus=num_gpus,
                         dist=True) for ds in dataset
    ]

    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    if mixed_precision:
        # broken in TF 2.1
        # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale='dynamic')

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    amp_enabled=mixed_precision)
    # workaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    optimizer_config = cfg.optimizer_config
    optimizer_config['amp_enabled'] = mixed_precision
    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # register eval hooks
    if validate and runner.rank < runner.local_size:  # register this dist eval hook only for Node 0
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
示例#4
0
def main(cfg):
    decompress_data(cfg)
    ######################################################################################
    # Create Training Data
    ######################################################################################
    cfg.global_batch_size = cfg.batch_size_per_device * hvd.size()
    cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size

    datasets = build_dataset(cfg.data.train)
    tf_datasets = [
        build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)
    ]
    ######################################################################################
    # Build Model
    ######################################################################################

    #update any hyperparams that we may have passed in via arguments
    if cfg.ls > 0.0:
        cfg.model['bbox_head']['label_smoothing'] = cfg.ls
    if cfg.use_rcnn_bn:
        cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn
    if cfg.use_conv:
        cfg.model['bbox_head']['use_conv'] = cfg.use_conv

    cfg.schedule = args.schedule
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    _ = model(next(iter(tf_datasets[0][0])))
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)

    ######################################################################################
    # Build optimizer and associate scheduler
    ######################################################################################

    # base learning rate is set for global batch size of 8, with linear scaling for larger batches
    base_learning_rate = cfg.base_learning_rate
    scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8
    steps_per_epoch = cfg.steps_per_epoch
    if cfg.schedule == '1x':
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            [steps_per_epoch * 8, steps_per_epoch * 10], [
                scaled_learning_rate, scaled_learning_rate * 0.1,
                scaled_learning_rate * 0.01
            ])
    elif cfg.schedule == 'cosine':
        scheduler = tf.keras.experimental.CosineDecayRestarts(
            initial_learning_rate=scaled_learning_rate,
            first_decay_steps=12 * steps_per_epoch,
            t_mul=1,
            m_mul=1)  #0-1-13
    else:
        raise NotImplementedError
    warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate
    scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps)
    # FIXME: currently hardcoded to SGD
    optimizer = tf.keras.optimizers.SGD(scheduler,
                                        momentum=0.9,
                                        nesterov=False)
    if cfg.fp16:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale='dynamic')

    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model,
                                     batch_processor,
                                     name=cfg.model_name,
                                     optimizer=optimizer,
                                     work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level),
                                     amp_enabled=cfg.fp16,
                                     loss_weights=cfg.loss_weights)
    runner.timestamp = int(time())
    ######################################################################################
    # Setup Training Hooks
    ######################################################################################
    runner.register_hook(
        checkpoint.CheckpointHook(interval=cfg.checkpoint_interval,
                                  out_dir=cfg.outputs_path,
                                  s3_dir=cfg.s3_checkpoints,
                                  h5=True))
    runner.register_hook(
        CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval))
    runner.register_hook(iter_timer.IterTimerHook())
    runner.register_hook(text.TextLoggerHook())
    runner.register_hook(
        visualizer.Visualizer(cfg.data.val, interval=100, top_k=10))
    runner.register_hook(
        tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path,
                                          image_interval=100,
                                          s3_dir=cfg.s3_tensorboard))
    ######################################################################################
    # Run Model
    ######################################################################################
    runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)