def main(cfg): ###################################################################################### # Create Training Data ###################################################################################### datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined model.CLASSES = datasets.CLASSES _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=cfg.optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=None)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, interval=10, image_interval=100, s3_dir=None)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
def _dist_train(model, dataset, cfg, num_gpus=1, mixed_precision=False, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] tf_datasets = [ build_dataloader(ds, cfg.data.imgs_per_gpu, 1, num_gpus=num_gpus, dist=True) for ds in dataset ] # build runner optimizer = build_optimizer(cfg.optimizer) if mixed_precision: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') optimizer_config = cfg.optimizer_config optimizer_config['amp_enabled'] = mixed_precision gradient_clip = optimizer_config.get('gradient_clip', 15.0) # default is 15.0 runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, amp_enabled=mixed_precision, gradient_clip=gradient_clip) runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, num_gpus=1, mixed_precision=False, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] tf_datasets = [ build_dataloader(ds, cfg.data.imgs_per_gpu, 1, num_gpus=num_gpus, dist=True) for ds in dataset ] # build runner optimizer = build_optimizer(cfg.optimizer) if mixed_precision: # broken in TF 2.1 # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale='dynamic') runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, amp_enabled=mixed_precision) # workaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = cfg.optimizer_config optimizer_config['amp_enabled'] = mixed_precision # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate and runner.rank < runner.local_size: # register this dist eval hook only for Node 0 val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
def main(cfg): decompress_data(cfg) ###################################################################################### # Create Training Data ###################################################################################### cfg.global_batch_size = cfg.batch_size_per_device * hvd.size() cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### #update any hyperparams that we may have passed in via arguments if cfg.ls > 0.0: cfg.model['bbox_head']['label_smoothing'] = cfg.ls if cfg.use_rcnn_bn: cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn if cfg.use_conv: cfg.model['bbox_head']['use_conv'] = cfg.use_conv cfg.schedule = args.schedule model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Build optimizer and associate scheduler ###################################################################################### # base learning rate is set for global batch size of 8, with linear scaling for larger batches base_learning_rate = cfg.base_learning_rate scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8 steps_per_epoch = cfg.steps_per_epoch if cfg.schedule == '1x': scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay( [steps_per_epoch * 8, steps_per_epoch * 10], [ scaled_learning_rate, scaled_learning_rate * 0.1, scaled_learning_rate * 0.01 ]) elif cfg.schedule == 'cosine': scheduler = tf.keras.experimental.CosineDecayRestarts( initial_learning_rate=scaled_learning_rate, first_decay_steps=12 * steps_per_epoch, t_mul=1, m_mul=1) #0-1-13 else: raise NotImplementedError warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps) # FIXME: currently hardcoded to SGD optimizer = tf.keras.optimizers.SGD(scheduler, momentum=0.9, nesterov=False) if cfg.fp16: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale='dynamic') ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=cfg.s3_checkpoints, h5=True)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, image_interval=100, s3_dir=cfg.s3_tensorboard)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)