def test_task(self, config_name): input_image_size = [224, 224] test_tfrecord_file = os.path.join(self.get_temp_dir(), 'cls_test.tfrecord') example = tf.train.Example.FromString( tfexample_utils.create_classification_example( image_height=input_image_size[0], image_width=input_image_size[1])) self._create_test_tfrecord(tfrecord_file=test_tfrecord_file, example=example, num_samples=10) config = exp_factory.get_exp_config(config_name) config.task.train_data.global_batch_size = 2 config.task.validation_data.input_path = test_tfrecord_file config.task.train_data.input_path = test_tfrecord_file task = img_cls_task.ImageClassificationTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) for metric in metrics: logs[metric.name] = metric.result() self.assertIn('loss', logs) self.assertIn('accuracy', logs) self.assertIn('top_5_accuracy', logs) logs = task.validation_step(next(iterator), model, metrics=metrics) for metric in metrics: logs[metric.name] = metric.result() self.assertIn('loss', logs) self.assertIn('accuracy', logs) self.assertIn('top_5_accuracy', logs)
def test_task(self, config_name): config_to_backbone_mapping = { 'deeplabv3plus_mobilenet_edgetpuv2_xs_ade20k_32': 'mobilenet_edgetpu_v2_xs', 'deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32': 'mobilenet_edgetpu_v2_s', 'deeplabv3plus_mobilenet_edgetpuv2_m_ade20k_32': 'mobilenet_edgetpu_v2_m', } config = seg_cfg.seg_deeplabv3plus_ade20k_32( config_to_backbone_mapping[config_name], init_backbone=False) config.task.train_data.global_batch_size = 1 config.task.train_data.shuffle_buffer_size = 2 config.task.validation_data.shuffle_buffer_size = 2 config.task.validation_data.global_batch_size = 1 config.task.train_data.output_size = [32, 32] config.task.validation_data.output_size = [32, 32] config.task.model.decoder.aspp.pool_kernel_size = None config.task.model.backbone.dilated_resnet.model_id = 50 config.task.model.backbone.dilated_resnet.output_stride = 16 task = img_seg_task.CustomSemanticSegmentationTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) self.assertIn('loss', logs) logs = task.validation_step(next(iterator), model, metrics=task.build_metrics(training=False)) self.assertIn('loss', logs)
def test_task(self, config_name): config_to_backbone_mapping = { 'autoseg_edgetpu_xs': 'autoseg_edgetpu_backbone_xs', 'autoseg_edgetpu_s': 'autoseg_edgetpu_backone_s' } config = autoseg_cfg.autoseg_edgetpu_experiment_config( config_to_backbone_mapping[config_name], init_backbone=False) config.task.train_data.global_batch_size = 2 config.task.train_data.shuffle_buffer_size = 2 config.task.validation_data.shuffle_buffer_size = 2 config.task.validation_data.global_batch_size = 2 config.task.train_data.output_size = [512, 512] config.task.validation_data.output_size = [512, 512] task = img_seg_task.AutosegEdgeTPUTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) if isinstance(optimizer, optimization.ExponentialMovingAverage ) and not optimizer.has_shadow_copy: optimizer.shadow_copy(model) logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) self.assertIn('loss', logs) logs = task.validation_step(next(iterator), model, metrics=task.build_metrics(training=False)) self.assertIn('loss', logs) model.summary()
def testTaskWithUnstructuredSparsity(self, config_name): config = exp_factory.get_exp_config(config_name) config.task.train_data.global_batch_size = 2 task = img_cls_task.ImageClassificationTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) if isinstance(optimizer, optimization.ExponentialMovingAverage ) and not optimizer.has_shadow_copy: optimizer.shadow_copy(model) if config.task.pruning: # This is an auxilary initialization required to prune a model which is # originally done in the train library. actions.PruningAction(export_dir=tempfile.gettempdir(), model=model, optimizer=optimizer) # Check all layers and target weights are successfully pruned. self._validate_model_pruned(model, config_name) logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) self._validate_metrics(logs, metrics) logs = task.validation_step(next(iterator), model, metrics=metrics) self._validate_metrics(logs, metrics)
def test_retinanet_task(self, test_config, is_training): """RetinaNet task test for training and val using toy configs.""" input_image_size = [384, 384] test_tfrecord_file = os.path.join(self.get_temp_dir(), 'det_test.tfrecord') example = tfexample_utils.create_detection_test_example( image_height=input_image_size[0], image_width=input_image_size[1], image_channel=3, num_instances=10) self._create_test_tfrecord( tfrecord_file=test_tfrecord_file, example=example, num_samples=10) config = exp_factory.get_exp_config(test_config) # modify config to suit local testing config.task.model.input_size = [128, 128, 3] config.trainer.steps_per_loop = 1 config.task.train_data.global_batch_size = 1 config.task.validation_data.global_batch_size = 1 config.task.train_data.shuffle_buffer_size = 2 config.task.validation_data.shuffle_buffer_size = 2 config.task.validation_data.input_path = test_tfrecord_file config.task.train_data.input_path = test_tfrecord_file config.task.annotation_file = None config.train_steps = 1 task = retinanet.RetinaNetTask(config.task) model = task.build_model() metrics = task.build_metrics(training=is_training) strategy = tf.distribute.get_strategy() data_config = config.task.train_data if is_training else config.task.validation_data dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, data_config) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate()) if is_training: task.train_step(next(iterator), model, optimizer, metrics=metrics) else: task.validation_step(next(iterator), model, metrics=metrics)
def build_optimizer(self, config): """Creates optimier for the fused model.""" optimizer_config = self.optimizer_config.replace( learning_rate={ 'polynomial': { 'decay_steps': config.decay_steps, 'initial_learning_rate': config.initial_learning_rate, 'end_learning_rate': config.end_learning_rate, } }, warmup={ 'type': 'linear', 'linear': { 'warmup_steps': config.warmup_steps, } }) logging.info('The optimizer config is: %s', optimizer_config.as_dict()) optimizer_factory = optimization.OptimizerFactory(optimizer_config) return optimizer_factory.build_optimizer( optimizer_factory.build_learning_rate())
def get_optimizer(self, stage_id): """Build optimizer for each stage.""" params = self._optimizer_config.replace( warmup={ 'linear': { 'warmup_steps': self.task_config.stage_list[stage_id].warmup_steps }, }, learning_rate={ 'power': { 'initial_learning_rate': self.task_config.stage_list[stage_id].initial_learning_rate }, }, ) opt_factory = optimization.OptimizerFactory(params) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) return optimizer
def create_optimizer( cls, optimizer_config: OptimizationConfig, runtime_config: Optional[RuntimeConfig] = None, dp_config: Optional[DifferentialPrivacyConfig] = None): """Creates an TF optimizer from configurations. Args: optimizer_config: the parameters of the Optimization settings. runtime_config: the parameters of the runtime. dp_config: the parameter of differential privacy. Returns: A tf.optimizers.Optimizer object. """ gradient_transformers = None if dp_config is not None: logging.info( "Adding differential privacy transform with config %s.", dp_config.as_dict()) noise_stddev = dp_config.clipping_norm * dp_config.noise_multiplier gradient_transformers = [ functools.partial(ops.clip_l2_norm, l2_norm_clip=dp_config.clipping_norm), functools.partial(ops.add_noise, noise_stddev=noise_stddev) ] opt_factory = optimization.OptimizerFactory(optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate(), gradient_transformers=gradient_transformers) # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if runtime_config: optimizer = performance.configure_optimizer( optimizer, use_float16=runtime_config.mixed_precision_dtype == "float16", loss_scale=runtime_config.loss_scale) return optimizer
def create_optimizer(trainer_config: TrainerConfig, runtime_config: Optional[RuntimeConfig] = None): """Creates an TF optimizer from configurations. Args: trainer_config: the parameters of the trainer. runtime_config: the parameters of the runtime. Returns: A tf.optimizers.Optimizer object. """ opt_factory = optimization.OptimizerFactory( trainer_config.optimizer_config) optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate()) # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if runtime_config and runtime_config.loss_scale: optimizer = performance.configure_optimizer( optimizer, use_float16=runtime_config.mixed_precision_dtype == "float16", loss_scale=runtime_config.loss_scale) return optimizer
def test_task(self, config_name): config = exp_factory.get_exp_config(config_name) config.task.train_data.global_batch_size = 2 task = image_classification.EdgeTPUTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) if isinstance(optimizer, optimization.ExponentialMovingAverage ) and not optimizer.has_shadow_copy: optimizer.shadow_copy(model) logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) for metric in metrics: logs[metric.name] = metric.result() self.assertIn('loss', logs) self.assertIn('accuracy', logs) self.assertIn('top_5_accuracy', logs) logs = task.validation_step(next(iterator), model, metrics=metrics) for metric in metrics: logs[metric.name] = metric.result() self.assertIn('loss', logs) self.assertIn('accuracy', logs) self.assertIn('top_5_accuracy', logs)
def get_optimizer(self, stage_id): """Build optimizer for each stage.""" if stage_id + 1 < self.num_stages(): distill_config = self._progressive_config.layer_wise_distill_config else: distill_config = self._progressive_config.pretrain_distill_config params = self._optimizer_config.replace( learning_rate={ 'polynomial': { 'decay_steps': distill_config.decay_steps, 'initial_learning_rate': distill_config.initial_learning_rate, 'end_learning_rate': distill_config.end_learning_rate, } }, warmup={'linear': { 'warmup_steps': distill_config.warmup_steps, }}) opt_factory = optimization.OptimizerFactory(params) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) return optimizer
def _build_and_run_model(self, config): task = image_classification.ImageClassificationTask(config.task) model = task.build_model() metrics = task.build_metrics() strategy = tf.distribute.get_strategy() dataset = orbit.utils.make_distributed_dataset(strategy, task.build_inputs, config.task.train_data) iterator = iter(dataset) opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) if isinstance(optimizer, optimization.ExponentialMovingAverage ) and not optimizer.has_shadow_copy: optimizer.shadow_copy(model) # Run training for _ in range(5): logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) for metric in metrics: logs[metric.name] = metric.result() # Run validation validation_logs = task.validation_step(next(iterator), model, metrics=metrics) for metric in metrics: validation_logs[metric.name] = metric.result() return logs, validation_logs, model.weights
def __init__(self, config: ExperimentConfig, task: base_task.Task, train: bool = True, evaluate: bool = True, model=None, optimizer=None, checkpoint_exporter=None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. task: A base_task.Task instance. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. model: tf.keras.Model instance. If provided, it will be used instead of building model using task.build_model(). Default to None. optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will used instead of the optimizer from config. Default to None. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._config = config self._task = task self._model = model or task.build_model() if optimizer is None: opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) self._optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) else: self._optimizer = optimizer self._checkpoint_exporter = checkpoint_exporter # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if config.runtime.loss_scale: self._optimizer = performance.configure_optimizer( self._optimizer, use_float16=config.runtime.mixed_precision_dtype == 'float16', loss_scale=config.runtime.loss_scale) # global_step increases by 1 after each training iteration. # We should have global_step.numpy() == self.optimizer.iterations.numpy() # when there is only 1 optimizer. self._global_step = orbit.utils.create_global_step() if hasattr(self.model, 'checkpoint_items'): checkpoint_items = self.model.checkpoint_items else: checkpoint_items = {} self._checkpoint = tf.train.Checkpoint(global_step=self.global_step, model=self.model, optimizer=self.optimizer, **checkpoint_items) self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean('validation_loss', dtype=tf.float32) self._train_metrics = self.task.build_metrics( training=True) + self.model.metrics self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: train_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.train_data) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function, use_tpu_summary_optimization=config.trainer. allow_tpu_summary)) if evaluate: eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.validation_data) orbit.StandardEvaluator.__init__( self, eval_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function))