def testEmptyTrainingDataset(self): model = _make_seq2seq_model(self.get_temp_dir()) optimizer = tf.keras.optimizers.SGD(1.0) trainer = training.DistributionStrategyTrainer(model, optimizer) empty_file = os.path.join(self.get_temp_dir(), "train.txt") with open(empty_file, "w"): pass dataset = model.examples_inputter.make_training_dataset( empty_file, empty_file, 32) with self.assertRaisesRegex(RuntimeError, "No training steps"): trainer(dataset)
def train(self, num_devices=1, with_eval=False, checkpoint_path=None): """Runs the training loop. Args: num_devices: Number of devices to use for training. with_eval: Enable evaluation during training. checkpoint_path: The checkpoint path to load the model weights from it. Returns: The path to the final model directory. """ checkpoint, config = self._init_run(num_devices=num_devices, training=True) # load teacher model. teacher_checkpoint_path = config["model"]["teacher"].get( "teacher_checkpoint_path", None) if not teacher_checkpoint_path: raise ValueError("teacher_checkpoint_path is None.") teacher_checkpoint = tf.train.Checkpoint( model=checkpoint.model.teacher_model) status = teacher_checkpoint.restore( tf.train.latest_checkpoint(teacher_checkpoint_path)) # load student model. student_checkpoint_path = config["model"]["student"].get( "student_checkpoint_path", None) if student_checkpoint_path: student_checkpoint = tf.train.Checkpoint( model=checkpoint.model.student_model) student_checkpoint.restore( tf.train.latest_checkpoint(student_checkpoint_path)) status = checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=checkpoint_path is not None) model = checkpoint.model data_config = config["data"] train_config = config["train"] eval_config = config["eval"] batch_type = train_config["batch_type"] if batch_type == "tokens" and self._mixed_precision: batch_size_multiple = 8 else: batch_size_multiple = 1 dataset = model.student_model.examples_inputter.make_training_dataset( data_config["train_features_file"], data_config.get("train_labels_file"), train_config["batch_size"], batch_type=batch_type, batch_size_multiple=batch_size_multiple, shuffle_buffer_size=train_config["sample_buffer_size"], length_bucket_width=train_config["length_bucket_width"], maximum_features_length=train_config.get( "maximum_features_length"), maximum_labels_length=train_config.get("maximum_labels_length"), single_pass=train_config.get("single_pass", False), prefetch_buffer_size=train_config.get("prefetch_buffer_size")) # todo if with_eval: evaluator = evaluation.Evaluator.from_config(model, config) else: evaluator = None # Set gradients accumulation based on the requested effective batch size. if train_config.get("effective_batch_size") is not None: accum_steps = _count_batch_accum( train_config["batch_size"], train_config["effective_batch_size"], num_replicas=num_devices) tf.get_logger().info( "Accumulate gradients of %d iterations to reach effective batch size of %d", accum_steps, train_config["effective_batch_size"]) else: accum_steps = 1 trainer = training_util.DistributionStrategyTrainer( checkpoint, devices=misc.get_devices(count=num_devices)) trainer(dataset, max_step=train_config.get("max_step"), accum_steps=accum_steps, report_steps=train_config.get("save_summary_steps", 100), save_steps=train_config.get("save_checkpoints_steps", 5000), evaluator=evaluator, eval_steps=eval_config.get("steps", 5000), export_on_best=eval_config.get("export_on_best")) average_last_checkpoints = train_config.get("average_last_checkpoints", 0) if average_last_checkpoints > 0: return self.average_checkpoints(os.path.join( checkpoint.model_dir, "avg"), max_count=average_last_checkpoints) return checkpoint.model_dir
def train(self, num_devices=1, with_eval=False, checkpoint_path=None): """Runs the training loop. Args: num_devices: Number of devices to use for training. with_eval: Enable evaluation during training. checkpoint_path: The checkpoint path to load the model weights from it. Returns: The path to the final model directory. """ devices = misc.get_devices(count=num_devices) checkpoint, config = self._init_run(num_devices=num_devices, training=True) checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=checkpoint_path is not None) model = checkpoint.model data_config = config["data"] train_config = config["train"] eval_config = config["eval"] batch_type = train_config["batch_type"] if batch_type == "tokens" and self._mixed_precision: batch_size_multiple = 8 else: batch_size_multiple = 1 dataset_fn = lambda input_context: model.examples_inputter.make_training_dataset( data_config["train_features_file"], data_config.get("train_labels_file"), train_config["batch_size"], batch_type=batch_type, batch_size_multiple=batch_size_multiple, shuffle_buffer_size=train_config["sample_buffer_size"], length_bucket_width=train_config["length_bucket_width"], maximum_features_length=train_config.get("maximum_features_length" ), maximum_labels_length=train_config.get("maximum_labels_length"), single_pass=train_config.get("single_pass", False), num_shards=input_context.num_input_pipelines, shard_index=input_context.input_pipeline_id, prefetch_buffer_size=train_config.get("prefetch_buffer_size"), cardinality_multiple=input_context.num_replicas_in_sync, weights=data_config.get("train_files_weights")) if with_eval: evaluator = evaluation.Evaluator.from_config(model, config) else: evaluator = None # Set gradients accumulation based on the requested effective batch size. if train_config.get("effective_batch_size") is not None: accum_steps = _count_batch_accum( train_config["batch_size"], train_config["effective_batch_size"], num_replicas=num_devices) tf.get_logger().info( "Accumulate gradients of %d iterations to reach effective batch size of %d", accum_steps, train_config["effective_batch_size"]) else: accum_steps = 1 trainer = training_util.DistributionStrategyTrainer(checkpoint, devices=devices) trainer(dataset_fn, max_step=train_config.get("max_step"), accum_steps=accum_steps, report_steps=train_config.get("save_summary_steps", 100), save_steps=train_config.get("save_checkpoints_steps", 5000), evaluator=evaluator, eval_steps=eval_config.get("steps", 5000), moving_average_decay=train_config.get("moving_average_decay")) average_last_checkpoints = train_config.get("average_last_checkpoints", 0) if average_last_checkpoints > 0: return self.average_checkpoints(os.path.join( checkpoint.model_dir, "avg"), max_count=average_last_checkpoints) return checkpoint.model_dir
def train(self, num_devices=1, with_eval=False, checkpoint_path=None, hvd=None): """Runs the training loop. Args: num_devices: Number of devices to use for training. with_eval: Enable evaluation during training. checkpoint_path: The checkpoint path to load the model weights from it. hvd: Optional Horovod module. Returns: The path to the final model directory. """ if hvd is None: num_replicas = num_devices is_master = True else: num_replicas = hvd.size() is_master = hvd.rank() == 0 config = self._finalize_config(training=True, num_replicas=num_replicas, num_devices=num_devices) model = self._init_model(config) optimizer = model.get_optimizer() data_config = config["data"] train_config = config["train"] eval_config = config["eval"] batch_type = train_config["batch_type"] if batch_type == "tokens" and self._mixed_precision: batch_size_multiple = 8 else: batch_size_multiple = 1 dataset_fn = lambda input_context: model.examples_inputter.make_training_dataset( data_config["train_features_file"], data_config.get("train_labels_file"), train_config["batch_size"], batch_type=batch_type, batch_size_multiple=batch_size_multiple, shuffle_buffer_size=train_config["sample_buffer_size"], length_bucket_width=train_config["length_bucket_width"], maximum_features_length=train_config.get("maximum_features_length" ), maximum_labels_length=train_config.get("maximum_labels_length"), single_pass=train_config.get("single_pass", False), num_shards=input_context.num_input_pipelines, shard_index=input_context.input_pipeline_id, prefetch_buffer_size=train_config.get("prefetch_buffer_size"), cardinality_multiple=input_context.num_replicas_in_sync, weights=data_config.get("train_files_weights")) checkpoint = None evaluator = None if is_master: checkpoint = checkpoint_util.Checkpoint.from_config( config, model, optimizer=optimizer) checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=checkpoint_path is not None) if with_eval: evaluator = evaluation.Evaluator.from_config(model, config) # Set gradients accumulation based on the requested effective batch size. if train_config.get("effective_batch_size") is not None: accum_steps = _count_batch_accum( train_config["batch_size"], train_config["effective_batch_size"], num_replicas=num_replicas) tf.get_logger().info( "Accumulate gradients of %d iterations to reach effective batch size of %d", accum_steps, train_config["effective_batch_size"]) else: accum_steps = 1 if hvd is not None: if num_devices > 1: raise ValueError( "num_devices (or num_gpus) should be set to 1 when using Horovod" ) trainer = training_util.HorovodTrainer(model, optimizer, hvd, checkpoint=checkpoint) else: devices = misc.get_devices(count=num_devices) trainer = training_util.DistributionStrategyTrainer( model, optimizer, checkpoint=checkpoint, devices=devices) trainer(dataset_fn, max_step=train_config.get("max_step"), accum_steps=accum_steps, report_steps=train_config.get("save_summary_steps", 100), save_steps=train_config.get("save_checkpoints_steps", 5000), evaluator=evaluator, eval_steps=eval_config.get("steps", 5000), moving_average_decay=train_config.get("moving_average_decay")) if checkpoint is None: return None average_last_checkpoints = train_config.get("average_last_checkpoints", 0) if average_last_checkpoints > 0: return self.average_checkpoints(os.path.join( checkpoint.model_dir, "avg"), max_count=average_last_checkpoints) return checkpoint.model_dir