Exemplo n.º 1
0
def get_callbacks(model_checkpoint: bool = True,
                  include_tensorboard: bool = True,
                  time_history: bool = True,
                  track_lr: bool = True,
                  write_model_weights: bool = True,
                  initial_step: int = 0,
                  batch_size: int = 0,
                  log_steps: int = 0,
                  model_dir: str = None) -> List[tf.keras.callbacks.Callback]:
    """Get all callbacks."""
    model_dir = model_dir or ''
    callbacks = []
    if model_checkpoint:
        ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
        callbacks.append(
            tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                               save_weights_only=True,
                                               verbose=1))
    if include_tensorboard:
        callbacks.append(
            CustomTensorBoard(log_dir=model_dir,
                              track_lr=track_lr,
                              initial_step=initial_step,
                              write_images=write_model_weights))
    if time_history:
        callbacks.append(
            keras_utils.TimeHistory(
                batch_size,
                log_steps,
                logdir=model_dir if include_tensorboard else None))
    return callbacks
Exemplo n.º 2
0
def get_callbacks(steps_per_epoch, current_rank, cluster_size,
                  learning_rate_schedule_fn):
    """Returns common callbacks."""
    time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    if not FLAGS.use_tensor_lr and learning_rate_schedule_fn:
        lr_callback = LearningRateBatchScheduler(
            learning_rate_schedule_fn,
            batch_size=FLAGS.batch_size,
            steps_per_epoch=steps_per_epoch,
            cluster_size=cluster_size)
        callbacks.append(lr_callback)

    if FLAGS.enable_tensorboard and current_rank == 0:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=FLAGS.model_dir)
        callbacks.append(tensorboard_callback)

    if FLAGS.profile_steps:
        profiler_callback = keras_utils.get_profiler_callback(
            FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard,
            steps_per_epoch)
        callbacks.append(profiler_callback)

    return callbacks
    def _run_and_report_benchmark(self,
                                  params,
                                  min_ap=0.325,
                                  max_ap=0.35,
                                  do_eval=True,
                                  warmup=1):
        """Starts RetinaNet accuracy benchmark test."""
        FLAGS.params_override = json.dumps(params)
        # Need timer callback to measure performance
        self.timer_callback = keras_utils.TimeHistory(
            batch_size=params['train']['batch_size'],
            log_steps=FLAGS.log_steps,
        )

        start_time_sec = time.time()
        FLAGS.mode = 'train'
        summary, _ = self._run_detection_main()
        wall_time_sec = time.time() - start_time_sec

        if do_eval:
            FLAGS.mode = 'eval'
            eval_metrics = self._run_detection_main()
            summary.update(eval_metrics)

        summary['total_steps'] = params['train']['total_steps']
        self._report_benchmark(summary, start_time_sec, wall_time_sec, min_ap,
                               max_ap, warmup)
Exemplo n.º 4
0
def get_callbacks(pruning_method=None,
                  enable_checkpoint_and_export=False,
                  model_dir=None):
    """Returns common callbacks."""
    time_callback = keras_utils.TimeHistory(
        FLAGS.batch_size,
        FLAGS.log_steps,
        logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
    callbacks = [time_callback]

    if FLAGS.enable_tensorboard:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps)
        callbacks.append(tensorboard_callback)

    is_pruning_enabled = pruning_method is not None
    if is_pruning_enabled:
        callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
        if model_dir is not None:
            callbacks.append(
                tfmot.sparsity.keras.PruningSummaries(log_dir=model_dir,
                                                      profile_batch=0))

    if enable_checkpoint_and_export:
        if model_dir is not None:
            ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
            callbacks.append(
                tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                                   save_weights_only=True))
    return callbacks
Exemplo n.º 5
0
def get_callbacks(learning_rate_schedule_fn, num_images):
  """Returns common callbacks."""
  time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
  callbacks = [time_callback]

  if not FLAGS.use_tensor_lr:
    lr_callback = LearningRateBatchScheduler(
        learning_rate_schedule_fn,
        batch_size=FLAGS.batch_size,
        num_images=num_images)
    callbacks.append(lr_callback)

  if FLAGS.enable_tensorboard:
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=FLAGS.model_dir)
    callbacks.append(tensorboard_callback)

  if FLAGS.profile_steps:
    profiler_callback = keras_utils.get_profiler_callback(
        FLAGS.model_dir,
        FLAGS.profile_steps,
        FLAGS.enable_tensorboard)
    callbacks.append(profiler_callback)

  return callbacks
Exemplo n.º 6
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""
    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        tf.logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)
    batch_size = params["batch_size"]

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well.
    params['batch_size'] = params['eval_batch_size']

    num_users, num_items, num_train_steps, num_eval_steps, producer = (
        ncf_common.get_inputs(params))

    params["num_users"], params["num_items"] = num_users, num_items
    producer.start()
    model_helpers.apply_clean(flags.FLAGS)

    batches_per_step = params["batches_per_step"]
    train_input_dataset, eval_input_dataset = _get_train_and_eval_data(
        producer, params)
    # It is required that for distributed training, the dataset must call
    # batch(). The parameter of batch() here is the number of replicas involed,
    # such that each replica evenly gets a slice of data.
    train_input_dataset = train_input_dataset.batch(batches_per_step)
    eval_input_dataset = eval_input_dataset.batch(batches_per_step)

    strategy = ncf_common.get_distribution_strategy(params)
    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = ncf_common.get_optimizer(params)
        time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)

        keras_model.compile(loss=_keras_loss,
                            metrics=[_get_metric_fn(params)],
                            optimizer=optimizer)

        history = keras_model.fit(
            train_input_dataset,
            epochs=FLAGS.train_epochs,
            callbacks=[IncrementEpochCallback(producer), time_callback],
            verbose=2)

        tf.logging.info("Training done. Start evaluating")

        eval_results = keras_model.evaluate(eval_input_dataset,
                                            steps=num_eval_steps,
                                            verbose=2)

    tf.logging.info("Keras evaluation is done.")

    stats = build_stats(history, eval_results, time_callback)
    return stats
Exemplo n.º 7
0
def main(_):
    # Users should always run this script under TF 2.x

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    if FLAGS.mode == 'export_only':
        export_squad(FLAGS.model_export_path, input_meta_data)
        return

    # Configures cluster spec for multi-worker distribution strategy.
    if FLAGS.num_gpus > 0:
        _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
                                                 FLAGS.task_index)
    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        all_reduce_alg=FLAGS.all_reduce_alg,
        tpu_address=FLAGS.tpu)

    if 'train' in FLAGS.mode:
        if FLAGS.log_steps:
            custom_callbacks = [
                keras_utils.TimeHistory(
                    batch_size=FLAGS.train_batch_size,
                    log_steps=FLAGS.log_steps,
                    logdir=FLAGS.model_dir,
                )
            ]
        else:
            custom_callbacks = None

        train_squad(
            strategy,
            input_meta_data,
            custom_callbacks=custom_callbacks,
            run_eagerly=FLAGS.run_eagerly,
        )
    if 'predict' in FLAGS.mode:
        predict_squad(strategy, input_meta_data)
    if 'eval' in FLAGS.mode:
        if input_meta_data.get('version_2_with_negative', False):
            logging.error('SQuAD v2 eval is not supported. '
                          'Falling back to predict mode.')
            predict_squad(strategy, input_meta_data)
        else:
            eval_metrics = eval_squad(strategy, input_meta_data)
            f1_score = eval_metrics['f1']
            logging.info('SQuAD eval F1-score: %f', f1_score)
            if (not strategy) or strategy.extended.should_save_summary:
                summary_dir = os.path.join(FLAGS.model_dir, 'summaries')
            else:
                summary_dir = tempfile.mkdtemp()
            summary_writer = tf.summary.create_file_writer(
                os.path.join(summary_dir, 'eval'))
            with summary_writer.as_default():
                # TODO(lehou): write to the correct step number.
                tf.summary.scalar('F1-score', f1_score, step=0)
                summary_writer.flush()
Exemplo n.º 8
0
def run_bert(strategy,
             input_meta_data,
             model_config,
             train_input_fn=None,
             eval_input_fn=None,
             init_checkpoint=None,
             custom_callbacks=None,
             custom_metrics=None):
  """Run BERT training."""
  # Enables XLA in Session Config. Should not be set for TPU.
  keras_utils.set_session_config(FLAGS.enable_xla)
  performance.set_mixed_precision_policy(common_flags.dtype(),
                                         use_experimental_api=False)

  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
  train_data_size = (
      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
  if FLAGS.train_data_size:
    train_data_size = min(train_data_size, FLAGS.train_data_size)
    logging.info('Updated train_data_size: %s', train_data_size)
  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
  eval_steps = int(
      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))

  if not strategy:
    raise ValueError('Distribution strategy has not been specified.')

  if not custom_callbacks:
    custom_callbacks = []

  if FLAGS.log_steps:
    custom_callbacks.append(
        keras_utils.TimeHistory(
            batch_size=FLAGS.train_batch_size,
            log_steps=FLAGS.log_steps,
            logdir=FLAGS.model_dir))

  trained_model, _ = run_bert_classifier(
      strategy,
      model_config,
      input_meta_data,
      FLAGS.model_dir,
      epochs,
      steps_per_epoch,
      FLAGS.steps_per_loop,
      eval_steps,
      warmup_steps,
      FLAGS.learning_rate,
      init_checkpoint or FLAGS.init_checkpoint,
      train_input_fn,
      eval_input_fn,
      custom_callbacks=custom_callbacks,
      custom_metrics=custom_metrics)

  if FLAGS.model_export_path:
    model_saving_utils.export_bert_model(
        FLAGS.model_export_path, model=trained_model)
  return trained_model
  def train_and_eval(self):
    """Trains the model."""
    lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"],
                                                 self.params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(lr_schedule,
                                   self.params["optimizer_adam_beta1"],
                                   self.params["optimizer_adam_beta2"],
                                   epsilon=self.params["optimizer_adam_epsilon"])
    self.train_model.compile(opt)
    self.train_model.summary()

    # create train dataset
    train_ds = data_pipeline.train_input_fn(self.params,
                                            shuffle_seed = 42,
                                            num_ranks = tnt.get_size(),
                                            rank = tnt.get_rank())

    # enable global callbacks
    callbacks = []
    if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir:
      callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir))

    # enable logging callbacks only on the master rank
    if self.flags_obj.enable_time_history:
      time_callback = keras_utils.TimeHistory(self.params["batch_size"],
                                              self.params["num_sentences"],
                                              logdir = None)
      tnt_time_callback = tnt.keras.callbacks.Callback(time_callback,
                                                       aggregate_logs = False,
                                                       run_on_all_ranks = False)
      callbacks.append(tnt_time_callback)

    # print messages only once
    if tnt.is_master_rank():
      logging.info("Start train")

    stats = {}
    for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]):
      # as our dataset is distributed manually, disable the automatic Tarantella distribution
      history = self.train_model.fit(train_ds,
                                     callbacks = callbacks,
                                     tnt_distribute_dataset = False,
                                     initial_epoch = epoch,
                                     epochs = epoch + min(self.params["epochs_between_evals"],
                                                          self.params["train_epochs"]-epoch),
                                     verbose = 2)

      if tnt.is_master_rank():
        logging.info("Train history: {}".format(history.history))
        stats = misc.build_stats(history, callbacks)

      if tnt.is_master_rank():
        eval_stats = self.eval()
        stats.update(eval_stats)

    return stats
Exemplo n.º 10
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    if FLAGS.mode == 'export_only':
        export_squad(FLAGS.model_export_path, input_meta_data)
        return

    # Configures cluster spec for multi-worker distribution strategy.
    if FLAGS.num_gpus > 0:
        _ = distribute_utils.configure_cluster(FLAGS.worker_hosts,
                                               FLAGS.task_index)
    strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        all_reduce_alg=FLAGS.all_reduce_alg,
        tpu_address=FLAGS.tpu)

    if 'train' in FLAGS.mode:
        if FLAGS.log_steps:
            custom_callbacks = [
                keras_utils.TimeHistory(
                    batch_size=FLAGS.train_batch_size,
                    log_steps=FLAGS.log_steps,
                    logdir=FLAGS.model_dir,
                )
            ]
        else:
            custom_callbacks = None

        train_squad(
            strategy,
            input_meta_data,
            custom_callbacks=custom_callbacks,
            run_eagerly=FLAGS.run_eagerly,
            sub_model_export_name=FLAGS.sub_model_export_name,
        )
    if 'predict' in FLAGS.mode:
        predict_squad(strategy, input_meta_data)
    if 'eval' in FLAGS.mode:
        eval_metrics = eval_squad(strategy, input_meta_data)
        f1_score = eval_metrics['final_f1']
        logging.info('SQuAD eval F1-score: %f', f1_score)
        summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
        summary_writer = tf.summary.create_file_writer(summary_dir)
        with summary_writer.as_default():
            # TODO(lehou): write to the correct step number.
            tf.summary.scalar('F1-score', f1_score, step=0)
            summary_writer.flush()
        # Also write eval_metrics to json file.
        squad_lib_wp.write_to_json_files(
            eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
        time.sleep(60)
Exemplo n.º 11
0
def get_callbacks(learning_rate_schedule_fn, num_images):
    """Returns common callbacks."""
    time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=FLAGS.model_dir)

    lr_callback = LearningRateBatchScheduler(learning_rate_schedule_fn,
                                             batch_size=FLAGS.batch_size,
                                             num_images=num_images)

    return time_callback, tensorboard_callback, lr_callback
def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None):
    """Trains a Shakespeare model.

  Args:
    flags_obj: An object containing parsed flag values.s
    dataset: the training data set.
    vocab_size: the number of unique character classes.
    strategy: distribution strategy to use.
    checkpoint_dir: if not None, the directory in which to make checkpoints.

  Returns:
    The training history and callbacks.
  """
    if flags_obj.train_steps:
        train_steps = flags_obj.train_steps
    else:
        train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size
    strategy_scope = distribution_utils.get_strategy_scope(strategy)

    with strategy_scope:
        model = build_model(vocab_size=vocab_size,
                            batch_size=flags_obj.batch_size,
                            use_cudnn=flags_obj.cudnn)

        # When keras_use_ctl is False, Model.fit() automatically applies
        # loss scaling so we don't need to create a LossScaleOptimizer.
        model.compile(
            optimizer=tf.keras.optimizers.Adam(),
            loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=[
                tf.keras.metrics.Recall(top_k=1, name='RecallAt1'),
                tf.keras.metrics.Recall(top_k=5, name='RecallAt5')
            ],
            run_eagerly=flags_obj.run_eagerly,
            experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)

    callbacks = []
    if checkpoint_dir:
        checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_prefix, save_weights_only=True)
        callbacks.append(checkpoint_callback)
    time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
                                            flags_obj.log_steps)
    callbacks.append(time_callback)
    history = model.fit(dataset,
                        epochs=flags_obj.train_epochs,
                        steps_per_epoch=train_steps,
                        callbacks=callbacks,
                        verbose=2)
    return history, callbacks
Exemplo n.º 13
0
def get_callbacks(
    model_checkpoint: bool = True,
    include_tensorboard: bool = True,
    time_history: bool = True,
    track_lr: bool = True,
    write_model_weights: bool = True,
    apply_moving_average: bool = False,
    initial_step: int = 0,
    batch_size: int = 0,
    log_steps: int = 0,
    model_dir: Optional[str] = None,
    backup_and_restore: bool = False) -> List[tf.keras.callbacks.Callback]:
  """Get all callbacks."""
  model_dir = model_dir or ''
  callbacks = []
  if model_checkpoint:
    ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
    callbacks.append(
        tf.keras.callbacks.ModelCheckpoint(
            ckpt_full_path, save_weights_only=True, verbose=1))
  if backup_and_restore:
    backup_dir = os.path.join(model_dir, 'tmp')
    callbacks.append(
        tf.keras.callbacks.experimental.BackupAndRestore(backup_dir))
  if include_tensorboard:
    callbacks.append(
        CustomTensorBoard(
            log_dir=model_dir,
            track_lr=track_lr,
            initial_step=initial_step,
            write_images=write_model_weights,
            profile_batch=0))
  if time_history:
    callbacks.append(
        keras_utils.TimeHistory(
            batch_size,
            log_steps,
            logdir=model_dir if include_tensorboard else None))
  if apply_moving_average:
    # Save moving average model to a different file so that
    # we can resume training from a checkpoint
    ckpt_full_path = os.path.join(model_dir, 'average',
                                  'model.ckpt-{epoch:04d}')
    callbacks.append(
        AverageModelCheckpoint(
            update_weights=False,
            filepath=ckpt_full_path,
            save_weights_only=True,
            verbose=1))
    callbacks.append(MovingAverageCallback())
  return callbacks
Exemplo n.º 14
0
def get_callbacks():
    """Returns common callbacks."""
    callbacks = []
    if FLAGS.enable_time_history:
        time_callback = keras_utils.TimeHistory(
            FLAGS.batch_size, FLAGS.log_steps,
            FLAGS.model_dir if FLAGS.enable_tensorboard else None)
        callbacks.append(time_callback)

    if FLAGS.enable_tensorboard:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=FLAGS.model_dir)
        callbacks.append(tensorboard_callback)

    return callbacks
Exemplo n.º 15
0
def get_callbacks(steps_per_epoch,
                  learning_rate_schedule_fn=None,
                  pruning_method=None,
                  enable_checkpoint_and_export=False,
                  model_dir=None):
    """Returns common callbacks."""
    time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    if not FLAGS.use_tensor_lr and learning_rate_schedule_fn:
        lr_callback = LearningRateBatchScheduler(
            learning_rate_schedule_fn,
            batch_size=FLAGS.batch_size,
            steps_per_epoch=steps_per_epoch)
        callbacks.append(lr_callback)

    if FLAGS.enable_tensorboard:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=FLAGS.model_dir)
        callbacks.append(tensorboard_callback)

    if FLAGS.profile_steps:
        profiler_callback = keras_utils.get_profiler_callback(
            FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard,
            steps_per_epoch)
        callbacks.append(profiler_callback)

    is_pruning_enabled = pruning_method is not None
    if is_pruning_enabled:
        callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
        if model_dir is not None:
            callbacks.append(
                tfmot.sparsity.keras.PruningSummaries(log_dir=model_dir,
                                                      profile_batch=0))

    if enable_checkpoint_and_export:
        if model_dir is not None:
            ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
            callbacks.append(
                tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                                   save_weights_only=True))
    return callbacks
def get_callbacks(steps_per_epoch):
    """Returns common callbacks."""
    callbacks = []
    if FLAGS.enable_time_history:
        time_callback = keras_utils.TimeHistory(FLAGS.batch_size,
                                                FLAGS.log_steps)
        callbacks.append(time_callback)

    if FLAGS.enable_tensorboard:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=FLAGS.model_dir)
        callbacks.append(tensorboard_callback)

    if FLAGS.profile_steps:
        profiler_callback = keras_utils.get_profiler_callback(
            FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard,
            steps_per_epoch)
        callbacks.append(profiler_callback)

    return callbacks
Exemplo n.º 17
0
    def test_time_history(self):
        th = keras_utils.TimeHistory(batch_size=128, log_steps=3)

        th.on_train_begin()
        th.on_batch_begin(0)
        th.on_batch_end(0)
        th.on_batch_begin(1)
        th.on_batch_end(1)
        th.on_batch_begin(2)
        th.on_batch_end(2)
        th.on_batch_begin(3)
        th.on_batch_end(3)
        th.on_batch_begin(4)
        th.on_batch_end(4)
        th.on_batch_begin(5)
        th.on_batch_end(5)
        th.on_batch_begin(6)
        th.on_batch_end(6)
        th.on_train_end()

        self.assertEqual(3, len(th.timestamp_log))
Exemplo n.º 18
0
def main(_):
    # Users should always run this script under TF 2.x
    assert tf.version.VERSION.startswith('2.')

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    if FLAGS.mode == 'export_only':
        export_squad(FLAGS.model_export_path, input_meta_data)
        return

    # Configures cluster spec for multi-worker distribution strategy.
    if FLAGS.num_gpus > 0:
        _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
                                                 FLAGS.task_index)
    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        all_reduce_alg=FLAGS.all_reduce_alg,
        tpu_address=FLAGS.tpu)
    if FLAGS.mode in ('train', 'train_and_predict'):
        if FLAGS.log_steps:
            custom_callbacks = [
                keras_utils.TimeHistory(
                    batch_size=FLAGS.train_batch_size,
                    log_steps=FLAGS.log_steps,
                    logdir=FLAGS.model_dir,
                )
            ]
        else:
            custom_callbacks = None

        train_squad(
            strategy,
            input_meta_data,
            custom_callbacks=custom_callbacks,
            run_eagerly=FLAGS.run_eagerly,
        )
    if FLAGS.mode in ('predict', 'train_and_predict'):
        predict_squad(strategy, input_meta_data)
Exemplo n.º 19
0
    def test_build_stats(self):

        history = self._build_history(1.145, cat_accuracy=.99988)
        eval_output = self._build_eval_output(.56432111, 5.990)
        th = keras_utils.TimeHistory(128, 100)

        th.timestamp_log = [
            keras_utils.BatchTimestamp(0, 1),
            keras_utils.BatchTimestamp(1, 2),
            keras_utils.BatchTimestamp(2, 3)
        ]
        th.train_finish_time = 12345
        stats = common.build_stats(history, eval_output, [th])

        self.assertEqual(1.145, stats['loss'])
        self.assertEqual(.99988, stats['training_accuracy_top_1'])

        self.assertEqual(.56432111, stats['accuracy_top_1'])
        self.assertEqual(5.990, stats['eval_loss'])

        self.assertEqual(3, stats['step_timestamp_log'][2].timestamp)
        self.assertEqual(12345, stats['train_finish_time'])
Exemplo n.º 20
0
def run(args, strategy):
    """Pretrains model using TF2. Adapted from the tensorflow/models Github"""
    # CONFIG
    # Use timestamp to generate a unique run name
    run_name = get_run_name(args)
    logger.info(f'*** Starting run {run_name} ***')
    output_dir = f'gs://{args.bucket_name}/{args.project_name}/pretrain/runs/{run_name}'

    # pretrained model path
    try:
        pretrained_model_path = PRETRAINED_MODELS[args.model_class]['bucket_location']
    except KeyError:
        raise ValueError(f'Could not find a pretrained model matching the model class {args.model_class}')
    pretrained_model_config_path = f'gs://{args.bucket_name}/{pretrained_model_path}/bert_config.json'
    if args.init_checkpoint is None:
        pretrained_model_checkpoint_path = f'gs://{args.bucket_name}/{pretrained_model_path}/bert_model.ckpt'
    else:
        pretrained_model_checkpoint_path = f'gs://{args.bucket_name}/{args.project_name}/pretrain/runs/{args.init_checkpoint}'

    # some logging
    logger.info(f'Running pretraining of model {args.model_class} on pretrain data {args.pretrain_data}')
    logger.info(f'Initializing model from checkpoint {pretrained_model_checkpoint_path}')

    # load model config based on model_class
    model_config = get_model_config(pretrained_model_config_path)

    # input data function
    train_input_fn = get_dataset_fn(args, _type='train')
    eval_input_fn = None
    eval_metric_fn = None
    if args.do_eval:
        logger.info(f'Setting up evaluation dataset')
        eval_metric_fn = get_eval_metric_fn
        eval_input_fn = get_dataset_fn(args, _type='dev')

    # model_fn
    def _get_pretrained_model(end_lr=0.0):
        """Gets a pretraining model."""
        pretrain_model, core_model = bert_models.pretrain_model(model_config, args.max_seq_length, args.max_predictions_per_seq)
        if args.warmup_proportion is None:
            warmup_steps = args.warmup_steps
            warmup_proportion_perc = 100 * args.warmup_steps/(args.num_epochs * args.num_steps_per_epoch)
        else:
            warmup_steps = int(args.num_epochs * args.num_steps_per_epoch * args.warmup_proportion)
            warmup_proportion_perc = args.warmup_proportion * 100
        logger.info(f'Running {warmup_steps:,} warmup steps ({warmup_proportion_perc:.2f}% warmup)')
        optimizer = utils.optimizer.create_optimizer(
                args.learning_rate,
                args.num_steps_per_epoch * args.num_epochs,
                warmup_steps,
                args.end_lr,
                args.optimizer_type)
        pretrain_model.optimizer = configure_optimizer(optimizer, use_float16=args.dtype == 'fp16', use_graph_rewrite=False)
        return pretrain_model, core_model

    # custom callbacks
    summary_dir = os.path.join(output_dir, 'summaries')
    time_history_callback = keras_utils.TimeHistory(
        batch_size=args.train_batch_size,
        log_steps=args.time_history_log_steps,
        logdir=summary_dir)
    custom_callbacks = [time_history_callback]

    # Save an initial version of the log file
    data = {
            'created_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'run_name': run_name,
            'num_train_steps': args.num_steps_per_epoch * args.num_epochs,
            'eval_steps': args.eval_steps,
            'model_dir': output_dir,
            'output_dir': output_dir,
            **vars(args),
            }
    # write initial training log
    f_path_training_log = os.path.join(output_dir, 'run_logs.json')
    logger.info(f'Writing training preliminary log to {f_path_training_log}...')
    save_to_json(data, f_path_training_log)

    # run training loop
    logger.info(f'Run training for {args.num_epochs:,} epochs, {args.num_steps_per_epoch:,} steps each, processing {args.num_epochs*args.num_steps_per_epoch*args.train_batch_size:,} training examples in total...')
    time_start = time.time()
    model_training_utils.run_customized_training_loop(
        strategy=strategy,
        model_fn=_get_pretrained_model,
        loss_fn=get_loss_fn(),
        scale_loss=True,
        model_dir=output_dir,
        train_input_fn=train_input_fn,
        steps_per_epoch=args.num_steps_per_epoch,
        steps_per_loop=args.steps_per_loop,
        epochs=args.num_epochs,
        eval_input_fn=eval_input_fn,
        eval_steps=args.eval_steps,
        metric_fn=eval_metric_fn,
        init_checkpoint=pretrained_model_checkpoint_path,
        load_mlm_nsp_weights = args.load_mlm_nsp_weights,
        set_trainstep = args.set_trainstep,
        custom_callbacks=custom_callbacks,
        run_eagerly=False,
        sub_model_export_name='pretrained/bert_model',
        explicit_allreduce=False,
        pre_allreduce_callbacks=None,
        post_allreduce_callbacks=None)
    time_end = time.time()
    training_time_min = (time_end-time_start)/60
    data['training_time_min'] = training_time_min
    logger.info(f'Finished training after {training_time_min:.1f} min')
    # Write to run directory
    logger.info(f'Writing final training log to {f_path_training_log}...')
    save_to_json(data, f_path_training_log)
    # Write bert config
    f_path_bert_config = os.path.join(output_dir, 'bert_config.json')
    logger.info(f'Writing BERT config to {f_path_bert_config}...')
    save_to_json(model_config.to_dict(), f_path_bert_config)
Exemplo n.º 21
0
def run_ncf(_):
  """Run NCF training and eval with Keras."""
  # TODO(seemuch): Support different train and eval batch sizes
  if FLAGS.eval_batch_size != FLAGS.batch_size:
    logging.warning(
        "The Keras implementation of NCF currently does not support batch_size "
        "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
        "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)
        )
    FLAGS.eval_batch_size = FLAGS.batch_size

  params = ncf_common.parse_flags(FLAGS)

  if params["keras_use_ctl"] and int(tf.__version__.split(".")[0]) == 1:
    logging.error(
        "Custom training loop only works with tensorflow 2.0 and above.")
    return

  # ncf_common rounds eval_batch_size (this is needed due to a reshape during
  # eval). This carries over that rounding to batch_size as well. This is the
  # per device batch size
  params["batch_size"] = params["eval_batch_size"]
  batch_size = params["batch_size"]

  num_users, num_items, num_train_steps, num_eval_steps, producer = (
      ncf_common.get_inputs(params))

  params["num_users"], params["num_items"] = num_users, num_items
  producer.start()
  model_helpers.apply_clean(flags.FLAGS)

  batches_per_step = params["batches_per_step"]
  train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer,
                                                                     params)
  # It is required that for distributed training, the dataset must call
  # batch(). The parameter of batch() here is the number of replicas involed,
  # such that each replica evenly gets a slice of data.
  train_input_dataset = train_input_dataset.batch(batches_per_step)
  eval_input_dataset = eval_input_dataset.batch(batches_per_step)

  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
  per_epoch_callback = IncrementEpochCallback(producer)
  callbacks = [per_epoch_callback, time_callback]

  if FLAGS.early_stopping:
    early_stopping_callback = CustomEarlyStopping(
        "val_metric_fn", desired_value=FLAGS.hr_threshold)
    callbacks.append(early_stopping_callback)

  strategy = ncf_common.get_distribution_strategy(params)
  with distribution_utils.get_strategy_scope(strategy):
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])

  if params["keras_use_ctl"]:
    loss_object = tf.losses.SparseCategoricalCrossentropy(
        reduction=tf.keras.losses.Reduction.SUM,
        from_logits=True)
    train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
    eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)

    @tf.function
    def train_step():
      """Called once per step to train the model."""
      def step_fn(inputs):
        """Computes loss and applied gradient per replica."""
        features, labels = inputs
        with tf.GradientTape() as tape:
          softmax_logits = keras_model(features)
          loss = loss_object(labels, softmax_logits,
                             sample_weight=features[rconst.VALID_POINT_MASK])
          loss *= (1.0 / (batch_size*strategy.num_replicas_in_sync))

        grads = tape.gradient(loss, keras_model.trainable_variables)
        optimizer.apply_gradients(list(zip(grads,
                                           keras_model.trainable_variables)))
        return loss

      per_replica_losses = strategy.experimental_run(step_fn,
                                                     train_input_iterator)
      mean_loss = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
      return mean_loss

    @tf.function
    def eval_step():
      """Called once per eval step to compute eval metrics."""
      def step_fn(inputs):
        """Computes eval metrics per replica."""
        features, _ = inputs
        softmax_logits = keras_model(features)
        in_top_k, metric_weights = metric_fn(
            softmax_logits, features[rconst.DUPLICATE_MASK], params)
        hr_sum = tf.reduce_sum(in_top_k*metric_weights)
        hr_count = tf.reduce_sum(metric_weights)
        return hr_sum, hr_count

      per_replica_hr_sum, per_replica_hr_count = (
          strategy.experimental_run(step_fn, eval_input_iterator))
      hr_sum = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None)
      hr_count = strategy.reduce(
          tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
      return hr_sum, hr_count

    time_callback.on_train_begin()
    for epoch in range(FLAGS.train_epochs):
      per_epoch_callback.on_epoch_begin(epoch)
      train_input_iterator.initialize()
      train_loss = 0
      for step in range(num_train_steps):
        time_callback.on_batch_begin(step+epoch*num_train_steps)
        train_loss += train_step()
        time_callback.on_batch_end(step+epoch*num_train_steps)
      train_loss /= num_train_steps
      logging.info("Done training epoch %s, epoch loss=%s.",
                   epoch+1, train_loss)
      eval_input_iterator.initialize()
      hr_sum = 0
      hr_count = 0
      for _ in range(num_eval_steps):
        step_hr_sum, step_hr_count = eval_step()
        hr_sum += step_hr_sum
        hr_count += step_hr_count
      logging.info("Done eval epoch %s, hr=%s.", epoch+1, hr_sum/hr_count)

      if (FLAGS.early_stopping and
          float(hr_sum/hr_count) > params["hr_threshold"]):
        break

    time_callback.on_train_end()
    eval_results = [None, hr_sum/hr_count]

  else:
    with distribution_utils.get_strategy_scope(strategy):

      keras_model.compile(optimizer=optimizer)

      history = keras_model.fit(train_input_dataset,
                                steps_per_epoch=num_train_steps,
                                epochs=FLAGS.train_epochs,
                                callbacks=callbacks,
                                validation_data=eval_input_dataset,
                                validation_steps=num_eval_steps,
                                verbose=2)

      logging.info("Training done. Start evaluating")

      eval_results = keras_model.evaluate(
          eval_input_dataset,
          steps=num_eval_steps,
          verbose=2)

      logging.info("Keras evaluation is done.")

    if history and history.history:
      train_history = history.history
      train_loss = train_history["loss"][-1]

  stats = build_stats(train_loss, eval_results, time_callback)
  return stats
Exemplo n.º 22
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    params = ncf_common.parse_flags(FLAGS)
    model_helpers.apply_clean(flags.FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)
    params["distribute_strategy"] = strategy

    if not keras_utils.is_v2_0() and strategy is not None:
        logging.error(
            "NCF Keras only works with distribution strategy in TF 2.0")
        return
    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return
    if params["use_tpu"] and not params["keras_use_ctl"]:
        logging.error(
            "Custom training loop must be used when using TPUStrategy.")
        return

    batch_size = params["batch_size"]
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    use_remote_tpu = params["use_tpu"] and FLAGS.tpu
    primary_cpu_task = tpu_lib.get_primary_cpu_task(use_remote_tpu)

    with tf.device(primary_cpu_task):
        (train_input_dataset, eval_input_dataset,
         num_train_steps, num_eval_steps) = \
          (ncf_input_pipeline.create_ncf_input_data(
              params, producer, input_meta_data, strategy))
        steps_per_epoch = None if generate_input_online else num_train_steps

        with distribution_utils.get_strategy_scope(strategy):
            keras_model = _get_keras_model(params)
            optimizer = tf.keras.optimizers.Adam(
                learning_rate=params["learning_rate"],
                beta_1=params["beta1"],
                beta_2=params["beta2"],
                epsilon=params["epsilon"])
            if FLAGS.dtype == "fp16":
                optimizer = \
                  tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                      optimizer,
                      loss_scale=flags_core.get_loss_scale(FLAGS,
                                                           default_for_fp16="dynamic"))

            if params["keras_use_ctl"]:
                train_loss, eval_results = run_ncf_custom_training(
                    params,
                    strategy,
                    keras_model,
                    optimizer,
                    callbacks,
                    train_input_dataset,
                    eval_input_dataset,
                    num_train_steps,
                    num_eval_steps,
                    generate_input_online=generate_input_online)
            else:
                # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
                # a valid arg for this model. Also remove as a valid flag.
                if FLAGS.force_v2_in_keras_compile is not None:
                    keras_model.compile(optimizer=optimizer,
                                        run_eagerly=FLAGS.run_eagerly,
                                        experimental_run_tf_function=FLAGS.
                                        force_v2_in_keras_compile)
                else:
                    keras_model.compile(optimizer=optimizer,
                                        run_eagerly=FLAGS.run_eagerly)

                history = keras_model.fit(train_input_dataset,
                                          epochs=FLAGS.train_epochs,
                                          steps_per_epoch=steps_per_epoch,
                                          callbacks=callbacks,
                                          validation_data=eval_input_dataset,
                                          validation_steps=num_eval_steps,
                                          verbose=2)

                logging.info("Training done. Start evaluating")

                eval_loss_and_metrics = keras_model.evaluate(
                    eval_input_dataset, steps=num_eval_steps, verbose=2)

                logging.info("Keras evaluation is done.")

                # Keras evaluate() API returns scalar loss and metric values from
                # evaluation as a list. Here, the returned list would contain
                # [evaluation loss, hr sum, hr count].
                eval_hit_rate = eval_loss_and_metrics[
                    1] / eval_loss_and_metrics[2]

                # Format evaluation result into [eval loss, eval hit accuracy].
                eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

                if history and history.history:
                    train_history = history.history
                    train_loss = train_history["loss"][-1]

        stats = build_stats(train_loss, eval_results, time_callback)
        return stats
Exemplo n.º 23
0
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    # TODO(anj-s): Set data_format without using Keras.
    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        num_workers=distribution_utils.configure_cluster(),
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    train_ds, test_ds = get_input_dataset(flags_obj, strategy)
    train_steps, train_epochs, eval_steps = get_num_train_iterations(flags_obj)

    time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
                                            flags_obj.log_steps)

    strategy_scope = distribution_utils.get_strategy_scope(strategy)
    with strategy_scope:
        model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            batch_size=flags_obj.batch_size,
            use_l2_regularizer=not flags_obj.single_l2_loss_op)

        optimizer = tf.keras.optimizers.SGD(
            learning_rate=common.BASE_LEARNING_RATE,
            momentum=0.9,
            nesterov=True)

        if flags_obj.fp16_implementation == "graph_rewrite":
            if not flags_obj.use_tf_function:
                raise ValueError(
                    "--fp16_implementation=graph_rewrite requires "
                    "--use_tf_function to be true")
            loss_scale = flags_core.get_loss_scale(flags_obj,
                                                   default_for_fp16=128)
            optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer, loss_scale)

        training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'training_accuracy', dtype=tf.float32)
        test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)

        trainable_variables = model.trainable_variables

        def train_step(train_ds_inputs):
            """Training StepFn."""
            def step_fn(inputs):
                """Per-Replica StepFn."""
                images, labels = inputs
                with tf.GradientTape() as tape:
                    logits = model(images, training=True)

                    prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                        labels, logits)
                    loss = tf.reduce_sum(prediction_loss) * (
                        1.0 / flags_obj.batch_size)
                    num_replicas = tf.distribute.get_strategy(
                    ).num_replicas_in_sync

                    if flags_obj.single_l2_loss_op:
                        filtered_variables = [
                            tf.reshape(v, (-1, )) for v in trainable_variables
                            if 'bn' not in v.name
                        ]
                        l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.nn.l2_loss(
                            tf.concat(filtered_variables, axis=0))
                        loss += (l2_loss / num_replicas)
                    else:
                        loss += (tf.reduce_sum(model.losses) / num_replicas)

                    # Scale the loss
                    if flags_obj.dtype == "fp16":
                        loss = optimizer.get_scaled_loss(loss)

                grads = tape.gradient(loss, trainable_variables)

                # Unscale the grads
                if flags_obj.dtype == "fp16":
                    grads = optimizer.get_unscaled_gradients(grads)

                optimizer.apply_gradients(zip(grads, trainable_variables))

                training_accuracy.update_state(labels, logits)
                return loss

            if strategy:
                per_replica_losses = strategy.experimental_run_v2(
                    step_fn, args=(train_ds_inputs, ))
                return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_losses,
                                       axis=None)
            else:
                return step_fn(train_ds_inputs)

        def test_step(test_ds_inputs):
            """Evaluation StepFn."""
            def step_fn(inputs):
                images, labels = inputs
                logits = model(images, training=False)
                loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size)
                test_loss.update_state(loss)
                test_accuracy.update_state(labels, logits)

            if strategy:
                strategy.experimental_run_v2(step_fn, args=(test_ds_inputs, ))
            else:
                step_fn(test_ds_inputs)

        if flags_obj.use_tf_function:
            train_step = tf.function(train_step)
            test_step = tf.function(test_step)

        time_callback.on_train_begin()
        for epoch in range(train_epochs):

            train_iter = iter(train_ds)
            total_loss = 0.0
            training_accuracy.reset_states()

            for step in range(train_steps):
                optimizer.lr = common.learning_rate_schedule(
                    epoch, step, train_steps, flags_obj.batch_size)

                time_callback.on_batch_begin(step + epoch * train_steps)
                total_loss += train_step(next(train_iter))
                time_callback.on_batch_end(step + epoch * train_steps)

            train_loss = total_loss / train_steps
            logging.info('Training loss: %s, accuracy: %s%% at epoch: %d',
                         train_loss.numpy(),
                         training_accuracy.result().numpy(), epoch)

            if (not flags_obj.skip_eval
                    and (epoch + 1) % flags_obj.epochs_between_evals == 0):
                test_loss.reset_states()
                test_accuracy.reset_states()

                test_iter = iter(test_ds)
                for _ in range(eval_steps):
                    test_step(next(test_iter))

                logging.info('Test loss: %s, accuracy: %s%% at epoch: %d',
                             test_loss.result().numpy(),
                             test_accuracy.result().numpy(), epoch)

        time_callback.on_train_end()

        eval_result = None
        train_result = None
        if not flags_obj.skip_eval:
            eval_result = [
                test_loss.result().numpy(),
                test_accuracy.result().numpy()
            ]
            train_result = [
                train_loss.numpy(),
                training_accuracy.result().numpy()
            ]

        stats = build_stats(train_result, eval_result, time_callback)
        return stats
Exemplo n.º 24
0
def run(callbacks=None):
    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    params = config_factory.config_generator(FLAGS.model)

    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)

    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)
    params.override(
        {
            'strategy_type': FLAGS.strategy_type,
            'model_dir': FLAGS.model_dir,
            'strategy_config': executor.strategy_flags_dict(),
        },
        is_strict=False)

    # Make sure use_tpu and strategy_type are in sync.
    params.use_tpu = (params.strategy_type == 'tpu')

    if not params.use_tpu:
        params.override(
            {
                'architecture': {
                    'use_bfloat16': False,
                },
                'norm_activation': {
                    'use_sync_bn': False,
                },
            },
            is_strict=True)

    params.validate()
    params.lock()
    pp = pprint.PrettyPrinter()
    params_str = pp.pformat(params.as_dict())
    logging.info('Model Parameters: %s', params_str)

    train_input_fn = None
    eval_input_fn = None
    training_file_pattern = FLAGS.training_file_pattern or params.train.train_file_pattern
    eval_file_pattern = FLAGS.eval_file_pattern or params.eval.eval_file_pattern
    if not training_file_pattern and not eval_file_pattern:
        raise ValueError(
            'Must provide at least one of training_file_pattern and '
            'eval_file_pattern.')

    if training_file_pattern:
        # Use global batch size for single host.
        train_input_fn = input_reader.InputFn(
            file_pattern=training_file_pattern,
            params=params,
            mode=input_reader.ModeKeys.TRAIN,
            batch_size=params.train.batch_size)

    if eval_file_pattern:
        eval_input_fn = input_reader.InputFn(
            file_pattern=eval_file_pattern,
            params=params,
            mode=input_reader.ModeKeys.PREDICT_WITH_GT,
            batch_size=params.eval.batch_size,
            num_examples=params.eval.eval_samples)

    if callbacks is None:
        callbacks = []

    if FLAGS.log_steps:
        callbacks.append(
            keras_utils.TimeHistory(
                batch_size=params.train.batch_size,
                log_steps=FLAGS.log_steps,
            ))

    return run_executor(params,
                        FLAGS.mode,
                        checkpoint_path=FLAGS.checkpoint_path,
                        train_input_fn=train_input_fn,
                        eval_input_fn=eval_input_fn,
                        callbacks=callbacks)
Exemplo n.º 25
0
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.bfloat16:
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    # TODO(anj-s): Set data_format without using Keras.
    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        num_workers=distribution_utils.configure_cluster(),
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs,
        tpu_address=flags_obj.tpu)

    train_ds, test_ds = get_input_dataset(flags_obj, strategy)
    per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
        flags_obj)
    steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
    logging.info(
        "Training %d epochs, each epoch has %d steps, "
        "total steps: %d; Eval %d steps", train_epochs, per_epoch_steps,
        train_epochs * per_epoch_steps, eval_steps)

    time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
                                            flags_obj.log_steps)

    with distribution_utils.get_strategy_scope(strategy):
        model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            batch_size=flags_obj.batch_size,
            use_l2_regularizer=not flags_obj.single_l2_loss_op)

        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)
        optimizer = common.get_optimizer(lr_schedule)

        if flags_obj.fp16_implementation == 'graph_rewrite':
            if not flags_obj.use_tf_function:
                raise ValueError(
                    '--fp16_implementation=graph_rewrite requires '
                    '--use_tf_function to be true')
            loss_scale = flags_core.get_loss_scale(flags_obj,
                                                   default_for_fp16=128)
            optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer, loss_scale)

        train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
        training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'training_accuracy', dtype=tf.float32)
        test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)

        trainable_variables = model.trainable_variables

        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            with tf.GradientTape() as tape:
                logits = model(images, training=True)

                prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_sum(prediction_loss) * (1.0 /
                                                         flags_obj.batch_size)
                num_replicas = tf.distribute.get_strategy(
                ).num_replicas_in_sync

                if flags_obj.single_l2_loss_op:
                    filtered_variables = [
                        tf.reshape(v, (-1, )) for v in trainable_variables
                        if 'bn' not in v.name
                    ]
                    l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.nn.l2_loss(
                        tf.concat(filtered_variables, axis=0))
                    loss += (l2_loss / num_replicas)
                else:
                    loss += (tf.reduce_sum(model.losses) / num_replicas)

                # Scale the loss
                if flags_obj.dtype == "fp16":
                    loss = optimizer.get_scaled_loss(loss)

            grads = tape.gradient(loss, trainable_variables)

            # Unscale the grads
            if flags_obj.dtype == "fp16":
                grads = optimizer.get_unscaled_gradients(grads)

            optimizer.apply_gradients(zip(grads, trainable_variables))
            train_loss.update_state(loss)
            training_accuracy.update_state(labels, logits)

        @tf.function
        def train_steps(iterator, steps):
            """Performs distributed training steps in a loop."""
            for _ in tf.range(steps):
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

        def train_single_step(iterator):
            if strategy:
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))
            else:
                return step_fn(next(iterator))

        def test_step(iterator):
            """Evaluation StepFn."""
            def step_fn(inputs):
                images, labels = inputs
                logits = model(images, training=False)
                loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size)
                test_loss.update_state(loss)
                test_accuracy.update_state(labels, logits)

            if strategy:
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))
            else:
                step_fn(next(iterator))

        if flags_obj.use_tf_function:
            train_single_step = tf.function(train_single_step)
            test_step = tf.function(test_step)

        train_iter = iter(train_ds)
        time_callback.on_train_begin()
        for epoch in range(train_epochs):
            train_loss.reset_states()
            training_accuracy.reset_states()

            steps_in_current_epoch = 0
            while steps_in_current_epoch < per_epoch_steps:
                time_callback.on_batch_begin(steps_in_current_epoch +
                                             epoch * per_epoch_steps)
                steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps,
                                      steps_per_loop)
                if steps == 1:
                    train_single_step(train_iter)
                else:
                    # Converts steps to a Tensor to avoid tf.function retracing.
                    train_steps(train_iter,
                                tf.convert_to_tensor(steps, dtype=tf.int32))
                time_callback.on_batch_end(steps_in_current_epoch +
                                           epoch * per_epoch_steps)
                steps_in_current_epoch += steps

            logging.info('Training loss: %s, accuracy: %s at epoch %d',
                         train_loss.result().numpy(),
                         training_accuracy.result().numpy(), epoch + 1)

            if (not flags_obj.skip_eval
                    and (epoch + 1) % flags_obj.epochs_between_evals == 0):
                test_loss.reset_states()
                test_accuracy.reset_states()

                test_iter = iter(test_ds)
                for _ in range(eval_steps):
                    test_step(test_iter)

                logging.info('Test loss: %s, accuracy: %s%% at epoch: %d',
                             test_loss.result().numpy(),
                             test_accuracy.result().numpy(), epoch + 1)

        time_callback.on_train_end()

        eval_result = None
        train_result = None
        if not flags_obj.skip_eval:
            eval_result = [
                test_loss.result().numpy(),
                test_accuracy.result().numpy()
            ]
            train_result = [
                train_loss.result().numpy(),
                training_accuracy.result().numpy()
            ]

        stats = build_stats(train_result, eval_result, time_callback)
        return stats
Exemplo n.º 26
0
def run_bert(strategy,
             input_meta_data,
             model_config,
             train_input_fn=None,
             eval_input_fn=None):
    """Run BERT training."""
    if FLAGS.mode == 'export_only':
        # As Keras ModelCheckpoint callback used with Keras compile/fit() API
        # internally uses model.save_weights() to save checkpoints, we must
        # use model.load_weights() when Keras compile/fit() is used.
        export_classifier(FLAGS.model_export_path, input_meta_data,
                          FLAGS.use_keras_compile_fit, model_config,
                          FLAGS.model_dir)
        return

    if FLAGS.mode != 'train_and_eval':
        raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
    # Enables XLA in Session Config. Should not be set for TPU.
    keras_utils.set_config_v2(FLAGS.enable_xla)
    performance.set_mixed_precision_policy(common_flags.dtype())

    epochs = FLAGS.num_train_epochs
    train_data_size = input_meta_data['train_data_size']
    steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
    warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
    eval_steps = int(
        math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))

    if not strategy:
        raise ValueError('Distribution strategy has not been specified.')

    if FLAGS.log_steps:
        custom_callbacks = [
            keras_utils.TimeHistory(
                batch_size=FLAGS.train_batch_size,
                log_steps=FLAGS.log_steps,
                logdir=FLAGS.model_dir,
            )
        ]
    else:
        custom_callbacks = None

    trained_model = run_bert_classifier(
        strategy,
        model_config,
        input_meta_data,
        FLAGS.model_dir,
        epochs,
        steps_per_epoch,
        FLAGS.steps_per_loop,
        eval_steps,
        warmup_steps,
        FLAGS.learning_rate,
        FLAGS.init_checkpoint,
        train_input_fn,
        eval_input_fn,
        run_eagerly=FLAGS.run_eagerly,
        use_keras_compile_fit=FLAGS.use_keras_compile_fit,
        custom_callbacks=custom_callbacks)

    if FLAGS.model_export_path:
        # As Keras ModelCheckpoint callback used with Keras compile/fit() API
        # internally uses model.save_weights() to save checkpoints, we must
        # use model.load_weights() when Keras compile/fit() is used.
        model_saving_utils.export_bert_model(
            FLAGS.model_export_path,
            model=trained_model,
            restore_model_using_load_weights=FLAGS.use_keras_compile_fit)
    return trained_model
Exemplo n.º 27
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)
    model_helpers.apply_clean(flags.FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus)
    params["distribute_strategy"] = strategy

    if not keras_utils.is_v2_0() and strategy is not None:
        logging.error(
            "NCF Keras only works with distribution strategy in TF 2.0")
        return

    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well. This is the
    # per device batch size
    params["batch_size"] = params["eval_batch_size"]
    batch_size = params["batch_size"]

    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, num_train_steps, num_eval_steps, producer = (
            ncf_common.get_inputs(params))
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items
    (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \
        (ncf_input_pipeline.create_ncf_input_data(
            params, producer, input_meta_data))
    steps_per_epoch = None if generate_input_online else num_train_steps

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)
    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["learning_rate"],
            beta_1=params["beta1"],
            beta_2=params["beta2"],
            epsilon=params["epsilon"])

    if params["keras_use_ctl"]:
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction="sum", from_logits=True)
        train_input_iterator = strategy.make_dataset_iterator(
            train_input_dataset)
        eval_input_iterator = strategy.make_dataset_iterator(
            eval_input_dataset)

        @tf.function
        def train_step():
            """Called once per step to train the model."""
            def step_fn(features):
                """Computes loss and applied gradient per replica."""
                with tf.GradientTape() as tape:
                    softmax_logits = keras_model(features)
                    labels = features[rconst.TRAIN_LABEL_KEY]
                    loss = loss_object(
                        labels,
                        softmax_logits,
                        sample_weight=features[rconst.VALID_POINT_MASK])
                    loss *= (1.0 /
                             (batch_size * strategy.num_replicas_in_sync))

                grads = tape.gradient(loss, keras_model.trainable_variables)
                # Converting gradients to dense form helps in perf on GPU for NCF
                grads = neumf_model.sparse_to_dense_grads(
                    list(zip(grads, keras_model.trainable_variables)))
                optimizer.apply_gradients(grads)
                return loss

            per_replica_losses = strategy.experimental_run(
                step_fn, train_input_iterator)
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                        per_replica_losses,
                                        axis=None)
            return mean_loss

        @tf.function
        def eval_step():
            """Called once per eval step to compute eval metrics."""
            def step_fn(features):
                """Computes eval metrics per replica."""
                softmax_logits = keras_model(features)
                in_top_k, metric_weights = metric_fn(
                    softmax_logits, features[rconst.DUPLICATE_MASK], params)
                hr_sum = tf.reduce_sum(in_top_k * metric_weights)
                hr_count = tf.reduce_sum(metric_weights)
                return hr_sum, hr_count

            per_replica_hr_sum, per_replica_hr_count = (
                strategy.experimental_run(step_fn, eval_input_iterator))
            hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                     per_replica_hr_sum,
                                     axis=None)
            hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_hr_count,
                                       axis=None)
            return hr_sum, hr_count

        time_callback.on_train_begin()
        for epoch in range(FLAGS.train_epochs):
            for cb in callbacks:
                cb.on_epoch_begin(epoch)

            # As NCF dataset is sampled with randomness, not repeating
            # data elements in each epoch has significant impact on
            # convergence. As so, offline-generated TF record files
            # contains all epoch worth of data. Thus we do not need
            # to initialize dataset when reading from tf record files.
            if generate_input_online:
                train_input_iterator.initialize()

            train_loss = 0
            for step in range(num_train_steps):
                time_callback.on_batch_begin(step + epoch * num_train_steps)
                train_loss += train_step()
                time_callback.on_batch_end(step + epoch * num_train_steps)
            train_loss /= num_train_steps
            logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1,
                         train_loss)
            eval_input_iterator.initialize()
            hr_sum = 0
            hr_count = 0
            for _ in range(num_eval_steps):
                step_hr_sum, step_hr_count = eval_step()
                hr_sum += step_hr_sum
                hr_count += step_hr_count
            logging.info("Done eval epoch %s, hr=%s.", epoch + 1,
                         hr_sum / hr_count)

            if (FLAGS.early_stopping
                    and float(hr_sum / hr_count) > params["hr_threshold"]):
                break

        time_callback.on_train_end()
        eval_results = [None, hr_sum / hr_count]

    else:
        with distribution_utils.get_strategy_scope(strategy):

            keras_model.compile(
                optimizer=optimizer,
                run_eagerly=FLAGS.run_eagerly,
                run_distributed=FLAGS.force_v2_in_keras_compile)

            history = keras_model.fit(train_input_dataset,
                                      epochs=FLAGS.train_epochs,
                                      steps_per_epoch=steps_per_epoch,
                                      callbacks=callbacks,
                                      validation_data=eval_input_dataset,
                                      validation_steps=num_eval_steps,
                                      verbose=2)

            logging.info("Training done. Start evaluating")

            eval_results = keras_model.evaluate(eval_input_dataset,
                                                steps=num_eval_steps,
                                                verbose=2)

            logging.info("Keras evaluation is done.")

        if history and history.history:
            train_history = history.history
            train_loss = train_history["loss"][-1]

    stats = build_stats(train_loss, eval_results, time_callback)
    return stats
Exemplo n.º 28
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    model_helpers.apply_clean(FLAGS)

    if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
        tf.keras.mixed_precision.set_global_policy("mixed_float16")

    strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)

    params = ncf_common.parse_flags(FLAGS)
    params["distribute_strategy"] = strategy
    params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")

    if params["use_tpu"] and not params["keras_use_ctl"]:
        logging.error(
            "Custom training loop must be used when using TPUStrategy.")
        return

    batch_size = params["batch_size"]
    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)

    (train_input_dataset, eval_input_dataset,
     num_train_steps, num_eval_steps) = \
      (ncf_input_pipeline.create_ncf_input_data(
          params, producer, input_meta_data, strategy))
    steps_per_epoch = None if generate_input_online else num_train_steps

    with distribute_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["learning_rate"],
            beta_1=params["beta1"],
            beta_2=params["beta2"],
            epsilon=params["epsilon"])
        if FLAGS.fp16_implementation == "graph_rewrite":
            optimizer = \
              tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                  optimizer,
                  loss_scale=flags_core.get_loss_scale(FLAGS,
                                                       default_for_fp16="dynamic"))
        elif FLAGS.dtype == "fp16":
            loss_scale = flags_core.get_loss_scale(FLAGS,
                                                   default_for_fp16="dynamic")
            # Note Model.compile automatically wraps the optimizer with a
            # LossScaleOptimizer using dynamic loss scaling. We explicitly wrap it
            # here for the case where a custom training loop or fixed loss scale is
            # used.
            if loss_scale == "dynamic":
                optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
                    optimizer)
            else:
                optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
                    optimizer, dynamic=False, initial_scale=loss_scale)

        if params["keras_use_ctl"]:
            train_loss, eval_results = run_ncf_custom_training(
                params,
                strategy,
                keras_model,
                optimizer,
                callbacks,
                train_input_dataset,
                eval_input_dataset,
                num_train_steps,
                num_eval_steps,
                generate_input_online=generate_input_online)
        else:
            keras_model.compile(optimizer=optimizer,
                                run_eagerly=FLAGS.run_eagerly)

            if not FLAGS.ml_perf:
                # Create Tensorboard summary and checkpoint callbacks.
                summary_dir = os.path.join(FLAGS.model_dir, "summaries")
                summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
                checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
                checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                    checkpoint_path, save_weights_only=True)

                callbacks += [summary_callback, checkpoint_callback]

            history = keras_model.fit(train_input_dataset,
                                      epochs=FLAGS.train_epochs,
                                      steps_per_epoch=steps_per_epoch,
                                      callbacks=callbacks,
                                      validation_data=eval_input_dataset,
                                      validation_steps=num_eval_steps,
                                      verbose=2)

            logging.info("Training done. Start evaluating")

            eval_loss_and_metrics = keras_model.evaluate(eval_input_dataset,
                                                         steps=num_eval_steps,
                                                         verbose=2)

            logging.info("Keras evaluation is done.")

            # Keras evaluate() API returns scalar loss and metric values from
            # evaluation as a list. Here, the returned list would contain
            # [evaluation loss, hr sum, hr count].
            eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2]

            # Format evaluation result into [eval loss, eval hit accuracy].
            eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

            if history and history.history:
                train_history = history.history
                train_loss = train_history["loss"][-1]

    stats = build_stats(train_loss, eval_results, time_callback)
    return stats
Exemplo n.º 29
0
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)
    performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))

    # This only affects GPU.
    common.set_cudnn_batchnorm_mode()

    # TODO(anj-s): Set data_format without using Keras.
    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.config.list_physical_devices('GPU') else
                       'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs,
        tpu_address=flags_obj.tpu)

    per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
        flags_obj)
    steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)

    logging.info(
        'Training %d epochs, each epoch has %d steps, '
        'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
        train_epochs * per_epoch_steps, eval_steps)

    time_callback = keras_utils.TimeHistory(
        flags_obj.batch_size,
        flags_obj.log_steps,
        logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
    with distribution_utils.get_strategy_scope(strategy):
        runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
                                                  per_epoch_steps)

    eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
    checkpoint_interval = (per_epoch_steps
                           if flags_obj.enable_checkpoint_and_export else None)
    summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None

    checkpoint_manager = tf.train.CheckpointManager(
        runnable.checkpoint,
        directory=flags_obj.model_dir,
        max_to_keep=10,
        step_counter=runnable.global_step,
        checkpoint_interval=checkpoint_interval)

    resnet_controller = controller.Controller(
        strategy,
        runnable.train,
        runnable.evaluate,
        global_step=runnable.global_step,
        steps_per_loop=steps_per_loop,
        train_steps=per_epoch_steps * train_epochs,
        checkpoint_manager=checkpoint_manager,
        summary_interval=summary_interval,
        eval_steps=eval_steps,
        eval_interval=eval_interval)

    time_callback.on_train_begin()
    resnet_controller.train(evaluate=not flags_obj.skip_eval)
    time_callback.on_train_end()

    stats = build_stats(runnable, time_callback)
    return stats
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config()
    performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))

    if tf.config.list_physical_devices('GPU'):
        if flags_obj.tf_gpu_thread_mode:
            keras_utils.set_gpu_thread_mode_and_count(
                per_gpu_thread_count=flags_obj.per_gpu_thread_count,
                gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
                num_gpus=flags_obj.num_gpus,
                datasets_num_private_threads=flags_obj.
                datasets_num_private_threads)
        common.set_cudnn_batchnorm_mode()

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.config.list_physical_devices('GPU') else
                       'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs,
        tpu_address=flags_obj.tpu)

    per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
        flags_obj)
    if flags_obj.steps_per_loop is None:
        steps_per_loop = per_epoch_steps
    elif flags_obj.steps_per_loop > per_epoch_steps:
        steps_per_loop = per_epoch_steps
        logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
                     steps_per_loop)
    else:
        steps_per_loop = flags_obj.steps_per_loop

    logging.info(
        'Training %d epochs, each epoch has %d steps, '
        'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
        train_epochs * per_epoch_steps, eval_steps)

    time_callback = keras_utils.TimeHistory(
        flags_obj.batch_size,
        flags_obj.log_steps,
        logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
    with distribute_utils.get_strategy_scope(strategy):
        runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
                                                  per_epoch_steps)

    eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
    checkpoint_interval = (steps_per_loop * 5
                           if flags_obj.enable_checkpoint_and_export else None)
    summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None

    checkpoint_manager = tf.train.CheckpointManager(
        runnable.checkpoint,
        directory=flags_obj.model_dir,
        max_to_keep=10,
        step_counter=runnable.global_step,
        checkpoint_interval=checkpoint_interval)

    resnet_controller = orbit.Controller(
        strategy=strategy,
        trainer=runnable,
        evaluator=runnable if not flags_obj.skip_eval else None,
        global_step=runnable.global_step,
        steps_per_loop=steps_per_loop,
        checkpoint_manager=checkpoint_manager,
        summary_interval=summary_interval,
        summary_dir=flags_obj.model_dir,
        eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))

    time_callback.on_train_begin()
    if not flags_obj.skip_eval:
        resnet_controller.train_and_evaluate(train_steps=per_epoch_steps *
                                             train_epochs,
                                             eval_steps=eval_steps,
                                             eval_interval=eval_interval)
    else:
        resnet_controller.train(steps=per_epoch_steps * train_epochs)
    time_callback.on_train_end()

    stats = build_stats(runnable, time_callback)
    return stats