Exemplo n.º 1
0
    def _run_and_report_benchmark(self, force_gpu_memory_alloc=True):
        if force_gpu_memory_alloc:
            # force GPU memory allocation, so we always take the same amount of
            # GPU memory as running in Cloud (see b/151435951)
            gpus = tf.config.experimental.list_physical_devices("GPU")
            if gpus:
                try:
                    for gpu_id in range(0, len(gpus)):
                        tf.config.experimental.set_virtual_device_configuration(
                            gpus[gpu_id], [
                                tf.config.experimental.
                                VirtualDeviceConfiguration(memory_limit=14700)
                            ])
                    logical_gpus = tf.config.experimental.list_logical_devices(
                        "GPU")
                    print(len(gpus), "Physical GPUs,", len(logical_gpus),
                          "Logical GPUs")
                except RuntimeError as e:
                    # Virtual devices must be set before GPUs have been initialized
                    print(e)

        if FLAGS.tpu:
            strategy = distribution_utils.get_distribution_strategy(
                distribution_strategy=FLAGS.distribution_strategy,
                tpu_address=FLAGS.tpu,
                tpu_zone="europe-west4-a")
        else:
            strategy = distribution_utils.get_distribution_strategy(
                distribution_strategy=FLAGS.distribution_strategy,
                all_reduce_alg=FLAGS.all_reduce_alg,
                num_gpus=FLAGS.num_gpus)

        start_time_sec = time.time()
        run_pretraining.run_bert_pretrain(strategy, [self.timer_callback])
        wall_time_sec = time.time() - start_time_sec

        metrics = []
        if self.timer_callback:
            metrics.append({
                "name":
                "exp_per_second",
                "value":
                self.timer_callback.get_examples_per_sec(
                    FLAGS.train_batch_size * FLAGS.steps_per_loop)
            })
        else:
            logging.error(
                "exp_per_second not calculated because timer_callback is missing"
            )
            metrics.append({
                "name": "exp_per_second",
                "value": 0.0,
            })

        flags_str = flags_core.get_nondefault_flags_as_str()
        self.report_benchmark(iters=-1,
                              wall_time=wall_time_sec,
                              metrics=metrics,
                              extras={"flags": flags_str})
Exemplo n.º 2
0
    def _run_bert_classifier(self, callbacks=None, use_ds=True):
        """Starts BERT classification task."""
        with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
            input_meta_data = json.loads(reader.read().decode('utf-8'))

        bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
        epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs
        if self.num_steps_per_epoch:
            steps_per_epoch = self.num_steps_per_epoch
        else:
            train_data_size = input_meta_data['train_data_size']
            steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
        warmup_steps = int(epochs * steps_per_epoch * 0.1)
        eval_steps = int(
            math.ceil(input_meta_data['eval_data_size'] /
                      FLAGS.eval_batch_size))
        if self.tpu:
            strategy = distribution_utils.get_distribution_strategy(
                distribution_strategy='tpu', tpu_address=self.tpu)
        else:
            strategy = distribution_utils.get_distribution_strategy(
                distribution_strategy='mirrored' if use_ds else 'off',
                num_gpus=self.num_gpus)

        steps_per_loop = 50

        max_seq_length = input_meta_data['max_seq_length']
        train_input_fn = run_classifier.get_dataset_fn(FLAGS.train_data_path,
                                                       max_seq_length,
                                                       FLAGS.train_batch_size,
                                                       is_training=True)
        eval_input_fn = run_classifier.get_dataset_fn(FLAGS.eval_data_path,
                                                      max_seq_length,
                                                      FLAGS.eval_batch_size,
                                                      is_training=False)
        run_classifier.run_bert_classifier(strategy,
                                           bert_config,
                                           input_meta_data,
                                           FLAGS.model_dir,
                                           epochs,
                                           steps_per_epoch,
                                           steps_per_loop,
                                           eval_steps,
                                           warmup_steps,
                                           FLAGS.learning_rate,
                                           FLAGS.init_checkpoint,
                                           train_input_fn,
                                           eval_input_fn,
                                           custom_callbacks=callbacks)
Exemplo n.º 3
0
def main(_):
    # Users should always run this script under TF 2.x
    tf.enable_v2_behavior()

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    if not FLAGS.model_dir:
        FLAGS.model_dir = '/tmp/bert20/'

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)
    max_seq_length = input_meta_data['max_seq_length']
    train_input_fn = get_dataset_fn(FLAGS.train_data_path,
                                    max_seq_length,
                                    FLAGS.train_batch_size,
                                    is_training=True)
    eval_input_fn = get_dataset_fn(FLAGS.eval_data_path,
                                   max_seq_length,
                                   FLAGS.eval_batch_size,
                                   is_training=False)

    bert_config = bert_configs.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    run_bert(strategy, input_meta_data, bert_config, train_input_fn,
             eval_input_fn)
 def __init__(self, strategy_type=None, strategy_config=None):
     _ = distribution_utils.configure_cluster(strategy_config.worker_hosts,
                                              strategy_config.task_index)
     self._strategy = distribution_utils.get_distribution_strategy(
         distribution_strategy=strategy_type,
         num_gpus=strategy_config.num_gpus,
         all_reduce_alg=strategy_config.all_reduce_alg,
         num_packs=strategy_config.num_packs,
         tpu_address=strategy_config.tpu)
Exemplo n.º 5
0
    def _get_distribution_strategy(self, ds_type='mirrored'):
        """Gets the distribution strategy.

    Args:
      ds_type: String, the distribution strategy type to be used. Can be
      'mirrored', 'multi_worker_mirrored', 'tpu' and 'off'.

    Returns:
      A `tf.distribute.DistibutionStrategy` object.
    """
        if self.tpu or ds_type == 'tpu':
            return distribution_utils.get_distribution_strategy(
                distribution_strategy='tpu', tpu_address=self.tpu)
        elif ds_type == 'multi_worker_mirrored':
            # Configures cluster spec for multi-worker distribution strategy.
            _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
                                                     FLAGS.task_index)
        return distribution_utils.get_distribution_strategy(
            distribution_strategy=ds_type,
            num_gpus=self.num_gpus,
            all_reduce_alg=FLAGS.all_reduce_alg)
Exemplo n.º 6
0
def main(_):
    # Users should always run this script under TF 2.x
    tf.enable_v2_behavior()

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    if FLAGS.mode == 'export_only':
        export_squad(FLAGS.model_export_path, input_meta_data)
        return

    # Configures cluster spec for multi-worker distribution strategy.
    if FLAGS.num_gpus > 0:
        _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
                                                 FLAGS.task_index)
    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        all_reduce_alg=FLAGS.all_reduce_alg,
        tpu_address=FLAGS.tpu)
    if FLAGS.mode in ('train', 'train_and_predict'):
        train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
    if FLAGS.mode in ('predict', 'train_and_predict'):
        predict_squad(strategy, input_meta_data)
def run(flags_obj):
    """Run ResNet Cifar-10 training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    # Execute flag override logic for better model performance
    if flags_obj.tf_gpu_thread_mode:
        keras_utils.set_gpu_thread_mode_and_count(
            per_gpu_thread_count=flags_obj.per_gpu_thread_count,
            gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
            num_gpus=flags_obj.num_gpus,
            datasets_num_private_threads=flags_obj.datasets_num_private_threads
        )
    common.set_cudnn_batchnorm_mode()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    if strategy:
        # flags_obj.enable_get_next_as_optional controls whether enabling
        # get_next_as_optional behavior in DistributedIterator. If true, last
        # partial batch can be supported.
        strategy.extended.experimental_enable_get_next_as_optional = (
            flags_obj.enable_get_next_as_optional)

    strategy_scope = distribution_utils.get_strategy_scope(strategy)

    if flags_obj.use_synthetic_data:
        distribution_utils.set_up_synthetic_data()
        input_fn = common.get_synth_input_fn(
            height=cifar_preprocessing.HEIGHT,
            width=cifar_preprocessing.WIDTH,
            num_channels=cifar_preprocessing.NUM_CHANNELS,
            num_classes=cifar_preprocessing.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj),
            drop_remainder=True)
    else:
        distribution_utils.undo_set_up_synthetic_data()
        input_fn = cifar_preprocessing.input_fn

    train_input_dataset = input_fn(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=cifar_preprocessing.parse_record,
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        dtype=dtype,
        # Setting drop_remainder to avoid the partial batch logic in normalization
        # layer, which triggers tf.where and leads to extra memory copy of input
        # sizes between host and GPU.
        drop_remainder=(not flags_obj.enable_get_next_as_optional))

    eval_input_dataset = None
    if not flags_obj.skip_eval:
        eval_input_dataset = input_fn(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=flags_obj.batch_size,
            num_epochs=flags_obj.train_epochs,
            parse_record_fn=cifar_preprocessing.parse_record)

    steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] //
                       flags_obj.batch_size)
    lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
        lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
            values=[initial_learning_rate] + list(p[0] * initial_learning_rate
                                                  for p in LR_SCHEDULE))

    with strategy_scope:
        optimizer = common.get_optimizer(lr_schedule)
        model = resnet_cifar_model.resnet56(
            classes=cifar_preprocessing.NUM_CLASSES)

        # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
        # a valid arg for this model. Also remove as a valid flag.
        if flags_obj.force_v2_in_keras_compile is not None:
            model.compile(
                loss='sparse_categorical_crossentropy',
                optimizer=optimizer,
                metrics=(['sparse_categorical_accuracy']
                         if flags_obj.report_accuracy_metrics else None),
                run_eagerly=flags_obj.run_eagerly,
                experimental_run_tf_function=flags_obj.
                force_v2_in_keras_compile)
        else:
            model.compile(
                loss='sparse_categorical_crossentropy',
                optimizer=optimizer,
                metrics=(['sparse_categorical_accuracy']
                         if flags_obj.report_accuracy_metrics else None),
                run_eagerly=flags_obj.run_eagerly)

    train_epochs = flags_obj.train_epochs

    callbacks = common.get_callbacks(steps_per_epoch)

    if not flags_obj.use_tensor_lr:
        lr_callback = LearningRateBatchScheduler(
            schedule=learning_rate_schedule,
            batch_size=flags_obj.batch_size,
            steps_per_epoch=steps_per_epoch)
        callbacks.append(lr_callback)

    # if mutliple epochs, ignore the train_steps flag.
    if train_epochs <= 1 and flags_obj.train_steps:
        steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
        train_epochs = 1

    num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        if flags_obj.set_learning_phase_to_train:
            # TODO(haoyuzhang): Understand slowdown of setting learning phase when
            # not using distribution strategy.
            tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    if not strategy and flags_obj.explicit_gpu_placement:
        # TODO(b/135607227): Add device scope automatically in Keras training loop
        # when not using distribition strategy.
        no_dist_strat_device = tf.device('/device:GPU:0')
        no_dist_strat_device.__enter__()

    history = model.fit(train_input_dataset,
                        epochs=train_epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_steps=num_eval_steps,
                        validation_data=validation_data,
                        validation_freq=flags_obj.epochs_between_evals,
                        verbose=2)
    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=2)

    if not strategy and flags_obj.explicit_gpu_placement:
        no_dist_strat_device.__exit__()

    stats = common.build_stats(history, eval_output, callbacks)
    return stats