Exemplo n.º 1
0
def unwrap_output_dict(strategy, grouped_outputs, mode):
    """Unwrap the list of outputs contained in the PerReplica parameters."""
    if mode == ModeKeys.PREDICT:
        return flatten_per_replica_values(strategy, grouped_outputs)

    # In the case of fit/eval, the grouped_outputs is a dict, whereas in predict,
    # the output is as same structure as model output. They need to be treated
    # differently
    total_loss = strategy.reduce(reduce_util.ReduceOp.SUM,
                                 grouped_outputs['total_loss'][0],
                                 axis=None)
    output_losses = flatten_per_replica_values(
        strategy, grouped_outputs['output_losses'])
    metrics = flatten_per_replica_values(strategy, grouped_outputs['metrics'])
    batch_size = strategy.reduce(reduce_util.ReduceOp.SUM,
                                 grouped_outputs['batch_size'],
                                 axis=None)
    if (backend.is_tpu_strategy(strategy)
            and ops.executing_eagerly_outside_functions()):
        # Choose 1 value per replica in the TPU case since all replicas produce the
        # same output.
        # We only do this in eager mode for now since this function is used in
        # both graph and eager mode and in the graph case we currently don't use
        # experimental_run so would need to be removed when we converge the graph
        # code path as well.
        output_losses = output_losses[::strategy.num_replicas_in_sync]
        metrics = metrics[::strategy.num_replicas_in_sync]
    return {
        'total_loss': [total_loss],
        'output_losses': output_losses,
        'metrics': metrics,
        'batch_size': batch_size
    }
 def test_dnn_correctness(self, distribution, use_numpy,
                          use_validation_data,
                          experimental_run_tf_function):
     if (context.executing_eagerly()) or is_default_strategy(distribution):
         self.run_correctness_test(distribution, use_numpy,
                                   use_validation_data,
                                   experimental_run_tf_function)
     elif K.is_tpu_strategy(
             distribution) and not context.executing_eagerly():
         with self.assertRaisesRegexp(
                 ValueError,
                 'Expected `model` argument to be a functional `Model` instance, '
                 'but got a subclass model instead.'):
             self.run_correctness_test(distribution, use_numpy,
                                       use_validation_data,
                                       experimental_run_tf_function)
     else:
         with self.assertRaisesRegexp(
                 ValueError,
                 'We currently do not support distribution strategy with a '
                 '`Sequential` model that is created without `input_shape`/'
                 '`input_dim` set in its first layer or a subclassed model.'
         ):
             self.run_correctness_test(distribution, use_numpy,
                                       use_validation_data,
                                       experimental_run_tf_function)
Exemplo n.º 3
0
def call_replica_local_fn(fn, *args, **kwargs):
  """Call a function that uses replica-local variables.

  This function correctly handles calling `fn` in a cross-replica
  context.

  Args:
    fn: The function to call.
    *args: Positional arguments to the `fn`.
    **kwargs: Keyword argument to `fn`.

  Returns:
    The result of calling `fn`.
  """
  # TODO(b/132666209): Remove this function when we support assign_*
  # for replica-local variables.
  strategy = None
  if 'strategy' in kwargs:
    strategy = kwargs.pop('strategy')
  else:
    if ds_context.has_strategy():
      strategy = ds_context.get_strategy()

  # TODO(b/120571621): TPUStrategy does not implement replica-local variables.
  is_tpu = backend.is_tpu_strategy(strategy)
  if ((not is_tpu) and strategy and ds_context.in_cross_replica_context()):
    with strategy.scope():
      return strategy.extended.call_for_each_replica(fn, args, kwargs)
  return fn(*args, **kwargs)
Exemplo n.º 4
0
 def predict(self,
             model,
             x,
             batch_size=None,
             verbose=0,
             steps=None,
             callbacks=None,
             **kwargs):
   """Predict loop for Distribution Strategies."""
   dist_utils.validate_inputs(x=x, y=None)
   batch_size, steps = dist_utils.process_batch_and_step_size(
       model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
   dataset = model._distribution_standardize_user_data(
       x,
       batch_size=batch_size,
       allow_partial_batch=True)
   if K.is_tpu_strategy(model._distribution_strategy):
     steps = training_utils_v1.infer_steps_for_dataset(
         model, dataset, steps, steps_name='steps')
     if steps is None:
       raise ValueError('Number of steps could not be inferred from the data, '
                        'please pass the steps argument.')
     if not context.executing_eagerly():
       return experimental_tpu_predict_loop(
           model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
   return training_arrays_v1.predict_loop(
       model,
       dataset,
       batch_size=batch_size,
       verbose=verbose,
       steps=steps,
       callbacks=callbacks)
    def test_distribution_strategy_output_with_adapt(self, strategy):
        # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
        if backend.is_tpu_strategy(strategy):
            self.skipTest("This test needs MLIR bridge on TPU.")

        vocab_data = [[
            "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
            "and", "fire"
        ]]
        vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        input_dataset = dataset_ops.Dataset.from_tensor_slices(
            input_array).batch(2, drop_remainder=True)

        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        config.set_soft_device_placement(True)

        with strategy.scope():
            input_data = keras.Input(shape=(None, ), dtype=dtypes.string)
            layer = text_vectorization.TextVectorization(
                max_tokens=None,
                standardize=None,
                split=None,
                output_mode=text_vectorization.INT)
            layer.adapt(vocab_dataset)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)

        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)
Exemplo n.º 6
0
  def test_strategy_with_file(self, strategy):
    # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
    if backend.is_tpu_strategy(strategy):
      self.skipTest("This test needs MLIR bridge on TPU.")

    vocab_data = ["earth", "wind", "and", "fire"]
    vocab_file = self._write_to_temp_file("temp", vocab_data)

    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    config.set_soft_device_placement(True)

    with strategy.scope():
      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
      layer = index_lookup.IndexLookup(
          max_tokens=None,
          num_oov_indices=1,
          mask_token="",
          oov_token="[OOV]",
          dtype=dtypes.string,
          vocabulary=vocab_file)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    model.compile(loss="mse")
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)
 def test_dnn_with_dynamic_learning_rate(self, distribution):
   if ((context.executing_eagerly() and not K.is_tpu_strategy(distribution)) or
       is_default_strategy(distribution)):
     self.run_dynamic_lr_test(distribution)
   elif K.is_tpu_strategy(distribution):
     with self.assertRaisesRegex(
         ValueError,
         'Expected `model` argument to be a functional `Model` instance, '
         'but got a subclass model instead.'):
       self.run_dynamic_lr_test(distribution)
   else:
     with self.assertRaisesRegex(
         ValueError,
         'We currently do not support distribution strategy with a '
         '`Sequential` model that is created without `input_shape`/'
         '`input_dim` set in its first layer or a subclassed model.'):
       self.run_dynamic_lr_test(distribution)
Exemplo n.º 8
0
def batch_wrapper(dataset, batch_size, distribution, repeat=None):
    if repeat:
        dataset = dataset.repeat(repeat)
    # TPUs currently require fully defined input shapes, drop_remainder ensures
    # the input will have fully defined shapes.
    if backend.is_tpu_strategy(distribution):
        return dataset.batch(batch_size, drop_remainder=True)
    else:
        return dataset.batch(batch_size)
 def test_dnn_with_dynamic_learning_rate(self, distribution, cloning):
   if ((not cloning and context.executing_eagerly() and
        not K.is_tpu_strategy(distribution)) or
       is_default_strategy(distribution)):
     self.run_dynamic_lr_test(distribution, cloning)
   else:
     with self.assertRaisesRegexp(
         ValueError,
         'We currently do not support distribution strategy with a '
         '`Sequential` model that is created without `input_shape`/'
         '`input_dim` set in its first layer or a subclassed model.'):
       self.run_dynamic_lr_test(distribution, cloning)
 def test_dnn_with_dynamic_learning_rate(self, distribution, cloning):
   if ((not cloning and context.executing_eagerly() and
        not K.is_tpu_strategy(distribution)) or
       is_default_strategy(distribution)):
     self.run_dynamic_lr_test(distribution, cloning)
   else:
     with self.assertRaisesRegexp(
         ValueError,
         'We currently do not support distribution strategy with a '
         '`Sequential` model that is created without `input_shape`/'
         '`input_dim` set in its first layer or a subclassed model.'):
       self.run_dynamic_lr_test(distribution, cloning)
Exemplo n.º 11
0
def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
    """Prepare feed values to the model execution function.

  Args:
    model: Model to prepare feed values for.
    inputs: List or dict of model inputs.
    targets: Optional list of model targets.
    sample_weights: Optional list of sample weight arrays.
    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.

  Returns:
    Feed values for the model in the given mode.
  """
    strategy = model._distribution_strategy
    inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
    if backend.is_tpu_strategy(strategy):
        if sample_weights is not None:
            raise ValueError('TPUStrategy does not support sample weights.')

    # When the inputs are dict, then we want to flatten it in the same order as
    # the input layers, such that the data are fed into the input layers in the
    # correct order.
    if isinstance(inputs, dict):
        inputs = [inputs[key] for key in model._feed_input_names]
    if is_distributing_by_cloning(model):
        inputs = flatten_per_replica_values(strategy, inputs)
        targets = flatten_per_replica_values(strategy, targets)
        # Expand 1-dimensional inputs.
        # TODO(b/124535720): Remove once this standarize data logic is shared with
        # main flow.
        inputs, targets = nest.map_structure(
            training_utils_v1.standardize_single_array, (inputs, targets))
    else:
        inputs = training_utils_v1.ModelInputs(inputs).as_list()

    if mode == ModeKeys.PREDICT:
        sample_weights = []
        targets = []
    elif sample_weights is not None and is_distributing_by_cloning(model):
        if context.executing_eagerly() and not model._compile_distribution:
            raise NotImplementedError(
                '`sample_weight` is not supported when using '
                'tf.distribute.Strategy in eager mode and '
                'cloning=True.')
        sample_weights = flatten_per_replica_values(strategy, sample_weights)

    ins = [inputs, targets, sample_weights]
    return tuple(ins)
Exemplo n.º 12
0
def is_distributing_by_cloning(model):
  """Decide whether this model is going to be distributed via cloning.

  We are going to distribute the model by cloning if the user has signaled
  that intent by not setting `cloning=False` in `Model.compile()` unless we
  are in graph mode or running on TPU.

  Args:
    model: Keras model to distribute.

  Returns:
    True if the `model` is going to be distributed using cloning and False
    otherwise.
  """
  return (model._cloning or not context.executing_eagerly() or
          K.is_tpu_strategy(model._distribution_strategy))
def is_distributing_by_cloning(model):
  """Decide whether this model is going to be distributed via cloning.

  We are going to distribute the model by cloning if the user has signaled
  that intent by not setting `cloning=False` in `Model.compile()` unless we
  are in graph mode or running on TPU.

  Args:
    model: Keras model to distribute.

  Returns:
    True if the `model` is going to be distributed using cloning and False
    otherwise.
  """
  return (model._cloning or model._compile_distribution or
          not context.executing_eagerly() or
          K.is_tpu_strategy(model._distribution_strategy))
Exemplo n.º 14
0
def is_distributing_by_cloning(model):
    """Decide whether this model is going to be distributed via cloning.

  We are going to distribute the model by cloning in graph mode.

  Args:
    model: Keras model to distribute.

  Returns:
    True if the `model` is going to be distributed using cloning and False
    otherwise.
  """
    if (backend.is_tpu_strategy(model._distribution_strategy)
            and context.executing_eagerly):  # b/137580852
        return False
    elif ops.executing_eagerly_outside_functions():
        return bool(model._compile_distribution)
    return True
Exemplo n.º 15
0
  def decorated(metric_obj, *args, **kwargs):
    """Decorated function with `add_update()`."""
    strategy = distribution_strategy_context.get_strategy()

    for weight in metric_obj.weights:
      if (backend.is_tpu_strategy(strategy) and
          not strategy.extended.variable_created_in_scope(weight)
          and not distribution_strategy_context.in_cross_replica_context()):
        raise ValueError(
            'Trying to run metric.update_state in replica context when '
            'the metric was not created in TPUStrategy scope. '
            'Make sure the keras Metric is created in TPUstrategy scope. ')

    with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
      update_op = update_state_fn(*args, **kwargs)
    if update_op is not None:  # update_op will be None in eager execution.
      metric_obj.add_update(update_op)
    return update_op
Exemplo n.º 16
0
def unwrap_outputs(distribution_strategy,
                   grouped_outputs,
                   with_loss_tensor=False):
    """Unwrap the list of outputs contained in the PerReplica parameters.

  This function calls `flatten_per_replica_values` to parse each of the input
  parameters into a list of outputs on the different devices. If we set
  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
  the different devices to give us one loss tensor.

  Args:
    distribution_strategy: DistributionStrategy used to distribute training and
        validation.
    grouped_outputs: PerReplica outputs returned from the train or test function
        that we ran on each device.
    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
        tensor as one of the outputs.

  Returns:
    Values of each of the PerReplica outputs.

  """
    if not with_loss_tensor:
        return flatten_per_replica_values(distribution_strategy,
                                          grouped_outputs)

    if not isinstance(grouped_outputs, list):
        grouped_outputs = [grouped_outputs]
    # reduce loss tensor before adding it to the list of fetches
    loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
                                        grouped_outputs[0],
                                        axis=None)
    all_outputs = flatten_per_replica_values(distribution_strategy,
                                             grouped_outputs[1:])
    if (backend.is_tpu_strategy(distribution_strategy)
            and ops.executing_eagerly_outside_functions()):
        # Choose 1 value per replica in the TPU case since all replicas produce the
        # same output.
        # We only do this in eager mode for now since this function is used in
        # both graph and eager mode and in the graph case we currently don't use
        # experimental_run so would need to be removed when we converge the graph
        # code path as well.
        all_outputs = all_outputs[::distribution_strategy.num_replicas_in_sync]
    return [loss] + all_outputs
Exemplo n.º 17
0
  def evaluate(self,
               model,
               x=None,
               y=None,
               batch_size=None,
               verbose=1,
               sample_weight=None,
               steps=None,
               callbacks=None,
               **kwargs):
    """Evaluate loop for Distribution Strategies."""
    dist_utils.validate_inputs(x, y)
    batch_size, steps = dist_utils.process_batch_and_step_size(
        model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST)
    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
    dataset = model._distribution_standardize_user_data(
        x, y,
        sample_weight=sample_weight,
        batch_size=batch_size,
        allow_partial_batch=True)

    if K.is_tpu_strategy(model._distribution_strategy):
      steps = training_utils_v1.infer_steps_for_dataset(
          model, dataset, steps, steps_name='steps')
      if steps is None:
        raise ValueError('Number of steps could not be inferred from the data, '
                         'please pass the steps argument.')

      if not context.executing_eagerly():
        # Run TPU evaluation in a custom loop in graph mode.
        return experimental_tpu_test_loop(
            model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)

    return training_arrays_v1.test_loop(
        model,
        inputs=dataset,
        batch_size=batch_size,
        verbose=verbose,
        steps=steps,
        callbacks=callbacks)
    def __init__(self,
                 is_training,
                 depth_multiplier,
                 min_depth,
                 pad_to_multiple,
                 conv_hyperparams,
                 freeze_batchnorm,
                 inplace_batchnorm_update,
                 bifpn_min_level,
                 bifpn_max_level,
                 bifpn_num_iterations,
                 bifpn_num_filters,
                 bifpn_combine_method,
                 efficientnet_version,
                 use_explicit_padding=None,
                 use_depthwise=None,
                 override_base_feature_extractor_hyperparams=None,
                 name=None):
        """SSD Keras-based EfficientNetBiFPN (EfficientDet) feature extractor.

    Args:
      is_training: whether the network is in training mode.
      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
        multiplier for the feature extractor.
      min_depth: minimum feature extractor depth.
      pad_to_multiple: the nearest multiple to zero pad the input height and
        width dimensions to.
      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
        containing convolution hyperparameters for the layers added on top of
        the base feature extractor.
      freeze_batchnorm: whether to freeze batch norm parameters during training
        or not. When training with a small batch size (e.g. 1), it is desirable
        to freeze batch norm update and use pretrained batch norm params.
      inplace_batchnorm_update: whether to update batch norm moving average
        values inplace. When this is false train op must add a control
        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
        batch norm statistics.
      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
        respectively.
      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
        BiFPN constructions uses features maps starting from bifpn_min_level
        upto the bifpn_max_level. In the case that there are not enough feature
        maps in the backbone network, additional feature maps are created by
        applying stride 2 convolutions until we get the desired number of BiFPN
        levels.
      bifpn_num_iterations: number of BiFPN iterations. Overrided if
        efficientdet_version is provided.
      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
        Overrided if efficientdet_version is provided.
      bifpn_combine_method: the method used to combine BiFPN nodes.
      efficientnet_version: the EfficientNet version to use for this feature
        extractor's backbone.
      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
        explicit padding when extracting features.
      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
        convolutions when inputs to a node have a differing number of channels,
        and use separable convolutions after combine operations.
      override_base_feature_extractor_hyperparams: Whether to override the
        efficientnet backbone's default weight decay with the weight decay
        defined by `conv_hyperparams`. Note, only overriding of weight decay is
        currently supported.
      name: a string name scope to assign to the model. If 'None', Keras will
        auto-generate one from the class name.
    """
        super(SSDEfficientNetBiFPNKerasFeatureExtractor,
              self).__init__(is_training=is_training,
                             depth_multiplier=depth_multiplier,
                             min_depth=min_depth,
                             pad_to_multiple=pad_to_multiple,
                             conv_hyperparams=conv_hyperparams,
                             freeze_batchnorm=freeze_batchnorm,
                             inplace_batchnorm_update=inplace_batchnorm_update,
                             use_explicit_padding=None,
                             use_depthwise=None,
                             override_base_feature_extractor_hyperparams=
                             override_base_feature_extractor_hyperparams,
                             name=name)
        if depth_multiplier != 1.0:
            raise ValueError(
                'EfficientNetBiFPN does not support a non-default '
                'depth_multiplier.')
        if use_explicit_padding:
            raise ValueError(
                'EfficientNetBiFPN does not support explicit padding.')
        if use_depthwise:
            raise ValueError(
                'EfficientNetBiFPN does not support use_depthwise.')

        self._bifpn_min_level = bifpn_min_level
        self._bifpn_max_level = bifpn_max_level
        self._bifpn_num_iterations = bifpn_num_iterations
        self._bifpn_num_filters = max(bifpn_num_filters, min_depth)
        self._bifpn_node_params = {'combine_method': bifpn_combine_method}
        self._efficientnet_version = efficientnet_version

        logging.info('EfficientDet EfficientNet backbone version: %s',
                     self._efficientnet_version)
        logging.info('EfficientDet BiFPN num filters: %d',
                     self._bifpn_num_filters)
        logging.info('EfficientDet BiFPN num iterations: %d',
                     self._bifpn_num_iterations)

        self._backbone_max_level = min(
            max(_EFFICIENTNET_LEVEL_ENDPOINTS.keys()), bifpn_max_level)
        self._output_layer_names = [
            _EFFICIENTNET_LEVEL_ENDPOINTS[i]
            for i in range(bifpn_min_level, self._backbone_max_level + 1)
        ]
        self._output_layer_alias = [
            'level_{}'.format(i)
            for i in range(bifpn_min_level, self._backbone_max_level + 1)
        ]

        # Initialize the EfficientNet backbone.
        # Note, this is currently done in the init method rather than in the build
        # method, since doing so introduces an error which is not well understood.
        efficientnet_overrides = {'rescale_input': False}
        if override_base_feature_extractor_hyperparams:
            efficientnet_overrides[
                'weight_decay'] = conv_hyperparams.get_regularizer_weight()
        if (conv_hyperparams.use_sync_batch_norm() and
                keras_backend.is_tpu_strategy(tf.distribute.get_strategy())):
            efficientnet_overrides['batch_norm'] = 'tpu'
        efficientnet_base = efficientnet_model.EfficientNet.from_name(
            model_name=self._efficientnet_version,
            overrides=efficientnet_overrides)
        outputs = [
            efficientnet_base.get_layer(output_layer_name).output
            for output_layer_name in self._output_layer_names
        ]
        self._efficientnet = tf.keras.Model(inputs=efficientnet_base.inputs,
                                            outputs=outputs)
        self.classification_backbone = efficientnet_base
        self._bifpn_stage = None
Exemplo n.º 19
0
def get_input_params(distribution_strategy,
                     num_samples,
                     steps,
                     batch_size,
                     mode=None):
    """Calculate the number of batches and steps/steps_per_epoch.

  Args:
    distribution_strategy: The DistributionStrategy used to compile the model.
    num_samples: The number of samples from which we determine the batch size
      and steps.
    steps:  The specified number of steps.
    batch_size: The specified batch_size.
    mode: ModeKey representing whether input will be used for training,
      evaluation, or prediction. This is used to relax the constraints on
      consuming all the training samples to keep compatibility till we support
      partial batches. If none, then partial batches are not allowed.

  Returns:
    steps: The steps or steps_per_epoch argument depending on if a user is
        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
        we don't require the number of samples to be used completely.
    batch_size: The batch size to be used in model iterations.

  Raises:
    ValueError: If the number of batches or steps evaluates to 0.

  """
    # TODO(b/118776054): Use global batch size for Keras/DS support.
    # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
    use_per_replica_batch = not dist_utils.global_batch_size_supported(
        distribution_strategy)

    # TODO(b/128995245): In eager mode, uneven batch sizes are allowed except for
    # `fit()` on TPUStrategy.
    # In graph mode, the zero batch case in batch norm is not handled due to
    # XLA-GPU regression. Uneven batch sizes are not allowed except
    # for `test()` and `predict()` on TPUStrategy.
    if context.executing_eagerly():
        allow_partial_batch = (
            mode != ModeKeys.TRAIN
            or not backend.is_tpu_strategy(distribution_strategy))
    else:
        allow_partial_batch = (mode == ModeKeys.TRAIN or (
            (mode == ModeKeys.PREDICT or mode == ModeKeys.TEST)
            and backend.is_tpu_strategy(distribution_strategy)))

    if steps is None:
        if batch_size is None:
            # If neither the batch size or number of steps are set. We choose the
            # global batch size as the minimum of number of samples and 32. 32 is
            # chosen to provide backward compatibility.
            global_batch_size = min(num_samples, 32)
        else:
            # If the user provided the batch size we need to handle the case
            # between different strategies that use the global/per-replica batch size
            global_batch_size = batch_size
            if use_per_replica_batch:
                global_batch_size *= distribution_strategy.num_replicas_in_sync
        if allow_partial_batch:
            steps = np.ceil(num_samples / global_batch_size).astype(int)
        else:
            if num_samples % global_batch_size:
                raise ValueError(
                    'The number of samples %s is not divisible by '
                    'batch size %s.' % (num_samples, global_batch_size))
            steps = num_samples // global_batch_size
    else:
        if batch_size is None:
            # We calculate the batch size based on the number of steps specified
            if num_samples % steps:
                raise ValueError(
                    'The number of samples %s is not divisible by '
                    'steps %s. Please change the number of steps to a '
                    'value that can consume all the samples' %
                    (num_samples, steps))
            global_batch_size = num_samples // steps
        else:
            # If the user provided the batch size we need to handle the case
            # between different strategies that use the global/per-replica batch size
            global_batch_size = batch_size
            if use_per_replica_batch:
                global_batch_size *= distribution_strategy.num_replicas_in_sync

            min_num_samples = global_batch_size * steps
            if allow_partial_batch:
                min_num_samples = global_batch_size * (
                    steps - 1) + 1 if steps > 1 else 0

            if num_samples < min_num_samples:
                raise ValueError(
                    'Number of samples %s is less than samples required '
                    'for specified batch_size %s and steps %s' %
                    (num_samples, global_batch_size, steps))

    # We need to return the per replica or global batch size based on the strategy
    if use_per_replica_batch:
        if global_batch_size % distribution_strategy.num_replicas_in_sync:
            raise ValueError(
                'The batch size (%s) could not be sharded evenly across the sync '
                'replicas (%s) in the distribution strategy.' %
                (global_batch_size,
                 distribution_strategy.num_replicas_in_sync))
        batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
    else:
        batch_size = global_batch_size

    return steps, batch_size
Exemplo n.º 20
0
def get_tolerance(save_distribution, restore_distribution):
    if backend.is_tpu_strategy(save_distribution) or backend.is_tpu_strategy(
            restore_distribution):
        return _TPU_TOLERANCE
    return _TOLERANCE
Exemplo n.º 21
0
  def fit(self,
          model,
          x=None,
          y=None,
          batch_size=None,
          epochs=1,
          verbose=1,
          callbacks=None,
          validation_split=0.,
          validation_data=None,
          shuffle=True,
          class_weight=None,
          sample_weight=None,
          initial_epoch=0,
          steps_per_epoch=None,
          validation_steps=None,
          validation_freq=1,
          **kwargs):
    """Fit loop for Distribution Strategies."""
    dist_utils.validate_callbacks(input_callbacks=callbacks,
                                  optimizer=model.optimizer)
    dist_utils.validate_inputs(x, y)

    batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
        model._distribution_strategy,
        x,
        batch_size,
        steps_per_epoch,
        ModeKeys.TRAIN,
        validation_split=validation_split)
    batch_size = model._validate_or_infer_batch_size(
        batch_size, steps_per_epoch, x)
    dataset = model._distribution_standardize_user_data(
        x, y,
        sample_weight=sample_weight,
        class_weight=class_weight,
        batch_size=batch_size,
        validation_split=validation_split,
        shuffle=shuffle,
        epochs=epochs)
    if not dist_utils.is_distributing_by_cloning(model):
      with model._distribution_strategy.scope():
        (dataset, _, _) = model._standardize_user_data(
            dataset,
            sample_weight=sample_weight,
            class_weight=class_weight,
            batch_size=batch_size,
            validation_split=validation_split,
            shuffle=shuffle)

    val_dataset = None
    if validation_data:
      val_x, val_y, val_sample_weights = (
          training_utils_v1.unpack_validation_data(validation_data))
      dist_utils.validate_inputs(val_x, val_y)
      _, validation_steps = dist_utils.process_batch_and_step_size(
          model._distribution_strategy, val_x, batch_size, validation_steps,
          ModeKeys.TEST)

      val_dataset = model._distribution_standardize_user_data(
          val_x, val_y,
          sample_weight=val_sample_weights,
          class_weight=None,
          batch_size=batch_size,
          validation_split=validation_split,
          shuffle=shuffle,
          allow_partial_batch=True)
    elif validation_split:
      raise ValueError('validation_split argument is not supported with '
                       'distribution strategies.')

    if K.is_tpu_strategy(model._distribution_strategy):
      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
          model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
      if steps_per_epoch is None:
        raise ValueError('Number of steps could not be inferred from the data, '
                         'please pass the steps_per_epoch argument.')

      if not context.executing_eagerly():
        # Run TPU training in a custom loop in graph mode.
        return experimental_tpu_fit_loop(
            model,
            dataset,
            epochs=epochs,
            verbose=verbose,
            callbacks=callbacks,
            val_dataset=val_dataset,
            initial_epoch=initial_epoch,
            steps_per_epoch=steps_per_epoch,
            validation_steps=validation_steps,
            validation_freq=validation_freq)

    return training_arrays_v1.fit_loop(
        model,
        dataset,
        batch_size=batch_size,
        epochs=epochs,
        verbose=verbose,
        callbacks=callbacks,
        val_inputs=val_dataset,
        shuffle=shuffle,
        initial_epoch=initial_epoch,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        validation_freq=validation_freq,
        steps_name='steps_per_epoch')