def set_weights(distribution_strategy, dist_model, weights):
  """Sets the weights of the replicated models.

  The weights of the replicated models are set to the weights of the original
  model. The weights of the replicated model are Mirrored variables and hence
  we need to use the `update` call within a DistributionStrategy scope.

  Args:
    distribution_strategy: DistributionStrategy used to distribute training
        and validation.
    dist_model: The replicated models on the different devices.
    weights: The weights of the original model.
  """
  assign_ops = []
  for layer in dist_model.layers:
    num_param = len(layer.weights)
    layer_weights = weights[:num_param]
    for sw, w in zip(layer.weights, layer_weights):
      if ops.executing_eagerly_outside_functions():
        sw.assign(w)
      else:
        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
    weights = weights[num_param:]

  if not ops.executing_eagerly_outside_functions():
    K.get_session(assign_ops).run(assign_ops)
Пример #2
0
def setup_tpu_session(master):
  """Initializes and returns a Keras/TF session connected the TPU `master`."""
  session = tf_session.Session(
      target=master, config=config_pb2.ConfigProto(isolate_session_state=True))
  K.set_session(session)
  K.get_session().run(tpu.initialize_system())
  return session
def _get_var_for_numpy(distribution_strategy, input_array):
  """Creates a variable and assigns the value of the numpy array to it.

  Args:
    distribution_strategy: The DistributionStrategy used to compile the model.
    input_array: The input numpy array whose value will be assigned to the
      variable we create.

  Returns:
    The variable to which we will copy the value of the input numpy array.

  """
  with ops.device(get_cpu_device(distribution_strategy)):
    # Create and initialize a variable on the CPU device. This is the CPU
    # device of the host in the case of TPUDistributionStrategy.
    input_var = variables.VariableV1(array_ops.zeros(input_array.shape,
                                                     input_array.dtype),
                                     trainable=False, use_resource=True)
  K.get_session().run(input_var.initializer)

  # Create a placeholder for the numpy array input slices. We copy the value
  # of the input numpy array to the variable in slices of size 64 MB to avoid
  # running into memory issues or RPC message limits.
  start_placeholder = array_ops.placeholder(dtypes.int64, ())
  end_placeholder = array_ops.placeholder(dtypes.int64, ())
  slice_placeholder = array_ops.placeholder(input_var.dtype)
  assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
      slice_placeholder)

  # If each batch element is > 64 MB, then we copy each batch element
  # individually. Otherwise, the slices will be < 128 MB. There might be padding
  # which might mean that the slices are 128 MB even if the size of the
  # tensor allocated is less than 128 MB.
  # This formula gives slices with size:
  # ceil(64 MB / byte size per batch element) bytes.
  # Using ceil() guarantees we get a number >= 1.

  # Calculate the size of each batch element.
  byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
                                input_var.dtype.size

  # Calculate number of elements we want to copy per slice.
  batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element)

  # Copy slices of the above size starting at 0, except the last slice will be
  # smaller.
  start = 0
  limit = input_array.shape[0]
  while start < limit:
    end = min(start + batch_size_per_slice, limit)
    K.get_session().run(assign_slice_op, feed_dict={
        start_placeholder: start,
        end_placeholder: end,
        slice_placeholder: input_array[start:end]})
    start = end

  return input_var
Пример #4
0
 def _init_writer(self):
   """Sets file writer."""
   if context.executing_eagerly():
     self.writer = summary_ops_v2.create_file_writer(self.log_dir)
   elif self.write_graph:
     self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph)
   else:
     self.writer = tf_summary.FileWriter(self.log_dir)
Пример #5
0
  def set_model(self, model):
    """Sets Keras model and creates summary ops."""

    self.model = model
    self.sess = K.get_session()
    # only make histogram summary op if it hasn't already been made
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = self._writer_class(self.log_dir, self.sess.graph)
    else:
      self.writer = self._writer_class(self.log_dir)
Пример #6
0
  def __call__(self, inputs):
    assert isinstance(inputs, list)

    # Strip sample weight from inputs
    if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
        self.execution_mode == model_fn_lib.ModeKeys.EVAL):
      input_tensors = self.model._feed_inputs + self.model._feed_targets
      inputs = inputs[:len(input_tensors)]
    else:
      input_tensors = self.model._feed_inputs

    shard_inputs = self._split_tensors(inputs)
    del inputs  # To avoid accident usage.

    # Compute an input specification (used to generate infeed enqueue and
    # dequeue operations).  We use the shape from our input array and the
    # dtype from our model.  A user may pass in a float64 for a float32
    # input: for model compatibility we still must generate a float32 infeed.
    input_specs = []

    # We use the shape and dtype from the first shard to compute the input
    # metadata (`input_specs`); all replicas have the same type and shape.
    for tensor, ary in zip(input_tensors, shard_inputs[0]):
      input_specs.append(
          tensor_spec.TensorSpec(ary.shape, tensor.dtype,
                                 _valid_name(tensor.name)))

    # XLA requires every operation in the graph has a fixed shape.  To
    # handle varying batch sizes we recompile a new sub-graph for each
    # unique input shape.
    shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs])

    if shape_key not in self._compilation_cache:
      logging.info('New input shapes; (re-)compiling: mode=%s, %s',
                   self.execution_mode, input_specs)
      new_tpu_model_ops = self._specialize_model(input_specs)
      self._compilation_cache[shape_key] = new_tpu_model_ops
      self._test_model_compiles(new_tpu_model_ops)

    tpu_model_ops = self._compilation_cache[shape_key]

    infeed_dict = {}
    for infeed_tensors, inputs in zip(tpu_model_ops.infeed_tensors,
                                      shard_inputs):
      for tensor, value in zip(infeed_tensors, inputs):
        infeed_dict[tensor] = value

    session = K.get_session()
    _, _, outfeed_outputs = session.run([
        tpu_model_ops.infeed_op, tpu_model_ops.execute_op,
        tpu_model_ops.outfeed_op
    ], infeed_dict)

    # TODO(xiejw): Decide how to reduce outputs, or just discard all but first.
    return outfeed_outputs[:len(outfeed_outputs) // self.num_replicas]
Пример #7
0
 def tpu_session(self):
   """Yields a TPU session and sets it as the default Keras session."""
   with self._graph.as_default():
     default_session = K.get_session()
     # N.B. We have to call `K.set_session()` AND set our session as the
     # TF default. `K.get_session()` surprisingly does not return the value
     # supplied by K.set_session otherwise.
     K.set_session(self._session)
     with self._session.as_default():
       yield self._session
     K.set_session(default_session)
Пример #8
0
def setup_tpu_session(tpu_name_or_address):
  """Initializes and returns a Keras/TF session connected the TPU `master`.

  Args:
    tpu_name_or_address: A string that is either the name of the Cloud TPU,
      the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the
      Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will
      examine the environment to determine a potential Cloud TPU to use.

  Returns:
    A `tf.Session`.
  """
  cluster_resolver = tpu_cluster_resolver.TPUClusterResolver(
      tpu_name_or_address)
  cluster_spec = cluster_resolver.cluster_spec()
  session = tf_session.Session(
      target=cluster_resolver.master(),
      config=config_pb2.ConfigProto(
          isolate_session_state=True))
  if cluster_spec:
    session.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
  K.set_session(session)
  K.get_session().run(tpu.initialize_system())
  return session
Пример #9
0
  def _test_model_compiles(self, tpu_model_ops):
    """Verifies that the given TPUModelOp can be compiled via XLA."""
    logging.info('Started compiling')
    start_time = time.clock()

    result = K.get_session().run(tpu_model_ops.compile_op)
    proto = tpu_compilation_result.CompilationResultProto()
    proto.ParseFromString(result)
    if proto.status_error_message:
      raise RuntimeError('Compilation failed: {}'.format(
          proto.status_error_message))

    end_time = time.clock()
    logging.info('Finished compiling. Time elapsed: %s secs',
                 end_time - start_time)
Пример #10
0
def shutdown_tpu_session(session=None):
  """Shutdown the TPU attached to session.

  This should be called to cleanly shut down the TPU system before the client
  exits.

  Args:
    session: Session to shutdown, or None to use the default session.

  Returns:

  """
  if session is None:
    session = K.get_session()

  session.run(tpu.shutdown_system())
Пример #11
0
def _experimental_fit_loop(
    model,
    iterator,
    epochs=100,
    initial_epoch=0,
    steps_per_epoch=None):
  """fit function when using TPU DistributionStrategy for training.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator that returns inputs and targets
      epochs: Number of times to iterate over the data
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  current_strategy = model._distribution_strategy

  # TODO(priyag): Add validation that shapes are fully defined for TPU case.

  # TODO(priyag, sourabhbajaj): This should be moved into a callback instead.
  K.get_session().run(current_strategy.initialize())

  def _per_device_train_function(model):
    model._make_train_function()
    return (model.train_function.inputs,
            model.train_function.outputs,
            model.train_function.updates_op,
            model.train_function.session_kwargs)

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(1)

  def step_fn(ctx, inputs, targets):
    """Clones the model and calls make_train_function."""
    # TODO(priyag, sourabhbajaj): Should cache this keyed on input shapes.
    clone_model_on_towers(
        model,
        current_strategy,
        make_callback_model=True,
        inputs=inputs,
        targets=targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.call_for_each_tower(
         _per_device_train_function, model._grouped_model)
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args, with_loss_tensor=True)
    combined_fn = K.Function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_train_function',
        **all_session_args)

    # TODO(priyag, sourabhbajaj): Perhaps the aggregation type needs to be
    # something else for different outputs.
    out_labels = model.metrics_names or []
    for label, output in zip(out_labels, combined_fn.outputs):
      ctx.set_last_step_output(label, output,
                               aggregation=distribute_lib.get_loss_reduction())

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag, sourabhbajaj): Adjust steps_per_run appropriately based on
    # steps_per_epoch and number of epochs.
    ctx = current_strategy.run_steps_on_dataset(
        step_fn, iterator, iterations=current_strategy.steps_per_run,
        initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  # Copy the weights from the original model to each of the replicated models.
  orig_model_weights = model.get_weights()
  with current_strategy.scope():
    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
    distributed_training_utils.set_weights(
        current_strategy, distributed_model, orig_model_weights)

  assert steps_per_epoch is not None

  # TODO(priyag, sourabhbajaj): Add callbacks support.
  # TODO(priyag, sourabhbajaj): Add validation.
  for epoch in range(initial_epoch, epochs):
    for step_index in range(
        0, steps_per_epoch, current_strategy.steps_per_run):
      try:
        _, outs = K.get_session().run([train_op, output_tensors])
        # TODO(priyag, sourabhbajaj): Remove this logging in favor of proper
        # summaries through callbacks.
        print('Epoch: {}, step_index: {}, loss: {}'.format(
            epoch, step_index, outs['loss']))
        for label, out in outs.items():
          print(label, ': ', out)
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

  # Copy the weights back from the replicated model to the original model.
  with current_strategy.scope():
    updated_weights = current_strategy.unwrap(
        model._grouped_model)[0].get_weights()
    model.set_weights(updated_weights)

  K.get_session().run(current_strategy.finalize())
Пример #12
0
                    default="model_data/yolo_anchors.txt")
parser.add_argument('-m',
                    '--model',
                    type=str,
                    help='Load a pretrained model from h5 file',
                    default="model_data/yolo.h5")
parser.add_argument('-i',
                    '--image_file',
                    type=str,
                    help='Test image file',
                    default="images/test.jpg")

args = parser.parse_args()

if __name__ == '__main__':
    sess = K.get_session()

    class_names = read_classes(args.classes)
    anchors = read_anchors(args.anchors)
    image_shape = tf.constant([720., 1280.])

    yolo_model = load_model(args.model)

    yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names))
    boxes, scores, classes = yolo_eval(yolo_outputs, image_shape)
    image, image_data = preprocess_image("images/" + args.image_file,
                                         model_image_size=(608, 608))
    out_scores, out_boxes, out_classes = sess.run([scores, boxes, classes],
                                                  feed_dict={
                                                      yolo_model.input:
                                                      image_data,
Пример #13
0
  def on_epoch_end(self, epoch, logs=None):
    """Checks if summary ops should run next epoch, logs scalar summaries."""

    # don't output batch_size and
    # batch number as TensorBoard summaries
    logs = {('epoch_' + k): v
            for k, v in logs.items()
            if k not in ['batch', 'size', 'num_steps']}
    if self.update_freq == 'epoch':
      step = epoch
    else:
      step = self._samples_seen
    self._write_custom_summaries(step, logs)

    # pop the histogram summary op after each epoch
    if self.histogram_freq:
      # pylint: disable=protected-access
      if self.merged in self.model.test_function.fetches:
        self.model.test_function.fetches.remove(self.merged)
      if self.merged in self.model.test_function.fetch_callbacks:
        self.model.test_function.fetch_callbacks.pop(self.merged)
      # pylint: enable=protected-access

    if self.embeddings_data is None and self.embeddings_freq:
      raise ValueError('To visualize embeddings, embeddings_data must '
                       'be provided.')

    if self.embeddings_freq and self.embeddings_data is not None:
      if epoch % self.embeddings_freq == 0:
        # We need a second forward-pass here because we're passing
        # the `embeddings_data` explicitly. This design allows to pass
        # arbitrary data as `embeddings_data` and results from the fact
        # that we need to know the size of the `tf.Variable`s which
        # hold the embeddings in `set_model`. At this point, however,
        # the `validation_data` is not yet set.

        embeddings_data = self.embeddings_data
        n_samples = embeddings_data[0].shape[0]
        i = 0
        sess = K.get_session()
        while i < n_samples:
          step = min(self.batch_size, n_samples - i)
          batch = slice(i, i + step)

          if isinstance(self.model.input, list):
            feed_dict = {
                model_input: embeddings_data[idx][batch]
                for idx, model_input in enumerate(self.model.input)
            }
          else:
            feed_dict = {self.model.input: embeddings_data[0][batch]}

          feed_dict.update({self.batch_id: i, self.step: step})

          if not isinstance(K.learning_phase(), int):
            feed_dict[K.learning_phase()] = False

          sess.run(self.assign_embeddings, feed_dict=feed_dict)
          self.saver.save(sess,
                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
                          epoch)

          i += self.batch_size
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
    """Fit loop for training with TPU tf.distribute.Strategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    mode = ModeKeys.TRAIN

    current_strategy = model._distribution_strategy
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
    steps_per_run = K.variable(value=iteration_value,
                               dtype='int32',
                               name='steps_per_run')

    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
    iterator = dist_utils.get_iterator(dataset, current_strategy)

    scope = dist_utils.distributed_scope(strategy=current_strategy,
                                         learning_phase=1)
    scope.__enter__()

    out_labels = model.metrics_names or []

    step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy,
                                  out_labels)

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for m in model._get_training_eval_metrics():
        tensor = m.result()
        initial_loop_values[m.name] = array_ops.zeros(tensor.shape,
                                                      tensor.dtype)

    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn,
        iterator,
        iterations=steps_per_run,
        initial_loop_values=initial_loop_values)
    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    do_validation = bool(validation_steps)

    if model._compile_distribution:
        dist_utils._copy_weights_to_distributed_model(model, mode)

    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=do_validation,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose,
                                         count_mode='steps',
                                         mode=mode)

    # Calculate the steps each time on the device.
    steps_to_run = (
        [current_strategy.extended.steps_per_run] *
        (steps_per_epoch // current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
        steps_to_run.append(steps_per_epoch %
                            current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)

    callbacks._call_begin_hook(mode)

    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
        initial_epoch, mode)

    for epoch in range(initial_epoch, epochs):
        dist_utils._reset_metrics(model)
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        current_step = 0
        while current_step < target_steps:
            step_count = steps_to_run[current_step]
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                K.get_session().run(steps_per_run.assign(step_count))
                prev_step_count = step_count
            try:
                _, outputs = K.batch_get_value([train_op, output_tensors])
            except errors.OutOfRangeError:
                logging.warning(
                    'Your dataset iterator ran out of data; '
                    'interrupting training. Make sure that your dataset '
                    'can generate at least `steps_per_epoch * epochs` '
                    'batches (in this case, %d batches).' % steps_per_epoch *
                    epochs)
                break

            batch_logs.update(outputs)
            callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
            step_index = step_index + step_count
            current_step += 1

            if callbacks.model.stop_training:
                break

        if (do_validation and training_utils.should_run_validation(
                validation_freq, epoch)):
            logging.info('Running validation at fit epoch: %s', epoch)

            if model._compile_distribution:
                # Since we create a new clone from the original model we need to copy
                # the weights back to the original model before we can run validation.
                dist_utils._copy_weights_to_original_model(
                    model, ModeKeys.TRAIN)

            val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
                model,
                val_dataset,
                steps=validation_steps,
                verbose=verbose,
                callbacks=callbacks)
            if not isinstance(val_outs, list):
                val_outs = [val_outs]
            # Same labels assumed.
            for label, val_out in zip(out_labels, val_outs):
                epoch_logs['val_' + label] = val_out

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks._call_end_hook(mode)

    if model._compile_distribution:
        # Copy the weights back from the replicated model to the original model.
        dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
    scope.__exit__(None, None, None)
    return model.history
Пример #15
0
 def _set_weights_v1(self, weights):
     feed_dict = {}
     for idx, tensor in enumerate(weights):
         feed_dict[self._placeholder_tensors[idx]] = tensor
     backend.get_session().run(self._assign_op, feed_dict)
Пример #16
0
 def auc(y_true, y_pred):
     auc = tf.metrics.auc(y_true, y_pred)[1]
     K.get_session().run(tf.local_variables_initializer())
     return auc
Пример #17
0
def experimental_predict_loop(model, iterator, verbose=0, steps=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  current_strategy = model._distribution_strategy
  scope = current_strategy.scope()
  scope.__enter__()

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      clone_model_on_replicas(model, current_strategy,
                              make_callback_model=False, inputs=inputs,
                              mode=ModeKeys.PREDICT)
    else:
      _build_distributed_network(model, current_strategy, inputs,
                                 mode=ModeKeys.PREDICT)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function, args=(model._distributed_model_predict,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn, iterator, iterations=1,
        initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    with current_strategy.scope():
      _copy_weights_to_distributed_model(
          model, model._distributed_model_predict)
  with current_strategy.scope():
    _reset_metrics(model, model._distributed_model_predict)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    if verbose >= 1:
      progbar.update(step + 1)

  scope.__exit__(None, None, None)
  if len(unconcatenated_outs) == 1:
    return np.concatenate(unconcatenated_outs[0], axis=0)
  return [
      np.concatenate(unconcatenated_outs[i], axis=0)
      for i in range(len(unconcatenated_outs))
  ]
Пример #18
0
 def _restore_updates(self):
   """Recreates a dict of updates from the layer's weights."""
   data_dict = {}
   for name, var in self.state_variables.items():
     data_dict[name] = K.get_session().run(var)
   return data_dict
Пример #19
0
 def _eval(self, tensor):
     if self.use_v1_apis:
         return K.get_session().run(tensor)
     else:
         return tensor.numpy()
import tensorflow as tf
import tensorflow.python.keras.backend as K
from tensorflow.python.framework.graph_util import convert_variables_to_constants
from keras.models import load_model


def freeze_graph(session, keep_var_names=None, output_names=None, clear_devices=True):
    graph = session.graph
    with graph.as_default():
        freeze_var_names = list(set(v.op.name for v in tf.compat.v1.global_variables()).difference(keep_var_names or []))
        output_names = output_names or []
        output_names += [v.op.name for v in tf.compat.v1.global_variables()]
        input_graph_def = graph.as_graph_def()
        if clear_devices:
            for node in input_graph_def.node:
                node.device = ''
        frozen_graph = convert_variables_to_constants(sess=session, input_graph_def=input_graph_def,
                                                      output_node_names=output_names,
                                                      variable_names_whitelist=freeze_var_names)
        return frozen_graph


model = load_model('Image_Classifier.h5')

K.set_learning_phase(1)

frozen_graph = freeze_graph(K.get_session(), output_names=[model.output.op.name])
tf.io.write_graph(frozen_graph, '.', 'Image_Classifier.pb', False)
Пример #21
0
def _get_available_devices():
  return [x.name for x in backend.get_session().list_devices()]
Пример #22
0
        overwrite=True,
        include_optimizer=True,
        save_format=None
    )
    logger.info("Save the history")
    with open(args.output_history_file, 'wb') as file:
        pickle.dump(training_history.history, file)

    test_inputs = [X_test, test_preds]
    logger.info("Fit the model")
    predictions = model.predict(test_inputs)
    logger.error("Compute uncertainty metrics")
    logger.error("Compute mu pred. entropy")
    error, mu_entropy, pred_y = sess.run(predict_cross_entropy(y_test, test_preds))
    logger.error("Compute variation ratios")
    voted_pred = K.get_session().run(voting(predictions))
    logger.error("Compute beta pred. entropy")
    sampling_entropy_gal = sess.run(predict_dirichlet_entropy_gal(predictions))
    logger.error("Compute rejection measures")
    rejection_measures = np.array(
        [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1),
                                     np.argsort(sampling_entropy_gal),
                                     rejection_point))
         for rejection_point in range(1, pred_y.shape[0] - 10)])
    rejection_measures_baseline = np.array(
        [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1), np.argsort(mu_entropy),
                                     rejection_point))
         for rejection_point in range(1, pred_y.shape[0] - 10)])
    rejection_measures_voting = np.array(
        [list(get_rejection_measures(pred_y, np.argmax(y_test, axis=1), np.argsort(voted_pred),
                                     rejection_point))
    model.save(current_output_local+'/my_checkpoint_test', overwrite=True)  # creates a HDF5 file 'my_model.h5'
    #del model  # deletes the existing model
    #model = load_model('output/my_model.h5')

    ######## save method two https://jovianlin.io/saving-loading-keras-models/
    # Save the weights
    model.save_weights(current_output_local+'model_weights.h5', overwrite=True)

    # Save the model architecture
    with open(current_output_local+'model_architecture.json', 'w') as f:
        f.write(model.to_json())
    #######
    ############save method three#################
    saver = tf.train.Saver()
    sess = backend.get_session()
    saver.save(sess, current_output_local)

    model.save(current_output_local+'my_model.h5')
    #print(model.get_weights())


    ############    
###does not like to reopen old
##    del model
##    del saver
##
##    model = load_model('keras_model.hdf5')
##
##    saver = tf.train.Saver()
##    sess = backend.get_session()
Пример #24
0
    if data_augmentation:
        checkpoint_dir += '_aug/'

    # necessary !!!
    tf.compat.v1.disable_eager_execution()

    last_model = os.listdir(checkpoint_dir)[-1]
    chosen_model = 'Epoch_439_model.hp5'
    # chosen model = last_model
    save_pb = True
    if save_pb:
        h5_path = checkpoint_dir + chosen_model
        model = tf.keras.models.load_model(h5_path, compile=False)
        # save pb
        with K.get_session() as sess:
            output_names = [out.op.name for out in model.outputs]
            input_graph_def = sess.graph.as_graph_def()
            for node in input_graph_def.node:
                node.device = ""
            graph = graph_util.remove_training_nodes(input_graph_def)
            graph_frozen = graph_util.convert_variables_to_constants(
                sess, graph, output_names)
            tf.io.write_graph(graph_frozen,
                              checkpoint_dir,
                              'model.pb',
                              as_text=False)
        logging.info("save pb successfully!")

    # Load Frozen graph
    pb_file = checkpoint_dir + 'model.pb'
def _experimental_test_loop(model, iterator, verbose=0, steps=None,
                            initialize_finalize_strategy=True):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      initialize_finalize_strategy: Should the strategy initialize and finalize
          functions be called.

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  current_strategy = model._distribution_strategy
  if initialize_finalize_strategy:
    K.get_session().run(current_strategy.initialize())

  def _per_device_eval_function(model):
    model._make_eval_function()
    return (model._eval_function.inputs, model._eval_function.outputs,
            model._eval_function.updates_op,
            model._eval_function.session_kwargs)

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def step_fn(ctx, inputs, targets):
    """Clones the model and calls make_eval_function."""
    # TODO(priyag, sourabhbajaj): The model gets cloned every time
    # fit/test/predict is called. We should look into caching this keyed on
    # input shapes.
    clone_model_on_replicas(
        model,
        current_strategy,
        make_callback_model=False,
        inputs=inputs,
        targets=targets,
        mode=_Mode.TEST)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.call_for_each_replica(
         _per_device_eval_function, args=(model._grouped_model_test,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_test_function',
        **all_session_args)

    for label, output in zip(model.metrics_names, combined_fn.outputs):
      if label == 'loss':
        aggregation = distribute_lib.get_loss_reduction()
      else:
        # We aggregate all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        aggregation = variable_scope.VariableAggregation.MEAN
      ctx.set_last_step_output(label, output, aggregation)

    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag): Use steps_per_run when we use new metrics as they will
    # allow handling metric computation at each step using variables.
    ctx = current_strategy.run_steps_on_dataset(
        step_fn, iterator, iterations=1,
        initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  # Copy the weights from the original model to each of the replicated models.
  orig_model_weights = model.get_weights()
  with current_strategy.scope():
    distributed_model = current_strategy.unwrap(model._grouped_model_test)[0]
    distributed_training_utils.set_weights(
        current_strategy, distributed_model, orig_model_weights)

  assert steps is not None
  outs = [0.] * len(model.metrics_names)
  for step in range(steps):
    _, batch_outs = K.get_session().run([test_op, output_tensors])
    for i, label in enumerate(model.metrics_names):
      outs[i] += batch_outs[label]
    if verbose >= 1:
      progbar.update(step + 1)
  for i in range(len(outs)):
    outs[i] /= (steps)

  if initialize_finalize_strategy:
    K.get_session().run(current_strategy.finalize())

  if len(outs) == 1:
    return outs[0]
  return outs
Пример #26
0
def experimental_tpu_predict_loop(model,
                                  dataset,
                                  verbose=0,
                                  steps=None,
                                  callbacks=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  dataset_fully_shaped = (distributed_training_utils.
                          is_dataset_shape_fully_defined(dataset))
  padding_handler = None
  if not dataset_fully_shaped:
    # TODO(hongjunchoi): Investigate whether operations from
    # PartialBatchPaddingHandler are unnecessarily pruned out
    # during graph optimization.
    padding_handler = padding_util.PartialBatchPaddingHandler(
        model._feed_output_shapes)
    batched_dataset = input_lib._get_batched_dataset(dataset)
    batch_size, _, prefetch_buffer = input_lib._get_batched_dataset_attributes(
        batched_dataset)
    padding_handler.padded_batch_size = batch_size
    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
                                                  padding_handler.update_mask)

    dataset = dataset.map(padding_handler.pad_batch)
    dataset = dataset.apply(batching.unbatch())
    # Upon this point, it is guaranteed that the dataset does not
    # have partial batches. Thus, we set `drop_remainder=True` to
    # get static shape information about the elements in the dataset.
    dataset = dataset.batch(batch_size, drop_remainder=True)

    if prefetch_buffer is not None:
      dataset = dataset.prefetch(prefetch_buffer)

  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      distributed_training_utils. clone_model_on_replicas(
          model, current_strategy, ModeKeys.PREDICT, inputs=inputs)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, ModeKeys.PREDICT, inputs)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function, args=(model._distributed_model_predict,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(
        model, ModeKeys.PREDICT)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=ModeKeys.PREDICT)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    if verbose >= 1:
      progbar.update(step + 1)

  scope.__exit__(None, None, None)

  if len(unconcatenated_outs) == 1:
    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
  else:
    prediction_result = [
        np.concatenate(unconcatenated_outs[i], axis=0)
        for i in range(len(unconcatenated_outs))
    ]

  if padding_handler:
    prediction_result = padding_handler.apply_mask(prediction_result)

  return prediction_result
Пример #27
0
def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
  """Loads Keras objects from a SavedModel.

  Any Keras layer or model saved to the SavedModel will be loaded back
  as Keras objects. Other objects are loaded as regular trackable objects (same
  as `tf.saved_model.load`).

  Currently, Keras saving/loading only retains the Keras object's weights,
  losses, and call function.

  The loaded model can be re-compiled, but the original optimizer, compiled loss
  functions, and metrics are not retained. This is temporary, and `model.save`
  will soon be able to serialize compiled models.

  Args:
    path: Path to SavedModel.
    compile: If true, compile the model after loading it.
    options: Optional `tf.saved_model.LoadOptions` object that specifies
      options for loading from SavedModel.


  Returns:
    Object loaded from SavedModel.
  """
  # TODO(kathywu): Add saving/loading of optimizer, compiled losses and metrics.
  # TODO(kathywu): Add code to load from objects that contain all endpoints

  # Look for metadata file or parse the SavedModel
  metadata = saved_metadata_pb2.SavedMetadata()
  meta_graph_def = loader_impl.parse_saved_model(path).meta_graphs[0]
  object_graph_def = meta_graph_def.object_graph_def
  path_to_metadata_pb = os.path.join(path, constants.SAVED_METADATA_PATH)
  if gfile.Exists(path_to_metadata_pb):
    try:
      with gfile.GFile(path_to_metadata_pb, 'rb') as f:
        file_content = f.read()
      metadata.ParseFromString(file_content)
    except message.DecodeError as e:
      raise IOError('Cannot parse keras metadata {}: {}.'
                    .format(path_to_metadata_pb, str(e)))
  else:
    logging.warning('SavedModel saved prior to TF 2.5 detected when loading '
                    'Keras model. Please ensure that you are saving the model '
                    'with model.save() or tf.keras.models.save_model(), *NOT* '
                    'tf.saved_model.save(). To confirm, there should be a file '
                    'named "keras_metadata.pb" in the SavedModel directory.')
    _read_legacy_metadata(object_graph_def, metadata)

  if not metadata.nodes:
    # When there are no Keras objects, return the results from the core loader
    return tf_load.load(path, options=options)

  # Recreate layers and metrics using the info stored in the metadata.
  keras_loader = KerasObjectLoader(metadata, object_graph_def)
  keras_loader.load_layers(compile=compile)

  # Generate a dictionary of all loaded nodes.
  nodes_to_load = {'root': None}
  for node_id, loaded_node in keras_loader.loaded_nodes.items():
    nodes_to_load[keras_loader.get_path(node_id)] = loaded_node
  loaded = tf_load.load_partial(path, nodes_to_load, options=options)

  # Finalize the loaded layers and remove the extra tracked dependencies.
  keras_loader.finalize_objects()
  keras_loader.del_tracking()

  model = loaded['root']

  # pylint: disable=protected-access
  if isinstance(model, training_lib.Model) and compile:
    # TODO(kathywu): Use compiled objects from SavedModel, instead of
    # creating new objects from the training config.
    training_config = model._serialized_attributes['metadata'].get(
        'training_config', None)
    if training_config is not None:
      model.compile(**saving_utils.compile_args_from_training_config(
          training_config))
      saving_utils.try_build_compiled_arguments(model)
    else:
      logging.warning('No training configuration found in save file, so the '
                      'model was *not* compiled. Compile it manually.')
  # pylint: enable=protected-access

  # Force variables and resources to initialize.
  if not context.executing_eagerly():
    sess = backend.get_session()  # Variables are initialized by this call.
    sess.run(ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS))

  return model
Пример #28
0
 def _run(self, op):
     if self.use_v1_apis:
         K.get_session().run(op)
Пример #29
0
def initialize_iterator(iterator, distribution_strategy):
    with distribution_strategy.scope():
        init_op = control_flow_ops.group(iterator.initialize())
        if not context.executing_eagerly():
            K.get_session((init_op, )).run(init_op)
Пример #30
0
    model.add(layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, 28, 28, 1)

    return model



generator = make_generator_model()
noise = tf.random.normal([1, 100])

generated_image = generator(noise, training=False)



your_session = K.get_session()

array = generated_image[0, :, :, 0].eval(session=your_session)


plt.imshow(array, cmap='gray')
plt.show()




def discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same',
                                     input_shape=[28, 28, 1]))
    model.add(layers.LeakyReLU())
Пример #31
0
def experimental_test_loop(model,
                           iterator,
                           verbose=0,
                           steps=None):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  current_strategy = model._distribution_strategy
  scope = current_strategy.scope()
  scope.__enter__()

  def _per_device_eval_function(model):
    model._make_eval_function()
    return (model._eval_function.inputs, model._eval_function.outputs,
            model._eval_function.updates_op,
            model._eval_function.session_kwargs)

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_eval_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      clone_model_on_replicas(model, current_strategy,
                              make_callback_model=False, inputs=inputs,
                              targets=targets, mode=ModeKeys.TEST)
    else:
      _build_distributed_network(model, current_strategy, inputs,
                                 targets, mode=ModeKeys.TEST)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_eval_function, args=(model._distributed_model_test,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_test_function',
        **all_session_args)

    for label, output in zip(model.metrics_names, combined_fn.outputs):
      if label == 'loss':
        reduce_op = distribute_lib.get_loss_reduction()
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag): Use steps_per_run when we use new metrics as they will
    # allow handling metric computation at each step using variables.
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn, iterator, iterations=1,
        initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    with current_strategy.scope():
      _copy_weights_to_distributed_model(model, model._distributed_model_test)
  with current_strategy.scope():
    _reset_metrics(model, model._distributed_model_test)

  assert steps is not None
  outs = [0.] * len(model.metrics_names)
  for step in range(steps):
    _, batch_outs = K.get_session().run([test_op, output_tensors])
    for i, label in enumerate(model.metrics_names):
      if i == 0:
        # Loss is stateless metrics.
        outs[i] += batch_outs[label]
      else:
        # For all stateful metrics, the aggregation is handled by mirrored vars.
        outs[i] = batch_outs[label]

    if verbose >= 1:
      progbar.update(step + 1)

  scope.__exit__(None, None, None)
  if len(outs) >= 0:
    outs[0] /= (steps)

  if len(outs) == 1:
    return outs[0]
  return outs
Пример #32
0
def _experimental_fit_loop(model,
                           iterator,
                           epochs=100,
                           verbose=1,
                           callbacks=None,
                           initial_epoch=0,
                           steps_per_epoch=None):
    """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
    current_strategy = model._distribution_strategy

    # TODO(priyag): Add validation that shapes are fully defined for TPU case.

    K.get_session().run(current_strategy.initialize())

    def _per_device_train_function(model):
        model._make_train_function()
        return (model.train_function.inputs, model.train_function.outputs,
                model.train_function.updates_op,
                model.train_function.session_kwargs)

    # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
    K.set_learning_phase(1)

    def step_fn(ctx, inputs, targets):
        """Clones the model and calls make_train_function."""
        # TODO(priyag, sourabhbajaj): The model gets cloned every time
        # fit/test/predict is called. We should look into caching this keyed on
        # input shapes.
        clone_model_on_towers(model,
                              current_strategy,
                              make_callback_model=True,
                              inputs=inputs,
                              targets=targets)

        (grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args) = current_strategy.call_for_each_tower(
             _per_device_train_function, model._grouped_model)
        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)
        combined_fn = K.Function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_train_function',
                                 **all_session_args)

        out_labels = model.metrics_names or []
        for label, output in zip(out_labels, combined_fn.outputs):
            if label == 'loss':
                aggregation = distribute_lib.get_loss_reduction()
            else:
                # We aggregate all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                aggregation = variable_scope.VariableAggregation.MEAN
            ctx.set_last_step_output(label, output, aggregation)

        # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
        # feed_dict, session kwargs, run options, run_metadata for now. These should
        # be handled appropriately
        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    if steps_per_epoch is None:
        raise ValueError(
            'steps_per_epoch should be specified in the fit call.')
    steps_per_run_var = K.variable(value=min(steps_per_epoch,
                                             current_strategy.steps_per_run),
                                   dtype='int32',
                                   name='steps_per_run_var')

    with current_strategy.scope():
        ctx = current_strategy.run_steps_on_dataset(
            step_fn,
            iterator,
            iterations=steps_per_run_var,
            initial_loop_values=initial_loop_values)

    train_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    # Copy the weights from the original model to each of the replicated models.
    orig_model_weights = model.get_weights()
    with current_strategy.scope():
        distributed_model = current_strategy.unwrap(model._grouped_model)[0]
        distributed_training_utils.set_weights(current_strategy,
                                               distributed_model,
                                               orig_model_weights)
    callbacks = cbks.configure_callbacks(callbacks,
                                         model,
                                         do_validation=False,
                                         val_inputs=None,
                                         val_targets=None,
                                         epochs=epochs,
                                         steps_per_epoch=steps_per_epoch,
                                         verbose=verbose)
    # TODO(priyag, sourabhbajaj): Add callbacks support for per step callback
    # TODO(priyag, sourabhbajaj): Add validation.

    # Calculate the steps each time on the device.
    steps_to_run = [current_strategy.steps_per_run
                    ] * (steps_per_epoch // current_strategy.steps_per_run)
    if steps_per_epoch % current_strategy.steps_per_run:
        steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)

    callbacks.on_train_begin()
    for epoch in range(initial_epoch, epochs):
        callbacks.on_epoch_begin(epoch)
        epoch_logs = {}
        step_index = 0
        prev_step_count = None
        for step_count in steps_to_run:
            batch_logs = {
                'batch': step_index,
                'size': 1,
                'num_steps': step_count
            }
            callbacks.on_batch_begin(step_index, batch_logs)
            if prev_step_count is None or step_count != prev_step_count:
                steps_per_run_var.load(step_count, K.get_session())
                prev_step_count = step_count
            try:
                _, outputs = K.get_session().run([train_op, output_tensors])
            except errors.OutOfRangeError:
                logging.warning(
                    'Your dataset iterator ran out of data; '
                    'interrupting training. Make sure that your dataset '
                    'can generate at least `steps_per_epoch * epochs` '
                    'batches (in this case, %d batches).' % steps_per_epoch *
                    epochs)
                break

            batch_logs.update(outputs)
            callbacks.on_batch_end(step_index, batch_logs)
            step_index = step_index + step_count
            if callbacks.model.stop_training:
                break

        callbacks.on_epoch_end(epoch, epoch_logs)
        if callbacks.model.stop_training:
            break
    callbacks.on_train_end()

    # Copy the weights back from the replicated model to the original model.
    with current_strategy.scope():
        updated_weights = current_strategy.unwrap(
            model._grouped_model)[0].get_weights()
        model.set_weights(updated_weights)

    K.get_session().run(current_strategy.finalize())
    return model.history
Пример #33
0
def experimental_fit_loop(model,
                          iterator,
                          epochs=100,
                          verbose=1,
                          callbacks=None,
                          initial_epoch=0,
                          steps_per_epoch=None,
                          val_iterator=None,
                          validation_steps=None):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_iterator: Iterator for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  current_strategy = model._distribution_strategy
  scope = current_strategy.scope()
  scope.__enter__()

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(1)
  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs):
    """Clones the model and calls make_fit_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      clone_model_on_replicas(model, current_strategy,
                              make_callback_model=True, inputs=inputs,
                              targets=targets, mode=ModeKeys.TRAIN)
    else:
      _build_distributed_network(model, current_strategy, inputs,
                                 targets, mode=ModeKeys.TRAIN)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_fit_function, args=(model._distributed_model_train,))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        reduce_op = distribute_lib.get_loss_reduction()
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is None:
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model.')
  steps_per_run = K.variable(
      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
      dtype='int32',
      name='steps_per_run')

  with current_strategy.scope():
    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
        step_fn, iterator, iterations=steps_per_run,
        initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    with current_strategy.scope():
      _copy_weights_to_distributed_model(model, model._distributed_model_train)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose)

  # Calculate the steps each time on the device.
  steps_to_run = [current_strategy.extended.steps_per_run] * (
      steps_per_epoch // current_strategy.extended.steps_per_run)
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)

  callbacks.on_train_begin()
  for epoch in range(initial_epoch, epochs):
    with current_strategy.scope():
      _reset_metrics(model, model._distributed_model_train)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    for step_count in steps_to_run:
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks.on_batch_begin(step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks.on_batch_end(step_index, batch_logs)
      step_index = step_index + step_count
      if callbacks.model.stop_training:
        break

    if do_validation:
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        with current_strategy.scope():
          _copy_weights_to_original_model(
              model, model._distributed_model_train, ModeKeys.TRAIN)

      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
          model,
          val_iterator,
          steps=validation_steps,
          verbose=verbose)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks.on_train_end()

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    with current_strategy.scope():
      _copy_weights_to_original_model(model, model._distributed_model_train,
                                      ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
Пример #34
0
def _experimental_test_loop(model, iterator, verbose=0, steps=None):
    """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
    current_strategy = model._distribution_strategy
    K.get_session().run(current_strategy.initialize())

    def _per_device_test_function(model):
        model._make_test_function()
        return (model.test_function.inputs, model.test_function.outputs,
                model.test_function.updates_op,
                model.test_function.session_kwargs)

    # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
    K.set_learning_phase(0)

    def step_fn(ctx, inputs, targets):
        """Clones the model and calls make_test_function."""
        # TODO(priyag, sourabhbajaj): The model gets cloned every time
        # fit/test/predict is called. We should look into caching this keyed on
        # input shapes.
        clone_model_on_towers(model,
                              current_strategy,
                              make_callback_model=False,
                              inputs=inputs,
                              targets=targets)

        (grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args) = current_strategy.call_for_each_tower(
             _per_device_test_function, model._grouped_model)

        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)

        combined_fn = K.Function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_test_function',
                                 **all_session_args)

        for label, output in zip(model.metrics_names, combined_fn.outputs):
            if label == 'loss':
                aggregation = distribute_lib.get_loss_reduction()
            else:
                # We aggregate all other metrics using mean for now. This is temporary
                # workaround until new metrics are in place.
                aggregation = variable_scope.VariableAggregation.MEAN
            ctx.set_last_step_output(label, output, aggregation)

        return combined_fn.updates_op

    # Add initial dummy values for loss and other metric tensors.
    initial_loop_values = {}
    initial_loop_values['loss'] = constant_op.constant(1e7)
    for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
        initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

    with current_strategy.scope():
        # TODO(priyag): Use steps_per_run when we use new metrics as they will
        # allow handling metric computation at each step using variables.
        ctx = current_strategy.run_steps_on_dataset(
            step_fn,
            iterator,
            iterations=1,
            initial_loop_values=initial_loop_values)

    test_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    # Copy the weights from the original model to each of the replicated models.
    orig_model_weights = model.get_weights()
    with current_strategy.scope():
        distributed_model = current_strategy.unwrap(model._grouped_model)[0]
        distributed_training_utils.set_weights(current_strategy,
                                               distributed_model,
                                               orig_model_weights)

    assert steps is not None
    outs = [0.] * len(model.metrics_names)
    for step in range(steps):
        _, batch_outs = K.get_session().run([test_op, output_tensors])
        for i, label in enumerate(model.metrics_names):
            outs[i] += batch_outs[label]
        if verbose >= 1:
            progbar.update(step + 1)
    for i in range(len(outs)):
        outs[i] /= (steps)

    K.get_session().run(current_strategy.finalize())

    if len(outs) == 1:
        return outs[0]
    return outs
Пример #35
0
def _init_session():
    from tensorflow.python.keras import backend
    sess = backend.get_session()
    tf.get_default_graph()
    set_session(sess)
    return sess
Пример #36
0
def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
    """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
    current_strategy = model._distribution_strategy
    K.get_session().run(current_strategy.initialize())

    # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
    K.set_learning_phase(0)

    def _per_device_predict_function(model):
        model._make_predict_function()
        return (model.predict_function.inputs, model.predict_function.outputs,
                model.predict_function.updates_op,
                model.predict_function.session_kwargs)

    def step_fn(ctx, inputs, targets):
        """Clones the model and calls make_predict_function."""

        # TODO(anjalisridhar): Support predict input correctly as it will not
        # contain targets, only inputs.
        del targets

        # TODO(priyag, sourabhbajaj): The model gets cloned every time
        # fit/test/predict is called. We should look into caching this keyed on
        # input shapes.
        clone_model_on_towers(model,
                              current_strategy,
                              make_callback_model=False,
                              inputs=inputs)

        (grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args) = current_strategy.call_for_each_tower(
             _per_device_predict_function, model._grouped_model)

        (all_inputs, all_outputs, all_updates,
         all_session_args) = distributed_training_utils.unwrap_values(
             current_strategy, grouped_inputs, grouped_outputs,
             grouped_updates, grouped_session_args)

        combined_fn = K.Function(all_inputs,
                                 all_outputs,
                                 updates=all_updates,
                                 name='distributed_predict_function',
                                 **all_session_args)

        for label, output in zip(model.output_names, combined_fn.outputs):
            ctx.set_last_step_output(label, output)

        return combined_fn.updates_op

    # Add initial dummy values for outputs.
    initial_loop_values = {}
    batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
    for name, tensor in zip(model.output_names, model.outputs):
        # TODO(priyag): This is a workaround as we do not know the batch dimension
        # of the model's output at this point.
        shape = tensor_shape.TensorShape(tensor.shape.dims)
        shape.dims = [batch_dimension] + shape.dims[1:]
        initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

    with current_strategy.scope():
        # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
        ctx = current_strategy.run_steps_on_dataset(
            step_fn,
            iterator,
            iterations=1,
            initial_loop_values=initial_loop_values)

    predict_op = ctx.run_op
    output_tensors = ctx.last_step_outputs

    if verbose == 1:
        progbar = Progbar(target=steps)

    # Copy the weights from the original model to each of the replicated models.
    orig_model_weights = model.get_weights()
    with current_strategy.scope():
        distributed_model = current_strategy.unwrap(model._grouped_model)[0]
        distributed_training_utils.set_weights(current_strategy,
                                               distributed_model,
                                               orig_model_weights)

    assert steps is not None
    # Since we do not know how many samples we will see, we cannot pre-allocate
    # the returned Numpy arrays. Instead, we store one array per batch seen
    # and concatenate them upon returning.
    unconcatenated_outs = [[] for _ in model.outputs]
    for step in range(steps):
        _, batch_outs = K.get_session().run([predict_op, output_tensors])
        # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
        for i, label in enumerate(model.output_names):
            unconcatenated_outs[i].extend(batch_outs[label])
        if verbose >= 1:
            progbar.update(step + 1)

    K.get_session().run(current_strategy.finalize())

    if len(unconcatenated_outs) == 1:
        return np.concatenate(unconcatenated_outs[0], axis=0)
    return [
        np.concatenate(unconcatenated_outs[i], axis=0)
        for i in range(len(unconcatenated_outs))
    ]
Пример #37
0
                                    global_step_tensor,
                                    5000,
                                    0.975,
                                    staircase=True)
    tf.summary.scalar('Learning_Rate', lr)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    apply_recons_op, apply_soft_op, avg_loss_recons, avg_loss_soft = create_parallel_optimization(
        create_wnet, iterator, optimizer, num_classes)
    with tf.name_scope('Loss'):
        tf.summary.scalar('Reconstruction_Loss', avg_loss_recons)
        tf.summary.scalar('Soft_N_Cut_Loss', avg_loss_soft)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    K_B.set_session(sess)
    with K_B.get_session() as sess:
        train_writer = tf.summary.FileWriter(logdir, sess.graph)

        init = tf.global_variables_initializer()
        sess.run(init)

        if exists(checkpt_dir):
            if tf.train.latest_checkpoint(checkpt_dir) is not None:
                tf.logging.info('Loading Checkpoint from ' +
                                tf.train.latest_checkpoint(checkpt_dir))
                saver.restore(sess, tf.train.latest_checkpoint(checkpt_dir))
        else:
            tf.logging.info('Training from Scratch -  No Checkpoint found')

        # img_lab = np.expand_dims(cv2.cvtColor(img, cv2.COLOR_BGR2LAB), axis=0)
        i = 0
Пример #38
0
    def on_epoch_end(self, epoch, logs=None):
        """Checks if summary ops should run next epoch, logs scalar summaries."""

        # don't output batch_size and
        # batch number as TensorBoard summaries
        logs = {('epoch_' + k): v
                for k, v in logs.items()
                if k not in ['batch', 'size', 'num_steps']}
        if self.update_freq == 'epoch':
            step = epoch
        else:
            step = self._samples_seen
        self._write_custom_summaries(step, logs)

        # pop the histogram summary op after each epoch
        if self.histogram_freq:
            # pylint: disable=protected-access
            if self.merged in self.model.test_function.fetches:
                self.model.test_function.fetches.remove(self.merged)
            if self.merged in self.model.test_function.fetch_callbacks:
                self.model.test_function.fetch_callbacks.pop(self.merged)
            # pylint: enable=protected-access

        if self.embeddings_data is None and self.embeddings_freq:
            raise ValueError('To visualize embeddings, embeddings_data must '
                             'be provided.')

        if self.embeddings_freq and self.embeddings_data is not None:
            if epoch % self.embeddings_freq == 0:
                # We need a second forward-pass here because we're passing
                # the `embeddings_data` explicitly. This design allows to pass
                # arbitrary data as `embeddings_data` and results from the fact
                # that we need to know the size of the `tf.Variable`s which
                # hold the embeddings in `set_model`. At this point, however,
                # the `validation_data` is not yet set.

                embeddings_data = self.embeddings_data
                n_samples = embeddings_data[0].shape[0]
                i = 0
                sess = K.get_session()
                while i < n_samples:
                    step = min(self.batch_size, n_samples - i)
                    batch = slice(i, i + step)

                    if isinstance(self.model.input, list):
                        feed_dict = {
                            model_input: embeddings_data[idx][batch]
                            for idx, model_input in enumerate(self.model.input)
                        }
                    else:
                        feed_dict = {
                            self.model.input: embeddings_data[0][batch]
                        }

                    feed_dict.update({self.batch_id: i, self.step: step})

                    if not isinstance(K.learning_phase(), int):
                        feed_dict[K.learning_phase()] = False

                    sess.run(self.assign_embeddings, feed_dict=feed_dict)
                    self.saver.save(
                        sess, os.path.join(self.log_dir,
                                           'keras_embedding.ckpt'), epoch)

                    i += self.batch_size
Пример #39
0
def _get_available_devices():
  return [x.name for x in K.get_session().list_devices()]
Пример #40
0
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  mode = ModeKeys.TEST
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_eval_function(model):
    model._make_eval_function()
    return (model._eval_function.inputs, model._eval_function.outputs,
            model._eval_function.updates_op,
            model._eval_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_eval_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode=mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_eval_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TEST),))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_test_function',
        **all_session_args)

    for label, output in zip(model.metrics_names, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  # TODO(priyag): Use steps_per_run when we use new metrics as they will
  # allow handling metric computation at each step using variables.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=ModeKeys.TEST)
  callbacks._call_begin_hook(mode)

  assert steps is not None
  outs = [0.] * len(model.metrics_names)
  for step in range(steps):
    batch_logs = {'batch': step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
    _, batch_outs = K.get_session().run([test_op, output_tensors])
    for i, label in enumerate(model.metrics_names):
      if i == 0:
        # Loss is stateless metrics.
        outs[i] += batch_outs[label]
      else:
        # For all stateful metrics, the aggregation is handled by mirrored vars.
        outs[i] = batch_outs[label]

    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
    if verbose >= 1:
      progbar.update(step + 1)

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)
  if len(outs) >= 0:
    outs[0] /= (steps)

  if len(outs) == 1:
    return outs[0]
  return outs
def initialize_iterator(iterator, distribution_strategy):
  with distribution_strategy.scope():
    init_op = control_flow_ops.group(iterator.initialize())
    if not context.executing_eagerly():
      K.get_session((init_op,)).run(init_op)
Пример #42
0
  def set_model(self, model):
    """Sets Keras model and creates summary ops."""

    self.model = model
    self.sess = K.get_session()
    # only make histogram summary op if it hasn't already been made
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          if isinstance(layer.output, list):
            for i, output in enumerate(layer.output):
              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
          else:
            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = self._writer_class(self.log_dir, self.sess.graph)
    else:
      self.writer = self._writer_class(self.log_dir)

    # If both embedding_freq and embeddings_data are available, we will
    # visualize embeddings.
    if self.embeddings_freq and self.embeddings_data is not None:
      self.embeddings_data = standardize_input_data(self.embeddings_data,
                                                    model.input_names)

      # If embedding_layer_names are not provided, get all of the embedding
      # layers from the model.
      embeddings_layer_names = self.embeddings_layer_names
      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name
            for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      self.assign_embeddings = []
      embeddings_vars = {}

      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
      self.step = step = array_ops.placeholder(dtypes.int32)

      for layer in self.model.layers:
        if layer.name in embeddings_layer_names:
          embedding_input = self.model.get_layer(layer.name).output
          embedding_size = np.prod(embedding_input.shape[1:])
          embedding_input = array_ops.reshape(embedding_input,
                                              (step, int(embedding_size)))
          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
          embedding = variables.Variable(
              array_ops.zeros(shape), name=layer.name + '_embedding')
          embeddings_vars[layer.name] = embedding
          batch = state_ops.assign(embedding[batch_id:batch_id + step],
                                   embedding_input)
          self.assign_embeddings.append(batch)

      self.saver = saver.Saver(list(embeddings_vars.values()))

      # Create embeddings_metadata dictionary
      if isinstance(self.embeddings_metadata, str):
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings_vars.keys()
        }
      else:
        # If embedding_metadata is already a dictionary
        embeddings_metadata = self.embeddings_metadata

      try:
        from tensorboard.plugins import projector
      except ImportError:
        raise ImportError('Failed to import TensorBoard. Please make sure that '
                          'TensorBoard integration is complete."')

      # TODO(psv): Add integration tests to test embedding visualization
      # with TensorBoard callback. We are unable to write a unit test for this
      # because TensorBoard dependency assumes TensorFlow package is installed.
      config = projector.ProjectorConfig()
      for layer_name, tensor in embeddings_vars.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if (embeddings_metadata is not None and
            layer_name in embeddings_metadata):
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
def _experimental_fit_loop(
    model,
    iterator,
    epochs=100,
    verbose=1,
    callbacks=None,
    initial_epoch=0,
    steps_per_epoch=None,
    val_iterator=None,
    validation_steps=None):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_iterator: Iterator for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  current_strategy = model._distribution_strategy

  K.get_session().run(current_strategy.initialize())

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(1)
  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs, targets):
    """Clones the model and calls make_fit_function."""
    # TODO(priyag, sourabhbajaj): The model gets cloned every time
    # fit/test/predict is called. We should look into caching this keyed on
    # input shapes.
    clone_model_on_replicas(
        model,
        current_strategy,
        make_callback_model=True,
        inputs=inputs,
        targets=targets,
        mode=_Mode.TRAIN)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.call_for_each_replica(
         _per_device_fit_function, args=(model._grouped_model_train,))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        aggregation = distribute_lib.get_loss_reduction()
      else:
        # We aggregate all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        aggregation = variable_scope.VariableAggregation.MEAN
      ctx.set_last_step_output(label, output, aggregation)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is None:
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model.')
  steps_per_run = K.variable(
      value=min(steps_per_epoch, current_strategy.steps_per_run),
      dtype='int32',
      name='steps_per_run')

  with current_strategy.scope():
    ctx = current_strategy.run_steps_on_dataset(
        step_fn, iterator, iterations=steps_per_run,
        initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  # Copy the weights from the original model to each of the replicated models.
  orig_model_weights = model.get_weights()
  with current_strategy.scope():
    distributed_model = current_strategy.unwrap(model._grouped_model_train)[0]
    distributed_training_utils.set_weights(
        current_strategy, distributed_model, orig_model_weights)
  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      val_inputs=None,
      val_targets=None,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose)

  # Calculate the steps each time on the device.
  steps_to_run = [current_strategy.steps_per_run] * (
      steps_per_epoch // current_strategy.steps_per_run)
  if steps_per_epoch % current_strategy.steps_per_run:
    steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)

  callbacks.on_train_begin()
  for epoch in range(initial_epoch, epochs):
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    for step_count in steps_to_run:
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks.on_batch_begin(step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks.on_batch_end(step_index, batch_logs)
      step_index = step_index + step_count
      if callbacks.model.stop_training:
        break

    if do_validation:
      logging.info('Running validation at fit epoch: %s', epoch)

      # Since we create a new clone from the original model we need to copy
      # the weights back to the original model before we can run validation.
      with current_strategy.scope():
        updated_weights = current_strategy.unwrap(
            model._grouped_model_train)[0].get_weights()
        model.set_weights(updated_weights)

      val_outs = _experimental_test_loop(
          model,
          val_iterator,
          steps=validation_steps,
          verbose=verbose,
          initialize_finalize_strategy=False)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks.on_train_end()

  # Copy the weights back from the replicated model to the original model.
  with current_strategy.scope():
    updated_weights = current_strategy.unwrap(
        model._grouped_model_train)[0].get_weights()
    model.set_weights(updated_weights)

  K.get_session().run(current_strategy.finalize())
  return model.history
Пример #44
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
  if (current_strategy.extended.steps_per_run != 1 and
      steps_per_epoch is None):
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model with TPUStrategy when '
                     '`steps_per_run` != 1 .')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs):
    """Clones the model and calls make_fit_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_fit_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TRAIN),))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  use_steps = steps_per_epoch is not None
  if use_steps:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    iteration_value = current_strategy.extended.steps_per_run

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  if use_steps:
    steps_to_run = ([current_strategy.extended.steps_per_run] *
                    (steps_per_epoch //
                     current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
      steps_to_run.append(
          steps_per_epoch % current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)
  else:
    target_steps = np.inf

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step] if use_steps else 1
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        if use_steps:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches).' %
                          steps_per_epoch * epochs)
        else:
          target_steps = current_step
          logging.info('Dataset iterator ran out of data. Inferring the '
                       'value of `steps_per_epoch` as %s  .' % target_steps)
          distributed_training_utils.initialize_iterator(iterator,
                                                         current_strategy)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      iterator: Iterator for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  current_strategy = model._distribution_strategy
  K.get_session().run(current_strategy.initialize())

  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
  K.set_learning_phase(0)

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, *inputs):
    """Clones the model and calls make_predict_function."""

    # TODO(priyag, sourabhbajaj): The model gets cloned every time
    # fit/test/predict is called. We should look into caching this keyed on
    # input shapes.
    clone_model_on_replicas(
        model,
        current_strategy,
        make_callback_model=False,
        inputs=inputs,
        mode=_Mode.PREDICT)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.call_for_each_replica(
         _per_device_predict_function, args=(model._grouped_model_predict,))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  with current_strategy.scope():
    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
    ctx = current_strategy.run_steps_on_dataset(
        step_fn, iterator, iterations=1,
        initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  # Copy the weights from the original model to each of the replicated models.
  orig_model_weights = model.get_weights()
  with current_strategy.scope():
    distributed_model = current_strategy.unwrap(model._grouped_model_predict)[0]
    distributed_training_utils.set_weights(
        current_strategy, distributed_model, orig_model_weights)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    if verbose >= 1:
      progbar.update(step + 1)

  K.get_session().run(current_strategy.finalize())

  if len(unconcatenated_outs) == 1:
    return np.concatenate(unconcatenated_outs[0], axis=0)
  return [
      np.concatenate(unconcatenated_outs[i], axis=0)
      for i in range(len(unconcatenated_outs))
  ]
Пример #46
0
def experimental_tpu_test_loop(model,
                               dataset,
                               verbose=0,
                               steps=None,
                               callbacks=None):
  """Test loop for evaluating with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring predictions finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Scalar loss (if the model has a single output and no metrics)
      or list of scalars (if the model has multiple outputs
      and/or metrics). The attribute `model.metrics_names` will give you
      the display labels for the outputs.
  """
  mode = ModeKeys.TEST
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset,
                                                     current_strategy)
  steps = training_utils.infer_steps_for_dataset(dataset, steps,
                                                 steps_name='steps')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_eval_function(model):
    model._make_eval_function()
    return (model._eval_function.inputs, model._eval_function.outputs,
            model._eval_function.updates_op,
            model._eval_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_eval_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode=mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_eval_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TEST),))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_test_function',
        **all_session_args)

    for label, output in zip(model.metrics_names, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  # TODO(priyag): Use steps_per_run when we use new metrics as they will
  # allow handling metric computation at each step using variables.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  test_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=ModeKeys.TEST)
  callbacks._call_begin_hook(mode)

  outs = [0.] * len(model.metrics_names)
  if steps is not None:
    target_steps = steps
  else:
    target_steps = np.inf

  current_step = 0
  while current_step < target_steps:
    batch_logs = {'batch': current_step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
    try:
      _, batch_outs = K.get_session().run([test_op, output_tensors])
    except errors.OutOfRangeError:
      if steps is not None:
        warning_msg = 'Make sure that your dataset can generate at least '
        '`steps` batches (in this case, {} batches).'.format(steps)
      else:
        warning_msg = 'Number of steps ran: {} steps'.format(current_step)

      logging.warning('Your dataset iterator ran out of data; '
                      'interrupting evaluation. ' + warning_msg)
      target_steps = current_step
      break
    for i, label in enumerate(model.metrics_names):
      if i == 0:
        # Loss is stateless metrics.
        outs[i] += batch_outs[label]
      else:
        # For all stateful metrics, the aggregation is handled by mirrored vars.
        outs[i] = batch_outs[label]

    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
    if verbose >= 1:
      progbar.update(current_step + 1)
    current_step += 1

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)
  if len(outs) >= 0:
    outs[0] /= (target_steps)

  if len(outs) == 1:
    return outs[0]
  return outs
Пример #47
0
def main(argv):
    logging.info('Building Keras ResNet-50 model')
    model = tf.keras.applications.resnet50.ResNet50(include_top=True,
                                                    weights=None,
                                                    input_tensor=None,
                                                    input_shape=None,
                                                    pooling=None,
                                                    classes=NUM_CLASSES)

    if FLAGS.use_tpu:
        logging.info('Converting from CPU to TPU model.')
        resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver)
        model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy)
        session_master = resolver.master()
    else:
        session_master = ''

    logging.info('Compiling model.')
    model.compile(optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE,
                                                    momentum=0.9,
                                                    nesterov=True),
                  loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])

    callbacks = [LearningRateBatchScheduler(schedule=learning_rate_schedule)]
    if FLAGS.model_dir:
        callbacks.append(
            tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir))

    if FLAGS.data is None:
        training_images = np.random.randn(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE,
                                          3).astype(np.float32)
        training_labels = np.random.randint(NUM_CLASSES,
                                            size=BATCH_SIZE,
                                            dtype=np.int32)
        logging.info('Training model using synthetica data.')
        model.fit(training_images,
                  training_labels,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  callbacks=callbacks)
        logging.info('Evaluating the model on synthetic data.')
        model.evaluate(training_images, training_labels, verbose=0)
    else:
        imagenet_train = imagenet_input.ImageNetInput(
            is_training=True,
            use_bfloat16=FLAGS.use_bfloat16,
            data_dir=FLAGS.data,
            per_core_batch_size=PER_CORE_BATCH_SIZE)
        logging.info('Training model using real data in directory "%s".',
                     FLAGS.data)
        model.fit(imagenet_train.input_fn,
                  epochs=EPOCHS,
                  steps_per_epoch=TRAINING_STEPS_PER_EPOCH,
                  callbacks=callbacks)

        logging.info('Evaluating the model on the validation dataset.')
        if FLAGS.eval_top_5_accuracy:
            logging.info('Evaluating top 1 and top 5 accuracy using a Python '
                         'generator.')
            # We feed the inputs from a Python generator, so we need to build a single
            # batch for all of the cores, which will be split on TPU.
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                use_bfloat16=FLAGS.use_bfloat16,
                data_dir=FLAGS.data,
                per_core_batch_size=BATCH_SIZE)
            score = eval_utils.multi_top_k_accuracy(
                model, imagenet_eval.evaluation_generator(K.get_session()),
                EVAL_STEPS)
        else:
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                use_bfloat16=FLAGS.use_bfloat16,
                data_dir=FLAGS.data,
                per_core_batch_size=PER_CORE_BATCH_SIZE)
            score = model.evaluate(imagenet_eval.input_fn,
                                   steps=EVAL_STEPS,
                                   verbose=1)
        print('Evaluation score', score)

        if HAS_H5PY:
            weights_file = os.path.join(
                FLAGS.model_dir if FLAGS.model_dir else '/tmp', WEIGHTS_TXT)
            logging.info('Save weights into %s', weights_file)
            model.save_weights(weights_file, overwrite=True)
Пример #48
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
  steps_per_epoch = training_utils.infer_steps_for_dataset(
      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
  if (current_strategy.extended.steps_per_run != 1 and
      steps_per_epoch is None):
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model with TPUStrategy when '
                     '`steps_per_run` != 1 .')

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  out_labels = model.metrics_names or []

  step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels)

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  use_steps = steps_per_epoch is not None
  if use_steps:
    iteration_value = min(steps_per_epoch,
                          current_strategy.extended.steps_per_run)
  else:
    iteration_value = current_strategy.extended.steps_per_run

  steps_per_run = K.variable(
      value=iteration_value,
      dtype='int32',
      name='steps_per_run')
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)
  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  if use_steps:
    steps_to_run = ([current_strategy.extended.steps_per_run] *
                    (steps_per_epoch //
                     current_strategy.extended.steps_per_run))
    if steps_per_epoch % current_strategy.extended.steps_per_run:
      steps_to_run.append(
          steps_per_epoch % current_strategy.extended.steps_per_run)
    target_steps = len(steps_to_run)
  else:
    target_steps = np.inf

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    current_step = 0
    while current_step < target_steps:
      step_count = steps_to_run[current_step] if use_steps else 1
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.batch_get_value([train_op, output_tensors])
      except errors.OutOfRangeError:
        if use_steps:
          logging.warning('Your dataset iterator ran out of data; '
                          'interrupting training. Make sure that your dataset '
                          'can generate at least `steps_per_epoch * epochs` '
                          'batches (in this case, %d batches).' %
                          steps_per_epoch * epochs)
        else:
          target_steps = current_step
          logging.info('Dataset iterator ran out of data. Inferring the '
                       'value of `steps_per_epoch` as %s  .' % target_steps)
          distributed_training_utils.initialize_iterator(iterator,
                                                         current_strategy)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      current_step += 1

      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
Пример #49
0
def experimental_tpu_fit_loop(model,
                              dataset,
                              epochs=100,
                              verbose=1,
                              callbacks=None,
                              initial_epoch=0,
                              steps_per_epoch=None,
                              val_dataset=None,
                              validation_steps=None,
                              validation_freq=1):
  """Fit loop for training with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset that returns inputs and targets
      epochs: Number of times to iterate over the data
      verbose: Integer, Verbosity mode, 0, 1 or 2
      callbacks: List of callbacks to be called during training
      initial_epoch: Epoch at which to start training
          (useful for resuming a previous training run)
      steps_per_epoch: Total number of steps (batches of samples)
          before declaring one epoch finished and starting the
          next epoch. Ignored with the default value of `None`.
      val_dataset: Dataset for validation data.
      validation_steps: Number of steps to run validation for
          (only if doing validation from data tensors).
          Ignored with the default value of `None`.
      validation_freq: Only relevant if validation data is provided. Integer or
          `collections.Container` instance (e.g. list, tuple, etc.). If an
          integer, specifies how many training epochs to run before a new
          validation run is performed, e.g. `validation_freq=2` runs
          validation every 2 epochs. If a Container, specifies the epochs on
          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
          validation at the end of the 1st, 2nd, and 10th epochs.

  Returns:
      Returns `None`.

  Raises:
      ValueError: in case of invalid arguments.
  """
  mode = ModeKeys.TRAIN
  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=1)
  scope.__enter__()

  def _per_device_fit_function(model):
    model._make_fit_function()
    return (model._fit_function.inputs, model._fit_function.outputs,
            model._fit_function.updates_op, model._fit_function.session_kwargs)

  out_labels = model.metrics_names or []

  def step_fn(ctx, inputs):
    """Clones the model and calls make_fit_function."""
    inputs, targets = inputs
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs, targets=targets)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs, targets)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_fit_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.TRAIN),))
    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs,
         grouped_updates, grouped_session_args)
    combined_fn = K.function(
        all_inputs,
        all_outputs,
        updates=all_updates,
        name='distributed_fit_function',
        **all_session_args)

    for label, output in zip(out_labels, combined_fn.outputs):
      if label == 'loss':
        reduce_op = ds_reduce_util.ReduceOp.SUM
      else:
        # We reduce all other metrics using mean for now. This is temporary
        # workaround until new metrics are in place.
        reduce_op = ds_reduce_util.ReduceOp.MEAN
      ctx.set_last_step_output(label, output, reduce_op)

    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
    # feed_dict, session kwargs, run options, run_metadata for now. These should
    # be handled appropriately
    return combined_fn.updates_op

  # Add initial dummy values for loss and other metric tensors.
  initial_loop_values = {}
  initial_loop_values['loss'] = constant_op.constant(1e7)
  for name in model.metrics_names[1:]:
    tensor = model._all_stateful_metrics_tensors[name]
    initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)

  if steps_per_epoch is None:
    raise ValueError('`steps_per_epoch` should be specified when calling '
                     '`fit` on the model.')
  steps_per_run = K.variable(
      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
      dtype='int32',
      name='steps_per_run')

  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=steps_per_run,
      initial_loop_values=initial_loop_values)

  train_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  do_validation = bool(validation_steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=do_validation,
      epochs=epochs,
      steps_per_epoch=steps_per_epoch,
      verbose=verbose,
      count_mode='steps',
      mode=mode)

  # Calculate the steps each time on the device.
  steps_to_run = [current_strategy.extended.steps_per_run] * (
      steps_per_epoch // current_strategy.extended.steps_per_run)
  if steps_per_epoch % current_strategy.extended.steps_per_run:
    steps_to_run.append(
        steps_per_epoch % current_strategy.extended.steps_per_run)

  callbacks._call_begin_hook(mode)
  for epoch in range(initial_epoch, epochs):
    distributed_training_utils._reset_metrics(model)
    callbacks.on_epoch_begin(epoch)
    epoch_logs = {}
    step_index = 0
    prev_step_count = None
    for step_count in steps_to_run:
      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
      if prev_step_count is None or step_count != prev_step_count:
        steps_per_run.load(step_count, K.get_session())
        prev_step_count = step_count
      try:
        _, outputs = K.get_session().run([train_op, output_tensors])
      except errors.OutOfRangeError:
        logging.warning('Your dataset iterator ran out of data; '
                        'interrupting training. Make sure that your dataset '
                        'can generate at least `steps_per_epoch * epochs` '
                        'batches (in this case, %d batches).' %
                        steps_per_epoch * epochs)
        break

      batch_logs.update(outputs)
      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
      step_index = step_index + step_count
      if callbacks.model.stop_training:
        break

    if (do_validation and
        training_utils.should_run_validation(validation_freq, epoch)):
      logging.info('Running validation at fit epoch: %s', epoch)

      if model._compile_distribution:
        # Since we create a new clone from the original model we need to copy
        # the weights back to the original model before we can run validation.
        distributed_training_utils._copy_weights_to_original_model(
            model, ModeKeys.TRAIN)

      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
          model,
          val_dataset,
          steps=validation_steps,
          verbose=verbose,
          callbacks=callbacks)
      if not isinstance(val_outs, list):
        val_outs = [val_outs]
      # Same labels assumed.
      for label, val_out in zip(out_labels, val_outs):
        epoch_logs['val_' + label] = val_out

    callbacks.on_epoch_end(epoch, epoch_logs)
    if callbacks.model.stop_training:
      break
  callbacks._call_end_hook(mode)

  if model._compile_distribution:
    # Copy the weights back from the replicated model to the original model.
    distributed_training_utils._copy_weights_to_original_model(
        model, ModeKeys.TRAIN)
  scope.__exit__(None, None, None)
  return model.history
Пример #50
0
 def save_graph(self, path):
     session = K.get_session()
     tf.train.write_graph(K.get_graph(), path, "model", as_text=True)
Пример #51
0
def experimental_tpu_predict_loop(model,
                                  dataset,
                                  verbose=0,
                                  steps=None,
                                  callbacks=None):
  """Predict loop for predicting with TPU DistributionStrategy.

  Arguments:
      model: Keras Model instance.
      dataset: Dataset for input data.
      verbose: Integer, Verbosity mode 0 or 1.
      steps: Total number of steps (batches of samples)
          before declaring `_predict_loop` finished.
          Ignored with the default value of `None`.
      callbacks: List of callbacks to be called during training

  Returns:
      Array of predictions (if the model has a single output)
      or list of arrays of predictions
      (if the model has multiple outputs).
  """
  mode = ModeKeys.PREDICT
  dataset_fully_shaped = (distributed_training_utils.
                          is_dataset_shape_fully_defined(dataset))
  padding_handler = None
  if not dataset_fully_shaped:
    # TODO(hongjunchoi): Investigate whether operations from
    # PartialBatchPaddingHandler are unnecessarily pruned out
    # during graph optimization.
    padding_handler = padding_util.PartialBatchPaddingHandler(
        model._feed_output_shapes)
    batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
    padding_handler.padded_batch_size = batch_size
    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
                                                  padding_handler.update_mask)

    dataset = dataset.map(padding_handler.pad_batch)
    dataset = dataset.apply(batching.unbatch())
    # Upon this point, it is guaranteed that the dataset does not
    # have partial batches. Thus, we set `drop_remainder=True` to
    # get static shape information about the elements in the dataset.
    dataset = dataset.batch(batch_size, drop_remainder=True)

    if prefetch_buffer is not None:
      dataset = dataset.prefetch(prefetch_buffer)

  current_strategy = model._distribution_strategy
  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)

  scope = distributed_training_utils.distributed_scope(
      strategy=current_strategy, learning_phase=0)
  scope.__enter__()

  def _per_device_predict_function(model):
    model._make_predict_function()
    return (model.predict_function.inputs,
            model.predict_function.outputs,
            model.predict_function.updates_op,
            model.predict_function.session_kwargs)

  def step_fn(ctx, inputs):
    """Clones the model and calls make_predict_function."""
    if model._compile_distribution:
      distributed_training_utils.clone_model_on_replicas(
          model, current_strategy, mode, inputs=inputs)
    else:
      distributed_training_utils._build_distributed_network(
          model, current_strategy, mode, inputs)

    (grouped_inputs, grouped_outputs, grouped_updates,
     grouped_session_args) = current_strategy.extended.call_for_each_replica(
         _per_device_predict_function,
         args=(distributed_training_utils.get_distributed_model(
             model, ModeKeys.PREDICT),))

    (all_inputs, all_outputs, all_updates,
     all_session_args) = distributed_training_utils.unwrap_values(
         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
         grouped_session_args)

    combined_fn = K.function(
        all_inputs, all_outputs,
        updates=all_updates,
        name='distributed_predict_function',
        **all_session_args)

    for label, output in zip(model.output_names, combined_fn.outputs):
      ctx.set_last_step_output(label, output)

    return combined_fn.updates_op

  # Add initial dummy values for outputs.
  initial_loop_values = {}
  batch_dimension = distributed_training_utils.get_batch_dimension(iterator)
  for name, tensor in zip(model.output_names, model.outputs):
    # TODO(priyag): This is a workaround as we do not know the batch dimension
    # of the model's output at this point.
    shape = tensor_shape.TensorShape(tensor.shape.dims)
    shape.dims = [batch_dimension] + shape.dims[1:]
    initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)

  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
      step_fn, iterator, iterations=1,
      initial_loop_values=initial_loop_values)

  predict_op = ctx.run_op
  output_tensors = ctx.last_step_outputs

  if verbose == 1:
    progbar = Progbar(target=steps)

  if model._compile_distribution:
    distributed_training_utils._copy_weights_to_distributed_model(model, mode)

  distributed_training_utils._reset_metrics(model)

  callbacks = cbks.configure_callbacks(
      callbacks,
      model,
      do_validation=False,
      epochs=1,
      steps_per_epoch=steps,
      verbose=verbose,
      count_mode='steps',
      mode=mode)
  callbacks._call_begin_hook(mode)

  assert steps is not None
  # Since we do not know how many samples we will see, we cannot pre-allocate
  # the returned Numpy arrays. Instead, we store one array per batch seen
  # and concatenate them upon returning.
  unconcatenated_outs = [[] for _ in model.outputs]
  for step in range(steps):
    batch_logs = {'batch': step, 'size': 1}
    callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
    _, batch_outs = K.get_session().run([predict_op, output_tensors])
    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
    for i, label in enumerate(model.output_names):
      unconcatenated_outs[i].extend(batch_outs[label])
    batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
    callbacks._call_batch_hook(mode, 'end', step, batch_logs)
    if verbose >= 1:
      progbar.update(step + 1)

  callbacks._call_end_hook(mode)

  scope.__exit__(None, None, None)

  if len(unconcatenated_outs) == 1:
    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
  else:
    prediction_result = [
        np.concatenate(unconcatenated_outs[i], axis=0)
        for i in range(len(unconcatenated_outs))
    ]

  if padding_handler:
    prediction_result = padding_handler.apply_mask(prediction_result)

  return prediction_result
Пример #52
0
def _get_var_for_numpy(distribution_strategy, input_array):
    """Creates a variable and assigns the value of the numpy array to it.

  Args:
    distribution_strategy: The DistributionStrategy used to compile the model.
    input_array: The input numpy array whose value will be assigned to the
      variable we create.

  Returns:
    The variable to which we will copy the value of the input numpy array.

  """
    with ops.device(get_cpu_device(distribution_strategy)):
        # Create and initialize a variable on the CPU device. This is the CPU
        # device of the host in the case of TPUDistributionStrategy.
        input_var = variables.VariableV1(array_ops.zeros(
            input_array.shape, input_array.dtype),
                                         trainable=False,
                                         use_resource=True)
    K.get_session().run(input_var.initializer)

    # Create a placeholder for the numpy array input slices. We copy the value
    # of the input numpy array to the variable in slices of size 64 MB to avoid
    # running into memory issues or RPC message limits.
    start_placeholder = array_ops.placeholder(dtypes.int64, ())
    end_placeholder = array_ops.placeholder(dtypes.int64, ())
    slice_placeholder = array_ops.placeholder(input_var.dtype)
    assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
        slice_placeholder)

    # If each batch element is > 64 MB, then we copy each batch element
    # individually. Otherwise, the slices will be < 128 MB. There might be padding
    # which might mean that the slices are 128 MB even if the size of the
    # tensor allocated is less than 128 MB.
    # This formula gives slices with size:
    # ceil(64 MB / byte size per batch element) bytes.
    # Using ceil() guarantees we get a number >= 1.

    # Calculate the size of each batch element.
    byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
                                  input_var.dtype.size

    # Calculate number of elements we want to copy per slice.
    batch_size_per_slice = int(
        np.ceil((64 << 20) / byte_size_per_batch_element))

    # Copy slices of the above size starting at 0, except the last slice will be
    # smaller.
    start = 0
    limit = input_array.shape[0]
    while start < limit:
        end = min(start + batch_size_per_slice, limit)
        K.get_session().run(assign_slice_op,
                            feed_dict={
                                start_placeholder: start,
                                end_placeholder: end,
                                slice_placeholder: input_array[start:end]
                            })
        start = end

    return input_var