예제 #1
0
def StartMultiplexerReloadingThread(multiplexer, path_to_run, load_interval):
  """Starts a thread to automatically reload the given multiplexer.

  The thread will reload the multiplexer by calling `ReloadMultiplexer` every
  `load_interval` seconds, starting immediately.

  Args:
    multiplexer: The `EventMultiplexer` to add runs to and reload.
    path_to_run: A dict mapping from paths to run names, where `None` as the run
      name is interpreted as a run name equal to the path.
    load_interval: How many seconds to wait after one load before starting the
      next load.

  Returns:
    A started `threading.Thread` that reloads the multiplexer.
  """
  # We don't call multiplexer.Reload() here because that would make
  # AddRunsFromDirectory block until the runs have all loaded.
  for path in path_to_run.keys():
    if gcs.IsGCSPath(path):
      gcs.CheckIsSupported()
      logging.info(
          'Assuming %s is intended to be a Google Cloud Storage path because '
          'it starts with %s. If it isn\'t, prefix it with \'/.\' (i.e., use '
          '/.%s instead)', path, gcs.PATH_PREFIX, path)

  def _ReloadForever():
    while True:
      ReloadMultiplexer(multiplexer, path_to_run)
      time.sleep(load_interval)

  thread = threading.Thread(target=_ReloadForever)
  thread.daemon = True
  thread.start()
  return thread
예제 #2
0
 def __exit__(self, exc_type, exc_val, exc_tb):
   logging.info('Disabling worker watchdog.')
   self._worker_manager.configure(
       event_pb2.WorkerHeartbeatRequest(
           watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
   self._running = False
   self.join()
예제 #3
0
  def evaluate(self, delay_secs=None):
    """Evaluate on the evaluation data.

    Runs evaluation on the evaluation data and returns the result. Runs for
    `self._eval_steps` steps, or if it's `None`, then run until input is
    exhausted or another exception is raised. Start the evaluation after
    `delay_secs` seconds, or if it's `None`, defaults to using
    `self._eval_delay_secs` seconds.

    Args:
      delay_secs: Start evaluating after this many seconds. If `None`, defaults
        to using `self._eval_delays_secs`.

    Returns:
      The result of the `evaluate` call to the `Estimator`.
    """
    if delay_secs is None:
      delay_secs = self._eval_delay_secs

    if delay_secs:
      logging.info("Waiting %d secs before starting eval.", delay_secs)
      time.sleep(delay_secs)

    return self._call_evaluate(input_fn=self._eval_input_fn,
                               steps=self._eval_steps,
                               metrics=self._eval_metrics,
                               name="one_pass",
                               hooks=self._eval_hooks)
  def testGeneratesStacktrace(self):
    if FLAGS.child:
      return

    # Subprocess sys.argv[0] with --child=True
    if sys.executable:
      child_process = subprocess.Popen(
          [sys.executable, sys.argv[0], '--child=True'], cwd=os.getcwd(),
          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    else:
      child_process = subprocess.Popen(
          [sys.argv[0], '--child=True'], cwd=os.getcwd(),
          stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Capture its output. capture both stdout and stderr and append them.
    # We are not worried about timing or order of messages in this test.
    child_stdout, child_stderr = child_process.communicate()
    child_output = child_stdout + child_stderr

    # Make sure the child process is dead before we proceed.
    child_process.wait()

    logging.info('Output from the child process:')
    logging.info(child_output)

    # Verify a stack trace is printed.
    self.assertIn(b'PyEval_EvalFrame', child_output)
예제 #5
0
def add_gradients_summaries(grads_and_vars):
  """Add summaries to gradients.

  Args:
    grads_and_vars: A list of gradient to variable pairs (tuples).

  Returns:
    The list of created summaries.
  """
  summaries = []
  for grad, var in grads_and_vars:
    if grad is not None:
      if isinstance(grad, ops.IndexedSlices):
        grad_values = grad.values
      else:
        grad_values = grad
      summaries.append(
          summary.histogram(var.op.name + '_gradient', grad_values))
      summaries.append(
          summary.scalar(var.op.name + '_gradient_norm',
                         clip_ops.global_norm([grad_values])))
    else:
      logging.info('Var %s has no gradient', var.op.name)

  return summaries
예제 #6
0
  def testCustomGradient(self):
    dtype = dtypes.float32

    @function.Defun(dtype, dtype, dtype)
    def XentLossGrad(logits, labels, dloss):
      dlogits = array_ops.reshape(dloss, [-1, 1]) * (
          nn_ops.softmax(logits) - labels)
      dlabels = array_ops.zeros_like(labels)
      # Takes exp(dlogits) to differentiate it from the "correct" gradient.
      return math_ops.exp(dlogits), dlabels

    @function.Defun(dtype, dtype, grad_func=XentLossGrad)
    def XentLoss(logits, labels):
      return math_ops.reduce_sum(labels * math_ops.log(nn_ops.softmax(logits)),
                                 1)

    g = ops.Graph()
    with g.as_default():
      logits = array_ops.placeholder(dtype)
      labels = array_ops.placeholder(dtype)
      loss = XentLoss(logits, labels)
      dlogits = gradients_impl.gradients([loss], [logits])

    x = np.random.uniform(-10., 10., size=(4, 9)).astype(np.float32)
    prob = np.exp(x) / np.sum(np.exp(x), 1, keepdims=1)
    y = np.random.uniform(-10., 10., size=(4, 9)).astype(np.float32)
    for cfg in _OptimizerOptions():
      tf_logging.info("cfg = %s", cfg)
      with session.Session(graph=g, config=cfg) as sess:
        out, = sess.run(dlogits, {logits: x, labels: y})
      self.assertAllClose(out, np.exp(prob - y))
  def setUpClass(cls):
    gpu_memory_fraction_opt = (
        "--gpu_memory_fraction=%f" % cls.PER_PROC_GPU_MEMORY_FRACTION)

    worker_port = portpicker.pick_unused_port()
    cluster_spec = "worker|localhost:%d" % worker_port
    tf_logging.info("cluster_spec: %s", cluster_spec)

    server_bin = test.test_src_dir_path("python/debug/grpc_tensorflow_server")

    cls.server_target = "grpc://localhost:%d" % worker_port

    cls.server_procs = {}
    cls.server_procs["worker"] = subprocess.Popen(
        [
            server_bin,
            "--cluster_spec=%s" % cluster_spec,
            "--job_name=worker",
            "--task_id=0",
            gpu_memory_fraction_opt,
        ],
        stdout=sys.stdout,
        stderr=sys.stderr)

    # Start debug server in-process, on separate thread.
    (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread,
     cls.debug_server
    ) = grpc_debug_test_server.start_server_on_separate_thread(
        dump_to_filesystem=False)
    tf_logging.info("debug server url: %s", cls.debug_server_url)

    cls.session_config = config_pb2.ConfigProto(
        gpu_options=config_pb2.GPUOptions(
            per_process_gpu_memory_fraction=cls.PER_PROC_GPU_MEMORY_FRACTION))
예제 #8
0
  def testUnrollLSTM(self):
    # Run one step of the unrolled lstm graph.
    def RunForward(mode, cfg=None):
      tf_logging.info("mode = %s", mode)
      g = ops.Graph()
      start = time.time()
      with g.as_default():
        weights = self._Weights()
        inp = self._Input()
        m = self._BuildForward(weights, inp, mode)
      gdef = g.as_graph_def()
      finish = time.time()
      tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                      len(str(gdef)), len(gdef.SerializeToString()))
      with g.as_default(), session.Session(config=cfg) as sess:
        return sess.run(m)

    mv0 = RunForward("complete")
    for cfg in _OptimizerOptions():
      tf_logging.info("cfg = %s", cfg)
      mv1 = RunForward("cell", cfg)
      mv2 = RunForward("loop", cfg)
      mv3 = RunForward("loop10", cfg)
      self.assertAllClose(mv0, mv1, rtol=1e-4)
      self.assertAllClose(mv0, mv2, rtol=1e-4)
      self.assertAllClose(mv0, mv3, rtol=1e-4)
예제 #9
0
  def testUnrollLSTMGrad(self):
    # Run one step of the unrolled lstm graph.
    def RunForwardBackward(mode, cfg=None):
      tf_logging.info("mode = %s", mode)
      g = ops.Graph()
      start = time.time()
      with g.as_default():
        weights = self._Weights()
        inp = self._Input()
        m = self._BuildForward(weights, inp, mode)
        loss = math_ops.reduce_sum(math_ops.square(m))
        dw = gradients_impl.gradients([loss], [weights])
      gdef = g.as_graph_def()
      finish = time.time()
      tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                      len(str(gdef)), len(gdef.SerializeToString()))
      with g.as_default(), session.Session(config=cfg) as sess:
        return sess.run(dw)

    d0 = RunForwardBackward("complete")
    for cfg in _OptimizerOptions():
      tf_logging.info("cfg = %s", cfg)
      d1 = RunForwardBackward("cell", cfg)
      d2 = RunForwardBackward("loop", cfg)
      d3 = RunForwardBackward("loop10", cfg)
      self.assertAllClose(d0, d1, rtol=1e-4, atol=1e-4)
      self.assertAllClose(d0, d2, rtol=1e-4, atol=1e-4)
      self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
예제 #10
0
  def build_greedy_training(self, state, network_states):
    """Extracts features and advances a batch using the oracle path.

    Args:
      state: MasterState from the 'AdvanceMaster' op that advances the
          underlying master to this component.
      network_states: dictionary of component NetworkState objects

    Returns:
      state handle: final state after advancing
      cost: regularization cost, possibly associated with embedding matrices
      correct: since no gold path is available, 0.
      total: since no gold path is available, 0.
    """
    logging.info('Building component: %s', self.spec.name)
    stride = state.current_batch_size * self.training_beam_size
    with tf.variable_scope(self.name, reuse=True):
      state.handle, fixed_embeddings = fetch_differentiable_fixed_embeddings(
          self, state, stride)

    linked_embeddings = [
        fetch_linked_embedding(self, network_states, spec)
        for spec in self.spec.linked_feature
    ]

    with tf.variable_scope(self.name, reuse=True):
      tensors = self.network.create(
          fixed_embeddings, linked_embeddings, None, None, True, stride=stride)
    update_network_states(self, tensors, network_states, stride)
    cost = self.add_regularizer(tf.constant(0.))

    correct, total = tf.constant(0), tf.constant(0)
    return state.handle, cost, correct, total
예제 #11
0
  def _extract_feature_ids(self, state, network_states, during_training):
    """Extracts feature IDs and advances a batch using the oracle path.

    Args:
      state: MasterState from the 'AdvanceMaster' op that advances the
          underlying master to this component.
      network_states: Dictionary of component NetworkState objects.
      during_training: Whether the graph is being constructed during training.

    Returns:
      state handle: Final state after advancing.
    """
    logging.info('Building component: %s', self.spec.name)

    if during_training:
      stride = state.current_batch_size * self.training_beam_size
    else:
      stride = state.current_batch_size * self.inference_beam_size

    with tf.variable_scope(self.name, reuse=True):
      state.handle, ids = extract_fixed_feature_ids(self, state, stride)

    with tf.variable_scope(self.name, reuse=True):
      tensors = self.network.create(
          ids, [], None, None, during_training, stride=stride)
    update_network_states(self, tensors, network_states, stride)
    return state.handle
예제 #12
0
  def __init__(self, model_dir=None, config=None):
    """Initializes a BaseEstimator instance.

    Args:
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model.
      config: A RunConfig instance.
    """
    # Model directory.
    self._model_dir = model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)

    # Create a run configuration
    if config is None:
      self._config = BaseEstimator._Config()
      logging.warning('Using default config.')
    else:
      self._config = config
    logging.info('Using config: %s', str(vars(self._config)))

    # Set device function depending if there are replicas or not.
    self._device_fn = _get_replica_device_setter(self._config)

    # Features and targets TensorSignature objects.
    # TODO(wicke): Rename these to something more descriptive
    self._features_info = None
    self._targets_info = None

    self._graph = None
예제 #13
0
  def _infer_model_as_iterable(
      self, checkpoint_path, predictions, feed_fn, return_dict):
    if feed_fn is None:
      feed_dicts = itertools.repeat(None)
    else:
      def _feed_fn():
        while True:
          yield feed_fn()
      feed_dicts = _feed_fn()

    try:
      for output_batch in graph_actions.run_feeds_iter(
          output_dict=predictions,
          feed_dicts=feed_dicts,
          restore_checkpoint_path=checkpoint_path):
        # Unpack batches into individual predictions
        if return_dict:
          batch_length = list(output_batch.values())[0].shape[0]
          for i in range(batch_length):
            yield {key: value[i] for key, value in output_batch.items()}
        else:
          for pred in output_batch['predictions']:
            yield pred

    except errors.OutOfRangeError:
      # We fall out of the above loop naturally if feed_fn raises StopIteration,
      # or we catch an OutOfRangeError if we've reached the end of inputs.
      logging.info('Reached end of inputs for predict_iter.')
예제 #14
0
def _write_dict_to_summary(output_dir,
                           dictionary,
                           current_global_step):
  """Writes a `dict` into summary file in given output directory.

  Args:
    output_dir: `str`, directory to write the summary file in.
    dictionary: the `dict` to be written to summary file.
    current_global_step: `int`, the current global step.
  """
  logging.info('Saving dict for global step %d: %s', current_global_step,
               _dict_to_str(dictionary))
  summary_writer = writer_cache.FileWriterCache.get(output_dir)
  summary_proto = summary_pb2.Summary()
  for key in dictionary:
    if dictionary[key] is None:
      continue
    if key == 'global_step':
      continue
    value = summary_proto.value.add()
    value.tag = key
    if (isinstance(dictionary[key], np.float32) or
        isinstance(dictionary[key], float)):
      value.simple_value = float(dictionary[key])
    elif (isinstance(dictionary[key], np.int64) or
          isinstance(dictionary[key], np.int32) or
          isinstance(dictionary[key], int)):
      value.simple_value = int(dictionary[key])
    else:
      logging.warn(
          'Skipping summary for %s, must be a float, np.float32, np.int64, '
          'np.int32 or int.',
          key)
  summary_writer.add_summary(summary_proto, current_global_step)
  summary_writer.flush()
예제 #15
0
  def _get_device_assignment(self):
    """Gets the (maybe cached) TPU device assignment."""
    master = self._get_master_address()
    device_assignment = self._lazy_device_assignment_dict.get(master)
    if device_assignment is not None:
      return device_assignment

    tpu_system_metadata = self._get_tpu_system_metadata()

    device_assignment = tpu_device_assignment.device_assignment(
        tpu_system_metadata.topology,
        computation_shape=self._computation_shape,
        num_replicas=self.num_replicas)

    logging.info('num_cores_per_replica: %s',
                 str(self._config.tpu_config.num_cores_per_replica))
    logging.info('computation_shape: %s', str(self._computation_shape))
    logging.info('num_replicas: %d', self.num_replicas)
    logging.info('device_assignment.topology.device_coordinates: %s',
                 str(device_assignment.topology.device_coordinates))
    logging.info('device_assignment.core_assignment: %s',
                 str(device_assignment.core_assignment))

    self._lazy_device_assignment_dict[master] = device_assignment
    return device_assignment
예제 #16
0
def wait_for_new_checkpoint(checkpoint_dir,
                            last_checkpoint,
                            seconds_to_sleep=1,
                            timeout=None):
  """Waits until a new checkpoint file is found.

  Args:
    checkpoint_dir: The directory in which checkpoints are saved.
    last_checkpoint: The last checkpoint path used.
    seconds_to_sleep: The number of seconds to sleep for before looking for a
      new checkpoint.
    timeout: The maximum amount of time to wait. If left as `None`, then the
      process will wait indefinitely.

  Returns:
    a new checkpoint path, or None if the timeout was reached.
  """
  logging.info('Waiting for new checkpoint at %s', checkpoint_dir)
  stop_time = time.time() + timeout if timeout is not None else None
  while True:
    checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir)
    if checkpoint_path is None or checkpoint_path == last_checkpoint:
      if stop_time is not None and time.time() + seconds_to_sleep > stop_time:
        return None
      time.sleep(seconds_to_sleep)
    else:
      logging.info('Found new checkpoint at %s', checkpoint_path)
      return checkpoint_path
예제 #17
0
  def _restore_checkpoint(self,
                          master,
                          saver=None,
                          checkpoint_dir=None,
                          checkpoint_filename_with_path=None,
                          wait_for_checkpoint=False,
                          max_wait_secs=7200,
                          config=None):
    """Creates a `Session`, and tries to restore a checkpoint.


    Args:
      master: `String` representation of the TensorFlow master to use.
      saver: A `Saver` object used to restore a model.
      checkpoint_dir: Path to the checkpoint files. The latest checkpoint in the
        dir will be used to restore.
      checkpoint_filename_with_path: Full file name path to the checkpoint file.
      wait_for_checkpoint: Whether to wait for checkpoint to become available.
      max_wait_secs: Maximum time to wait for checkpoints to become available.
      config: Optional `ConfigProto` proto used to configure the session.

    Returns:
      A pair (sess, is_restored) where 'is_restored' is `True` if
      the session could be restored, `False` otherwise.

    Raises:
      ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
        set.
    """
    self._target = master
    sess = session.Session(self._target, graph=self._graph, config=config)

    if checkpoint_dir and checkpoint_filename_with_path:
      raise ValueError("Can not provide both checkpoint_dir and "
                       "checkpoint_filename_with_path.")
    # If either saver or checkpoint_* is not specified, cannot restore. Just
    # return.
    if not saver or not (checkpoint_dir or checkpoint_filename_with_path):
      return sess, False

    if checkpoint_filename_with_path:
      saver.restore(sess, checkpoint_filename_with_path)
      return sess, True

    # Waits up until max_wait_secs for checkpoint to become available.
    wait_time = 0
    ckpt = checkpoint_management.get_checkpoint_state(checkpoint_dir)
    while not ckpt or not ckpt.model_checkpoint_path:
      if wait_for_checkpoint and wait_time < max_wait_secs:
        logging.info("Waiting for checkpoint to be available.")
        time.sleep(self._recovery_wait_secs)
        wait_time += self._recovery_wait_secs
        ckpt = checkpoint_management.get_checkpoint_state(checkpoint_dir)
      else:
        return sess, False

    # Loads the checkpoint.
    saver.restore(sess, ckpt.model_checkpoint_path)
    saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)
    return sess, True
예제 #18
0
 def fit(self, **kwargs):
   self.fake_checkpoint()
   tf_logging.info('fit called with args: %s' % kwargs)
   self.fit_count += 1
   if 'monitors' in kwargs:
     self.monitors = kwargs['monitors']
   return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
  def testEval(self):
    if not is_tensorrt_enabled():
      return
    model_dir = test.test_src_dir_path('python/compiler/tensorrt/test/testdata')

    accuracy_tf_native = self._Run(
        is_training=False,
        use_trt=False,
        batch_size=128,
        num_epochs=None,
        model_dir=model_dir)['accuracy']
    logging.info('accuracy_tf_native: %f', accuracy_tf_native)
    self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3)

    if get_linked_tensorrt_version()[0] < 5:
      return

    accuracy_tf_trt = self._Run(
        is_training=False,
        use_trt=True,
        batch_size=128,
        num_epochs=None,
        model_dir=model_dir)['accuracy']
    logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
    self.assertAllClose(0.9675, accuracy_tf_trt, rtol=1e-3, atol=1e-3)
예제 #20
0
def _maybe_save_assets(assets_collection_to_add=None):
  """Saves assets to the meta graph.

  Args:
    assets_collection_to_add: The collection where the asset paths are setup.

  Returns:
    The list of filepaths to the assets in the assets collection.

  Raises:
    ValueError: Indicating an invalid filepath tensor.
  """
  asset_source_filepath_list = []

  if assets_collection_to_add is None:
    tf_logging.info("No assets to save.")
    return asset_source_filepath_list

  # Iterate over the supplied asset collection, build the `AssetFile` proto
  # and add them to the collection with key `constants.ASSETS_KEY`, in the
  # graph.
  for asset_tensor in assets_collection_to_add:
    asset_source_filepath = _asset_path_from_tensor(asset_tensor)
    if not asset_source_filepath:
      raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)

    asset_source_filename = os.path.basename(asset_source_filepath)

    # Build `AssetFile` proto and add it to the asset collection in the graph.
    _add_asset_to_collection(asset_source_filename, asset_tensor)

    asset_source_filepath_list.append(asset_source_filepath)

  tf_logging.info("Assets added to graph.")
  return asset_source_filepath_list
예제 #21
0
  def _save_and_write_assets(self, assets_collection_to_add=None):
    """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)

    # Return if there are no assets to write.
    if len(asset_source_filepath_list) is 0:
      tf_logging.info("No assets to write.")
      return

    assets_destination_dir = os.path.join(
        compat.as_bytes(self._export_dir),
        compat.as_bytes(constants.ASSETS_DIRECTORY))

    if not file_io.file_exists(assets_destination_dir):
      file_io.recursive_create_dir(assets_destination_dir)

    # Copy each asset from source path to destination path.
    for asset_source_filepath in asset_source_filepath_list:
      asset_source_filename = os.path.basename(asset_source_filepath)

      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_source_filename))

      # Only copy the asset file to the destination if it does not already
      # exist. This is to ensure that an asset with the same name defined as
      # part of multiple graphs is only copied the first time.
      if not file_io.file_exists(asset_destination_filepath):
        file_io.copy(asset_source_filepath, asset_destination_filepath)

    tf_logging.info("Assets written to: %s", assets_destination_dir)
  def __init__(self,
               checkpoint_dir,
               save_secs=None,
               save_steps=None,
               saver=None,
               checkpoint_basename="model.ckpt",
               scaffold=None):
    """Initialize CheckpointSaverHook monitor.

    Args:
      checkpoint_dir: `str`, base directory for the checkpoint files.
      save_secs: `int`, save every N secs.
      save_steps: `int`, save every N steps.
      saver: `Saver` object, used for saving.
      checkpoint_basename: `str`, base name for the checkpoint files.
      scaffold: `Scaffold`, use to get saver object.

    Raises:
      ValueError: One of `save_steps` or `save_secs` should be set.
    """
    logging.info("Create CheckpointSaverHook.")
    self._saver = saver
    self._checkpoint_dir = checkpoint_dir
    self._summary_writer = SummaryWriterCache.get(checkpoint_dir)
    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
    self._scaffold = scaffold
    self._save_secs = save_secs
    self._save_steps = save_steps
    self._last_saved_time = None
    self._last_saved_step = None

    if save_steps is None and save_secs is None:
      raise ValueError("Either save_steps or save_secs should be provided")
    if (save_steps is not None) and (save_secs is not None):
      raise ValueError("Can not provide both save_steps and save_secs.")
예제 #23
0
  def train(self, delay_secs=None):
    """Fit the estimator using the training data.

    Train the estimator for `self._train_steps` steps, after waiting for
    `delay_secs` seconds. If `self._train_steps` is `None`, train forever.

    Args:
      delay_secs: Start training after this many seconds.

    Returns:
      The trained estimator.
    """
    if delay_secs is None:
      task_id = 0
      if hasattr(FLAGS, "task"):
        task_id = FLAGS.task
      delay_secs = min(60, task_id*5)

    if delay_secs:
      logging.info("Waiting %d secs before starting training.", delay_secs)
      time.sleep(delay_secs)

    return self._estimator.fit(input_fn=self._train_input_fn,
                               max_steps=self._train_steps,
                               monitors=self._train_monitors)
예제 #24
0
 def add_saver(self):
   """Adds a Saver for all variables in the graph."""
   logging.info('Generating op to save variables:\n\t%s',
                '\n\t'.join([x.name for x in tf.global_variables()]))
   self.saver = tf.train.Saver(
       var_list=[x for x in tf.global_variables()],
       write_version=saver_pb2.SaverDef.V1)
예제 #25
0
def _restore_from_checkpoint(session, graph, checkpoint_path, saver=None):
  logging.info('Loading model from checkpoint: %s.', checkpoint_path)
  saver = saver or _make_saver(graph)
  if saver:
    saver.restore(session, checkpoint_path)
  else:
    logging.info('No variables found in graph, not creating Saver() object.')
예제 #26
0
 def __init__(
     self,
     params,
     device_assigner=None,
     variables=None,
     tree_variables_class=TreeTrainingVariables,
     tree_graphs=None,
     training=True,
     t_ops=training_ops,
     i_ops=inference_ops,
 ):
     self.params = params
     self.device_assigner = device_assigner or RandomForestDeviceAssigner()
     logging.info("Constructing forest with params = ")
     logging.info(self.params.__dict__)
     self.variables = variables or ForestTrainingVariables(
         self.params,
         device_assigner=self.device_assigner,
         training=training,
         tree_variables_class=tree_variables_class,
     )
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, t_ops.Load(), i_ops.Load(), i)
         for i in range(self.params.num_trees)
     ]
예제 #27
0
def _get_model_dir(tf_config, model_dir):
  """Returns `model_dir` based user provided `tf_config` or `model_dir`."""
  # pylint: disable=g-explicit-bool-comparison

  # Empty string is treated as False in Python condition check, which triggers
  # some confusing error messages. For example, 'a or b' returns None if a is ''
  # and b is None. `None` is allowed for model_dir but '' is not allowed. Here,
  # explicitly check empty string to provide clear error message.
  if model_dir == '':
    raise ValueError('model_dir should be non-empty.')

  model_dir_in_tf_config = tf_config.get('model_dir')
  if model_dir_in_tf_config == '':
    raise ValueError('model_dir in TF_CONFIG should be non-empty.')

  if model_dir_in_tf_config:
    if model_dir and model_dir_in_tf_config != model_dir:
      raise ValueError(
          '`model_dir` provided in RunConfig construct, if set, '
          'must have the same value as the model_dir in TF_CONFIG. '
          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
              model_dir, model_dir_in_tf_config))

    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)

  return model_dir or model_dir_in_tf_config
예제 #28
0
  def RunTraining(self, hyperparam_config):
    master_spec = self.LoadSpec('master_spec_link.textproto')

    self.assertTrue(isinstance(hyperparam_config, spec_pb2.GridPoint))
    gold_doc = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc)
    gold_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2)
    reader_strings = [
        gold_doc.SerializeToString(), gold_doc_2.SerializeToString()
    ]
    tf.logging.info('Generating graph with config: %s', hyperparam_config)
    with tf.Graph().as_default():
      builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)

      target = spec_pb2.TrainTarget()
      target.name = 'testTraining-all'
      train = builder.add_training_from_config(target)
      with self.test_session() as sess:
        logging.info('Initializing')
        sess.run(tf.global_variables_initializer())

        # Run one iteration of training and verify nothing crashes.
        logging.info('Training')
        sess.run(train['run'], feed_dict={train['input_batch']: reader_strings})
  def _initialize_local(self, num_gpus_per_worker):
    """Initialize internal devices for local training."""
    self._worker_device = "/job:localhost"
    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus_per_worker > 0:
      self._compute_devices = list(
          map("/device:GPU:{}".format, range(num_gpus_per_worker)))
    else:
      self._compute_devices = [_LOCAL_CPU]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # If there is only one GPU, put everything on that GPU. Otherwise, place
    # variables on CPU.
    if num_gpus_per_worker == 1:
      assert len(list(self._compute_devices)) == 1
      self._variable_device = _LOCAL_GPU_0
      self._parameter_devices = [_LOCAL_GPU_0]
    else:
      self._variable_device = _LOCAL_CPU
      self._parameter_devices = [_LOCAL_CPU]

    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info(
        "ParameterServerStrategy with compute_devices = %r, "
        "variable_device = %r", self._compute_devices, self._variable_device)
예제 #30
0
  def save(self, as_text=False):
    """Writes a `SavedModel` protocol buffer to disk.

    The function writes the SavedModel protocol buffer to the export directory
    in serialized format.

    Args:
      as_text: Writes the SavedModel protocol buffer in text format to disk.

    Returns:
      The path to which the SavedModel protocol buffer was written.
    """
    if not file_io.file_exists(self._export_dir):
      file_io.recursive_create_dir(self._export_dir)

    if as_text:
      path = os.path.join(
          compat.as_bytes(self._export_dir),
          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
      file_io.write_string_to_file(path, str(self._saved_model))
    else:
      path = os.path.join(
          compat.as_bytes(self._export_dir),
          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
    tf_logging.info("SavedModel written to: %s", path)

    return path
  def __init__(self, model_fn, model_dir=None, config=None, params=None,
               warm_start_from=None):
    """Constructs an `Estimator` instance.

    See @{$estimators} for more information. To warm-start an `Estimator`:

    ```python
    estimator = tf.estimator.DNNClassifier(
        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
        hidden_units=[1024, 512, 256],
        warm_start_from="/path/to/checkpoint/dir")
    ```

    For more details on warm-start configuration, see
    @{tf.estimator.WarmStartSettings$WarmStartSettings}.

    Args:
      model_fn: Model function. Follows the signature:

        * Args:

          * `features`: This is the first item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same.
          * `labels`: This is the second item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same (for multi-head models). If
                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
                 the `model_fn`'s signature does not accept `mode`, the
                 `model_fn` must still be able to handle `labels=None`.
          * `mode`: Optional. Specifies if this training, evaluation or
                 prediction. See `ModeKeys`.
          * `params`: Optional `dict` of hyperparameters.  Will receive what
                 is passed to Estimator in `params` parameter. This allows
                 to configure Estimators from hyper parameter tuning.
          * `config`: Optional configuration object. Will receive what is passed
                 to Estimator in `config` parameter, or the default `config`.
                 Allows updating things in your model_fn based on configuration
                 such as `num_ps_replicas`, or `model_dir`.

        * Returns:
          `EstimatorSpec`

      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model. If `PathLike` object, the
        path will be resolved. If `None`, the model_dir in `config` will be used
        if set. If both are set, they must be same. If both are `None`, a
        temporary directory will be used.
      config: Configuration object.
      params: `dict` of hyper parameters that will be passed into `model_fn`.
              Keys are names of parameters, values are basic python types.
      warm_start_from: Optional string filepath to a checkpoint to warm-start
                       from, or a `tf.estimator.WarmStartSettings` object to
                       fully configure warm-starting.  If the string filepath is
                       provided instead of a `WarmStartSettings`, then all
                       variables are warm-started, and it is assumed that
                       vocabularies and Tensor names are unchanged.

    Raises:
      RuntimeError: If eager execution is enabled.
      ValueError: parameters of `model_fn` don't match `params`.
      ValueError: if this is called via a subclass and if that class overrides
        a member of `Estimator`.
    """
    if context.in_eager_mode():
      raise RuntimeError(
          'Estimators are not supported when eager execution is enabled.')

    Estimator._assert_members_are_not_overridden(self)

    if config is None:
      self._config = run_config.RunConfig()
      logging.info('Using default config.')
    else:
      if not isinstance(config, run_config.RunConfig):
        raise ValueError(
            'config must be an instance of RunConfig, but provided %s.' %
            config)
      self._config = config

    # Model directory.
    model_dir = compat_internal.path_to_str(model_dir)
    if (model_dir is not None) and (self._config.model_dir is not None):
      if model_dir != self._config.model_dir:
        # TODO(alanyee): remove this suppression after it is no longer needed
        # pylint: disable=g-doc-exception
        raise ValueError(
            "model_dir are set both in constructor and RunConfig, but with "
            "different values. In constructor: '{}', in RunConfig: "
            "'{}' ".format(model_dir, self._config.model_dir))
        # pylint: enable=g-doc-exception

    self._model_dir = model_dir or self._config.model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)
    if self._config.model_dir is None:
      self._config = self._config.replace(model_dir=self._model_dir)
    logging.info('Using config: %s', str(vars(self._config)))

    if self._config.session_config is None:
      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    else:
      self._session_config = self._config.session_config

    self._device_fn = _get_replica_device_setter(self._config)

    if model_fn is None:
      raise ValueError('model_fn must be provided to Estimator.')
    _verify_model_fn_args(model_fn, params)
    self._model_fn = model_fn
    self._params = copy.deepcopy(params or {})

    # pylint: disable=protected-access
    self._warm_start_settings = (
        warm_starting_util._get_default_warm_start_settings(warm_start_from))
예제 #32
0
    def __init__(self,
                 sequence_feature_columns,
                 context_feature_columns=None,
                 units=None,
                 cell_type=USE_DEFAULT,
                 rnn_cell_fn=None,
                 return_sequences=False,
                 model_dir=None,
                 n_classes=2,
                 weight_column=None,
                 label_vocabulary=None,
                 optimizer='Adagrad',
                 loss_reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                 sequence_mask='sequence_mask',
                 config=None):
        """Initializes a `RNNClassifier` instance.

    Args:
      sequence_feature_columns: An iterable containing the `FeatureColumn`s
        that represent sequential input. All items in the set should either be
        sequence columns (e.g. `sequence_numeric_column`) or constructed from
        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
        input).
      context_feature_columns: An iterable containing the `FeatureColumn`s
        for contextual input. The data represented by these columns will be
        replicated and given to the RNN at each timestep. These columns must be
        instances of classes derived from `DenseColumn` such as
        `numeric_column`, not the sequential variants.
      units: Iterable of integer number of hidden units per RNN layer. If
        set, `cell_type` must also be specified and `rnn_cell_fn` must be
        `None`.
      cell_type: A class producing a RNN cell or a string specifying the cell
        type. Supported strings are: `'simple_rnn'`, `'lstm'`, and `'gru'`. If
        set, `units` must also be specified and `rnn_cell_fn` must be `None`.
      rnn_cell_fn: A function that returns a RNN cell instance that will be used
        to construct the RNN. If set, `units` and `cell_type` cannot be set.
        This is for advanced users who need additional customization beyond
        `units` and `cell_type`. Note that `tf.keras.layers.StackedRNNCells` is
        needed for stacked RNNs.
      return_sequences: A boolean indicating whether to return the last output
        in the output sequence, or the full sequence. Note that if True,
        `weight_column` must be None or a string.
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model.
      n_classes: Number of label classes. Defaults to 2, namely binary
        classification. Must be > 1.
      weight_column: A string or a `NumericColumn` created by
        `tf.feature_column.numeric_column` defining feature column representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example. If it is a string, it is
        used as a key to fetch weight tensor from the `features`. If it is a
        `NumericColumn`, raw tensor is fetched by key `weight_column.key`, then
        weight_column.normalizer_fn is applied on it to get weight tensor.
      label_vocabulary: A list of strings represents possible label values. If
        given, labels must be string type and have any value in
        `label_vocabulary`. If it is not given, that means labels are
        already encoded as integer or float within [0, 1] for `n_classes=2` and
        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
        Also there will be errors if vocabulary is not provided and labels are
        string.
      optimizer: An instance of `tf.Optimizer` or string specifying optimizer
        type. Defaults to Adagrad optimizer.
      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
        to reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`.
      sequence_mask: A string with the name of the sequence mask tensor. If
        `sequence_mask` is in the features dictionary, the provided tensor is
        used, otherwise the sequence mask is computed from the length of
        sequential features. The sequence mask is used in evaluation and
        training mode to aggregate loss and metrics computation while excluding
        padding steps. It is also added to the predictions dictionary in
        prediction mode to indicate which steps are padding.
      config: `RunConfig` object to configure the runtime settings.

    Note that a RNN cell has:
      - a `call` method.
      - a `state_size` attribute.
      - a `output_size` attribute.
      - a `get_initial_state` method.
    See the documentation on `tf.keras.layers.RNN` for more details.

    Raises:
      ValueError: If `units`, `cell_type`, and `rnn_cell_fn` are not
        compatible.
    """
        rnn_layer_fn = _make_rnn_layer_fn(rnn_cell_fn,
                                          units,
                                          cell_type,
                                          return_sequences=return_sequences)

        if n_classes == 2:
            head = binary_head_lib.BinaryClassHead(
                weight_column=weight_column,
                label_vocabulary=label_vocabulary,
                loss_reduction=loss_reduction)
        else:
            head = multi_head_lib.MultiClassHead(
                n_classes=n_classes,
                weight_column=weight_column,
                label_vocabulary=label_vocabulary,
                loss_reduction=loss_reduction)

        if return_sequences:
            logging.info(
                'Converting head to sequential head with '
                '`SequentialHeadWrapper` to allow sequential predictions.')
            head = seq_head_lib.SequentialHeadWrapper(
                head,
                sequence_length_mask=sequence_mask,
                feature_columns=weight_column)

        def _model_fn(features, labels, mode, config):
            del config  # Unused.
            return _rnn_model_fn(
                features=features,
                labels=labels,
                mode=mode,
                head=head,
                rnn_layer_fn=rnn_layer_fn,
                sequence_feature_columns=tuple(sequence_feature_columns or []),
                context_feature_columns=tuple(context_feature_columns or []),
                return_sequences=return_sequences,
                optimizer=optimizer)

        super(RNNClassifier, self).__init__(model_fn=_model_fn,
                                            model_dir=model_dir,
                                            config=config)
예제 #33
0
    def RunTest(self, run_params):
        should_run, reason_for_skipping = self.ShouldRunTest(run_params)
        if not should_run:
            return self.skipTest(reason_for_skipping)

        saved_model_dir = self._MakeSavedModel(run_params)

        np.random.seed(12345)  # Fix the seed so the test is deterministic.
        inputs_data = []
        input_specs = self._GetParamsCached().input_specs
        for dim_list in self._GetParamsCached().input_dims:
            assert len(input_specs) == len(dim_list)
            current_input_data = []
            for spec, np_shape in zip(input_specs, dim_list):
                np_dtype = spec.dtype.as_numpy_dtype()
                # Multiply the input by some constant to avoid all zeros input for
                # integer types.
                scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
                # TODO(laigd): add debug options. E.g. we can set the input data to be
                # continuous natural numbers:
                # seq = np.arange(np.prod(np_shape))
                # seq.resize(np_shape)
                # current_inputs_data.append(scale * seq.astype(np_dtype))
                data = (scale *
                        np.random.random_sample(np_shape)).astype(np_dtype)
                if run_params.is_v2:
                    with ops.device("/GPU:0"):
                        data = ops.convert_to_tensor(data)
                current_input_data.append(data)
            inputs_data.append(current_input_data)

        # Verify original graph.
        self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
                             GraphState.ORIGINAL)

        # Run original graph without trt to get reference result.
        config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
        logging.info("Running original graph w/o trt, config:\n%s",
                     str(config_no_trt))
        ref_result = self._RunGraph(run_params, saved_model_dir, inputs_data,
                                    config_no_trt, GraphState.ORIGINAL)

        # Run calibration if necessary.
        if IsQuantizationWithCalibration(run_params):
            infer_saved_model_dir = self._GetCalibratedInferGraph(
                run_params, saved_model_dir, inputs_data)
            self._VerifyGraphDef(run_params, saved_model_dir,
                                 infer_saved_model_dir, GraphState.INFERENCE)
        elif not run_params.convert_online:
            infer_saved_model_dir = self._GetInferGraph(
                run_params, saved_model_dir)
            self._VerifyGraphDef(run_params, saved_model_dir,
                                 infer_saved_model_dir, GraphState.INFERENCE)
        else:
            infer_saved_model_dir = saved_model_dir

        # Run inference.
        infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
        logging.info("Running final inference graph, config:\n%s",
                     str(infer_config))
        result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
                                infer_config, GraphState.INFERENCE)
        self.assertAllClose(ref_result,
                            result,
                            atol=self.ExpectedAbsoluteTolerance(run_params),
                            rtol=self.ExpectedRelativeTolerance(run_params))
예제 #34
0
 def stop_heartbeat():
     logging.info('Stopping the heartbeat thread')
     _heartbeat_timer.set()
     # Give the threads some time to clean up.
     time.sleep(max(period // 10, 2))
예제 #35
0
def start(period: int) -> threading.Event:
    """Starts a persistent thread exchanging heartbeats between workers.

  Args:
    period: Heartbeat interval in seconds. Heartbeat timeout is set to the
      larger of `period` - 10 and 2s.

  Returns:
    A threading.Event object. Users can choose to call its set() method to shut
    down the heartbeat service gracefully. This isn't necessary in most cases,
    because the heartbeat service automatically shuts down at successful program
    exit through atexit handlers. But in situations when atexit handlers are not
    invoked, such as when multiprocessing processes exit in tests, users can
    manually request a shutdown.
  """
    global _heartbeat_timer
    if _heartbeat_timer is not None:
        logging.warning(
            'A heartbeat thread is already running, skipping this one.')
        return _heartbeat_timer

    task_id = api.client_id()
    num_tasks = api.num_clients()

    # Worker 0 generates a random token. All other workers receive that token.
    if task_id == 0:
        token = np.random.randint(0,
                                  pow(2, 16) - 1)  # reserve the other 16 bits
        signal = np.full([num_tasks], token, dtype=np.int32)
    else:
        signal = np.zeros([num_tasks], dtype=np.int32)
    logging.info('Initial heartbeat signal: %s', signal)

    device = tf_device.DeviceSpec(job=api.job_name(),
                                  replica=0,
                                  task=task_id,
                                  device_type='CPU',
                                  device_index=0)
    # Always use 0 for group and instance keys to reduce unnecessary
    # collective hangs and simplify failure analysis. This also avoid
    # collision with normal collectives.
    with ops.device(device):
        signal = all_reduce(constant_op.constant(signal),
                            group_size=num_tasks,
                            group_key=0,
                            instance_key=0,
                            timeout=max(period - 10, 2)).numpy()
    logging.info('Merged heartbeat signal %s', signal)

    # The merged signal should have equal elements. If not, some worker(s) may be
    # out of sync, and we should terminate all workers.
    if task_id == 0:
        if not np.all(signal == token):
            logging.fatal('Merged heartbeat signal has value != %d', token)
    else:
        if len(set(signal)) != 1:
            logging.fatal('Merged heartbeat signal has unequal elements')
        token = signal[0]

    # On normal main process exit, set the timer to stop the heartbeat thread.
    _heartbeat_timer = threading.Event()

    def stop_heartbeat():
        logging.info('Stopping the heartbeat thread')
        _heartbeat_timer.set()
        # Give the threads some time to clean up.
        time.sleep(max(period // 10, 2))

    atexit.register(stop_heartbeat)

    # Start the persistent heartbeat thread.
    thread = threading.Thread(
        target=_heartbeat,
        args=[period, _heartbeat_timer, token, num_tasks, task_id, device],
        daemon=True)
    thread.start()

    return _heartbeat_timer
  def _train_model(self, input_fn, hooks, saving_listeners):
    worker_hooks = []
    with ops.Graph().as_default() as g, g.device(self._device_fn):
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step_tensor = self._create_and_assert_global_step(g)
      training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
      features, labels, input_hooks = (
          self._get_features_and_labels_from_input_fn(
              input_fn, model_fn_lib.ModeKeys.TRAIN))
      worker_hooks.extend(input_hooks)
      estimator_spec = self._call_model_fn(
          features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)

      if self._warm_start_settings:
        logging.info('Warm-starting with WarmStartSettings: %s' %
                     (self._warm_start_settings,))
        # pylint: disable=protected-access
        warm_starting_util._warm_start(self._warm_start_settings)
        # pylint: enable=protected-access
      # Check if the user created a loss summary, and add one if they didn't.
      # We assume here that the summary is called 'loss'. If it is not, we will
      # make another one with the name 'loss' to ensure it shows up in the right
      # graph in TensorBoard.
      if not any([x.op.name == 'loss'
                  for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
        summary.scalar('loss', estimator_spec.loss)
      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
      worker_hooks.extend(hooks)
      worker_hooks.extend([
          training.NanTensorHook(estimator_spec.loss),
          training.LoggingTensorHook(
              {
                  'loss': estimator_spec.loss,
                  'step': global_step_tensor
              },
              every_n_iter=100)
      ])
      worker_hooks.extend(estimator_spec.training_hooks)

      if not (estimator_spec.scaffold.saver or
              ops.get_collection(ops.GraphKeys.SAVERS)):
        ops.add_to_collection(
            ops.GraphKeys.SAVERS,
            training.Saver(
                sharded=True,
                max_to_keep=self._config.keep_checkpoint_max,
                keep_checkpoint_every_n_hours=(
                    self._config.keep_checkpoint_every_n_hours),
                defer_build=True,
                save_relative_paths=True))

      chief_hooks = []
      all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
      saver_hooks = [
          h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
      if (self._config.save_checkpoints_secs or
          self._config.save_checkpoints_steps):
        if not saver_hooks:
          chief_hooks = [
              training.CheckpointSaverHook(
                  self._model_dir,
                  save_secs=self._config.save_checkpoints_secs,
                  save_steps=self._config.save_checkpoints_steps,
                  scaffold=estimator_spec.scaffold)
          ]
          saver_hooks = [chief_hooks[0]]
      if saving_listeners:
        if not saver_hooks:
          raise ValueError(
              'There should be a CheckpointSaverHook to use saving_listeners. '
              'Please set one of the RunConfig.save_checkpoints_steps or '
              'RunConfig.save_checkpoints_secs.')
        else:
          # It is expected to have one CheckpointSaverHook. If multiple, we pick
          # up the first one to add listener.
          saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
      with training.MonitoredTrainingSession(
          master=self._config.master,
          is_chief=self._config.is_chief,
          checkpoint_dir=self._model_dir,
          scaffold=estimator_spec.scaffold,
          hooks=worker_hooks,
          chief_only_hooks=(
              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
          save_checkpoint_secs=0,  # Saving is handled by a hook.
          save_summaries_steps=self._config.save_summary_steps,
          config=self._session_config,
          log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
        loss = None
        while not mon_sess.should_stop():
          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
      return loss
  def train(self,
            input_fn,
            hooks=None,
            steps=None,
            max_steps=None,
            saving_listeners=None):
    """Trains a model given training data input_fn.

    Args:
      input_fn: A function that provides input data for training as minibatches.
        See @{$get_started/premade_estimators#create_input_functions} for more
        information. The function should construct and return one of
        the following:

          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
            tuple (features, labels) with same constraints as below.
          * A tuple (features, labels): Where features is a `Tensor` or a
            dictionary of string feature name to `Tensor` and labels is a
            `Tensor` or a dictionary of string label name to `Tensor`. Both
            features and labels are consumed by `model_fn`. They should satisfy
            the expectation of `model_fn` from inputs.

      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
        inside the training loop.
      steps: Number of steps for which to train model. If `None`, train forever
        or train until input_fn generates the `OutOfRange` error or
        `StopIteration` exception. 'steps' works incrementally. If you call two
        times train(steps=10) then training occurs in total 20 steps. If
        `OutOfRange` or `StopIteration` occurs in the middle, training stops
        before 20 steps. If you don't want to have incremental behavior please
        set `max_steps` instead. If set, `max_steps` must be `None`.
      max_steps: Number of total steps for which to train model. If `None`,
        train forever or train until input_fn generates the `OutOfRange` error
        or `StopIteration` exception. If set, `steps` must be `None`. If
        `OutOfRange` or `StopIteration` occurs in the middle, training stops
        before `max_steps` steps.
        Two calls to `train(steps=100)` means 200 training
        iterations. On the other hand, two calls to `train(max_steps=100)` means
        that the second call will not do any iteration since first call did
        all 100 steps.
      saving_listeners: list of `CheckpointSaverListener` objects. Used for
        callbacks that run immediately before or after checkpoint savings.

    Returns:
      `self`, for chaining.

    Raises:
      ValueError: If both `steps` and `max_steps` are not `None`.
      ValueError: If either `steps` or `max_steps` is <= 0.
    """
    if (steps is not None) and (max_steps is not None):
      raise ValueError('Can not provide both steps and max_steps.')
    if steps is not None and steps <= 0:
      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
    if max_steps is not None and max_steps <= 0:
      raise ValueError(
          'Must specify max_steps > 0, given: {}'.format(max_steps))

    if max_steps is not None:
      start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
      if max_steps <= start_step:
        logging.info('Skipping training since max_steps has already saved.')
        return self

    hooks = _check_hooks_type(hooks)
    hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))

    saving_listeners = _check_listeners_type(saving_listeners)
    loss = self._train_model(input_fn, hooks, saving_listeners)
    logging.info('Loss for final step: %s.', loss)
    return self
예제 #38
0
def Quantize(graph,
             is_training,
             weight_bits=8,
             activation_bits=8,
             ema_decay=0.999,
             quant_delay=None,
             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
             scope=None):
    """Updates graph with quantization operations.

  Currently we quantize the following tensors:
  * Conv/MatMul: Quantize the weights if it matches.
  * Activation: Quantize the output if it matches.
  * Bypass/Post-activation Bypass: Quantize both input and output
    if it matches.

  Args:
    graph: Graph to modify.
    is_training: Whether quantizing training graph or eval graph.
    weight_bits: Number of bits to use for quantizing weights.
    activation_bits: Number of bits to use for quantizing activations.
    ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
      quantization intervals for quantizing activations (see here about EMA:
      https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
    quant_delay: (Optional, default None) Int, count of global steps for which
      to delay quantization.  This helps weights stabilize at the start of
      training.
    vars_collection: (Optional) Collection where to store the variables for
      quantization interval ends.
    scope: The scope to be transformed. If it's not None, only the ops which
      are in this scope will be transformed.
  Raises:
    ValueError: When quantization fails.
  """
    if scope and not scope.endswith('/'):
        scope += '/'

    input_to_ops_map = input_to_ops.InputToOps(graph)
    for layer_match in _FindLayersToQuantize(graph):
        # Quantize the weights.
        context = _GetContextFromOp(layer_match.layer_op)

        # If `scope` is given, only quantize it if the consumer of weights
        # (the layer op) is in the right scope.
        _InsertQuantOp(context,
                       'weights_quant',
                       layer_match.weight_tensor.op, [layer_match.layer_op],
                       is_training,
                       moving_avg=False,
                       ema_decay=ema_decay,
                       quant_delay=quant_delay,
                       narrow_range=True,
                       vars_collection=vars_collection,
                       bits=weight_bits,
                       consumer_scope=scope)

        # Quantize the activations.
        consumer_ops = input_to_ops_map.ConsumerOperations(
            layer_match.activation_op)
        add_context = context
        if layer_match.bypass_op:
            add_context = re.search(r'^(.*)/([^/]+)', context).group(1)

        # If `scope` is given, only quantize it if the producer of weights
        # (usually it's the layer op) is in the right scope.
        _InsertQuantOp(add_context,
                       'act_quant',
                       layer_match.activation_op,
                       consumer_ops,
                       is_training,
                       moving_avg=True,
                       ema_decay=ema_decay,
                       quant_delay=quant_delay,
                       vars_collection=vars_collection,
                       bits=activation_bits,
                       init_min=0.0,
                       producer_scope=scope)

        # Quantize the inputs and output to the bypass (if it exists). The input to
        # the bypass is the bias add, and the output is the activation.
        if layer_match.bypass_op is not None:
            # If `scope` is given, only quantize it if the both the producer and the
            # consumer are in the right scope.
            _InsertQuantOp(context,
                           'conv_quant',
                           layer_match.bias_add_op, [layer_match.bypass_op],
                           is_training,
                           moving_avg=True,
                           ema_decay=ema_decay,
                           quant_delay=quant_delay,
                           vars_collection=vars_collection,
                           bits=activation_bits,
                           producer_scope=scope,
                           consumer_scope=scope)
            # Make sure the op following this isn't an activation. In which case, we
            # shouldn't quantize it, since the activation will be Fused into the
            # Add at inference time.
            consumers = input_to_ops_map.ConsumerOperations(
                layer_match.bypass_op)
            if any(
                [consumer.type in _ACTIVATION_TYPES
                 for consumer in consumers]):
                logging.info(
                    'Skipping %s, because its followed by an activation.',
                    layer_match.bypass_op.name)
            else:
                _InsertQuantOp(add_context,
                               'add_quant',
                               layer_match.bypass_op,
                               input_to_ops_map.ConsumerOperations(
                                   layer_match.bypass_op),
                               is_training,
                               moving_avg=True,
                               ema_decay=ema_decay,
                               quant_delay=quant_delay,
                               vars_collection=vars_collection,
                               bits=activation_bits,
                               producer_scope=scope,
                               consumer_scope=scope)

        # Quantize bypass ops that occur after the activation.
        if layer_match.post_activation_bypass_op is not None:
            post_activation_bypass_context = re.search(
                r'^(.*)/([^/]+)',
                layer_match.post_activation_bypass_op.name).group(1)
            # If `scope` is given, only quantize it if the producer is in the right
            # scope.
            # Make sure the op following this isn't an activation. In which case, we
            # shouldn't quantize it, since the activation will be Fused into the
            # Add at inference time.
            consumers = input_to_ops_map.ConsumerOperations(
                layer_match.post_activation_bypass_op)
            if any(
                [consumer.type in _ACTIVATION_TYPES
                 for consumer in consumers]):
                logging.info(
                    'Skipping %s, because its followed by an activation.',
                    layer_match.post_activation_bypass_op.name)
            else:
                _InsertQuantOp(post_activation_bypass_context,
                               'post_activation_bypass_quant',
                               layer_match.post_activation_bypass_op,
                               consumers,
                               is_training,
                               moving_avg=True,
                               ema_decay=ema_decay,
                               quant_delay=quant_delay,
                               vars_collection=vars_collection,
                               bits=activation_bits,
                               producer_scope=scope)
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = copy.deepcopy(context.context().config)
            config_proto = self._update_config_proto(config_proto)

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices) * self._num_workers,
            collective_keys=self._collective_keys,
            communication=self._communication)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            collective_keys=self._collective_keys,
            communication=cross_device_ops_lib.CollectiveCommunication.RING,
        )
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
예제 #40
0
def _InsertQuantOp(context,
                   name,
                   producer,
                   consumers,
                   is_training,
                   moving_avg=True,
                   init_min=-6.0,
                   init_max=6.0,
                   bits=8,
                   ema_decay=0.999,
                   quant_delay=None,
                   vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
                   narrow_range=False,
                   producer_scope=None,
                   consumer_scope=None):
    """Inserts a quant op between a producer op and (multiple) consumer ops.

  Args:
    context: Context where producer and consumer operations are nested.
    name: Name for the new quantization op within the context.
    producer: Producer operation of the pairs where quantization will be
      inserted.
    consumers: Consumer operations of the pairs.
    is_training: Whether quantizing training graph or eval graph.
    moving_avg: Specifies whether to use exponential moving average or just
      the last value seen.
    init_min: Starting minimum value for the new quantization op.
    init_max: Starting maximum value for the new quantization op.
    bits: Number of bits to use for quantization, must be between 2 and 8.
    ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
      quantization intervals for quantizing activations (see here about EMA:
      https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
    quant_delay: (Optional, default None) Int, count of global steps for which
      to delay quantization.  This helps weights stabilize at the start of
      training.
    vars_collection: (Optional) Collection where to store the variables for
      quantization interval ends.
    narrow_range: Whether to use the narrow quantization range
      [1; 2^bits - 1] or wide range [0; 2^bits - 1].
    producer_scope: The restriction of producer scope. If not None, the new op
      will be inserted only when the producer is in this scope.
    consumer_scope: The restriction of producer scope. If not None, the new op
      will be inserted only when all the consumers are in this scope.
  Raises:
    ValueError: When producer operation is not directly connected to the
      consumer operation.
  """
    if producer_scope and not producer.name.startswith(producer_scope):
        logging.info(
            '_InsertQuantOp ignores context="%s" name="%s" '
            'because producer "%s" is not in scope "%s"', context, name,
            producer.name, producer_scope)
        return

    if consumer_scope:
        consumers_in_scope = []
        for consumer in consumers:
            if consumer.name.startswith(consumer_scope):
                consumers_in_scope.append(consumer)
            else:
                logging.info(
                    '_InsertQuantOp context="%s" name="%s" ignores '
                    'consumer "%s" because it is not in scope "%s"', context,
                    name, consumer.name, consumer_scope)
                return
        consumers = consumers_in_scope

    name_prefix = _AddContextToName(context, name)
    # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
    # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
    # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which
    # breaks things later.
    name_scope = ops.get_name_scope()
    if name_scope:
        name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/')

    inputs = producer.outputs[0]
    # Prevent ops from being quantized multiple times. Bypass ops can sometimes
    # overlap between multiple matches, so we need to ensure that we don't
    # add duplicate FakeQuant operations.
    if _FollowedByFakeQuant(inputs):
        return

    if moving_avg:
        quant = (quant_ops.MovingAvgQuantize(inputs,
                                             init_min=init_min,
                                             init_max=init_max,
                                             ema_decay=ema_decay,
                                             is_training=is_training,
                                             num_bits=bits,
                                             narrow_range=narrow_range,
                                             vars_collection=vars_collection,
                                             name_prefix=name_prefix))
    else:
        quant = (quant_ops.LastValueQuantize(inputs,
                                             init_min=init_min,
                                             init_max=init_max,
                                             is_training=is_training,
                                             num_bits=bits,
                                             narrow_range=narrow_range,
                                             vars_collection=vars_collection,
                                             name_prefix=name_prefix))

    if quant_delay and quant_delay > 0:
        activate_quant = math_ops.greater_equal(
            common.CreateOrGetQuantizationStep(),
            quant_delay,
            name=name_prefix + '/activate_quant')
        quant = control_flow_ops.cond(activate_quant,
                                      lambda: quant,
                                      lambda: inputs,
                                      name=name_prefix + '/delayed_quant')

    if consumers:
        tensors_modified_count = common.RerouteTensor(quant,
                                                      inputs,
                                                      can_modify=consumers)
        # Some operations can have multiple output tensors going to the same
        # consumer. Since consumers is a set, we need to ensure that
        # tensors_modified_count is greater than or equal to the length of the set
        # of consumers.
        if tensors_modified_count < len(consumers):
            raise ValueError(
                'No inputs quantized for ops: [%s]' %
                ', '.join([consumer.name for consumer in consumers]))
예제 #41
0
    def __init__(self,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=100,
                 save_checkpoints_steps=_USE_DEFAULT,
                 save_checkpoints_secs=_USE_DEFAULT,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 train_distribute=None,
                 device_fn=None,
                 protocol=None,
                 eval_distribute=None,
                 experimental_distribute=None,
                 experimental_max_worker_delay_secs=None):
        """Constructs a RunConfig.

    All distributed training related properties `cluster_spec`, `is_chief`,
    `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and
    `task_type` are set based on the `TF_CONFIG` environment variable, if the
    pertinent information is present. The `TF_CONFIG` environment variable is a
    JSON object with attributes: `cluster` and `task`.

    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
    `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to
    a list of task addresses.

    `task` has two attributes: `type` and `index`, where `type` can be any of
    the task types in `cluster`. When `TF_CONFIG` contains said information,
    the following properties are set on this class:

    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
      present, must have one and only one node in the `chief` attribute of
      `cluster_spec`.
    * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if
      `cluster_spec` is present; must be `worker` (the default value) if
      `cluster_spec` is not set.
    * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if
      `cluster_spec` is present; must be 0 (the default value) if
      `cluster_spec` is not set.
    * `master` is determined by looking up `task_type` and `task_id` in the
      `cluster_spec`. Defaults to ''.
    * `num_ps_replicas` is set by counting the number of nodes listed
      in the `ps` attribute of `cluster_spec`. Defaults to 0.
    * `num_worker_replicas` is set by counting the number of nodes listed
      in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1.
    * `is_chief` is determined based on `task_type` and `cluster`.

    There is a special node with `task_type` as `evaluator`, which is not part
    of the (training) `cluster_spec`. It handles the distributed evaluation job.

    Example of non-chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'worker', 'index': 1}})
      config = RunConfig()
      assert config.master == 'host4:2222'
      assert config.task_id == 1
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'worker'
      assert not config.is_chief
    ```

    Example of chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'chief', 'index': 0}})
      config = RunConfig()
      assert config.master == 'host0:2222'
      assert config.task_id == 0
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'chief'
      assert config.is_chief
    ```

    Example of evaluator node (evaluator is not part of training cluster):
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'evaluator', 'index': 0}})
      config = RunConfig()
      assert config.master == ''
      assert config.evaluator_master == ''
      assert config.task_id == 0
      assert config.num_ps_replicas == 0
      assert config.num_worker_replicas == 0
      assert config.cluster_spec == {}
      assert config.task_type == 'evaluator'
      assert not config.is_chief
    ```

    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
    distributed training. For example, setting `save_checkpoints_secs` as 60
    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
    that checkpoint would be garbage collected after 5 minutes. In distributed
    training, the evaluation job starts asynchronously and might fail to load or
    find the checkpoint due to race condition.

    Args:
      model_dir: directory where model parameters, graph, etc are saved. If
        `PathLike` object, the path will be resolved. If `None`, will use a
        default value set by the Estimator.
      tf_random_seed: Random seed for TensorFlow initializers.
        Setting this value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
          specified with `save_checkpoints_secs`.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
          be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
          both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
          in constructor.  If both `save_checkpoints_steps` and
          `save_checkpoints_secs` are `None`, then checkpoints are disabled.
      session_config: a ConfigProto used to set session parameters, or `None`.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If `None` or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept.)
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
        to be saved. The default value of 10,000 hours effectively disables
        the feature.
      log_step_count_steps: The frequency, in number of global steps, that the
        global step and the loss will be logged during training.  Also controls
        the frequency that the global steps / s will be logged (and written to
        summary) during training.
      train_distribute: An optional instance of
        `tf.contrib.distribute.DistributionStrategy`. If specified,
        then Estimator will distribute the user's model during training,
        according to the policy specified by that strategy. Setting
        `experimental_distribute.train_distribute` is preferred.
      device_fn: A callable invoked for every `Operation` that takes the
        `Operation` and returns the device string. If `None`, defaults to
        the device function returned by `tf.train.replica_device_setter`
        with round-robin strategy.
      protocol: An optional argument which specifies the protocol used when
        starting server. `None` means default to grpc.
      eval_distribute: An optional instance of
        `tf.contrib.distribute.DistributionStrategy`. If specified,
        then Estimator will distribute the user's model during evaluation,
        according to the policy specified by that strategy. Setting
        `experimental_distribute.eval_distribute` is preferred.
      experimental_distribute: An optional
        `tf.contrib.distribute.DistributeConfig` object specifying
        DistributionStrategy-related configuration. The `train_distribute` and
        `eval_distribute` can be passed as parameters to `RunConfig` or set in
        `experimental_distribute` but not both.
      experimental_max_worker_delay_secs: An optional integer
        specifying the maximum time a worker should wait before starting.
        By default, workers are started at staggered times, with each worker
        being delayed by up to 60 seconds. This is intended to reduce the risk
        of divergence, which can occur when many workers simultaneously update
        the weights of a randomly initialized model. Users who warm-start their
        models and train them for short durations (a few minutes or less) should
        consider reducing this default to improve training times.

    Raises:
      ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
      are set.
    """
        if (save_checkpoints_steps == _USE_DEFAULT
                and save_checkpoints_secs == _USE_DEFAULT):
            save_checkpoints_steps = None
            save_checkpoints_secs = 600
        elif save_checkpoints_secs == _USE_DEFAULT:
            save_checkpoints_secs = None
        elif save_checkpoints_steps == _USE_DEFAULT:
            save_checkpoints_steps = None
        elif (save_checkpoints_steps is not None
              and save_checkpoints_secs is not None):
            raise ValueError(_SAVE_CKPT_ERR)

        tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
        if tf_config:
            logging.info('TF_CONFIG environment variable: %s', tf_config)

        model_dir = _get_model_dir(tf_config,
                                   compat_internal.path_to_str(model_dir))

        RunConfig._replace(
            self,
            allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            train_distribute=train_distribute,
            device_fn=device_fn,
            protocol=protocol,
            eval_distribute=eval_distribute,
            experimental_distribute=experimental_distribute,
            experimental_max_worker_delay_secs=
            experimental_max_worker_delay_secs)

        # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs.
        if ((train_distribute
             and train_distribute.__class__.__name__ != 'TPUStrategy')
                or (eval_distribute
                    and eval_distribute.__class__.__name__ != 'TPUStrategy')
                or experimental_distribute):
            logging.info(
                'Initializing RunConfig with distribution strategies.')
            distribute_coordinator_training.init_run_config(self, tf_config)
        else:
            self._init_distributed_setting_from_environment_var(tf_config)
            self._maybe_overwrite_session_config_for_distributed_training()
예제 #42
0
    def __init__(self,
                 dim=1000,
                 num_ops=100,
                 virtual_devices_per_gpu=None,
                 device_probabilities=None):
        self._dim = dim
        self._num_ops = num_ops
        if virtual_devices_per_gpu is None:
            self._virtual_devices_per_gpu = [3]
        else:
            self._virtual_devices_per_gpu = virtual_devices_per_gpu
        self._visible_device_list = [
            i for i in range(len(self._virtual_devices_per_gpu))
        ]
        gpu_devices = [('/gpu:' + str(i))
                       for i in range(sum(self._virtual_devices_per_gpu))]
        self.devices = ['/cpu:0'] + gpu_devices
        self._num_devices = len(self.devices)
        # Each virtual device gets 2GB memory.
        self._mem_limits_mb = [([1 << 11] * i)
                               for i in self._virtual_devices_per_gpu]
        self.config = self._GetSessionConfig()

        if device_probabilities is not None:
            self._device_probabilities = list(
                device_probabilities)  # Deep copy
            for i in range(1, self._num_devices):
                self._device_probabilities[i] += self._device_probabilities[i -
                                                                            1]
        else:
            # Each device gets same probability to be assigned an operation.
            step = 1.0 / self._num_devices
            self._device_probabilities = [(x + 1) * step
                                          for x in range(self._num_devices)]
        # To prevent rounding error causing problems.
        self._device_probabilities[self._num_devices - 1] = 1.1

        logging.info('dim: %d', self._dim)
        logging.info('num_ops: %d', self._num_ops)
        logging.info('visible_device_list: %s', str(self._visible_device_list))
        logging.info('virtual_devices_per_gpu: %s',
                     str(self._virtual_devices_per_gpu))
        logging.info('mem_limits: %s', str(self._mem_limits_mb))
        logging.info('devices: %s', str(self.devices))
        logging.info('config: %s', text_format.MessageToString(self.config))
        logging.info('device_probabilities: %s',
                     str(self._device_probabilities))
 def func():
   logging.info('func running')
    def _initialize_local(self, cluster_resolver, devices=None):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if ops.executing_eagerly_outside_functions():
            try:
                context.context().configure_collective_ops(
                    scoped_allocator_enabled_ops=("CollectiveReduce", ))
            except RuntimeError:
                logging.warning(
                    "Collective ops is not configured at program startup. "
                    "Some performance features may not be enabled.")
            self._collective_ops_configured = True

        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if devices:
            local_devices = devices
        else:
            if num_gpus:
                local_devices = tuple("/device:GPU:%d" % i
                                      for i in range(num_gpus))
            else:
                local_devices = ("/device:CPU:0", )

        self._worker_device = device_util.canonicalize("/device:CPU:0")
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices),
            collective_keys=self._collective_keys,
            communication=self._communication)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            collective_keys=self._collective_keys,
            communication=cross_device_ops_lib.CollectiveCommunication.RING,
        )
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        # This is a mark to tell whether we are running with standalone client or
        # independent worker. Right now with standalone client, strategy object is
        # created as local strategy and then turn into multi-worker strategy via
        # configure call.
        self._local_or_standalone_client_mode = True

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Single-worker MultiWorkerMirroredStrategy with local_devices "
            "= %r, communication = %s", local_devices, self._communication)
예제 #45
0
def train_step(sess, train_op, global_step, train_step_kwargs):
    """Function that takes a gradient step and specifies whether to stop.

  Args:
    sess: The current session.
    train_op: An `Operation` that evaluates the gradients and returns the
      total loss.
    global_step: A `Tensor` representing the global training step.
    train_step_kwargs: A dictionary of keyword arguments.

  Returns:
    The total loss and a boolean indicating whether or not to stop training.

  Raises:
    ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not.
  """
    start_time = time.time()

    trace_run_options = None
    run_metadata = None
    if 'should_trace' in train_step_kwargs:
        if 'logdir' not in train_step_kwargs:
            raise ValueError(
                'logdir must be present in train_step_kwargs when '
                'should_trace is present')
        if sess.run(train_step_kwargs['should_trace']):
            trace_run_options = config_pb2.RunOptions(
                trace_level=config_pb2.RunOptions.FULL_TRACE)
            run_metadata = config_pb2.RunMetadata()

    total_loss, np_global_step = sess.run([train_op, global_step],
                                          options=trace_run_options,
                                          run_metadata=run_metadata)
    time_elapsed = time.time() - start_time

    if run_metadata is not None:
        tl = timeline.Timeline(run_metadata.step_stats)
        trace = tl.generate_chrome_trace_format()
        trace_filename = os.path.join(train_step_kwargs['logdir'],
                                      'tf_trace-%d.json' % np_global_step)
        logging.info('Writing trace to %s', trace_filename)
        file_io.write_string_to_file(trace_filename, trace)
        if 'summary_writer' in train_step_kwargs:
            train_step_kwargs['summary_writer'].add_run_metadata(
                run_metadata, 'run_metadata-%d' % np_global_step)

    if 'should_log' in train_step_kwargs:
        if sess.run(train_step_kwargs['should_log']):
            logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                         np_global_step, total_loss, time_elapsed)

    # TODO(nsilberman): figure out why we can't put this into sess.run. The
    # issue right now is that the stop check depends on the global step. The
    # increment of global step often happens via the train op, which used
    # created using optimizer.apply_gradients.
    #
    # Since running `train_op` causes the global step to be incremented, one
    # would expected that using a control dependency would allow the
    # should_stop check to be run in the same session.run call:
    #
    #   with ops.control_dependencies([train_op]):
    #     should_stop_op = ...
    #
    # However, this actually seems not to work on certain platforms.
    if 'should_stop' in train_step_kwargs:
        should_stop = sess.run(train_step_kwargs['should_stop'])
    else:
        should_stop = False

    return total_loss, should_stop
예제 #46
0
def convert_variables_to_constants_v2(func):
    """Replaces all the variables in a graph with constants of the same values.

  TensorFlow 2.0 function for converting all Variable ops into Const ops holding
  the same values. This makes it possible to describe the network fully with a
  single GraphDef file, and allows the removal of a lot of ops related to
  loading and saving the variables. This function runs Grappler's function
  inlining optimization in order to return a single subgraph.

  The current implementation only works for graphs that do not contain any
  control flow or embedding related ops.

  Args:
    func: ConcreteFunction.

  Returns:
    GraphDef containing a simplified version of the original.
  """
    # TODO(nupurgarg): Replace ResourceGather with Gather.
    # TODO(nupurgarg): Change attr for Variables in control flow and functions.
    graph_def = _run_inline_graph_optimization(func)

    # Identify the ReadVariableOps.
    get_name = lambda name: name.split(":")[0]
    map_name_to_node = {get_name(node.name): node for node in graph_def.node}

    # TODO(b/125838789): Use `func.graph.captures`.
    # Get mapping from input name to variable value.
    tensor_data = {}
    input_tensors = func.inputs[-len(func.captured_inputs):]
    for var in func.graph.variables:
        index = func.captured_inputs.index(var.handle)
        tensor = input_tensors[index]
        tensor_data[get_name(tensor.name)] = var.numpy()

    resource_identities = {}
    resource_placeholders = {}
    for node in graph_def.node:
        if node.op == "ReadVariableOp":
            # Get name of Placeholder op associated with ReadVariableOp. There can be
            # an Identity in between the ReadVariableOp and Placeholder. Store the
            # Identity ops with the associated dtypes.
            input_name = get_name(node.input[0])
            while map_name_to_node[input_name].op == "Identity":
                resource_identities[input_name] = node.attr["dtype"]
                input_name = get_name(map_name_to_node[input_name].input[0])
            if map_name_to_node[input_name].op != "Placeholder":
                raise ValueError(
                    "Cannot find the Placeholder op that is an input "
                    "to the ReadVariableOp.")
            # Build a map of Placeholder ops that are inputs to ReadVariableOps to the
            # variable's dtype and data.
            resource_placeholders[input_name] = {
                "dtype": node.attr["dtype"],
                "data": tensor_data[input_name],
            }

    # Reconstruct the graph with constants in place of variables.
    output_graph_def = graph_pb2.GraphDef()
    how_many_converted = 0

    for input_node in graph_def.node:
        output_node = output_graph_def.node.add()
        # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops.
        if input_node.name in resource_placeholders:
            dtype = resource_placeholders[input_node.name]["dtype"]
            data = resource_placeholders[input_node.name]["data"]

            output_node.op = "Const"
            output_node.name = input_node.name
            output_node.attr["dtype"].CopyFrom(dtype)
            output_node.attr["value"].tensor.CopyFrom(
                tensor_util.make_tensor_proto(data,
                                              dtype=dtype.type,
                                              shape=data.shape))
            how_many_converted += 1
        # Change the dtype for Identity ops that are inputs to ReadVariableOps.
        elif input_node.name in resource_identities:
            output_node.CopyFrom(input_node)
            output_node.attr["T"].CopyFrom(
                resource_identities[input_node.name])
        # Convert ReadVariableOps into Identity ops.
        elif input_node.op == "ReadVariableOp":
            output_node.op = "Identity"
            output_node.name = input_node.name
            output_node.input.extend([input_node.input[0]])
            output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
            if "_class" in input_node.attr:
                output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
        else:
            output_node.CopyFrom(input_node)

    logging.info("Converted %d variables to const ops.", how_many_converted)
    return output_graph_def
예제 #47
0
 def after_run(self, run_context, run_values):
   evals_completed = run_values.results['evals_completed']
   if self._log_progress:
     logging.info('Evaluation [%d/%d]', evals_completed, self._num_evals)
   if evals_completed >= self._num_evals:
     run_context.request_stop()
예제 #48
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training,
      as measured by 'global_step': training will stop if global_step is
      greater than 'number_of_steps'. If the value is left as None, training
      proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
      them. If the argument is supplied, gradient updates will be synchronous.
      If left as `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if isinstance(sync_optimizer,
                  sync_replicas_optimizer.SyncReplicasOptimizer):
        sync_optimizer = [sync_optimizer]
    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        if sync_optimizer is not None:
            for opt in sync_optimizer:
                if not isinstance(
                        opt, sync_replicas_optimizer.SyncReplicasOptimizer):
                    raise ValueError(
                        '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                    )

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    lookup_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(sync_optimizer, list):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = control_flow_ops.group(
                            *[opt.chief_init_op for opt in sync_optimizer])
                    else:
                        local_init_op = control_flow_ops.group(
                            *
                            [opt.local_step_init_op for opt in sync_optimizer])
                ready_for_local_init_op = control_flow_ops.group(
                    *[opt.ready_for_local_init_op for opt in sync_optimizer])
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = [
                opt.get_init_tokens_op() for opt in sync_optimizer
            ]
            chief_queue_runner = [
                opt.get_chief_queue_runner() for opt in sync_optimizer
            ]

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                if log_every_n_steps > 0:
                    train_step_kwargs['should_log'] = math_ops.equal(
                        math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                threads = sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, chief_queue_runner)
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            sv.request_stop()
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)
                    sv.stop(threads, close_summary_writer=True)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
예제 #49
0
 def print_hparams(self):
     logging.info(self._spec.to_json())
예제 #50
0
def _evaluate_once(checkpoint_path,
                   master='',
                   scaffold=None,
                   eval_ops=None,
                   feed_dict=None,
                   final_ops=None,
                   final_ops_feed_dict=None,
                   hooks=None,
                   config=None):
  """Evaluates the model at the given checkpoint path.

  During a single evaluation, the `eval_ops` is run until the session is
  interrupted or requested to finish. This is typically requested via a
  `tf.contrib.training.StopAfterNEvalsHook` which results in `eval_ops` running
  the requested number of times.

  Optionally, a user can pass in `final_ops`, a single `Tensor`, a list of
  `Tensors` or a dictionary from names to `Tensors`. The `final_ops` is
  evaluated a single time after `eval_ops` has finished running and the fetched
  values of `final_ops` are returned. If `final_ops` is left as `None`, then
  `None` is returned.

  One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
  summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
  summaries run immediately after the model checkpoint has been restored.

  Note that `evaluate_once` creates a local variable used to track the number of
  evaluations run via `tf.contrib.training.get_or_create_eval_step`.
  Consequently, if a custom local init op is provided via a `scaffold`, the
  caller should ensure that the local init op also initializes the eval step.

  Args:
    checkpoint_path: The path to a checkpoint to use for evaluation.
    master: The BNS address of the TensorFlow master.
    scaffold: An tf.train.Scaffold instance for initializing variables and
      restoring variables. Note that `scaffold.init_fn` is used by the function
      to restore the checkpoint. If you supply a custom init_fn, then it must
      also take care of restoring the model from its checkpoint.
    eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
      to `Tensors`, which is run until the session is requested to stop,
      commonly done by a `tf.contrib.training.StopAfterNEvalsHook`.
    feed_dict: The feed dictionary to use when executing the `eval_ops`.
    final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
      to `Tensors`.
    final_ops_feed_dict: A feed dictionary to use when evaluating `final_ops`.
    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
      evaluation loop.
    config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The fetched values of `final_ops` or `None` if `final_ops` is `None`.
  """
  eval_step = _get_or_create_eval_step()

  # Prepare the run hooks.
  hooks = list(hooks or [])

  if eval_ops is not None:
    update_eval_step = state_ops.assign_add(eval_step, 1, use_locking=True)

    if isinstance(eval_ops, dict):
      eval_ops['update_eval_step'] = update_eval_step
    elif isinstance(eval_ops, (tuple, list)):
      eval_ops = list(eval_ops) + [update_eval_step]
    else:
      eval_ops = [eval_ops, update_eval_step]

    eval_step_value = _get_latest_eval_step_value(eval_ops)

    for h in hooks:
      if isinstance(h, _StopAfterNEvalsHook):
        h._set_evals_completed_tensor(eval_step_value)  # pylint: disable=protected-access

  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))

  # Prepare the session creator.
  session_creator = monitored_session.ChiefSessionCreator(
      scaffold=scaffold,
      checkpoint_filename_with_path=checkpoint_path,
      master=master,
      config=config)

  final_ops_hook = basic_session_run_hooks.FinalOpsHook(
      final_ops, final_ops_feed_dict)
  hooks.append(final_ops_hook)

  with monitored_session.MonitoredSession(
      session_creator=session_creator, hooks=hooks) as session:
    if eval_ops is not None:
      while not session.should_stop():
        session.run(eval_ops, feed_dict)

  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                         time.gmtime()))
  return final_ops_hook.final_ops_values
예제 #51
0
  def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir):
    """Train or evaluate the model.

    Args:
      is_training: whether to train or evaluate the model. In training mode,
        quantization will be simulated where the quantize_and_dequantize_v2 are
        placed.
      use_trt: if true, use TRT INT8 mode for evaluation, which will perform
        real quantization. Otherwise use native TensorFlow which will perform
        simulated quantization. Ignored if is_training is True.
      batch_size: batch size.
      num_epochs: how many epochs to train. Ignored if is_training is False.
      model_dir: where to save or load checkpoint.

    Returns:
      The Estimator evaluation result.
    """
    # Get dataset
    train_data, test_data = mnist.load_data()

    def _PreprocessFn(x, y):
      x = math_ops.cast(x, dtypes.float32)
      x = array_ops.expand_dims(x, axis=2)
      x = 2.0 * (x / 255.0) - 1.0
      y = math_ops.cast(y, dtypes.int32)
      return x, y

    def _EvalInputFn():
      mnist_x, mnist_y = test_data
      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
      dataset = dataset.apply(
          data.experimental.map_and_batch(
              map_func=_PreprocessFn,
              batch_size=batch_size,
              num_parallel_calls=8))
      dataset = dataset.repeat(count=1)
      iterator = dataset.make_one_shot_iterator()
      features, labels = iterator.get_next()
      return features, labels

    def _TrainInputFn():
      mnist_x, mnist_y = train_data
      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
      dataset = dataset.shuffle(2 * len(mnist_x))
      dataset = dataset.apply(
          data.experimental.map_and_batch(
              map_func=_PreprocessFn,
              batch_size=batch_size,
              num_parallel_calls=8))
      dataset = dataset.repeat(count=num_epochs)
      iterator = dataset.make_one_shot_iterator()
      features, labels = iterator.get_next()
      return features, labels

    def _ModelFn(features, labels, mode):
      if is_training:
        logits_out = self._BuildGraph(features)
      else:
        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
        logits_out = importer.import_graph_def(
            graph_def,
            input_map={INPUT_NODE_NAME: features},
            return_elements=[OUTPUT_NODE_NAME + ':0'],
            name='')[0]

      loss = losses.sparse_softmax_cross_entropy(
          labels=labels, logits=logits_out)
      summary.scalar('loss', loss)

      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
      accuracy = metrics.accuracy(
          labels=labels, predictions=classes_out, name='acc_op')
      summary.scalar('accuracy', accuracy[1])

      if mode == ModeKeys.EVAL:
        return EstimatorSpec(
            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
      elif mode == ModeKeys.TRAIN:
        optimizer = AdamOptimizer(learning_rate=1e-2)
        train_op = optimizer.minimize(loss, global_step=get_global_step())
        return EstimatorSpec(mode, loss=loss, train_op=train_op)

    config_proto = config_pb2.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    estimator = Estimator(
        model_fn=_ModelFn,
        model_dir=model_dir if is_training else None,
        config=RunConfig(session_config=config_proto))

    if is_training:
      estimator.train(_TrainInputFn)
    results = estimator.evaluate(_EvalInputFn)
    logging.info('accuracy: %s', str(results['accuracy']))
    return results
예제 #52
0
def main(dataset_dir, log_dir, tfrecord_filename):

    #State the location of the checkpoint file is
    checkpoint_file = 'init_ckpt/inception_resnet_v2.ckpt'


    #State the labels file and read it
    labels_file = dataset_dir+'/labels.txt'
    labels = open(labels_file, 'r')

    #Create a dictionary to refer each label to their string name
    labels_to_name = {}
    for line in labels:
        label, string_name = line.split(':')
        string_name = string_name[:-1] #Remove newline
        labels_to_name[int(label)] = string_name

    #Create the file pattern of your TFRecord files so that it could be recognized later on
    file_pattern = tfrecord_filename + '_%s_*.tfrecord'


    #================= TRAINING INFORMATION ==================
    #State the number of epochs to train
    num_epochs = 2

    #State your batch size
    batch_size = 4

    #Learning rate information and configuration (Up to you to experiment)
    initial_learning_rate = 0.001
    learning_rate_decay_factor = 0.5
    num_epochs_before_decay = 1

    #Create the log directory here. Must be done here otherwise import will activate this unneededly.
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
   # session=tf.Session()
    #Training the model
    #we start by constructing the graph and then build the model
    with tf.Graph().as_default():
        tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level

        #First create the dataset and load one batch
        dataset = get_split('train', dataset_dir, file_pattern=file_pattern, tfrecord_filename=tfrecord_filename)
        images, _, labels = load_batch(dataset, height=image_size, width=image_size, batch_size=batch_size)

        #Know the number steps to take before decaying the learning rate and batches per epoch and Because one step is one batch processed step per epoch=step per batch
        num_batches_per_epoch = dataset.num_samples / batch_size
        num_steps_per_epoch = num_batches_per_epoch 
        decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)

        #Create the model inference
        with slim.arg_scope(inception_resnet_v2_arg_scope()):
            logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = True)

        #Define the scopes that you want to exclude for restoration
        exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
        variables_to_restore = slim.get_variables_to_restore(exclude = exclude)
        
        #Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
        one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes)
        """# To make the model better at classifying the input images, we must somehow change the variables for all the network layers. To do this we first need to know how well
        the model currently performs by comparing the predicted output of the model `y_pred` to the desired output `y_true`
         The cross-entropy is a performance measure used in classification. The cross-entropy is a continuous function that is always positive and
        if the predicted output of the model exactly matches the desired output then the cross-entropy equals zero. 
        The goal of optimization is therefore to minimize the cross-entropy so it gets as close to zero as possible by changing the variables of the network layers.
        Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
        this is achieved by computing the loss, applies the gradients in order to update the weight and later return a tensor that when evaluated returns the total loss
        """
        loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
        total_loss = tf.losses.get_total_loss()    #obtain the regularization losses as well

        #Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        #Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(
            learning_rate = initial_learning_rate,
            global_step = global_step,
            decay_steps = decay_steps,
            decay_rate = learning_rate_decay_factor,
            staircase = True)

        #Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate = lr)

        """
        Create the train_op,
        Computation of the loss and gradient
        """

        train_op = slim.learning.create_train_op(total_loss, optimizer)
        
        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(end_points['Predictions'], 1)
        probabilities = end_points['Predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
        metrics_op = tf.group(accuracy_update, probabilities)
        my_summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        for end_point in end_points:
          x = end_points[end_point]
          my_summaries.add(tf.summary.histogram('activation/' + end_point, x))
          

        #Now finally create all the summaries you need to monitor and group them into one summary op.
        
        my_summaries.add(tf.summary.scalar('losses/Total_Loss', total_loss))
        my_summaries.add(tf.summary.scalar('accuracy', accuracy))
        my_summaries.add(tf.summary.scalar('learning_rate', lr))
        my_summary_op = tf.summary.merge(list(my_summaries))
        
        #session.run(tf.global_variables_initializer())
        
        #Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step):
            """
            Runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            """
            #Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
            time_elapsed = time.time() - start_time

            #Run the logging to print some results
            logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)

            return total_loss, global_step_count

        #Now we create a saver function that actually restores the variables from a checkpoint file in a sess
        saver = tf.train.Saver(variables_to_restore)
        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn)

        #Run the managed session
        with sv.managed_session() as sess:
            for step in range(int(num_steps_per_epoch * num_epochs)):
            # for step in xrange(1):
                #At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value, accuracy_value = sess.run([lr, accuracy])
                    logging.info('Current Learning Rate: %s', learning_rate_value)
                    logging.info('Current Streaming Accuracy: %s', accuracy_value)

                    # optionally, print your logits and predictions for a sanity check that things are going fine.
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels])
                    print('logits: \n', logits_value)
                    print('Probabilities: \n', probabilities_value)
                    print('predictions: \n', predictions_value)
                    print('Labels:\n:', labels_value)

                #Log the summaries every 10 step.
                if step % 10 == 0:
                    loss, _ = train_step(sess, train_op, sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)
                    
                #If not, simply run the training step
                else:
                    loss, _ = train_step(sess, train_op, sv.global_step)

            #We log the final training loss and accuracy
            logging.info('Final Loss: %s', loss)
            logging.info('Final Accuracy: %s', sess.run(accuracy))

            #Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
            # saver.save(sess, "model.ckpt")
            sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
  def __call__(self, run_context, all_workers, lame_workers):
    del run_context, lame_workers
    all_workers.shutdown(exit_code=42)

    logging.info('Resetting coordinator.')
    raise CoordinatorResetError()
def main():
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    parameters = {}
    parameters['use_character_lstm'] = True
    parameters['character_embedding_dimension'] = 25
    parameters['token_embedding_dimension'] = 100
    parameters['token_pretrained_embedding_filepath'] = ''
    parameters['pretrained_model_checkpoint_filepath'] = OutputPath(
        'char_model_{0:05d}.ckpt'.format(2))
    parameters['character_lstm_hidden_state_dimension'] = 25
    parameters['token_lstm_hidden_state_dimension'] = 100
    parameters['use_crf'] = True
    parameters['optimizer'] = 'adam'
    parameters['learning_rate'] = 0.005
    parameters['gradient_clipping_value'] = 5.0
    parameters['dropout_rate'] = 0.2
    parameters['maximum_number_of_epochs'] = 10

    loading_time = time.time()
    test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu'
    wordMapPath = 'word_map'
    tagMapPath = 'tag_map'
    charMapPath = 'char_map'
    word_map = readMap(wordMapPath)
    tag_map = readMap(tagMapPath)
    char_map = readMap(charMapPath)

    test_data = Dataset(test_data_path)
    test_data.load_dataset(word_map, tag_map, char_map)

    sess = tf.Session()
    with sess.as_default():
        model = EntityLSTM(test_data, parameters)
        sess.run(tf.global_variables_initializer())

        model_saver = tf.train.Saver(
            max_to_keep=parameters['maximum_number_of_epochs'])
        model_saver.restore(sess,
                            parameters['pretrained_model_checkpoint_filepath'])

        total_token_num = 0
        correct_token_num = 0
        start = time.time()
        transition_params_trained = sess.run(model.transition_parameters)
        start = time.time()
        while test_data.has_next_sent():
            sent = test_data.get_next_sent()
            feed_dict = {
                model.input_token_indices:
                sent.word_ids,
                model.input_label_indices:
                sent.tag_ids,
                model.input_token_character_indices:
                utils.pad_lists(sent.char_lists),
                model.input_token_lengths:
                sent.word_lengths,
                model.dropout_keep_prob:
                1 - parameters['dropout_rate']
            }
            unary_scores, predictions = sess.run(
                [model.unary_scores, model.predictions], feed_dict)
            if parameters['use_crf']:
                predictions, _ = tf.contrib.crf.viterbi_decode(
                    unary_scores, transition_params_trained)
                predictions = predictions[1:-1]
            else:
                predictions = predictions.tolist()
            gold_labels = sent.tag_ids
            total_token_num += len(predictions)
            for idx, p in enumerate(predictions):
                if p == gold_labels[idx]:
                    correct_token_num += 1
        logging.info('token number is %d, accuracy is %.2f%%, time is %.2f',
                     total_token_num,
                     (100.0 * correct_token_num / total_token_num),
                     time.time() - start)
예제 #55
0
    def _train_model(self,
                     input_fn,
                     steps,
                     feed_fn=None,
                     init_op=None,
                     init_feed_fn=None,
                     init_fn=None,
                     device_fn=None,
                     monitors=None,
                     log_every_steps=100,
                     fail_on_nan_loss=True):
        if self._config.execution_mode not in ('all', 'train'):
            return

        # Stagger startup of worker sessions based on task id.
        sleep_secs = min(
            self._config.training_worker_max_startup_secs, self._config.task *
            self._config.training_worker_session_startup_stagger_secs)
        if sleep_secs:
            logging.info('Waiting %d secs before starting task %d.',
                         sleep_secs, self._config.task)
            time.sleep(sleep_secs)

        # Device allocation
        device_fn = device_fn or self._device_fn

        self._graph = ops.Graph()
        with self._graph.as_default() as g, g.device(device_fn):
            random_seed.set_random_seed(self._config.tf_random_seed)
            global_step = contrib_framework.create_global_step(g)
            features, targets = input_fn()
            self._check_inputs(features, targets)
            train_op, loss_op = self._get_train_ops(features, targets)

            # Add default monitors.
            if monitors is None:
                monitors = []
            monitors += monitors_lib.get_default_monitors(
                loss_op=loss_op,
                summary_op=logging_ops.get_summary_op(),
                save_summary_steps=100)

            is_chief = self._config.task == 0
            if not is_chief:
                # Run monitors only on chief.
                monitors = []

            # Setup monitors.
            for monitor in monitors:
                monitor.set_estimator(self)

            return train(graph=g,
                         output_dir=self._model_dir,
                         train_op=train_op,
                         loss_op=loss_op,
                         global_step_tensor=global_step,
                         init_op=init_op,
                         init_feed_dict=init_feed_fn()
                         if init_feed_fn is not None else None,
                         init_fn=init_fn,
                         log_every_steps=log_every_steps,
                         supervisor_is_chief=is_chief,
                         supervisor_master=self._config.master,
                         feed_fn=feed_fn,
                         max_steps=steps,
                         fail_on_nan_loss=fail_on_nan_loss,
                         monitors=monitors)
예제 #56
0
    def _init_distributed_setting_from_environment_var(self):
        """Initialize distributed properties based on environment variable."""

        tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV) or '{}')
        if tf_config:
            logging.info('TF_CONFIG environment variable: %s', tf_config)

        self._cluster_spec = server_lib.ClusterSpec(
            tf_config.get(_CLUSTER_KEY, {}))
        task_env = tf_config.get(_TASK_ENV_KEY, {})

        if self._cluster_spec:
            # Distributed mode.
            if TaskType.CHIEF not in self._cluster_spec.jobs:
                raise ValueError(
                    'If "cluster" is set in TF_CONFIG, it must have one "chief" node.'
                )
            if len(self._cluster_spec.job_tasks(TaskType.CHIEF)) > 1:
                raise ValueError(
                    'The "cluster" in TF_CONFIG must have only one "chief" node.'
                )

            self._task_type = task_env.get(_TASK_TYPE_KEY, None)
            task_id = task_env.get(_TASK_ID_KEY, None)

            if not self._task_type:
                raise ValueError(
                    'If "cluster" is set in TF_CONFIG, task type must be set.')
            if task_id is None:
                raise ValueError(
                    'If "cluster" is set in TF_CONFIG, task index must be set.'
                )

            self._task_id = int(task_id)

            # Check the task id bounds. Upper bound is not necessary as
            # - for evaluator, there is no upper bound.
            # - for non-evaluator, task id is upper bounded by the number of jobs in
            # cluster spec, which will be checked later (when retrieving the `master`)
            if self._task_id < 0:
                raise ValueError('Task index must be non-negative number.')

            if self._task_type != TaskType.EVALUATOR:
                self._master = _get_master(self._cluster_spec, self._task_type,
                                           self._task_id)
                self._num_ps_replicas = _count_ps(self._cluster_spec)
                self._num_worker_replicas = _count_worker(self._cluster_spec)
            else:
                # Evaluator is not part of the training cluster.
                self._cluster_spec = server_lib.ClusterSpec({})
                self._master = _LOCAL_MASTER
                self._num_ps_replicas = 0
                self._num_worker_replicas = 0

            self._is_chief = self._task_type == TaskType.CHIEF
        else:
            # Local mode.
            self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER)
            self._task_id = int(task_env.get(_TASK_ID_KEY, 0))

            if self._task_type != TaskType.WORKER:
                raise ValueError(
                    'If "cluster" is not set in TF_CONFIG, task type must be WORKER.'
                )
            if self._task_id != 0:
                raise ValueError(
                    'If "cluster" is not set in TF_CONFIG, task index must be 0.'
                )

            self._master = ''
            self._is_chief = True
            self._num_ps_replicas = 0
            self._num_worker_replicas = 1
예제 #57
0
    def build_training(self,
                       handle,
                       component_weights=None,
                       unroll_using_oracle=None,
                       max_index=-1):
        """Builds a training pipeline.

    Args:
      handle: Handle tensor for the ComputeSession.
      component_weights: If set, this is a list of relative weights
        each component's cost should get in the pipeline. Defaults to 1.0 for
        each component.
      unroll_using_oracle: If set, this is a list of booleans indicating
        whether or not to use the gold decodings for each component. Defaults
        to True for each component.
      max_index: Training will use only the first max_index components,
        or -1 for all components.

    Returns:
      handle: to the ComputeSession, conditioned on completing training step.
      outputs: a dictionary of useful training tensors.

    Raises:
      IndexError: if max_index is positive but out of bounds.
    """
        self.read_from_avg = False
        if max_index < 0:
            max_index = len(self.components)
        else:
            if not 0 < max_index <= len(self.components):
                raise IndexError(
                    'Invalid max_index {} for components {}; handle {}'.format(
                        max_index, self.component_names, handle.name))

        # By default, we train every component supervised.
        if not component_weights:
            component_weights = [1] * max_index
        if not unroll_using_oracle:
            unroll_using_oracle = [True] * max_index

        component_weights = component_weights[:max_index]
        total_weight = (float)(sum(component_weights))
        component_weights = [w / total_weight for w in component_weights]

        unroll_using_oracle = unroll_using_oracle[:max_index]

        logging.info('Creating training target:')
        logging.info('\tWeights: %s', component_weights)
        logging.info('\tOracle: %s', unroll_using_oracle)

        metrics_list = []
        cost = tf.constant(0.)
        effective_batch = tf.constant(0)

        avg_ops = []
        params_to_train = []

        network_states = {}
        for component_index in range(0, max_index):
            comp = self.components[component_index]
            network_states[comp.name] = component.NetworkState()

            logging.info('Initializing data for component "%s"', comp.name)
            handle = dragnn_ops.init_component_data(
                handle, component=comp.name, clear_existing_annotations=False)
            # TODO(googleuser): Phase out component.MasterState.
            master_state = component.MasterState(
                handle, dragnn_ops.batch_size(handle, component=comp.name))
            with tf.control_dependencies([handle, cost]):
                args = (master_state, network_states)
                if unroll_using_oracle[component_index]:
                    handle, component_cost, correct, total = comp.build_training(
                        *args)
                else:
                    handle = comp.build_inference(*args, during_training=True)
                    component_cost = tf.constant(0.)
                    correct, total = tf.constant(0), tf.constant(0)

                weighted_component_cost = tf.multiply(
                    component_cost,
                    tf.constant((float)(component_weights[component_index])),
                    name='weighted_component_cost')

                cost += weighted_component_cost
                effective_batch += total
                metrics_list += [[total], [correct]]

                with tf.control_dependencies([comp.advance_counters(total)]):
                    cost = tf.identity(cost)

                # Keep track of which parameters will be trained, and any moving
                # average updates to apply for these parameters.
                params_to_train += comp.network.params
                if self.hyperparams.use_moving_average:
                    avg_ops += comp.avg_ops

        # Concatenate evaluation results
        metrics = tf.concat(metrics_list, 0)

        # Now that the cost is computed:
        # 1. compute the gradients,
        # 2. add an optimizer to update the parameters using the gradients,
        # 3. make the ComputeSession handle depend on the optimizer.
        grads_and_vars = self.optimizer.compute_gradients(
            cost, var_list=params_to_train)
        clipped_gradients = [(self._clip_gradients(g), v)
                             for g, v in grads_and_vars]
        minimize_op = self.optimizer.apply_gradients(
            clipped_gradients, global_step=self.master_vars['step'])

        if self.hyperparams.use_moving_average:
            with tf.control_dependencies([minimize_op]):
                minimize_op = tf.group(*avg_ops)

        # Make sure all the side-effectful minimizations ops finish before
        # proceeding.
        with tf.control_dependencies([minimize_op]):
            handle = tf.identity(handle)

        # Restore that subsequent builds don't use average by default.
        self.read_from_avg = False

        # Returns named access to common outputs.
        outputs = {
            'cost': cost,
            'batch': effective_batch,
            'metrics': metrics,
        }
        return handle, outputs
 def stop(self):
   logging.info('Stopping worker watchdog.')
   self._reset_manager(stopping=True)
   self._running = False
   self.join()
예제 #59
0
# ==============================================================================
"""Builds a DRAGNN graph for local training."""

import tensorflow as tf
from tensorflow.core.protobuf import saver_pb2
from tensorflow.python.platform import tf_logging as logging

from dragnn.protos import spec_pb2
from dragnn.python import component
from dragnn.python import dragnn_ops
from dragnn.python import check

try:
    tf.NotDifferentiable('ExtractFixedFeatures')
except KeyError, e:
    logging.info(str(e))


def _create_learning_rate(hyperparams, step_var):
    """Creates learning rate var, with decay and switching for CompositeOptimizer.

  Args:
    hyperparams: a GridPoint proto containing optimizer spec, particularly
      learning_method to determine optimizer class to use.
    step_var: tf.Variable, global training step.

  Returns:
    a scalar `Tensor`, the learning rate based on current step and hyperparams.
  """
    base_rate = hyperparams.learning_rate
    return tf.train.exponential_decay(base_rate,
예제 #60
0
    def train_helper(self,
                     input_window_size,
                     loss,
                     max_loss=None,
                     train_steps=200,
                     anomaly_prob=0.01,
                     anomaly_distribution=None,
                     multiple_periods=False):
        np.random.seed(3)
        data_noise_stddev = 0.2
        if max_loss is None:
            if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
                max_loss = 1.0
            else:
                max_loss = 0.05 / (data_noise_stddev**2)
        train_data, test_data = self.create_data(
            noise_stddev=data_noise_stddev,
            anomaly_prob=anomaly_prob,
            multiple_periods=multiple_periods)
        output_window_size = 10
        window_size = input_window_size + output_window_size

        class _RunConfig(estimator_lib.RunConfig):
            @property
            def tf_random_seed(self):
                return 3

        estimator = ARRegressor(
            periodicities=self.period,
            anomaly_prior_probability=0.01 if anomaly_distribution else None,
            anomaly_distribution=anomaly_distribution,
            num_features=2,
            output_window_size=output_window_size,
            num_time_buckets=20,
            input_window_size=input_window_size,
            hidden_layer_sizes=[16],
            loss=loss,
            config=_RunConfig())
        train_input_fn = input_pipeline.RandomWindowInputFn(
            time_series_reader=input_pipeline.NumpyReader(train_data),
            window_size=window_size,
            batch_size=64,
            num_threads=1,
            shuffle_seed=2)
        test_input_fn = test_utils.AllWindowInputFn(
            time_series_reader=input_pipeline.NumpyReader(test_data),
            window_size=window_size)

        # Test training
        estimator.train(input_fn=train_input_fn, steps=train_steps)
        test_evaluation = estimator.evaluate(input_fn=test_input_fn, steps=1)
        test_loss = test_evaluation["loss"]
        logging.info("Final test loss: %f", test_loss)
        self.assertLess(test_loss, max_loss)
        if loss == ar_model.ARModel.SQUARED_LOSS:
            # Test that the evaluation loss is reported without input scaling.
            self.assertAllClose(
                test_loss,
                np.mean((test_evaluation["mean"] -
                         test_evaluation["observed"])**2))

        # Test predict
        train_data_times = train_data[TrainEvalFeatures.TIMES]
        train_data_values = train_data[TrainEvalFeatures.VALUES]
        test_data_times = test_data[TrainEvalFeatures.TIMES]
        test_data_values = test_data[TrainEvalFeatures.VALUES]
        predict_times = np.expand_dims(
            np.concatenate(
                [train_data_times[input_window_size:], test_data_times]), 0)
        predict_true_values = np.expand_dims(
            np.concatenate(
                [train_data_values[input_window_size:], test_data_values]), 0)
        state_times = np.expand_dims(train_data_times[:input_window_size], 0)
        state_values = np.expand_dims(train_data_values[:input_window_size, :],
                                      0)
        state_exogenous = state_times[:, :, None][:, :, :0]

        def prediction_input_fn():
            return ({
                PredictionFeatures.TIMES:
                training.limit_epochs(predict_times, num_epochs=1),
                PredictionFeatures.STATE_TUPLE:
                (state_times, state_values, state_exogenous)
            }, {})

        (predictions, ) = tuple(
            estimator.predict(input_fn=prediction_input_fn))
        predicted_mean = predictions["mean"][:, 0]
        true_values = predict_true_values[0, :, 0]

        if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
            variances = predictions["covariance"][:, 0]
            standard_deviations = np.sqrt(variances)
            # Note that we may get tighter bounds with more training steps.
            errors = np.abs(predicted_mean -
                            true_values) > 4 * standard_deviations
            fraction_errors = np.mean(errors)
            logging.info("Fraction errors: %f", fraction_errors)