def __init__(self,
               hparams,
               train_iterations,
               eval_steps,
               per_host_v1=False):
    tf.logging.info("TrainLowLevelRunner: constructor")

    self.feature_structure = {}
    self.eval_feature_structure = {}
    self.loss = None
    self.infeed_queue = []
    self.eval_infeed_queue = []
    self.enqueue_ops = []
    self.eval_enqueue_ops = []
    self.dataset_initializer = []
    self.eval_dataset_initializer = []
    self.is_local = ((hparams.master == "") and (hparams.tpu_name is None))
    self.per_host_v1 = per_host_v1
    self.iterations = train_iterations
    self.eval_steps = eval_steps
    self.outfeed_tensors = []
    self.outfeed_names = []
    self.dequeue_ops = []
    self.predictions = {}
    self.sess = None
    self.graph = tf.Graph()
    self.hparams = hparams
    self.num_hosts = hparams.num_shards // hparams.num_shards_per_host
    with self.graph.as_default():
      self.tpu_init = [tpu.initialize_system()]
      self.tpu_shutdown = tpu.shutdown_system()

    self.resolver = get_resolver(hparams)
    session_config = tf.ConfigProto(
        allow_soft_placement=True,
        isolate_session_state=True,
        operation_timeout_in_ms=600 * 60 * 1000,
        graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                disable_meta_optimizer=True)))

    if self.hparams.tpu_name is None:
      master = self.hparams.master
    else:
      cluster_spec = self.resolver.cluster_spec()
      if cluster_spec:
        session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
      master = self.resolver.get_master()
    self.sess = tf.Session(master, graph=self.graph, config=session_config)
    self.sess.run(self.tpu_init)
def run_inference(inputs,
                  pipeline_config_file,
                  ckpt_path,
                  input_type='encoded_image_string_tensor',
                  use_bfloat16=False,
                  repeat=1):
    """Runs inference on TPU.

  Args:
    inputs: Input image with the same type as `input_type`
    pipeline_config_file: Pipeline config file name.
    ckpt_path: Training checkpoint path.
    input_type: One of
                'encoded_image_string_tensor': a 1d tensor with dtype=tf.string
                'image_tensor': a 4d tensor with dtype=tf.uint8
                'tf_example': a 1d tensor with dtype=tf.string
    use_bfloat16: If true, use tf.bfloat16 on TPU.
    repeat: Number of times to repeat running the provided input for profiling.

  Returns:
    A dict of resulting tensors.
  """

    pipeline_config, meta_arch = parse_pipeline_config(pipeline_config_file)

    shapes_info = model_map[meta_arch].get_prediction_tensor_shapes(
        pipeline_config)

    with tf.Graph().as_default(), tf.Session() as sess:
        placeholder_tensor, result_tensor_dict = model_map[
            meta_arch].build_graph(pipeline_config, shapes_info, input_type,
                                   use_bfloat16)

        saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()

        sess.run(tpu.initialize_system())

        sess.run(init_op)
        if ckpt_path is not None:
            saver.restore(sess, ckpt_path)

        for _ in range(repeat):
            tensor_dict_out = sess.run(
                result_tensor_dict, feed_dict={placeholder_tensor: [inputs]})

        sess.run(tpu.shutdown_system())

        return tensor_dict_out
def run_inference_from_saved_model(inputs,
                                   saved_model_dir,
                                   input_placeholder_name='placeholder_tensor',
                                   repeat=1):
    """Loads saved model and run inference on TPU.

  Args:
    inputs: Input image with the same type as `input_type`
    saved_model_dir: The directory SavedModel being exported to.
    input_placeholder_name: input placeholder's name in SavedModel signature.
    repeat: Number of times to repeat running the provided input for profiling.

  Returns:
    A dict of resulting tensors.
  """
    with tf.Graph().as_default(), tf.Session() as sess:
        meta_graph = loader.load(sess,
                                 [tag_constants.SERVING, tag_constants.TPU],
                                 saved_model_dir)

        sess.run(tpu.initialize_system())

        key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY

        tensor_name_input = (meta_graph.signature_def[key_prediction].
                             inputs[input_placeholder_name].name)
        tensor_name_output = {
            k: v.name
            for k, v in (
                meta_graph.signature_def[key_prediction].outputs.items())
        }

        for _ in range(repeat):
            tensor_dict_out = sess.run(tensor_name_output,
                                       feed_dict={tensor_name_input: [inputs]})

        sess.run(tpu.shutdown_system())

        return tensor_dict_out
Exemplo n.º 4
0
 def _tpu_shutdown_fn():
     tpu.shutdown_system(job=job)
Exemplo n.º 5
0
def shutdown_tpu_system(cluster_resolver=None):
    """Shuts down the TPU devices.

  This will clear all caches, even those that are maintained through sequential
  calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation
  cache.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution or if run in a
        tf.function.
  """
    job = None
    if cluster_resolver is None:
        # If no cluster resolver is specified, and running eagerly, execute the init
        # ops in the current device scope.
        if context.executing_eagerly():
            curr_device = device.DeviceSpec.from_string(
                context.context().device_name)
            if curr_device.job is not None:
                job = "{}/replica:0/task:0".format(curr_device.job)

        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name not in _INITIALIZED_TPU_SYSTEMS:
        logging.warning(
            "You are shutting down a TPU system %s that has not been "
            "initialized." % tpu_name)

    logging.info("Shutting down the TPU system: %s", tpu_name)

    if context.executing_eagerly():
        # This function looks as it is for the following non-intuitive reasons.
        # tpu.shutdown_system creates a dummy op whose sole purpose is to trigger
        # DistributedTPURewritePass. This pass actually adds real ops that
        # shutdown the TPU system. Thus, we can't simply run tpu.shutdown_system
        # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
        if tpu_name not in _LOCAL_MASTERS:
            # Explicitly place the tpu.shutdown_system in the first worker to
            # avoid the output node match multiple devices error.
            job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

        @function.defun
        def _tpu_shutdown_fn():
            tpu.shutdown_system(job=job)

        # The TPU_SYSTEM device must match the device used in tpu.shutdown_system
        # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
        # devices available.
        with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
            _tpu_shutdown_fn()

        # Clear out the eager context caches since the memory is invalid now.
        logging.info("Clearing out eager caches")
        context.context()._clear_caches()  # pylint: disable=protected-access
    elif not ops.executing_eagerly_outside_functions():
        master = cluster_resolver.master()
        cluster_spec = cluster_resolver.cluster_spec()

        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        if cluster_spec:
            session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                sess.run(tpu.shutdown_system())
    else:
        raise RuntimeError("initialize_tpu_system is not supported within "
                           "tf.functions.")

    logging.info("Finished shutting down TPU system.")
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        del _INITIALIZED_TPU_SYSTEMS[tpu_name]
    def __init__(self, iterations, eval_steps):
        tf.logging.info("LowLevelRunner: constructor.")

        self.fake_feature_structure = {}
        self.feature_structure = {}
        self.fake_eval_feature_structure = {}
        self.eval_feature_structure = {}
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.fake_enqueue_ops = []
        self.enqueue_ops = []
        self.fake_eval_enqueue_ops = []
        self.eval_enqueue_ops = []
        self.fake_dataset_initializer = []
        self.dataset_initializer = []
        self.fake_eval_dataset_initializer = []
        self.eval_dataset_initializer = []
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.dequeue_ops = []
        self.train_compile_op = None
        self.eval_compile_op = None
        self.loss = None
        self.eval_op = None
        self.iterations = iterations
        self.eval_steps = eval_steps
        self.num_hosts = FLAGS.tpu_num_shards // FLAGS.tpu_num_shards_per_host
        self.scaffold_fn = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.master or FLAGS.cloud_tpu_name)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.input_graph = tf.Graph()
        self.eval_input_graph = tf.Graph()
        # Train and eval share the same session and graph so that the weights
        # can be shared for in memory eval.
        self.graph = tf.Graph()
        self.output_graph = tf.Graph()
        with self.graph.as_default():
            if FLAGS.random_seed:
                tf.random.set_random_seed(FLAGS.random_seed)
            self.num_epochs_tensor = tf.placeholder(tf.int32,
                                                    shape=(),
                                                    name="epochs")
            self.train_steps_tensor = tf.placeholder(
                tf.int32, shape=(), name="steps_per_train_loop")
            self.eval_steps_tensor = tf.placeholder(tf.int32,
                                                    shape=(),
                                                    name="steps_per_eval_loop")
            self.tpu_init = [tpu.initialize_system()]
            self.tpu_shutdown = tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.input_sess = tf.Session(self.master,
                                     graph=self.input_graph,
                                     config=self.session_config)
        self.eval_input_sess = tf.Session(self.master,
                                          graph=self.eval_input_graph,
                                          config=self.session_config)
        self.sess = tf.Session(self.master,
                               graph=self.graph,
                               config=self.session_config)
        self.output_sess = tf.Session(self.master,
                                      graph=self.output_graph,
                                      config=self.session_config)
        self.sess.run(self.tpu_init)
        self.infeed_thead = None
        self.train_eval_thead = None
    def __init__(self,
                 iterations,
                 eval_steps,
                 sleep_seconds=120,
                 num_multiprocessing_workers=ssd_constants.WORKER_COUNT,
                 num_cores_per_shard=1,
                 input_partition_dims=None):
        tf.logging.info("TrainAndEvalLowLevelRunner: constructor")

        self.eval_steps = eval_steps
        self.feature_structure = {}
        self.eval_feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.enqueue_ops = []
        self.dequeue_ops = []
        self.predictions = {}
        self.eval_enqueue_ops = []
        self.train_eval_compile_op = None
        self.dataset_initializer = []
        self.eval_dataset_initializer = []
        self.iterations = iterations
        # TODO(wangtao): change FLAGS.num_shards_per_host to
        # FLAGS.num_cores_per_host after other low level API
        # support spatial partition. FLAGS.num_shards_per_host means number of TPU
        # cores for each host.
        self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard
        self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host
        self.num_shards = FLAGS.num_shards
        self.scaffold_fn = None
        self.sess = None
        self.input_sess = None
        self.graph = tf.Graph()
        self.input_graph = tf.Graph()
        self.eval_op = None
        self.infeed_thread = None
        self.eval_epochs = []
        self.success_epoch = 1000
        self.log_epochs = {}
        self.params = {}
        self.train_loop = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name or FLAGS.master,
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tpu.initialize_system()
        self.tpu_shutdown = tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.init_sess = tf.Session(self.master, config=self.session_config)
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.run_success = False
        self.log_run_success = False
        self.num_multiprocessing_workers = num_multiprocessing_workers

        # Figure out the steps and epochs to eval for MLPerf.
        self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        self.eval_iterations = [
            steps // 20000 - 1 for steps in self.eval_at_steps
        ]
        self.max_train_iterations = int(
            math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch /
                      (FLAGS.train_batch_size * self.iterations)))
        self.sleep_seconds = sleep_seconds

        tf.logging.info("eval_at_steps: %s", self.eval_at_steps)
        tf.logging.info("eval_iterations: %s", self.eval_iterations)

        # Init for spatial partitioning.
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = [input_partition_dims, None]
        self.use_spatial_partition = (
            input_partition_dims is not None
            and int(np.prod(FLAGS.input_partition_dims)) > 1)
        self.use_spatial_partition = input_partition_dims is not None
        self.num_cores_per_shard = num_cores_per_shard
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_shard]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_shard: %d",
                            self.num_cores_per_shard)
            tf.logging.info("num_hosts: %d", self.num_hosts)
            tf.logging.info("replicas_per_worker: %d",
                            self.replicas_per_worker)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_shards: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
            eval_input_partition_dims = [{
                ssd_constants.BOXES: None,
                ssd_constants.CLASSES: None,
                ssd_constants.IMAGE: input_partition_dims,
                ssd_constants.RAW_SHAPE: None,
                ssd_constants.SOURCE_ID: None,
            }, None]
            if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples:
                eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None
            self.eval_input_dims_flattener = utils.InputDimsFlattener(
                eval_input_partition_dims)
        else:
            self.device_assignment = None
            self.eval_input_dims_flattener = None