def _tpu_init_fn(): if tpu_name in _LOCAL_MASTERS: job = None else: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "worker/replica:0/task:0" return tpu.initialize_system(job=job)
def _tpu_init_fn(): if tpu_name in _LOCAL_MASTERS: job = None else: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) return tpu.initialize_system(job=job)
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access with ops.device(get_first_tpu_host_device(cluster_resolver)): output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def _tpu_init_fn(): # In TF1, we usually close chips when compilation fails to clear the data # in infeed. In TF2, we don't need to do this because infeed is no longer # used, so user can recover from TPU compilation failures more smoothly. # Same for the cancellation of a TPU excution. return tpu.initialize_system( job=job, compilation_failure_closes_chips=False, tpu_cancellation_closes_chips=False)
def run(self, fetches, feed_dict=None, options=None, run_metadata=None): from tensorflow.python.tpu import tpu # pylint: disable=g-import-not-at-top if self.topology is None: self.topology = super().run(tpu.initialize_system()) assert self.topology is not None fetch_mapper = session._FetchMapper.for_fetch(fetches) new_fetches = [] for fetch in fetch_mapper.unique_fetches(): if isinstance(fetch, ops.Operation): fetch = tpu.rewrite(lambda fetch=fetch: fetch) new_fetches.append(fetch) rewritten_fetches = fetch_mapper.build_results(new_fetches) return super().run(rewritten_fetches, feed_dict, options, run_metadata)
def __init__(self, hparams, train_iterations, eval_steps, per_host_v1=False): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.eval_enqueue_ops = [] self.dataset_initializer = [] self.eval_dataset_initializer = [] self.is_local = ((hparams.master == "") and (hparams.tpu_name is None)) self.per_host_v1 = per_host_v1 self.iterations = train_iterations self.eval_steps = eval_steps self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = [] self.predictions = {} self.sess = None self.graph = tf.Graph() self.hparams = hparams self.num_hosts = hparams.num_shards // hparams.num_shards_per_host with self.graph.as_default(): self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.resolver = get_resolver(hparams) session_config = tf.ConfigProto( allow_soft_placement=True, isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))) if self.hparams.tpu_name is None: master = self.hparams.master else: cluster_spec = self.resolver.cluster_spec() if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) master = self.resolver.get_master() self.sess = tf.Session(master, graph=self.graph, config=session_config) self.sess.run(self.tpu_init)
def run_inference(inputs, pipeline_config_file, ckpt_path, input_type='encoded_image_string_tensor', use_bfloat16=False, repeat=1): """Runs inference on TPU. Args: inputs: Input image with the same type as `input_type` pipeline_config_file: Pipeline config file name. ckpt_path: Training checkpoint path. input_type: One of 'encoded_image_string_tensor': a 1d tensor with dtype=tf.string 'image_tensor': a 4d tensor with dtype=tf.uint8 'tf_example': a 1d tensor with dtype=tf.string use_bfloat16: If true, use tf.bfloat16 on TPU. repeat: Number of times to repeat running the provided input for profiling. Returns: A dict of resulting tensors. """ pipeline_config, meta_arch = parse_pipeline_config(pipeline_config_file) shapes_info = model_map[meta_arch].get_prediction_tensor_shapes( pipeline_config) with tf.Graph().as_default(), tf.Session() as sess: placeholder_tensor, result_tensor_dict = model_map[ meta_arch].build_graph(pipeline_config, shapes_info, input_type, use_bfloat16) saver = tf.train.Saver() init_op = tf.global_variables_initializer() sess.run(tpu.initialize_system()) sess.run(init_op) if ckpt_path is not None: saver.restore(sess, ckpt_path) for _ in range(repeat): tensor_dict_out = sess.run( result_tensor_dict, feed_dict={placeholder_tensor: [inputs]}) sess.run(tpu.shutdown_system()) return tensor_dict_out
def _obtain_topology(master_address, cluster_def): """Obtains TPU fabric topology.""" try: logging.info( 'Initializing TPU system (master: %s) to fetch topology ' 'for model parallelism. This might take a while.', master_address) with ops.Graph().as_default(): session_config = get_session_config_with_timeout( _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def) with session_lib.Session(master_address, config=session_config) as sess: topology = sess.run(tpu.initialize_system()) return topology except errors.DeadlineExceededError: raise ValueError('Fail to initialize TPU system with master (%s). ' 'Please double check the TPU system is functional.' % (master_address))
def __init__(self, sparse_features_key, embedding, **kwargs): """Initializes the runner.""" super(DLRMEmbeddingRunner, self).__init__(**kwargs, do_initialize=False) self.embedding = embedding self.embedding_config = embedding.config_proto self.features_key = sparse_features_key self.embed_vars_and_ops = None self.retrieve_ops = None self.enqueue_datas_list = {True: [], False: []} self.dummy_variables = None self.dummy_variables_init = None self.num_outfeeds = 1 with self.graph.as_default(): self.embed_vars_and_ops = self.embedding.create_variables_and_ops() self.dummy_variables, self.dummy_variables_init = ( tpu_embedding_gradient.create_dummy_table_variables( self.embedding)) self.device_topology = tf.Session(self.master, config=self.config).run( tpu.initialize_system(embedding_config=self.embedding_config))
def run_inference_from_saved_model(inputs, saved_model_dir, input_placeholder_name='placeholder_tensor', repeat=1): """Loads saved model and run inference on TPU. Args: inputs: Input image with the same type as `input_type` saved_model_dir: The directory SavedModel being exported to. input_placeholder_name: input placeholder's name in SavedModel signature. repeat: Number of times to repeat running the provided input for profiling. Returns: A dict of resulting tensors. """ with tf.Graph().as_default(), tf.Session() as sess: meta_graph = loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], saved_model_dir) sess.run(tpu.initialize_system()) key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY tensor_name_input = (meta_graph.signature_def[key_prediction]. inputs[input_placeholder_name].name) tensor_name_output = { k: v.name for k, v in ( meta_graph.signature_def[key_prediction].outputs.items()) } for _ in range(repeat): tensor_dict_out = sess.run(tensor_name_output, feed_dict={tensor_name_input: [inputs]}) sess.run(tpu.shutdown_system()) return tensor_dict_out
def __init__(self, iterations, eval_steps, sleep_seconds=120, num_multiprocessing_workers=ssd_constants.WORKER_COUNT, num_cores_per_shard=1, input_partition_dims=None): tf.logging.info("TrainAndEvalLowLevelRunner: constructor") self.eval_steps = eval_steps self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.dequeue_ops = [] self.predictions = {} self.eval_enqueue_ops = [] self.train_eval_compile_op = None self.dataset_initializer = [] self.eval_dataset_initializer = [] self.iterations = iterations # TODO(wangtao): change FLAGS.num_shards_per_host to # FLAGS.num_cores_per_host after other low level API # support spatial partition. FLAGS.num_shards_per_host means number of TPU # cores for each host. self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host self.num_shards = FLAGS.num_shards self.scaffold_fn = None self.sess = None self.input_sess = None self.graph = tf.Graph() self.input_graph = tf.Graph() self.eval_op = None self.infeed_thread = None self.eval_epochs = [] self.success_epoch = 1000 self.log_epochs = {} self.params = {} self.train_loop = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.init_sess = tf.Session(self.master, config=self.session_config) self.outfeed_tensors = [] self.outfeed_names = [] self.run_success = False self.log_run_success = False self.num_multiprocessing_workers = num_multiprocessing_workers # Figure out the steps and epochs to eval for MLPerf. self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() self.eval_iterations = [ steps // 20000 - 1 for steps in self.eval_at_steps ] self.max_train_iterations = int( math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / (FLAGS.train_batch_size * self.iterations))) self.sleep_seconds = sleep_seconds tf.logging.info("eval_at_steps: %s", self.eval_at_steps) tf.logging.info("eval_iterations: %s", self.eval_iterations) # Init for spatial partitioning. self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = [input_partition_dims, None] self.use_spatial_partition = ( input_partition_dims is not None and int(np.prod(FLAGS.input_partition_dims)) > 1) self.use_spatial_partition = input_partition_dims is not None self.num_cores_per_shard = num_cores_per_shard if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_shard] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_shard: %d", self.num_cores_per_shard) tf.logging.info("num_hosts: %d", self.num_hosts) tf.logging.info("replicas_per_worker: %d", self.replicas_per_worker) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_shards: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) eval_input_partition_dims = [{ ssd_constants.BOXES: None, ssd_constants.CLASSES: None, ssd_constants.IMAGE: input_partition_dims, ssd_constants.RAW_SHAPE: None, ssd_constants.SOURCE_ID: None, }, None] if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples: eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None self.eval_input_dims_flattener = utils.InputDimsFlattener( eval_input_partition_dims) else: self.device_assignment = None self.eval_input_dims_flattener = None
def _tpu_init_fn(): return tpu.initialize_system()
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. @function.defun def _tpu_init_fn(): return tpu.initialize_system() tpu_devices = sorted( [x for x in context.list_devices() if "device:TPU:" in x]) if not tpu_devices: raise RuntimeError("Could not find any TPU devices") # Replace the remote TPU device with the remote TPU_SYSTEM system device. As # in the remote TPU device case, we will try to compile it instead of # running through optimization passes and TF Executor, but TPU_SYSTEM should # work. tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM") with ops.device(tpu_system_device): output = _tpu_init_fn() serialized_topology = output.numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key, input_tensor_key_feed_dict, outdir, overwrite_flag, worker=None, init_tpu=False, tf_debug=False): """Runs SavedModel and fetch all outputs. Runs the input dictionary through the MetaGraphDef within a SavedModel specified by the given tag_set and SignatureDef. Also save the outputs to file if outdir is not None. Args: saved_model_dir: Directory containing the SavedModel to execute. tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in string format, separated by ','. For tag-set contains multiple tags, all tags must be passed in. signature_def_key: A SignatureDef key string. input_tensor_key_feed_dict: A dictionary maps input keys to numpy ndarrays. outdir: A directory to save the outputs to. If the directory doesn't exist, it will be created. overwrite_flag: A boolean flag to allow overwrite output file if file with the same name exists. worker: If provided, the session will be run on the worker. Valid worker specification is a bns or gRPC path. init_tpu: If true, the TPU system will be initialized after the session is created. tf_debug: A boolean flag to use TensorFlow Debugger (TFDBG) to observe the intermediate Tensor values and runtime GraphDefs while running the SavedModel. Raises: ValueError: When any of the input tensor keys is not valid. RuntimeError: An error when output file already exists and overwrite is not enabled. """ # Get a list of output tensor names. meta_graph_def = saved_model_utils.get_meta_graph_def( saved_model_dir, tag_set) # Re-create feed_dict based on input tensor name instead of key as session.run # uses tensor name. inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def( meta_graph_def, signature_def_key) # Check if input tensor keys are valid. for input_key_name in input_tensor_key_feed_dict.keys(): if input_key_name not in inputs_tensor_info: raise ValueError( '"%s" is not a valid input key. Please choose from %s, or use ' '--show option.' % (input_key_name, '"' + '", "'.join(inputs_tensor_info.keys()) + '"')) inputs_feed_dict = { inputs_tensor_info[key].name: tensor for key, tensor in input_tensor_key_feed_dict.items() } # Get outputs outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def( meta_graph_def, signature_def_key) # Sort to preserve order because we need to go from value to key later. output_tensor_keys_sorted = sorted(outputs_tensor_info.keys()) output_tensor_names_sorted = [ outputs_tensor_info[tensor_key].name for tensor_key in output_tensor_keys_sorted ] with session.Session(worker, graph=ops_lib.Graph()) as sess: if init_tpu: print('Initializing TPU System ...') # This is needed for freshly started worker, or if the job # restarts after a preemption. sess.run(tpu.initialize_system()) loader.load(sess, tag_set.split(','), saved_model_dir) if tf_debug: sess = local_cli_wrapper.LocalCLIDebugWrapperSession(sess) outputs = sess.run(output_tensor_names_sorted, feed_dict=inputs_feed_dict) for i, output in enumerate(outputs): output_tensor_key = output_tensor_keys_sorted[i] print('Result for output key %s:\n%s' % (output_tensor_key, output)) # Only save if outdir is specified. if outdir: # Create directory if outdir does not exist if not os.path.isdir(outdir): os.makedirs(outdir) output_full_path = os.path.join(outdir, output_tensor_key + '.npy') # If overwrite not enabled and file already exist, error out if not overwrite_flag and os.path.exists(output_full_path): raise RuntimeError( 'Output file %s already exists. Add \"--overwrite\" to overwrite' ' the existing output files.' % output_full_path) np.save(output_full_path, output) print('Output %s is saved to %s' % (output_tensor_key, output_full_path))
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. If called inside tf.function, it returns the serialized topology object instead. Raises: RuntimeError: If running inside a tf.function. NotFoundError: If no TPU devices found in eager mode. """ job = None if cluster_resolver is None: # If no cluster resolver is specified, and running eagerly, execute the init # ops in the current device scope. if context.executing_eagerly(): curr_device = device.DeviceSpec.from_string( context.context().device_name) if curr_device.job is not None: job = "{}/replica:0/task:0".format(curr_device.job) cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning( "TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.", tpu_name) logging.info("Initializing the TPU system: %s", tpu_name) # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) if context.executing_eagerly(): @function.defun def _tpu_init_fn(): # In TF1, we usually close chips when compilation fails to clear the data # in infeed. In TF2, we don't need to do this because infeed is no longer # used, so user can recover from TPU compilation failures more smoothly. return tpu.initialize_system( job=job, compilation_failure_closes_chips=False) # The TPU_SYSTEM device must match the device used in tpu.initialize_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. try: with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access output = _tpu_init_fn() context.async_wait() except errors.InvalidArgumentError as e: raise errors.NotFoundError( None, None, "TPUs not found in the cluster. Failed in initialization: " + str(e)) # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access serialized_topology = output.numpy() elif not ops.executing_eagerly_outside_functions(): master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) else: with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access serialized_topology = tpu.initialize_system( job=job, compilation_failure_closes_chips=False) # If initialize_tpu_system is called inside tf.function, we only return # the serialized topology object as the tf.tpu.Topology object has to be # constructed in eager mode. return serialized_topology logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def __init__(self, iterations_per_loop, train_steps, eval_steps, num_replicas, eval_dataset_repeats=True, do_initialize=True): self.feature_structure = {} self.infeed_op = {} self.num_replicas = num_replicas self.eval_dataset_repeats = eval_dataset_repeats # Set number of input graphs to number of hosts up to a maximum of 32. self.num_input_graphs = min( 32, self.num_replicas // FLAGS.replicas_per_host) # Following data has separated copies for training and eval, thus # represented as a map from is_train(boolean) to actual data self.dataset_initializer = {True: [], False: []} self.input_graph = {True: [], False: []} self.input_sess = {True: [], False: []} self.enqueue_ops = {True: [], False: []} for _ in range(self.num_input_graphs): self.input_graph[True].append(tf.Graph()) self.input_graph[False].append(tf.Graph()) self.dataset_initializer[True].append([]) self.dataset_initializer[False].append([]) self.enqueue_ops[True].append([]) self.enqueue_ops[False].append([]) self.input_sess[True].append([]) self.input_sess[False].append([]) # dequeue_ops is only for eval self.dequeue_ops = [] self.iterations_per_loop = iterations_per_loop self.sess = None self.output_sess = None self.train_eval_thread = None self.graph = tf.Graph() if iterations_per_loop != 0 and train_steps % iterations_per_loop != 0: train_steps = iterations_per_loop * int( math.ceil(train_steps / iterations_per_loop)) self.train_steps = train_steps if iterations_per_loop == 0: self.max_train_iterations = 1 else: self.max_train_iterations = train_steps // iterations_per_loop self.eval_steps = int(eval_steps) self.train_batch_size = 0 self.eval_batch_size = 0 self.eval_has_labels = 0 self.model_fn = None self.num_outfeeds = self.eval_steps self.config = tf.ConfigProto( operation_timeout_in_ms=600 * 60 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) if FLAGS.enable_mlir_bridge: self.config.experimental.enable_mlir_bridge = True tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project, job_name="tpu_worker") self.master = tpu_cluster_resolver.get_master() self.job_name = tpu_cluster_resolver.get_job_name() or "tpu_worker" self.embedding_config = None self.device_topology = None if do_initialize: self.device_topology = tf.Session( self.master, config=self.config).run(tpu.initialize_system())
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access tpu_devices = sorted( [x for x in context.list_devices() if "device:TPU:" in x]) if not tpu_devices: raise RuntimeError("Could not find any TPU devices") with ops.device(device_util.get_host_for_device(tpu_devices[0])): output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: master = cluster_resolver.master() session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.tpu.Topology object for the topology of the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name in _INITIALIZED_TPU_SYSTEMS: logging.warning("TPU system %s has already been initialized. " "Reinitializing the TPU can cause previously created " "variables on TPU to be lost.") logging.info("Initializing the TPU system: %s", tpu_name) if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. job = None if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.initialize_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) @function.defun def _tpu_init_fn(): return tpu.initialize_system(job=job) # The TPU_SYSTEM device must match the device used in tpu.initialize_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access output = _tpu_init_fn() # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access serialized_topology = output.numpy() else: master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") tpu_topology = topology.Topology(serialized=serialized_topology) _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology return tpu_topology
def _tpu_init_fn(): return tpu.initialize_system(job=job)
def __init__(self, iterations, eval_steps): tf.logging.info("LowLevelRunner: constructor.") self.fake_feature_structure = {} self.feature_structure = {} self.fake_eval_feature_structure = {} self.eval_feature_structure = {} self.infeed_queue = [] self.eval_infeed_queue = [] self.fake_enqueue_ops = [] self.enqueue_ops = [] self.fake_eval_enqueue_ops = [] self.eval_enqueue_ops = [] self.fake_dataset_initializer = [] self.dataset_initializer = [] self.fake_eval_dataset_initializer = [] self.eval_dataset_initializer = [] self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = [] self.train_compile_op = None self.eval_compile_op = None self.loss = None self.eval_op = None self.iterations = iterations self.eval_steps = eval_steps self.num_hosts = FLAGS.tpu_num_shards // FLAGS.tpu_num_shards_per_host self.scaffold_fn = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.master or FLAGS.cloud_tpu_name) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.input_graph = tf.Graph() self.eval_input_graph = tf.Graph() # Train and eval share the same session and graph so that the weights # can be shared for in memory eval. self.graph = tf.Graph() self.output_graph = tf.Graph() with self.graph.as_default(): if FLAGS.random_seed: tf.random.set_random_seed(FLAGS.random_seed) self.num_epochs_tensor = tf.placeholder(tf.int32, shape=(), name="epochs") self.train_steps_tensor = tf.placeholder( tf.int32, shape=(), name="steps_per_train_loop") self.eval_steps_tensor = tf.placeholder(tf.int32, shape=(), name="steps_per_eval_loop") self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.input_sess = tf.Session(self.master, graph=self.input_graph, config=self.session_config) self.eval_input_sess = tf.Session(self.master, graph=self.eval_input_graph, config=self.session_config) self.sess = tf.Session(self.master, graph=self.graph, config=self.session_config) self.output_sess = tf.Session(self.master, graph=self.output_graph, config=self.session_config) self.sess.run(self.tpu_init) self.infeed_thead = None self.train_eval_thead = None