def add_meta_graph(self, tags, signature_def_map=None, assets_collection=None): """Adds the current meta graph to the SavedModel. Creates a Saver in the current scope and uses the Saver to export the meta graph def. Invoking this API requires the `add_meta_graph_and_variables()` API to have been invoked before. Args: tags: The set of tags to annotate the meta graph def with. signature_def_map: The map of signature defs to be added to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. Note that this collection should be a subset of the assets saved as part of the first meta graph in the SavedModel. Raises: AssertionError: If the variables for the SavedModel have not been saved yet. """ if not self._has_saved_variables: raise AssertionError( "Variables and assets have not been saved yet. " "Please invoke `add_meta_graph_and_variables()` first.") # Save asset files, if any. self._save_assets(assets_collection) saver = tf_saver.Saver(variables.all_variables()) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
def _get_saver(): """Lazy init and return saver.""" saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS) if saver is None and variables.all_variables(): saver = tf_saver.Saver() ops.add_to_collection(ops.GraphKeys.SAVERS, saver) return saver
def add_meta_graph_and_variables(self, sess, tags, signature_def_map=None, assets_collection=None, legacy_init_op=None): """Adds the current meta graph to the SavedModel and saves variables. Creates a Saver to save the variables from the provided session. Exports the corresponding meta graph def. This function assumes that the variables to be saved have been initialized. For a given `SavedModelBuilder`, this API must be called exactly once and for the first meta graph to save. For subsequent meta graph defs to be added, the `add_meta_graph()` API must be used. Args: sess: The TensorFlow session from which to save the meta graph and variables. tags: The set of tags with which to save the meta graph. signature_def_map: The map of signature def map to add to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. legacy_init_op: Op or group of ops to execute after the restore op upon a load. """ if self._has_saved_variables: raise AssertionError("Variables and assets have already been saved. " "Please invoke `add_meta_graph()` instead.") # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) # Create the variables sub-directory, if it does not exist. variables_dir = os.path.join( compat.as_text(self._export_dir), compat.as_text(constants.VARIABLES_DIRECTORY)) if not file_io.file_exists(variables_dir): file_io.recursive_create_dir(variables_dir) variables_path = os.path.join( compat.as_text(variables_dir), compat.as_text(constants.VARIABLES_FILENAME)) # Add legacy init op to the SavedModel. self._maybe_add_legacy_init_op(legacy_init_op) # Save the variables and export meta graph def. saver = tf_saver.Saver( variables.all_variables(), sharded=True, write_version=saver_pb2.SaverDef.V2) saver.save(sess, variables_path, write_meta_graph=False) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map) # Mark this instance of SavedModel as having saved variables, such that # subsequent attempts to save variables will fail. self._has_saved_variables = True
def add_meta_graph_and_variables(self, sess, tags, signature_def_map=None, assets_collection=None, legacy_init_op=None): """Adds the current meta graph to the SavedModel and saves variables. Creates a Saver to save the variables from the provided session. Exports the corresponding meta graph def. This function assumes that the variables to be saved have been initialized. For a given `SavedModelBuilder`, this API must be called exactly once and for the first meta graph to save. For subsequent meta graph defs to be added, the `add_meta_graph()` API must be used. Args: sess: The TensorFlow session from which to save the meta graph and variables. tags: The set of tags with which to save the meta graph. signature_def_map: The map of signature def map to add to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. legacy_init_op: Op or group of ops to execute after the restore op upon a load. """ if self._has_saved_variables: raise AssertionError( "Variables and assets have already been saved. " "Please invoke `add_meta_graph()` instead.") # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) # Create the variables sub-directory, if it does not exist. variables_dir = os.path.join( compat.as_text(self._export_dir), compat.as_text(constants.VARIABLES_DIRECTORY)) if not file_io.file_exists(variables_dir): file_io.recursive_create_dir(variables_dir) variables_path = os.path.join( compat.as_text(variables_dir), compat.as_text(constants.VARIABLES_FILENAME)) # Add legacy init op to the SavedModel. self._maybe_add_legacy_init_op(legacy_init_op) # Save the variables and export meta graph def. saver = tf_saver.Saver(variables.all_variables(), sharded=True, write_version=saver_pb2.SaverDef.V2) saver.save(sess, variables_path, write_meta_graph=False) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map) # Mark this instance of SavedModel as having saved variables, such that # subsequent attempts to save variables will fail. self._has_saved_variables = True
def get_variable_names(self): """Returns list of all variable names in this model. Returns: List of names. """ with self._graph.as_default(): return [v.name for v in variables.all_variables()]
def _get_saver(): saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS) if saver is not None: if saver: saver = saver[0] else: saver = None if saver is None and variables.all_variables(): saver = tf_saver.Saver() ops.add_to_collection(ops.GraphKeys.SAVERS, saver) return saver
def _get_saver(): """Lazy init and return saver.""" saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS) if saver is not None: if saver: saver = saver[0] else: saver = None if saver is None and variables.all_variables(): saver = tf_saver.Saver(write_version=saver_pb2.SaverDef.V1) ops.add_to_collection(ops.GraphKeys.SAVERS, saver) return saver
def run(self, num_batches=None, graph=None, session=None, start_queues=True, initialize_variables=True, **kwargs): """Builds and runs the columns of the `DataFrame` and yields batches. This is a generator that yields a dictionary mapping column names to evaluated columns. Args: num_batches: the maximum number of batches to produce. If none specified, the returned value will iterate through infinite batches. graph: the `Graph` in which the `DataFrame` should be built. session: the `Session` in which to run the columns of the `DataFrame`. start_queues: if true, queues will be started before running and halted after producting `n` batches. initialize_variables: if true, variables will be initialized. **kwargs: Additional keyword arguments e.g. `num_epochs`. Yields: A dictionary, mapping column names to the values resulting from running each column for a single batch. """ if graph is None: graph = ops.get_default_graph() with graph.as_default(): if session is None: session = sess.Session() self_built = self.build(**kwargs) keys = list(self_built.keys()) cols = list(self_built.values()) if initialize_variables: if variables.local_variables(): session.run(variables.initialize_local_variables()) if variables.all_variables(): session.run(variables.initialize_all_variables()) if start_queues: coord = coordinator.Coordinator() threads = qr.start_queue_runners(sess=session, coord=coord) i = 0 while num_batches is None or i < num_batches: i += 1 try: values = session.run(cols) yield collections.OrderedDict(zip(keys, values)) except errors.OutOfRangeError: break if start_queues: coord.request_stop() coord.join(threads)
def _init_saver(self, saver=USE_DEFAULT): """Initializes saver. Args: saver: A `Saver` object. If set to USE_DEFAULT, create one that saves all the variables. """ if saver is Supervisor.USE_DEFAULT: saver = self._get_first_op_from_collection(ops.GraphKeys.SAVERS) if saver is None and variables.all_variables(): saver = saver_mod.Saver() ops.add_to_collection(ops.GraphKeys.SAVERS, saver) self._saver = saver
def add_meta_graph(self, tags, signature_def_map=None, assets_collection=None, legacy_init_op=None, clear_devices=False): """Adds the current meta graph to the SavedModel. Creates a Saver in the current scope and uses the Saver to export the meta graph def. Invoking this API requires the `add_meta_graph_and_variables()` API to have been invoked before. Args: tags: The set of tags to annotate the meta graph def with. signature_def_map: The map of signature defs to be added to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. Note that this collection should be a subset of the assets saved as part of the first meta graph in the SavedModel. legacy_init_op: Op or group of ops to execute after the restore op upon a load. clear_devices: Set to true if the device info on the default graph should be cleared. Raises: AssertionError: If the variables for the SavedModel have not been saved yet. """ if not self._has_saved_variables: raise AssertionError( "Variables and assets have not been saved yet. " "Please invoke `add_meta_graph_and_variables()` first.") self._maybe_clear_devices(clear_devices) # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) # Add legacy init op to the SavedModel. self._maybe_add_legacy_init_op(legacy_init_op) saver = tf_saver.Saver( variables.all_variables(), sharded=True, write_version=saver_pb2.SaverDef.V2) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
def add_meta_graph(self, tags, signature_def_map=None, assets_collection=None, legacy_init_op=None, clear_devices=False): """Adds the current meta graph to the SavedModel. Creates a Saver in the current scope and uses the Saver to export the meta graph def. Invoking this API requires the `add_meta_graph_and_variables()` API to have been invoked before. Args: tags: The set of tags to annotate the meta graph def with. signature_def_map: The map of signature defs to be added to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. Note that this collection should be a subset of the assets saved as part of the first meta graph in the SavedModel. legacy_init_op: Op or group of ops to execute after the restore op upon a load. clear_devices: Set to true if the device info on the default graph should be cleared. Raises: AssertionError: If the variables for the SavedModel have not been saved yet. """ if not self._has_saved_variables: raise AssertionError( "Variables and assets have not been saved yet. " "Please invoke `add_meta_graph_and_variables()` first.") self._maybe_clear_devices(clear_devices) # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) # Add legacy init op to the SavedModel. self._maybe_add_legacy_init_op(legacy_init_op) saver = tf_saver.Saver(variables.all_variables(), sharded=True, write_version=saver_pb2.SaverDef.V2) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
def variables_to_restore(self, moving_avg_variables=None): """""" name_map = {} if moving_avg_variables is None: moving_avg_variables = variables.trainable_variables() moving_avg_variables += variables.moving_average_variables() # Remove duplicates moving_avg_variables = set(moving_avg_variables) # Collect all the variables with moving average, for v in moving_avg_variables: name_map[self.average_name(v)] = v # Make sure we restore variables without moving average as well. for v in list(set(variables.all_variables()) - moving_avg_variables): if v.op.name not in name_map: name_map[v.op.name] = v return name_map
def variables_to_restore(self, moving_avg_variables=None): """Returns a map of names to `Variables` to restore. If a variable has a moving average, use the moving average variable name as the restore name; otherwise, use the variable name. For example, ```python variables_to_restore = ema.variables_to_restore() saver = tf.train.Saver(variables_to_restore) ``` Below is an example of such mapping: ``` conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma, conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params, global_step: global_step ``` Args: moving_avg_variables: a list of variables that require to use of the moving variable name to be restored. If None, it will default to variables.moving_average_variables() + variables.trainable_variables() Returns: A map from restore_names to variables. The restore_name can be the moving_average version of the variable name if it exist, or the original variable name. """ name_map = {} if moving_avg_variables is None: # Include trainable variables and variables which have been explicitly # added to the moving_average_variables collection. moving_avg_variables = variables.trainable_variables() moving_avg_variables += variables.moving_average_variables() # Remove duplicates moving_avg_variables = set(moving_avg_variables) # Collect all the variables with moving average, for v in moving_avg_variables: name_map[self.average_name(v)] = v # Make sure we restore variables without moving average as well. for v in list(set(variables.all_variables()) - moving_avg_variables): if v.op.name not in name_map: name_map[v.op.name] = v return name_map
def add_meta_graph_and_variables(self, sess, tags, signature_def_map=None, assets_collection=None): """Adds the current meta graph to the SavedModel and saves variables. Creates a Saver to save the variables from the provided session. Exports the corresponding meta graph def. This function assumes that the variables to be saved have been initialized. For a given `SavedModelBuilder`, this API must be called exactly once and for the first meta graph to save. For subsequent meta graph defs to be added, the `add_meta_graph()` API must be used. Args: sess: The TensorFlow session from which to save the meta graph and variables. tags: The set of tags with which to save the meta graph. signature_def_map: The map of signature def map to add to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. """ if self._has_saved_variables: raise AssertionError( "Variables and assets have already been saved. " "Please invoke `add_meta_graph()` instead.") # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) export_path = os.path.join( compat.as_text(self._export_dir), compat.as_text(constants.VARIABLES_FILENAME)) # Save the variables and export meta graph def. saver = tf_saver.Saver(variables.all_variables()) saver.save(sess, export_path, write_meta_graph=False) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map) # Mark this instance of SavedModel as having saved variables, such that # subsequent attempts to save variables will fail. self._has_saved_variables = True
def add_meta_graph_and_variables(self, sess, tags, signature_def_map=None, assets_collection=None): """Adds the current meta graph to the SavedModel and saves variables. Creates a Saver to save the variables from the provided session. Exports the corresponding meta graph def. This function assumes that the variables to be saved have been initialized. For a given `SavedModelBuilder`, this API must be called exactly once and for the first meta graph to save. For subsequent meta graph defs to be added, the `add_meta_graph()` API must be used. Args: sess: The TensorFlow session from which to save the meta graph and variables. tags: The set of tags with which to save the meta graph. signature_def_map: The map of signature def map to add to the meta graph def. assets_collection: Assets collection to be saved with SavedModel. """ if self._has_saved_variables: raise AssertionError("Variables and assets have already been saved. " "Please invoke `add_meta_graph()` instead.") # Save asset files and write them to disk, if any. self._save_and_write_assets(assets_collection) export_path = os.path.join( compat.as_text(self._export_dir), compat.as_text(constants.VARIABLES_FILENAME)) # Save the variables and export meta graph def. saver = tf_saver.Saver(variables.all_variables()) saver.save(sess, export_path, write_meta_graph=False) meta_graph_def = saver.export_meta_graph() # Tag the meta graph def and add it to the SavedModel. self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map) # Mark this instance of SavedModel as having saved variables, such that # subsequent attempts to save variables will fail. self._has_saved_variables = True
def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs): """Create a saver swapping moving averages and variables. You should use this saver during training. It will save the moving averages of the trained parameters under the original parameter names. For evaluations or inference you should use a regular saver and it will automatically use the moving averages for the trained variable. You must call this function after all variables have been created and after you have called Optimizer.minimize(). Args: var_list: List of variables to save, as per `Saver()`. If set to None, will save all the variables that have been created before this call. name: The name of the saver. **kwargs: Keyword arguments of `Saver()`. Returns: A `tf.Saver` object. Raises: RuntimeError: If apply_gradients or minimize has not been called before. """ if self._variable_map is None: raise RuntimeError('Must call apply_gradients or minimize before ' 'creating the swapping_saver') if var_list is None: var_list = variables.all_variables() if not isinstance(var_list, dict): var_list = saver.BaseSaverBuilder.OpListToDict(var_list) # Now swap variables and moving averages swapped_var_list = {} for k, v in six.iteritems(var_list): v_swap = self._variable_map.get(v.op.name, None) if v_swap: swapped_var_list[k] = v_swap else: swapped_var_list[k] = v # Build the swapping saver. return saver.Saver(swapped_var_list, name=name, **kwargs)
def variables_to_restore(self): """Returns a map of names to `Variables` to restore. If a variable has a moving average, use the moving average variable name as the restore name; otherwise, use the variable name. For example, ```python variables_to_restore = ema.variables_to_restore() saver = tf.train.Saver(variables_to_restore) ``` Below is an example of such mapping: ``` conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma, conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params, global_step: global_step ``` Returns: A map from restore_names to variables. The restore_name can be the moving_average version of the variable name if it exist, or the original variable name. """ name_map = {} # Collect all the variables with moving average, including all # the trainable variables and variables which have been explicitly # added to the collection. moving_avg_variables = list( set(variables.moving_average_variables() + variables.trainable_variables())) for v in moving_avg_variables: name_map[self.average_name(v)] = v # Make sure we restore variables without moving average as well. for v in list( set(variables.all_variables()) - set(moving_avg_variables)): if v.op.name not in name_map: name_map[v.op.name] = v return name_map
def variables_to_restore(self): """Returns a map of names to `Variables` to restore. If a variable has a moving average, use the moving average variable name as the restore name; otherwise, use the variable name. For example, ```python variables_to_restore = ema.variables_to_restore() saver = tf.train.Saver(variables_to_restore) ``` Below is an example of such mapping: ``` conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma, conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params, global_step: global_step ``` Returns: A map from restore_names to variables. The restore_name can be the moving_average version of the variable name if it exist, or the original variable name. """ name_map = {} # Collect all the variables with moving average, including all # the trainable variables and variables which have been explicitly # added to the collection. moving_avg_variables = list(set(variables.moving_average_variables() + variables.trainable_variables())) for v in moving_avg_variables: name_map[self.average_name(v)] = v # Make sure we restore variables without moving average as well. for v in list(set(variables.all_variables()) - set(moving_avg_variables)): if v.op.name not in name_map: name_map[v.op.name] = v return name_map
def __init__(self, var_list=None, reshape=False, sharded=False, max_to_keep=5, keep_checkpoint_every_n_hours=10000.0, name=None, restore_sequentially=False, saver_def=None, builder=None): """Creates a `Saver`. The constructor adds ops to save and restore variables. `var_list` specifies the variables that will be saved and restored. It can be passed as a `dict` or a list: * A `dict` of names to variables: The keys are the names that will be used to save or restore the variables in the checkpoint files. * A list of variables: The variables will be keyed with their op name in the checkpoint files. For example: ```python v1 = tf.Variable(..., name='v1') v2 = tf.Variable(..., name='v2') # Pass the variables as a dict: saver = tf.train.Saver({'v1': v1, 'v2': v2}) # Or pass them as a list. saver = tf.train.Saver([v1, v2]) # Passing a list is equivalent to passing a dict with the variable op names # as keys: saver = tf.train.Saver({v.op.name: v for v in [v1, v2]}) ``` The optional `reshape` argument, if `True`, allows restoring a variable from a save file where the variable had a different shape, but the same number of elements and type. This is useful if you have reshaped a variable and want to reload it from an older checkpoint. The optional `sharded` argument, if `True`, instructs the saver to shard checkpoints per device. Args: var_list: A list of `Variable` objects or a dictionary mapping names to variables. If `None`, defaults to the list of all variables. reshape: If `True`, allows restoring parameters from a checkpoint where the variables have a different shape. sharded: If `True`, shard the checkpoints, one per device. max_to_keep: Maximum number of recent checkpoints to keep. Defaults to 10,000 hours. keep_checkpoint_every_n_hours: How often to keep checkpoints. Defaults to 10,000 hours. name: String. Optional name to use as a prefix when adding operations. restore_sequentially: A `Bool`, which if true, causes restore of different variables to happen sequentially within each device. This can lower memory usage when restoring very large models. saver_def: Optional `SaverDef` proto to use instead of running the builder. This is only useful for specialty code that wants to recreate a `Saver` object for a previously built `Graph` that had a `Saver`. The `saver_def` proto should be the one returned by the `as_saver_def()` call of the `Saver` that was created for that `Graph`. builder: Optional `SaverBuilder` to use if a `saver_def` was not provided. Defaults to `BaseSaverBuilder()`. Raises: TypeError: If `var_list` is invalid. ValueError: If any of the keys or values in `var_list` are not unique. """ if saver_def is None: if builder is None: builder = BaseSaverBuilder() if var_list is None: var_list = variables.all_variables() if not var_list: raise ValueError("No variables to save") saver_def = builder.build( var_list, reshape=reshape, sharded=sharded, max_to_keep=max_to_keep, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, name=name, restore_sequentially=restore_sequentially) if not isinstance(saver_def, saver_pb2.SaverDef): raise ValueError("saver_def must if a saver_pb2.SaverDef: %s" % saver_def) if not saver_def.save_tensor_name: raise ValueError("saver_def must specify the save_tensor_name: %s" % str(saver_def)) if not saver_def.restore_op_name: raise ValueError("saver_def must specify the restore_op_name: %s" % str(saver_def)) self._filename_tensor_name = saver_def.filename_tensor_name self._save_tensor_name = saver_def.save_tensor_name self._restore_op_name = saver_def.restore_op_name self._max_to_keep = saver_def.max_to_keep # If keep_checkpoint_every_n_hours is not set, set it to 10000 hours. self._keep_checkpoint_every_n_hours = ( saver_def.keep_checkpoint_every_n_hours if saver_def.keep_checkpoint_every_n_hours else 10000) self._next_checkpoint_time = ( time.time() + self._keep_checkpoint_every_n_hours * 3600) self._sharded = saver_def.sharded self._last_checkpoints = []
def apply_gradients( self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False, # batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None): matrix_to_solve=None, num_batches_per_epoch=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) train_ops.append(print_start_op) with ops.device(var.device): work_idx_print = logging_ops.Print( worker_id, [worker_id], message="worker id for comp grad") ps_step_printer0 = logging_ops.Print( global_step, [global_step], message="global step printer0 on ps") train_ops.append(work_idx_print) train_ops.append(ps_step_printer0) '''Implement LS computation and solution here''' #b = np.ones(int(num_batches_per_epoch)) b = tf.ones([int(num_batches_per_epoch), 1], tf.float32) A = matrix_to_solve # A_for_calc = np.transpose(A) LS_solution = linalg_ops.matrix_solve_ls(A, b, fast=False) LS_calc = tf.reshape(LS_solution, [-1]) weight = tf.slice(LS_calc, [worker_id], [1]) # print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!") # train_ops.append(print_ls_op) weighted_grad = tf.scalar_mul(weight[0], grad) '''Kill some workers''' if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): # apply_grad_op = grad_accum.apply_grad(grad, apply_grad_op = grad_accum.apply_grad( weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( # grad, local_step=self._local_step._ref()) weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] work_idx_print1 = logging_ops.Print( worker_id, [worker_id], message="worker id for aggregate grad") ps_step_printer1 = logging_ops.Print( global_step, [global_step], message="global step printer1 on ps") num_replica_aggragate = logging_ops.Print( self._replicas_to_aggregate, [self._replicas_to_aggregate], message="num replica aggregate") train_ops.append(work_idx_print1) train_ops.append(ps_step_printer1) train_ops.append(num_replica_aggragate) if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append( grad_accum.take_grad( self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append( grad_accum.take_grad( self._replicas_to_aggregate)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id].enqueue( global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def testModelWithBuckets(self): """Larger tests that does full sequence-to-sequence model training.""" # We learn to copy 10 symbols in 2 buckets: length 4 and length 8. classes = 10 buckets = [(4, 4), (8, 8)] perplexities = [[], []] # Results for each bucket. random_seed.set_random_seed(111) random.seed(111) np.random.seed(111) with self.test_session() as sess: # We use sampled softmax so we keep output projection separate. w = variable_scope.get_variable("proj_w", [24, classes]) w_t = array_ops.transpose(w) b = variable_scope.get_variable("proj_b", [classes]) # Here comes a sample Seq2Seq model using GRU cells. def SampleGRUSeq2Seq(enc_inp, dec_inp, weights): """Example sequence-to-sequence model that uses GRU cells.""" def GRUSeq2Seq(enc_inp, dec_inp): cell = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(24) for _ in range(2)], state_is_tuple=True) return seq2seq_lib.embedding_attention_seq2seq( enc_inp, dec_inp, cell, num_encoder_symbols=classes, num_decoder_symbols=classes, embedding_size=24, output_projection=(w, b)) targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0] def SampledLoss(labels, inputs): labels = array_ops.reshape(labels, [-1, 1]) return nn_impl.sampled_softmax_loss( weights=w_t, biases=b, labels=labels, inputs=inputs, num_sampled=8, num_classes=classes) return seq2seq_lib.model_with_buckets( enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq, softmax_loss_function=SampledLoss) # Now we construct the copy model. batch_size = 8 inp = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] out = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] weights = [ array_ops.ones_like( inp[0], dtype=dtypes.float32) for _ in range(8) ] with variable_scope.variable_scope("root"): _, losses = SampleGRUSeq2Seq(inp, out, weights) updates = [] params = variables.all_variables() optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5) for i in range(len(buckets)): full_grads = gradients_impl.gradients(losses[i], params) grads, _ = clip_ops.clip_by_global_norm(full_grads, 30.0) update = optimizer.apply_gradients(zip(grads, params)) updates.append(update) sess.run([variables.global_variables_initializer()]) steps = 6 for _ in range(steps): bucket = random.choice(np.arange(len(buckets))) length = buckets[bucket][0] i = [ np.array( [np.random.randint(9) + 1 for _ in range(batch_size)], dtype=np.int32) for _ in range(length) ] # 0 is our "GO" symbol here. o = [np.array([0] * batch_size, dtype=np.int32)] + i feed = {} for i1, i2, o1, o2 in zip(inp[:length], i[:length], out[:length], o[:length]): feed[i1.name] = i2 feed[o1.name] = o2 if length < 8: # For the 4-bucket, we need the 5th as target. feed[out[length].name] = o[length] res = sess.run([updates[bucket], losses[bucket]], feed) perplexities[bucket].append(math.exp(float(res[1]))) for bucket in range(len(buckets)): if len(perplexities[bucket]) > 1: # Assert that perplexity went down. self.assertLess(perplexities[bucket][-1], # 10% margin of error. 1.1 * perplexities[bucket][0])
def __init__(self, var_list=None, reshape=False, sharded=False, max_to_keep=5, keep_checkpoint_every_n_hours=10000.0, name=None, restore_sequentially=False, saver_def=None, builder=None): """Creates a `Saver`. The constructor adds ops to save and restore variables. `var_list` specifies the variables that will be saved and restored. It can be passed as a `dict` or a list: * A `dict` of names to variables: The keys are the names that will be used to save or restore the variables in the checkpoint files. * A list of variables: The variables will be keyed with their op name in the checkpoint files. For example: ```python v1 = tf.Variable(..., name='v1') v2 = tf.Variable(..., name='v2') # Pass the variables as a dict: saver = tf.train.Saver({'v1': v1, 'v2': v2}) # Or pass them as a list. saver = tf.train.Saver([v1, v2]) # Passing a list is equivalent to passing a dict with the variable op names # as keys: saver = tf.train.Saver({v.op.name: v for v in [v1, v2]}) ``` The optional `reshape` argument, if `True`, allows restoring a variable from a save file where the variable had a different shape, but the same number of elements and type. This is useful if you have reshaped a variable and want to reload it from an older checkpoint. The optional `sharded` argument, if `True`, instructs the saver to shard checkpoints per device. Args: var_list: A list of `Variable` objects or a dictionary mapping names to variables. If `None`, defaults to the list of all variables. reshape: If `True`, allows restoring parameters from a checkpoint where the variables have a different shape. sharded: If `True`, shard the checkpoints, one per device. max_to_keep: maximum number of recent checkpoints to keep. Defaults to 10,000 hours. keep_checkpoint_every_n_hours: How often to keep checkpoints. Defaults to 10,000 hours. name: string. Optional name to use as a prefix when adding operations. restore_sequentially: A `Bool`, which if true, causes restore of different variables to happen sequentially within each device. This can lower memory usage when restoring very large models. saver_def: Optional `SaverDef` proto to use instead of running the builder. This is only useful for specialty code that wants to recreate a `Saver` object for a previously built `Graph` that had a `Saver`. The `saver_def` proto should be the one returned by the `as_saver_def()` call of the `Saver` that was created for that `Graph`. builder: Optional `SaverBuilder` to use if a `saver_def` was not provided. Defaults to `BaseSaverBuilder()`. Raises: TypeError: If `var_list` is invalid. ValueError: If any of the keys or values in `var_list` are not unique. """ if saver_def is None: if builder is None: builder = BaseSaverBuilder() if var_list is None: var_list = variables.all_variables() if not var_list: raise ValueError("No variables to save") saver_def = builder.build( var_list, reshape=reshape, sharded=sharded, max_to_keep=max_to_keep, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, name=name, restore_sequentially=restore_sequentially) if not isinstance(saver_def, saver_pb2.SaverDef): raise ValueError("saver_def must if a saver_pb2.SaverDef: %s" % saver_def) if not saver_def.save_tensor_name: raise ValueError( "saver_def must specify the save_tensor_name: %s" % str(saver_def)) if not saver_def.restore_op_name: raise ValueError("saver_def must specify the restore_op_name: %s" % str(saver_def)) self._filename_tensor_name = saver_def.filename_tensor_name self._save_tensor_name = saver_def.save_tensor_name self._restore_op_name = saver_def.restore_op_name self._max_to_keep = saver_def.max_to_keep # If keep_checkpoint_every_n_hours is not set, set it to 10000 hours. self._keep_checkpoint_every_n_hours = ( saver_def.keep_checkpoint_every_n_hours if saver_def.keep_checkpoint_every_n_hours else 10000) self._next_checkpoint_time = ( time.time() + self._keep_checkpoint_every_n_hours * 3600) self._sharded = saver_def.sharded self._last_checkpoints = []
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_grad( self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = ( data_flow_ops.FIFOQueue(-1, global_step.dtype.base_dtype, shapes=(), shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step.ref()) sync_op = sync_token_queue.enqueue_many((tokens,)) if self._variable_averages is not None: with ops.control_dependencies([sync_op]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] printer_ops = [] def f_pos(): enq_total_ops = self._stop_queue.enqueue(global_step) ''' for worker_id in range(self._total_num_replicas): enq_ops = self._should_stop_queues[worker_id].enqueue(global_step) with ops.control_dependencies([enq_ops]): L = [] ''' # ret_pos = [tf.constant(i) for i in range(self._construtor)] with ops.control_dependencies([enq_total_ops]): return tf.Print(global_step, [global_step], message="Enquequed to stop queue") # ret_pos = tf.Variable(33) # return ret_pos def f_neg(): # ret_neg = [tf.constant(i+5) for i in range(self._construtor)] ret_neg = tf.Variable(22) return tf.Print(global_step, [global_step], message="Nothing to stop queue") # worker_id_list_printer = logging_ops.Print(global_step, # [a for a in self._worker_idx_list] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer) self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer = logging_ops.Print( global_step, [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Accum aggregated status on ps") train_ops.append(accum_sizes_printer) x = self._accumulator_list[0] ret = tf.cond( tf.greater_equal( x[0].num_accumulated(), self._constant_for_comparison), f_pos, f_neg) should_stop_list_printer = logging_ops.Print( global_step, [ret], message="Should stop ret val status on ps") train_ops.append(should_stop_list_printer) with ops.control_dependencies([ret]): queue_total_printer = logging_ops.Print( global_step, [self._stop_queue.size()], message="shared should stop queue size") train_ops.append(queue_total_printer) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: aggregated_grad.append( grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op num_to_dequeue = self._stop_queue.size() deq_ops = self._stop_queue.dequeue_many(num_to_dequeue) with ops.control_dependencies([deq_ops]): size_printer_2 = logging_ops.Print( global_step, [self.print_accum_sizes], message="Complelted the dequeue operation!") printer_ops.append(size_printer_2) with ops.control_dependencies(printer_ops): with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range( self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id]. enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id] + [global_step], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_grad(grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step.ref()) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op