def read_keyed_batch_features(file_pattern, batch_size, features, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, feature_queue_capacity=100, num_queue_runners=2, parser_num_threads=None, name=None): """Adds operations to read, queue, batch and parse `Example` protos. Given file pattern (or list of files), will setup a queue for file names, read `Example` proto using provided `reader`, use batch queue to create batches of examples of size `batch_size` and parse example given `features` specification. All queue runners are added to the queue runners collection, and may be started via `start_queue_runners`. All ops are added to the default graph. Args: file_pattern: List of files or pattern of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int or scalar `Tensor` specifying the batch size to use. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. reader: A function or class that returns an object with `read` method, (filename tensor) -> (example tensor). randomize_input: Whether the input should be randomized. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. NOTE - If specified, creates a variable that must be initialized, so call tf.initialize_local_variables() as shown in the tests. queue_capacity: Capacity for input queue. reader_num_threads: The number of threads to read examples. feature_queue_capacity: Capacity of the parsed features queue. num_queue_runners: Number of queue runners to start for the feature queue, Adding multiple queue runners for the parsed example queue helps maintain a full queue when the subsequent computations overall are cheaper than parsing. parser_num_threads: (Deprecated) The number of threads to parse examples. name: Name of resulting op. Returns: A dict of `Tensor` or `SparseTensor` objects for each in `features`. If `keep_keys` is `True`, returns tuple of string `Tensor` and above dict. Raises: ValueError: for invalid inputs. """ if parser_num_threads: # TODO(sibyl-Aix6ihai): Remove on Sept 3 2016. logging.warning( 'parser_num_threads is deprecated, it will be removed on' 'Sept 3 2016') with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope: keys, examples = read_keyed_batch_examples( file_pattern, batch_size, reader, randomize_input=randomize_input, num_epochs=num_epochs, queue_capacity=queue_capacity, num_threads=reader_num_threads, read_batch_size=batch_size, name=scope) # Parse the example. feature_map = parsing_ops.parse_example(examples, features) # Lets also add preprocessed tensors into the queue types for each item of # the queue. tensors_to_enqueue = [] # Each entry contains the key, and a boolean which indicates whether the # tensor was a sparse tensor. tensors_mapping = [] # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse # tensors into a queue. This could be taken care in somewhere else so others # can reuse it. Also, QueueBase maybe extended to handle sparse tensors # directly. for key in sorted(feature_map.keys()): tensor = feature_map[key] if isinstance(tensor, ops.SparseTensor): tensors_mapping.append((key, True)) tensors_to_enqueue.extend( [tensor.indices, tensor.values, tensor.shape]) else: tensors_mapping.append((key, False)) tensors_to_enqueue.append(tensor) tensors_to_enqueue.append(keys) queue_dtypes = [x.dtype for x in tensors_to_enqueue] input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes) # Add a summary op to debug if our feature queue is full or not. logging_ops.scalar_summary( 'queue/parsed_features/%s/fraction_of_%d_full' % (input_queue.name, feature_queue_capacity), math_ops.cast(input_queue.size(), dtypes.float32) * (1. / feature_queue_capacity)) # Add multiple queue runners so that the queue is always full. Adding more # than two queue-runners may hog the cpu on the worker to fill up the queue. for _ in range(num_queue_runners): queue_runner.add_queue_runner( queue_runner.QueueRunner( input_queue, [input_queue.enqueue(tensors_to_enqueue)])) dequeued_tensors = input_queue.dequeue() # Reset shapes on dequeued tensors. for i in range(len(tensors_to_enqueue)): dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape()) # Recreate feature mapping according to the original dictionary. dequeued_feature_map = {} index = 0 for key, is_sparse_tensor in tensors_mapping: if is_sparse_tensor: # Three tensors are (indices, values, shape). dequeued_feature_map[key] = ops.SparseTensor( dequeued_tensors[index], dequeued_tensors[index + 1], dequeued_tensors[index + 2]) index += 3 else: dequeued_feature_map[key] = dequeued_tensors[index] index += 1 dequeued_keys = dequeued_tensors[-1] return dequeued_keys, dequeued_feature_map
def input_producer(input_tensor, element_shape=None, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, summary_name=None, name=None): """Output the rows of `input_tensor` to a queue for an input pipeline. Args: input_tensor: A tensor with the rows to produce. Must be at least one-dimensional. Must either have a fully-defined shape, or `element_shape` must be defined. element_shape: (Optional.) A `TensorShape` representing the shape of a row of `input_tensor`, if it cannot be inferred. num_epochs: (Optional.) An integer. If specified `input_producer` produces each row of `input_tensor` `num_epochs` times before generating an `OutOfRange` error. If not specified, `input_producer` can cycle through the rows of `input_tensor` an unlimited number of times. shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled within each epoch. seed: (Optional.) An integer. The seed to use if `shuffle` is true. capacity: (Optional.) The capacity of the queue to be used for buffering the input. shared_name: (Optional.) If set, this queue will be shared under the given name across multiple sessions. summary_name: (Optional.) If set, a scalar summary for the current queue size will be generated, using this name as part of the tag. name: (Optional.) A name for queue. Returns: A queue with the output rows. A `QueueRunner` for the queue is added to the current `QUEUE_RUNNER` collection of the current graph. Raises: ValueError: If the shape of the input cannot be inferred from the arguments. """ with ops.name_scope(name, "input_producer", [input_tensor]): input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor") element_shape = input_tensor.get_shape()[1:].merge_with(element_shape) if not element_shape.is_fully_defined(): raise ValueError( "Either `input_tensor` must have a fully defined shape " "or `element_shape` must be specified") if shuffle: input_tensor = random_ops.random_shuffle(input_tensor, seed=seed) input_tensor = limit_epochs(input_tensor, num_epochs) q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[input_tensor.dtype.base_dtype], shapes=[element_shape], shared_name=shared_name, name=name) enq = q.enqueue_many([input_tensor]) queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq])) if summary_name is not None: logging_ops.scalar_summary( "queue/%s/%s" % (q.name, summary_name), math_ops.cast(q.size(), dtypes.float32) * (1. / capacity)) return q
def _enqueue_join(queue, tensor_list_list): enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list] queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
def _enqueue_join(queue, tensor_list_list, enqueue_many): if enqueue_many: enqueue_ops = [queue.enqueue_many(tl) for tl in tensor_list_list] else: enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list] queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
def _enqueue(queue, tensor_list, threads, enqueue_many): if enqueue_many: enqueue_ops = [queue.enqueue_many(tensor_list)] * threads else: enqueue_ops = [queue.enqueue(tensor_list)] * threads queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] # worker_id_list_printer = logging_ops.Print(global_step, # [a for a in self._worker_idx_list] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer) self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): worker_idx_list = [] worker_counter = 0 for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) with ops.device(global_step.device): worker_idx_list.append(worker_id) worker_counter += 1 worker_id_list_printer = logging_ops.Print(global_step, [a for a in worker_idx_list] + [worker_id] + [global_step], message="Worker ID list status") train_ops.append(worker_id_list_printer) counter_printer = logging_ops.Print(global_step, [worker_counter], message="Test for the counter") train_ops.append(counter_printer) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print(global_step, [global_step], message="Starting to apply grads for variable %d" % index) train_ops.append(print_start_op) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad(grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer = logging_ops.Print(global_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id] + [global_step], message="Accum aggregated status") ret = tf.cond(tf.greater(self._accumulator_list[0][0].num_accumulated(), self._constant_for_comparison), lambda: tf.constant(1), lambda: tf.constant(0)) notification_printer = logging_ops.Print(global_step, [ret], message="should stop notification") train_ops.append(notification_printer) '''else: notification_printer = logging_ops.Print(global_step, ["shouldn't stop"], message="shouldn't stop notification") train_ops.append(notification_printer)''' train_ops.append(accum_sizes_printer) # worker_id_list_printer = logging_ops.Print(global_step, # [len(self._worker_list)] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer) finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer_parse = logging_ops.Print(global_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id] + [global_step], message="Accum aggregated status") ret_sparse = tf.cond(tf.greater(self._accumulator_list[0][0].num_accumulated(), self._constant_for_comparison), lambda: tf.constant(1), lambda: tf.constant(0)) notification_printer_sparse = logging_ops.Print(global_step, [ret_sparse], message="should stop notification") train_ops.append(notification_printer_sparse) #else: # notification_printer_sparse = logging_ops.Print(global_step, ["shouldn't stop"], message="shouldn't stop notification") # train_ops.append(notification_printer_sparse) train_ops.append(accum_sizes_printer_parse) # worker_id_list_printer_sparse = logging_ops.Print(global_step, # [len(self._worker_list)] + [worker_id] + [global_step], # message="Worker ID list status") # train_ops.append(worker_id_list_printer_sparse) finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) with ops.control_dependencies([apply_grad_op]): accum_sizes_printer = logging_ops.Print(global_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id] + [global_step], message="Accum aggregated status on ps") train_ops.append(accum_sizes_printer) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) else: aggregated_grad.append(grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [self._sync_token_queues[i].size() for i in range(self._total_num_replicas)], message="queue sizes") self.print_accum_sizes = logging_ops.Print(self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print(self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append(self._sync_token_queues[cur_worker_id].enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print(self._local_step._ref(), [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id] + [global_step], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def _read_keyed_batch_examples_helper(file_pattern, batch_size, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, num_threads=1, read_batch_size=1, filter_fn=None, parse_fn=None, setup_shared_queue=False, name=None, seed=None): """Adds operations to read, queue, batch `Example` protos. Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int or scalar `Tensor` specifying the batch size to use. reader: A function or class that returns an object with `read` method, (filename tensor) -> (example tensor). randomize_input: Whether the input should be randomized. num_epochs: Integer specifying the number of times to read through the dataset. If `None`, cycles through the dataset forever. NOTE - If specified, creates a variable that must be initialized, so call `tf.global_variables_initializer()` and run the op in a session. queue_capacity: Capacity for input queue. num_threads: The number of threads enqueuing examples. read_batch_size: An int or scalar `Tensor` specifying the number of records to read at once filter_fn: Filtering function, takes both keys as well `Example` Tensors and returns a boolean mask of the same shape as the input Tensors to be applied for filtering. If `None`, no filtering is done. parse_fn: Parsing function, takes `Example` Tensor returns parsed representation. If `None`, no parsing is done. setup_shared_queue: Whether to set up a shared queue for file names. name: Name of resulting op. seed: An integer (optional). Seed used if randomize_input == True. Returns: Returns tuple of: - `Tensor` of string keys. - String `Tensor` of batched `Example` proto. Raises: ValueError: for invalid inputs. """ # Retrieve files to read. file_names = _get_file_names(file_pattern, randomize_input) # Check input parameters are given and reasonable. if (not queue_capacity) or (queue_capacity <= 0): raise ValueError('Invalid queue_capacity %s.' % queue_capacity) if (batch_size is None) or ( (not isinstance(batch_size, ops.Tensor)) and (batch_size <= 0 or batch_size >= queue_capacity)): raise ValueError('Invalid batch_size %s, with queue_capacity %s.' % (batch_size, queue_capacity)) if (read_batch_size is None) or ( (not isinstance(read_batch_size, ops.Tensor)) and (read_batch_size <= 0)): raise ValueError('Invalid read_batch_size %s.' % read_batch_size) if (not num_threads) or (num_threads <= 0): raise ValueError('Invalid num_threads %s.' % num_threads) if (num_epochs is not None) and (num_epochs <= 0): raise ValueError('Invalid num_epochs %s.' % num_epochs) with ops.name_scope(name, 'read_batch_examples', [file_pattern]) as scope: with ops.name_scope('file_name_queue') as file_name_queue_scope: if setup_shared_queue: file_name_queue = data_flow_ops.FIFOQueue( capacity=1, dtypes=[dtypes.string], shapes=[[]]) enqueue_op = file_name_queue.enqueue( input_pipeline_ops.seek_next(file_names, shuffle=randomize_input, num_epochs=num_epochs, seed=seed)) queue_runner.add_queue_runner( queue_runner.QueueRunner(file_name_queue, [enqueue_op])) else: file_name_queue = input_ops.string_input_producer( constant_op.constant(file_names, name='input'), shuffle=randomize_input, num_epochs=num_epochs, name=file_name_queue_scope, seed=seed) example_list = _get_examples(file_name_queue, reader, num_threads, read_batch_size, filter_fn, parse_fn) enqueue_many = read_batch_size > 1 if num_epochs is None: allow_smaller_final_batch = False else: allow_smaller_final_batch = True # Setup batching queue given list of read example tensors. if randomize_input: if isinstance(batch_size, ops.Tensor): min_after_dequeue = int(queue_capacity * 0.4) else: min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size) queued_examples_with_keys = input_ops.shuffle_batch_join( example_list, batch_size, capacity=queue_capacity, min_after_dequeue=min_after_dequeue, enqueue_many=enqueue_many, name=scope, allow_smaller_final_batch=allow_smaller_final_batch, seed=seed) else: queued_examples_with_keys = input_ops.batch_join( example_list, batch_size, capacity=queue_capacity, enqueue_many=enqueue_many, name=scope, allow_smaller_final_batch=allow_smaller_final_batch) if parse_fn and isinstance(queued_examples_with_keys, dict): queued_keys = queued_examples_with_keys.pop(KEY_FEATURE_NAME) return queued_keys, queued_examples_with_keys return queued_examples_with_keys
def prefetch_queue(tensors, capacity=8, num_threads=1, dynamic_pad=False, shared_name=None, name=None): """Creates a queue to prefetch tensors from `tensors`. A queue runner for enqueuing tensors into the prefetch_queue is automatically added to the TF QueueRunners collection. Example: This is for example useful to pre-assemble input batches read with `tf.compat.v1.train.batch()` and enqueue the pre-assembled batches. Ops that dequeue from the pre-assembled queue will not pay the cost of assembling the batch. images, labels = tf.compat.v1.train.batch([image, label], batch_size=32, num_threads=4) batch_queue = prefetch_queue([images, labels]) images, labels = batch_queue.dequeue() logits = Net(images) loss = Loss(logits, labels) Args: tensors: A list or dictionary of `Tensors` to enqueue in the buffer. capacity: An integer. The maximum number of elements in the queue. num_threads: An integer. Number of threads running the enqueue op. dynamic_pad: Boolean. Whether to allow variable dimensions in input shapes. shared_name: (optional). If set, this queue will be shared under the given name across multiple sessions. name: (Optional) A name for the operations. Returns: A queue from which you can dequeue tensors with the same type and shape as `tensors`. """ if isinstance(tensors, dict): # Need to wrap the keys and values in list() since Python3 returns views. # We sort the keys so the order is consistent across runs. names = list(sorted(tensors.keys())) tensor_list = list([tensors[n] for n in names]) else: names = None tensor_list = tensors with ops.name_scope(name, "prefetch_queue", tensor_list) as name: dtypes = [t.dtype for t in tensor_list] shapes = [t.get_shape() for t in tensor_list] queue = _which_queue(dynamic_pad)( capacity=capacity, dtypes=dtypes, shapes=shapes, names=names, shared_name=shared_name) enqueue_op = queue.enqueue(tensors) queue_runner.add_queue_runner( queue_runner.QueueRunner(queue, [enqueue_op] * num_threads)) summary.scalar( "fraction_of_%d_full" % capacity, math_ops.cast(queue.size(), _dtypes.float32) * (1. / capacity)) return queue
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_grad( self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append(grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = ( data_flow_ops.FIFOQueue(-1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens,)) if self._variable_averages is not None: with ops.control_dependencies([sync_op]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] inputs = [] var_list = [] for x in grads_and_vars: inputs.extend(list(x)) with ops.device(global_step.device): self._local_steps = variables.Variable(array_ops.zeros( [self._total_num_replicas], dtype=global_step.dtype), trainable=False, name="local_steps") # Check staleness. Note that this has to be ref(), otherwise identity will # be accessed and it will be old values. local_step = array_ops.slice( self._local_steps._ref(), # pylint: disable=protected-access array_ops.reshape(self._replica_id, (1, )), [1], name="get_local_step") local_step = array_ops.reshape(local_step, ()) is_stale = math_ops.less(local_step, global_step) with ops.name_scope(name, self._name, inputs) as name: for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): if isinstance(grad, ops.Tensor): gradient_queue = (data_flow_ops.FIFOQueue( self._tokens_per_step * 2, grad.dtype, shapes=var.get_shape(), shared_name=var.name)) self._one_element_queue_list.append( (gradient_queue, var.device)) train_ops.append(gradient_queue.enqueue([grad])) # Aggregate all gradients gradients = gradient_queue.dequeue_many( self._replicas_to_aggregate) aggregated_grad.append( math_ops.reduce_sum(gradients, [0])) elif grad is None: aggregated_grad.append(None) # pass-through. else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") aggregated_grad.append( self._aggregate_sparse_grad(grad, var, train_ops)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) # Clear all the gradients queues in case there are stale gradients. clear_queue_ops = [] with ops.control_dependencies([update_op]): for queue, dev in self._one_element_queue_list: with ops.device(dev): stale_grads = queue.dequeue_many(queue.size()) clear_queue_ops.append(stale_grads) for queue, dev in self._sparse_grad_queues_and_devs: with ops.device(dev): _, stale_indices = queue.dequeue_many(queue.size()) clear_queue_ops.append(stale_indices) with ops.device(global_step.device): self._clean_up_op = control_flow_ops.abort( error_msg="From sync_replicas") # According to the staleness, select between the enqueue op (real_grad) # or no-op (no_op_grad). Effectively dropping all the stale gradients. no_op_grad = lambda: [ control_flow_ops.no_op(name="no_grad_enqueue") ] real_grad = lambda: [control_flow_ops.group(*train_ops)] final_train_ops = control_flow_ops.cond(is_stale, no_op_grad, real_grad) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies([final_train_ops]): token = sync_token_queue.dequeue() train_op = state_ops.scatter_update(self._local_steps, self._replica_id, token, name=name) with ops.control_dependencies(clear_queue_ops): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. # Note that ref() is used to avoid reading from the identity with old # the step. tokens = array_ops.fill([self._tokens_per_step], global_step._ref()) # pylint: disable=protected-access sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) self._gradients_applied = True return train_op
def apply_gradients(self, grads_and_vars, global_step=None, name=None): if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. with ops.colocate_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) var_list = [v for g, v in grads_and_vars] velocity_list = [self._var_2_velocity[v] for v in var_list] residual_list = [self._var_2_residual[v] for v in var_list] density = 0.01 with ops.name_scope(None, self._name): for velocity, residual, grad, var in zip(velocity_list, residual_list, grads_and_vars): if grad is not None: if self._use_nesterov: update_velocity = self._momentum * (velocity + grad) update_residual = residual + update_velocity + grad else: update_velocity = self._momentum * velocity + grad update_residual = residual + update_velocity else: update_velocity = velocity update_residual = residual # select threshold according to abs(update_residual) top_k_values, top_k_indices = nn_ops.top_k( math_ops.abs(update_residual), math_ops.to_int32( array_ops.shape(update_residual)[-1] * density)) threshold = top_k_values[-1] mask = math_ops.abs(update_residual) > threshold mask = math_ops.cast(mask, dtype=dtypes.int32) mask_h = math_ops.abs(mask - 1) with ops.device(grad.device): dense_grad = mask * update_residual indices = array_ops.where(math_ops.not_equal( dense_grad, 0)) values = array_ops.gather_nd(dense_grad, indices) sparse_grad = ops.IndexedSlices(values, indices, dense_grad.get_shape()) #grad_update = state_ops.assign(grad, mask * update_residual) #with ops.control_dependencies([grad_update]), ops.device(var.device): #grad_accum = data_flow_ops.ConditionalAccumulator( #grad.dtype, shape=var.get_shape(), #shared_name=var.name + "/grad_accum") #train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step)) #aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate)) with ops.device(var.device): grad_accum = data_flow_ops.SparseConditionalAccumulator( sparse_grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_indexed_slices_grad( sparse_grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) with ops.device(residual.device): train_ops.append( state_ops.assign(residual, mask_h * update_residual)) with ops.device(velocity.device): train_ops.append( state_ops.assign(velocity, mask_h * update_velocity)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradient(aggregated_grads_and_vars, global_step) with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def get_image_labels(self): if self.is_all_shared: ### ALL SHARED ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) with tf.name_scope("to-preprocessing"): capacity = 20 * self.FLAGS.batch_size to_pre_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=None, name="to_pre_queue") to_pre_op = to_pre_queue.enqueue([image, label]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_pre_queue, [to_pre_op] * Pipeline.QR_THREADS)) tf.summary.scalar("to_pre_fraction_of_%d_full" % capacity, math_ops.to_float(to_pre_queue.size()) * (1. / capacity)) image, label = to_pre_queue.dequeue() with tf.name_scope("preprocessing"):#TODO image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) with tf.name_scope("to-allgather"): capacity = 20 * self.FLAGS.batch_size to_allg_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to_allgather_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_allg_queue, [to_allg_queue.enqueue([image, label])] * Pipeline.QR_THREADS)) tf.summary.scalar("to_allgather_fraction_of_%d_full" % capacity, math_ops.to_float(to_allg_queue.size()) * (1. / capacity)) # num_preprocessors = tf.placeholder(tf.int32, shape=[], name="num_preprocessors) # self.num_hvd_send_tensor = send_images, send_labels = to_allg_queue.dequeue_many(self.num_hvd_send) # if rank == #TODO all_images = hvd.allgather(send_images, name="hvd_allgather") all_labels = hvd.allgather(send_labels, name="hvd_allgather") #TODO: Remove extra queues with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[image.dtype, label.dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []],#TODO name="to_compute_queue")#[image.get_shape(), label.get_shape()]) queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_queue, [to_compute_queue.enqueue_many([all_images, all_labels])]))#1 thread! tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_queue.size()) * (1. / capacity)) image, label = to_compute_queue.dequeue() elif self.is_single_bcast: ### SINGLE BROADCAST ### img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" if 0 in self.member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) send_images, send_labels = create_qr("to-allg", 10 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], Pipeline.QR_THREADS, False, True, self.num_hvd_send) all_images = hvd.allgather(send_images, group=0, name=allg_images_name) all_labels = hvd.allgather(send_labels, group=0, name=allg_labels_name) all_images, all_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, True, self.images_per_bcast) if 1 in self.member_of_group: # For the middle man rank, reset all_images and all_labels # names to their broadcasted tensors so that the bcast is # performed. Note that the bcast root is rank 0 since the # group1 sent to init had this rank listed first, meaning that # the resulting mpi group comm has this rank has rank 0 if len(self.member_of_group) == 1: # Then not middle man, so construct holder variable WITH CORRECT NAME! # tf.Variable(self.num_hvd_send? all_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) all_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) #shape of [] turns into 1D instead of 0D all_images = hvd.broadcast(all_images, 0, group=1, name=bcast_images_name) all_labels = hvd.broadcast(all_labels, 0, group=1, name=bcast_labels_name) image, label = create_qr("to-compute", 20 * self.FLAGS.batch_size, [all_images, all_labels], [[self.train_image_size, self.train_image_size, 3], []], [post_pre_image_dtype, post_pre_label_dtype], 1, True, False) elif self.is_multi_bcast: ### MULTIPLE BROADCAST # print("Rank:", rank, member_of_group, group_rank_list) img_pre_fn = preprocessing_factory.get_preprocessing(self.FLAGS.preprocessing_name, is_training=True) # allg_image_name = "allgathered-image" # need some naming commonalities # allg_label_name = "allgathered-label" allg_images_name = "allgather-images-op" allg_labels_name = "allgather-labels-op" bcast_images_name = "bcast-images-op" bcast_labels_name = "bcast-labels-op" # if 0 in member_of_group: #If we belong to group 0, initialize the reading and preprocessing pipeline if self.rank < self.FLAGS.num_pre: with tf.device("/cpu:0"): with tf.name_scope("reading"): data_provider = slim.dataset_data_provider.DatasetDataProvider( self.dataset, num_readers=self.FLAGS.num_data_readers, common_queue_capacity=20*self.FLAGS.batch_size, common_queue_min=10*self.FLAGS.batch_size, seed=self.rank) [image, label] = data_provider.get(['image', 'label']) image, label = create_qr("to-pre", 10 * self.FLAGS.batch_size, [image, label], None, [image.dtype, label.dtype], Pipeline.QR_THREADS, False, False) with tf.name_scope("preprocessing"): image = img_pre_fn(image, self.train_image_size, self.train_image_size, fast_mode=self.FLAGS.fast_mode) # image = tf.Print(image, ["using preprocessed image"]) send_images, send_labels = create_qr("to-bcast", 20 * self.FLAGS.batch_size, [image, label], [[self.train_image_size, self.train_image_size, 3], []], [image.dtype, label.dtype], 2 * Pipeline.QR_THREADS, False, True, self.images_per_bcast) else: send_images = tf.zeros([self.images_per_bcast, self.train_image_size, self.train_image_size, 3], dtype=post_pre_image_dtype) send_labels = tf.zeros([self.images_per_bcast] , dtype=post_pre_label_dtype) with tf.device("/cpu:0"): bcast_images_root = "broadcast-images-" bcast_labels_root = "broadcast-labels-" bcast_images_per_group = [hvd.broadcast(send_images, i, group=i, name=bcast_images_root + str(i)) for i in range(self.FLAGS.num_pre)] bcast_labels_per_group = [hvd.broadcast(send_labels, i, group=i, name=bcast_labels_root + str(i)) for i in range(self.FLAGS.num_pre)] with tf.name_scope("to-compute"): capacity = 30 * self.FLAGS.batch_size to_compute_q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[post_pre_image_dtype, post_pre_label_dtype], shapes=[[self.train_image_size, self.train_image_size, 3], []], name="to-compute-queue") to_comp_ops = [to_compute_q.enqueue_many([bcast_images_per_group[i], bcast_labels_per_group[i]]) for i in range(self.FLAGS.num_pre)] queue_runner.add_queue_runner(queue_runner.QueueRunner(to_compute_q, to_comp_ops)) tf.summary.scalar("to_compute_fraction_of_%d_full" % capacity, math_ops.to_float(to_compute_q.size()) * (1. / capacity)) image, label = to_compute_q.dequeue() return image, label
def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False, # batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None): matrix_to_solve=None, num_batches_per_epoch=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print(global_step, [global_step], message="Starting to apply grads for variable %d" % index) train_ops.append(print_start_op) with ops.device(var.device): ps_step_printer0 = logging_ops.Print(global_step, [global_step], message="global step printer0 on ps") train_ops.append(ps_step_printer0) '''Implement LS computation and solution here''' #b = np.ones(int(num_batches_per_epoch)) b = tf.ones([int(num_batches_per_epoch),1], tf.float32) A = matrix_to_solve # A_for_calc = np.transpose(A) LS_solution = linalg_ops.matrix_solve_ls(A, b, fast=False) LS_calc = tf.reshape(LS_solution, [-1]) weight = tf.slice(LS_calc, [worker_id], [1]) # print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!") # train_ops.append(print_ls_op) weighted_grad = tf.scalar_mul(weight[0], grad) '''Kill some workers''' if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] num_accum = grad_accum.num_accumulated() tf.logging.info("Grad Accumed %s, Worker ID: %s" % (str(num_accum), str(worker_id))) with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_grad(grad, # apply_grad_op = grad_accum.apply_grad(weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()) # weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies([apply_grad_op]): finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index) train_ops.append(finished_print_op) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): work_idx_print1 = logging_ops.Print(worker_id, [worker_id], message="worker id for aggregate grad") ps_step_printer1 = logging_ops.Print(global_step, [global_step], message="global step printer1 on ps") num_replica_aggragate = logging_ops.Print(self._replicas_to_aggregate, [self._replicas_to_aggregate], message="num replica aggregate") train_ops.append(work_idx_print1) train_ops.append(ps_step_printer1) train_ops.append(num_replica_aggragate) grad_accum = self._accumulator_list[index][0] if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [self._sync_token_queues[i].size() for i in range(self._total_num_replicas)], message="queue sizes") self.print_accum_sizes = logging_ops.Print(self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print(self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append(self._sync_token_queues[cur_worker_id].enqueue(global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = ( data_flow_ops.FIFOQueue(1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print(self._local_step._ref(), [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step( global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
queue = tf.FIFOQueue(32, dtypes=[batch.dtype]) enqueue_ops = [] dependency = None for step_index in range(step_count): step = tf.strided_slice(batch, [0, step_index * step_size], [tf.shape(batch)[0], (step_index + 1) * step_size]) if dependency is None: dependency = queue.enqueue(step) else: with tf.control_dependencies([dependency]): step = queue.enqueue(step) dependency = step enqueue_ops.append(step) queue_runner.add_queue_runner( queue_runner.QueueRunner(queue=queue, enqueue_ops=[tf.group(*enqueue_ops)])) step = queue.dequeue() supervisor = tf.train.Supervisor() with supervisor.managed_session() as session: for batch_index in range(batch_count): for step_index in range(step_count): print("Batch %d, step %d" % (batch_index, step_index)) print(session.run(step))
def _read_keyed_batch_examples_helper(file_pattern, batch_size, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, num_threads=1, read_batch_size=1, parse_fn=None, setup_shared_queue=False, name=None): # Retrieve files to read. file_names = _get_file_names(file_pattern, randomize_input) # Check input parameters are given and reasonable. if (not queue_capacity) or (queue_capacity <= 0): raise ValueError('Invalid queue_capacity %s.' % queue_capacity) if (batch_size is None) or ( (not isinstance(batch_size, ops.Tensor)) and (batch_size <= 0 or batch_size > queue_capacity)): raise ValueError( 'Invalid batch_size %s, with queue_capacity %s.' % (batch_size, queue_capacity)) if (read_batch_size is None) or ( (not isinstance(read_batch_size, ops.Tensor)) and (read_batch_size <= 0)): raise ValueError('Invalid read_batch_size %s.' % read_batch_size) if (not num_threads) or (num_threads <= 0): raise ValueError('Invalid num_threads %s.' % num_threads) if (num_epochs is not None) and (num_epochs <= 0): raise ValueError('Invalid num_epochs %s.' % num_epochs) with ops.name_scope(name, 'read_batch_examples', [file_pattern]) as scope: with ops.name_scope('file_name_queue') as file_name_queue_scope: if setup_shared_queue: shared_file_name_queue = _get_shared_file_name_queue( file_names, randomize_input, num_epochs, file_name_queue_scope) file_name_queue = data_flow_ops.FIFOQueue( capacity=1, dtypes=[dtypes.string], shapes=[[]]) enqueue_op = file_name_queue.enqueue(shared_file_name_queue.dequeue()) queue_runner.add_queue_runner( queue_runner.QueueRunner(file_name_queue, [enqueue_op])) else: file_name_queue = input_ops.string_input_producer( constant_op.constant( file_names, name='input'), shuffle=randomize_input, num_epochs=num_epochs, name=file_name_queue_scope) example_list = _get_examples(file_name_queue, reader, num_threads, read_batch_size, parse_fn) enqueue_many = read_batch_size > 1 if num_epochs is not None: allow_smaller_final_batch = True else: allow_smaller_final_batch = False # Setup batching queue given list of read example tensors. if randomize_input: if isinstance(batch_size, ops.Tensor): min_after_dequeue = int(queue_capacity * 0.4) else: min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size) queued_examples_with_keys = input_ops.shuffle_batch_join( example_list, batch_size, capacity=queue_capacity, min_after_dequeue=min_after_dequeue, enqueue_many=enqueue_many, name=scope, allow_smaller_final_batch=allow_smaller_final_batch) else: queued_examples_with_keys = input_ops.batch_join( example_list, batch_size, capacity=queue_capacity, enqueue_many=enqueue_many, name=scope, allow_smaller_final_batch=allow_smaller_final_batch) if parse_fn and isinstance(queued_examples_with_keys, dict): queued_keys = queued_examples_with_keys.pop(KEY_FEATURE_NAME) return queued_keys, queued_examples_with_keys return queued_examples_with_keys
def bucket(tensors, which_bucket, batch_size, num_buckets, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=None, shared_name=None, name=None): """Lazy bucketing of input tensors according to `which_bucket`. The argument `tensors` can be a list or a dictionary of tensors. The value returned by the function will be of the same type as `tensors`. The tensors entering this function are put into the bucket given by `which_bucket`. Each bucket has its own queue. When a bucket contains `batch_size` elements, this minibatch is pushed onto a top queue. The tensors returned from this function are a the result of dequeueing the next minibatch from this top queue. This function is implemented using several queues. A `QueueRunner` for the queues is added to the current `Graph`'s `QUEUE_RUNNER` collection. As the returned tensors are the result of of a dequeue operation, evaluating them will throw a `tf.errors.OutOfRangeError` when the input queue is exhausted. If these tensors are feeding another input queue, its queue runner will catch this exception, however, if they are used in your main thread you are responsible for catching this yourself. *N.B.:* If `dynamic_pad` is `False`, you must ensure that either (i) the `shapes` argument is passed, or (ii) all of the tensors in `tensors` must have fully-defined shapes. `ValueError` will be raised if neither of these conditions holds. If `dynamic_pad` is `True`, it is sufficient that the *rank* of the tensors is known, but individual dimensions may have shape `None`. In this case, for each enqueue the dimensions with value `None` may have a variable length; upon dequeue, the output tensors will be padded on the right to the maximum shape of the tensors in the current minibatch. For numbers, this padding takes value 0. For strings, this padding is the empty string. See `PaddingFIFOQueue` for more info. If `allow_smaller_final_batch` is `True`, a smaller batch value than `batch_size` is returned when the queues are closed and there are not enough elements to fill the batch, otherwise the pending elements are discarded. In addition, all output tensors' static shapes, as accessed via the `get_shape()` method will have a 0th `Dimension` value of `None`, and operations that depend on fixed batch_size would fail. Args: tensors: The list or dictionary of tensors, representing a single element, to bucket. Nested lists are not supported. which_bucket: An `int32` scalar Tensor taking a value in `[0, num_buckets)`. batch_size: The new batch size pulled from the queue (all queues will have the same size). If a list is passed in then each bucket will have a different batch_size. (python int, int32 scalar or iterable of integers of length num_buckets). num_buckets: A python integer, the number of buckets. num_threads: An integer. The number of threads enqueuing `tensors`. capacity: An integer. The maximum number of minibatches in the top queue, and also the maximum number of elements within each bucket. shapes: (Optional) The shapes for each example. Defaults to the inferred shapes for `tensors`. dynamic_pad: Boolean. Allow variable dimensions in input shapes. The given dimensions are padded upon dequeue so that tensors within a batch have the same shapes. allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final batches to be smaller if there are insufficient items left in the queues. keep_input: (Optional). A `bool` scalar Tensor. If provided, this tensor controls whether the input is added to the queue or not. If it evaluates `True`, then `tensors` are added to the bucket; otherwise they are dropped. This tensor essentially acts as a filtering mechanism. The default behavior is to assume `keep_input=True`. shared_name: (Optional). If set, the queues will be shared under the given name across multiple sessions. name: (Optional) A name for the operations. Returns: A tuple `(bucket, outputs)` where `bucket` is a `int32` scalar tensor and `outputs` is a list or dictionary of batched outputs corresponding to elements of `tensors`. Every step will receive a new bucket of outputs. Raises: ValueError: If the `shapes` are not specified, and cannot be inferred from the elements of `tensors` or if batch_size is a sequence but it's length != num_buckets. """ batch_size_per_bucket = False if isinstance(batch_size, (list, tuple)): batch_size_per_bucket = True if len(batch_size) != num_buckets: raise ValueError( "If batch_size is a list it must have num_buckets elements") else: batch_size = [batch_size] * num_buckets tensor_list = _as_tensor_list(tensors) with ops.name_scope(name, "bucket", tensor_list) as name: tensor_list = _validate_bucket(tensor_list) (tensor_list, sparse_info) = _store_sparse_tensors( tensor_list, enqueue_many=False, keep_input=constant_op.constant(True)) # Round-trip batch_size to a tensor, and possibly back for i, bucket_batch_size in enumerate(batch_size): bucket_batch_size = ops.convert_to_tensor(bucket_batch_size, dtype=dtypes.int32, name="batch_size") static_batch_size = tensor_util.constant_value(bucket_batch_size) batch_size[i] = (static_batch_size if static_batch_size is not None else bucket_batch_size) types = _dtypes([tensor_list]) shapes = _shapes([tensor_list], shapes, enqueue_many=False) which_bucket = ops.convert_to_tensor(which_bucket, dtype=dtypes.int32, name="which_bucket") queue_creator = _which_queue(dynamic_pad) bucket_queues = [] for i in range(num_buckets): shared_name_i = ("%s_%d" % (shared_name, i) if shared_name is not None else None) bucket_queues.append( queue_creator(capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name_i, name="bucket_queue_%d" % i)) maybe_static_batch_size = (None if (allow_smaller_final_batch or batch_size_per_bucket) else static_batch_size) bucket_shapes = [ tensor_shape.vector(maybe_static_batch_size).concatenate(s) for s in bucket_queues[0].shapes ] # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO # queues because if we use allow_smaller_final_batch, shapes will # contain Nones in their first entry; as a result, a regular # FIFOQueue would die when being passed shapes that are not fully defined. top_queue = data_flow_ops.PaddingFIFOQueue( capacity=capacity, dtypes=[dtypes.int32] + types, shapes=[tensor_shape.scalar()] + bucket_shapes, shared_name=shared_name, name="top_queue") def enqueue_which(): def enqueue_single(i): return bucket_queues[i].enqueue(tensor_list) enqueues = [ control_flow_ops.cond(math_ops.equal(which_bucket, i), functools.partial(enqueue_single, i), control_flow_ops.no_op) for i in range(num_buckets) ] return control_flow_ops.group(*enqueues, name="group_enqueues") if keep_input is not None: # TODO(ebrevdo): Expand keep_input param to core training # methods, and pipe through to _store_sparse_tensors; so # that expensive serialization is guarded by keep_input. maybe_enqueue = control_flow_ops.cond(keep_input, enqueue_which, control_flow_ops.no_op) else: maybe_enqueue = enqueue_which() bucket_enqueue_ops = [maybe_enqueue] * num_threads if allow_smaller_final_batch: which_dequeue = lambda q: q.dequeue_up_to else: which_dequeue = lambda q: q.dequeue_many enqueues_to_top = [ top_queue.enqueue([constant_op.constant(i)] + which_dequeue(q)(bs, name="read_bucket_%d" % i), name="enqueue_from_bucket_%d" % i) for i, (q, bs) in enumerate(zip(bucket_queues, batch_size)) ] for i, q in enumerate(bucket_queues): queue_runner.add_queue_runner( queue_runner.QueueRunner( q, [enqueues_to_top[i]], queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) queue_runner.add_queue_runner( queue_runner.QueueRunner( top_queue, bucket_enqueue_ops, queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) for q in bucket_queues: summary.scalar("bucket/%s/size" % q.name, math_ops.cast(top_queue.size(), dtypes.float32)) summary.scalar( "bucket/%s/fraction_of_%d_full" % (top_queue.name, capacity), math_ops.cast(top_queue.size(), dtypes.float32) * (1. / capacity)) dequeued = top_queue.dequeue(name="dequeue_top") which_bucket_dequeued = dequeued[0] dequeued = dequeued[1:] dequeued = _restore_sparse_tensors(dequeued, sparse_info) return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
def queue_parsed_features(parsed_features, keys=None, feature_queue_capacity=100, num_queue_runners=None, num_enqueue_threads=None, name=None): """Speeds up parsing by using queues to do it asynchronously. This function adds the tensors in `parsed_features` to a queue, which allows the parsing (or any other expensive op before this) to be asynchronous wrt the rest of the training graph. This greatly improves read latency and speeds up training since the data will already be parsed and ready when each step of training needs it. All queue runners are added to the queue runners collection, and may be started via `start_queue_runners`. All ops are added to the default graph. Args: parsed_features: A dict of string key to `Tensor` or `SparseTensor` objects. keys: `Tensor` of string keys. feature_queue_capacity: Capacity of the parsed features queue. num_queue_runners: Deprecated. Defaults to 2 if this and `num_enqueue_threads` are both `None`. This is the number of queue runners to start for the feature queue. Adding multiple queue runners for the parsed example queue helps maintain a full queue when the subsequent computations overall are cheaper than parsing. This argument will be deprecated and replaced with `num_enqueue_threads`. num_enqueue_threads: Number of threads to enqueue the parsed example queue. Using multiple threads to enqueue the parsed example queue helps maintain a full queue when the subsequent computations overall are cheaper than parsing. This argument will replace `num_queue_runners`. This and `num_queue_runners` can not both be set. name: Name of resulting op. Returns: Returns tuple of: - `Tensor` corresponding to `keys` if provided, otherwise `None`. - A dict of string key to `Tensor` or `SparseTensor` objects corresponding to `parsed_features`. Raises: ValueError: for invalid inputs. """ num_queue_runners, num_enqueue_threads = _check_enqueue_params( num_queue_runners, num_enqueue_threads) args = list(parsed_features.values()) if keys is not None: args += [keys] with ops.name_scope(name, 'queue_parsed_features', args): # Lets also add preprocessed tensors into the queue types for each item of # the queue. tensors_to_enqueue = [] # Each entry contains the key, and a boolean which indicates whether the # tensor was a sparse tensor. tensors_mapping = [] # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse # tensors into a queue. This could be taken care in somewhere else so others # can reuse it. Also, QueueBase maybe extended to handle sparse tensors # directly. for key in sorted(parsed_features.keys()): tensor = parsed_features[key] if isinstance(tensor, ops.SparseTensor): tensors_mapping.append((key, True)) tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape]) else: tensors_mapping.append((key, False)) tensors_to_enqueue.append(tensor) if keys is not None: tensors_to_enqueue.append(keys) queue_dtypes = [x.dtype for x in tensors_to_enqueue] input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes) # Add a summary op to debug if our feature queue is full or not. logging_ops.scalar_summary('queue/parsed_features/%s/fraction_of_%d_full' % (input_queue.name, feature_queue_capacity), math_ops.cast(input_queue.size(), dtypes.float32) * (1. / feature_queue_capacity)) # Add multiple queue runners so that the queue is always full. Adding more # than two queue-runners may hog the cpu on the worker to fill up the queue. # # Note: this can result in large last batch being lost as the multiple queue # runner threads do not coordinate with each other. Please use # `num_enqueue_threads` instead. if num_queue_runners is not None: for _ in range(num_queue_runners): queue_runner.add_queue_runner( queue_runner.QueueRunner( input_queue, [input_queue.enqueue(tensors_to_enqueue)], queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) # Use a single QueueRunner with multiple threads to enqueue so the queue is # always full. The threads are coordinated so the last batch will not be # lost. elif num_enqueue_threads is not None: enqueue_ops = [input_queue.enqueue(tensors_to_enqueue) for _ in range(num_enqueue_threads)] queue_runner.add_queue_runner(queue_runner.QueueRunner( input_queue, enqueue_ops, queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) else: raise AssertionError( 'Either `num_queue_runners` or `num_enqueue_threads` should have ' 'been set.') dequeued_tensors = input_queue.dequeue() # Reset shapes on dequeued tensors. for i in range(len(tensors_to_enqueue)): dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape()) # Recreate feature mapping according to the original dictionary. dequeued_parsed_features = {} index = 0 for key, is_sparse_tensor in tensors_mapping: if is_sparse_tensor: # Three tensors are (indices, values, shape). dequeued_parsed_features[key] = ops.SparseTensor( dequeued_tensors[index], dequeued_tensors[index + 1], dequeued_tensors[index + 2]) index += 3 else: dequeued_parsed_features[key] = dequeued_tensors[index] index += 1 dequeued_keys = None if keys is not None: dequeued_keys = dequeued_tensors[-1] return dequeued_keys, dequeued_parsed_features
def set_many_fed_tensors(self, tensors): """Sets batches fed tensors.""" enq_op = self._local_q.enqueue_many(tensors) queue_runner.add_queue_runner(queue_runner.QueueRunner( self._local_q, [enq_op]))