def _generate_enqueue_op(self, inputs, name_prefix, index, device=None, tpu_ordinal=-1): """Generate a host-side Op to enqueue a tuple to the queue. If device is None the inputs are all required to have the same device specification, and the enqueue Op is colocated with inputs[0]. Otherwise the enqueue Op is placed on 'device'. Args: inputs: a list of Tensors with the types and shapes of the tuple elements. name_prefix: the base name for the Op. index: the shard index, used to uniquify the Op name. device: device to place the Op on, or None if it should be colocated with the inputs. tpu_ordinal: ordinal of the TPU device on the host to use for infeed if device is a CPU device. Should be set to -1 if device is a TPU device. Returns: An Op corresponding to a shard of infeed enqueued at the host, suitable for use within a replicated block. Raises: ValueError: if device is None and inputs do not all have the same device specification. """ full_name = "%s/%d" % (name_prefix, index) shapes = [t.shape for t in inputs] if device is None: devices = [t.device for t in inputs] for i in xrange(1, self.number_of_tuple_elements): if devices[0] != devices[i]: raise ValueError( "input devices for shard %d are %s, but should all be the same" % (index, str(devices))) with ops.colocate_with(inputs[0]): return tpu_ops.infeed_enqueue_tuple( inputs=inputs, shapes=shapes, name=full_name, device_ordinal=tpu_ordinal) else: with ops.device(device): return tpu_ops.infeed_enqueue_tuple( inputs=inputs, shapes=shapes, name=full_name, device_ordinal=tpu_ordinal)
def generate_enqueue_ops(self, sharded_inputs): """Generates the host-side Ops to enqueue the partitioned inputs. sharded_inputs is a list, one for each replica, of lists of Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed replica i. sharded_inputs[i][j] is partitioned by self._input_partition_dims[j]. For example, if sharded_inputs[i][j] is a 2-D Tensor: [[A, B, C, D], [E ,F, G, H]] self._input_partition_dims[j] is [2, 4]. sharded_inputs[i][j] will be partitioned and flattened into: [A, B, C, D, E, F, G, H] and fed into the logical core ids: [0, 1, 2, 3, 4, 5, 6, 7] respectively. Args: sharded_inputs: a list of lists of Tensors. The length of the outer list determines the number of shards. Each inner list indicates the types and shapes of the tuples in the corresponding shard. Returns: A list of host-side Ops, one for each shard, that when executed together will enqueue a full-size element of infeed. Raises: ValueError: if the queue configuration has previously been frozen and the shapes of the elements of sharded_inputs are not compatible with the frozen configuration; or if the shapes of the elements of sharded_inputs don't form a consistent unsharded tuple; or if the elements of a tuple have different device constraints; or if the partition dims are invalid. TypeError: if the queue configuration has previously been frozen and the types of the elements of sharded_inputs are not compatible with the frozen configuration; or if the types of the elements of sharded_inputs don't form a consistent unsharded tuple. """ self.set_configuration_from_sharded_input_tensors(sharded_inputs) number_of_replicas = len(sharded_inputs) number_of_tuple_elements = len(sharded_inputs[0]) assert len(self._input_partition_dims) == number_of_tuple_elements enqueue_ops = [] for replica_index in range(number_of_replicas): flattened_inputs = sharded_inputs[replica_index] inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs, self._input_partition_dims) inputs_parted_iters = [ iter(self._check_dims_and_partition_or_replicate_on_host(x, dims)) for x, dims in zip(sharded_inputs[replica_index], inputs_part_dims_flat) ] # Find the replica_id of the host's logical core 0. # The self._host_id is guaranteed to contain the logical core 0, # even when num_cores_per_replica > num_cores_per_host -- the function # caller makes sure that this host_id will must be receiving data (calls # input_fn). replica_id = self._device_assignment.lookup_replicas( task_id=self._host_id, logical_core=0)[replica_index] for logical_core in xrange(self._device_assignment.num_cores_per_replica): # Places different partitions to different logic cores. # Since there can be multiple hosts per replica, we need to find # the actual host (device) of this logical core. device = self._device_assignment.host_device( replica=replica_id, logical_core=logical_core) with ops.device(device): ordinal = self._device_assignment.tpu_ordinal( replica=replica_id, logical_core=logical_core) infeed_inputs = [] for it in inputs_parted_iters: input_for_device = next(it, None) if input_for_device is not None: infeed_inputs.append(input_for_device) if infeed_inputs: enqueue_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_inputs, shapes=[x.shape for x in infeed_inputs], name="enqueue/replica_{0}/input_{1}".format( replica_index, logical_core), device_ordinal=ordinal)) return enqueue_ops
def generate_enqueue_ops(self, per_host_sharded_inputs): """Generates the host-side Ops to enqueue the partitioned inputs. per_host_sharded_inputs is a list, one for each replica, of lists of Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed replica i. sharded_inputs[i][j] is partitioned by self._input_partition_dims[j]. For example, if sharded_inputs[i][j] is a 2-D Tensor: [[A, B, C, D], [E ,F, G, H]] self._input_partition_dims[j] is [2, 4]. sharded_inputs[i][j] will be partitioned and flattened into: [A, B, C, D, E, F, G, H] and fed into the logical core ids: [0, 1, 2, 3, 4, 5, 6, 7] respectively. Args: per_host_sharded_inputs: a list of lists of Tensors. The length of the outer list determines the number of shards. Each inner list indicates the types and shapes of the tuples in the corresponding shard. Returns: A list of host-side Ops, one for each shard, that when executed together will enqueue a full-size element of infeed. Raises: ValueError: if the queue configuration has previously been frozen and the shapes of the elements of sharded_inputs are not compatible with the frozen configuration; or if the shapes of the elements of sharded_inputs don't form a consistent unsharded tuple; or if the elements of a tuple have different device constraints; or if the partition dims are invalid. TypeError: if the queue configuration has previously been frozen and the types of the elements of sharded_inputs are not compatible with the frozen configuration; or if the types of the elements of sharded_inputs don't form a consistent unsharded tuple. """ self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs) number_of_replicas_per_host = len(per_host_sharded_inputs) number_of_tuple_elements = len(per_host_sharded_inputs[0]) assert len(self._input_partition_dims) == number_of_tuple_elements per_host_enqueue_ops = [] for replica_index in range(number_of_replicas_per_host): flattened_inputs = per_host_sharded_inputs[replica_index] inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs, self._input_partition_dims) inputs_parted_iters = [ iter(self._check_dims_and_partition_or_replicate_on_host(x, dims)) for x, dims in zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat) ] for logical_core in xrange(self._device_assignment.num_cores_per_replica): # Places different partitions to different logic cores. replica_id = self._device_assignment.lookup_replicas( self._host_id, logical_core)[replica_index] ordinal = self._device_assignment.tpu_ordinal( replica=replica_id, logical_core=logical_core) infeed_inputs = [] for it in inputs_parted_iters: input_for_device = next(it, None) if input_for_device is not None: infeed_inputs.append(input_for_device) if infeed_inputs: per_host_enqueue_ops.append( tpu_ops.infeed_enqueue_tuple( inputs=infeed_inputs, shapes=[x.shape for x in infeed_inputs], name="enqueue/replica_{0}/input_{1}".format( replica_index, logical_core), device_ordinal=ordinal)) return per_host_enqueue_ops