def setUp(self): ops.reset_default_graph() self.scalar_int_feed = array_ops.placeholder(dtypes_lib.int32, ()) self.unk_int64_feed = array_ops.placeholder(dtypes_lib.int64, (None,)) self.vec3_str_feed = array_ops.placeholder(dtypes_lib.string, (3,)) self.sparse_c = sparse_tensor.SparseTensor( indices=[[0]], values=[1.0], dense_shape=[1]) self._coord = coordinator.Coordinator() # Make capacity very large so we can feed all the inputs in the # main thread without blocking input_queue = data_flow_ops.PaddingFIFOQueue( 5000, dtypes=[dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.string], shapes=[(), (None,), (3,)]) self._input_enqueue_op = input_queue.enqueue( (self.scalar_int_feed, self.unk_int64_feed, self.vec3_str_feed)) self.scalar_int, self.unk_int64, self.vec3_str = input_queue.dequeue() self._threads = None self._close_op = input_queue.close() self._sess = None
def testDynamicPad(self): with self.cached_session() as sess: # Create 3 tensors of variable but compatible shapes. var_shape = [None, 2] p1 = constant_op.constant([[1, 2], [3, 4]]) p1.set_shape(var_shape) p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]]) p2.set_shape(var_shape) p3 = constant_op.constant([[11, 12]]) p3.set_shape(var_shape) batch = [p1, p2, p3] batch_size = len(batch) zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(batch_size) # Create a PaddingFIFOQueue to enqueue these tensors. q = data_flow_ops.PaddingFIFOQueue(capacity=10, dtypes=[dtypes.int32], shapes=[var_shape]) for tensor in [p1, p2, p3]: q.enqueue([tensor]).run() # Dequeue from the queue and batch them using batch(). batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size, num_threads=1, dynamic_pad=True) self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) # Finally, assemble them into prefetch_queue with dynamic_pad. batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True) batches = batcher.dequeue() self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() values, _ = sess.run(batches) # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad # they should be padded to the fixed size [3, 3, 2], where 3 # is the maximum length of the batch. self.assertTrue( np.array_equal( np.array([[[1, 2], [3, 4], [0, 0]], [[5, 6], [7, 8], [9, 10]], [[11, 12], [0, 0], [0, 0]]]), values)) with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def bucket(tensors, which_bucket, batch_size, num_buckets, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=None, shared_name=None, name=None): """Lazy bucketing of input tensors according to `which_bucket`. The argument `tensors` can be a list or a dictionary of tensors. The value returned by the function will be of the same type as `tensors`. The tensors entering this function are put into the bucket given by `which_bucket`. Each bucket has its own queue. When a bucket contains `batch_size` elements, this minibatch is pushed onto a top queue. The tensors returned from this function are a the result of dequeueing the next minibatch from this top queue. This function is implemented using several queues. A `QueueRunner` for the queues is added to the current `Graph`'s `QUEUE_RUNNER` collection. As the returned tensors are the result of of a dequeue operation, evaluating them will throw a `tf.errors.OutOfRangeError` when the input queue is exhausted. If these tensors are feeding another input queue, its queue runner will catch this exception, however, if they are used in your main thread you are responsible for catching this yourself. *N.B.:* If `dynamic_pad` is `False`, you must ensure that either (i) the `shapes` argument is passed, or (ii) all of the tensors in `tensors` must have fully-defined shapes. `ValueError` will be raised if neither of these conditions holds. If `dynamic_pad` is `True`, it is sufficient that the *rank* of the tensors is known, but individual dimensions may have shape `None`. In this case, for each enqueue the dimensions with value `None` may have a variable length; upon dequeue, the output tensors will be padded on the right to the maximum shape of the tensors in the current minibatch. For numbers, this padding takes value 0. For strings, this padding is the empty string. See `PaddingFIFOQueue` for more info. If `allow_smaller_final_batch` is `True`, a smaller batch value than `batch_size` is returned when the queues are closed and there are not enough elements to fill the batch, otherwise the pending elements are discarded. In addition, all output tensors' static shapes, as accessed via the `get_shape()` method will have a 0th `Dimension` value of `None`, and operations that depend on fixed batch_size would fail. Args: tensors: The list or dictionary of tensors, representing a single element, to bucket. Nested lists are not supported. which_bucket: An `int32` scalar Tensor taking a value in `[0, num_buckets)`. batch_size: The new batch size pulled from the queue (python int or int32 scalar). num_buckets: A python integer, the number of buckets. num_threads: An integer. The number of threads enqueuing `tensors`. capacity: An integer. The maximum number of minibatches in the top queue, and also the maximum number of elements within each bucket. shapes: (Optional) The shapes for each example. Defaults to the inferred shapes for `tensors`. dynamic_pad: Boolean. Allow variable dimensions in input shapes. The given dimensions are padded upon dequeue so that tensors within a batch have the same shapes. allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final batches to be smaller if there are insufficient items left in the queues. keep_input: (Optional). A `bool` scalar Tensor. If provided, this tensor controls whether the input is added to the queue or not. If it evaluates `True`, then `tensors` are added to the bucket; otherwise they are dropped. This tensor essentially acts as a filtering mechanism. The default behavior is to assume `keep_input=True`. shared_name: (Optional). If set, the queues will be shared under the given name across multiple sessions. name: (Optional) A name for the operations. Returns: A tuple `(bucket, outputs)` where `bucket` is a `int32` scalar tensor and `outputs` is a list or dictionary of batched outputs corresponding to elements of `tensors`. Every step will receive a new bucket of outputs. Raises: ValueError: If the `shapes` are not specified, and cannot be inferred from the elements of `tensors`. """ tensor_list = _as_tensor_list(tensors) with ops.name_scope(name, "bucket", tensor_list) as name: tensor_list = _validate_bucket(tensor_list) (tensor_list, sparse_info) = _store_sparse_tensors(tensor_list, enqueue_many=False) # Round-trip batch_size to a tensor, and possibly back batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int32, name="batch_size") static_batch_size = tensor_util.constant_value(batch_size) batch_size = (static_batch_size if static_batch_size is not None else batch_size) types = _dtypes([tensor_list]) shapes = _shapes([tensor_list], shapes, enqueue_many=False) which_bucket = ops.convert_to_tensor(which_bucket, dtype=dtypes.int32, name="which_bucket") queue_creator = _which_queue(dynamic_pad) bucket_queues = [] for i in range(num_buckets): shared_name_i = ("%s_%d" % (shared_name, i) if shared_name is not None else None) bucket_queues.append( queue_creator(capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name_i, name="bucket_queue_%d" % i)) maybe_static_batch_size = (None if allow_smaller_final_batch else static_batch_size) bucket_shapes = [ tensor_shape.vector(maybe_static_batch_size).concatenate(s) for s in bucket_queues[0].shapes ] # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO # queues because if we use allow_smaller_final_batch, shapes will # contain Nones in their first entry; as a result, a regular # FIFOQueue would die when being passed shapes that are not fully defined. top_queue = data_flow_ops.PaddingFIFOQueue( capacity=capacity, dtypes=[dtypes.int32] + types, shapes=[tensor_shape.scalar()] + bucket_shapes, shared_name=shared_name, name="top_queue") def enqueue_which(): def enqueue_single(i): return bucket_queues[i].enqueue(tensor_list) enqueues = [ control_flow_ops.cond(math_ops.equal(which_bucket, i), functools.partial(enqueue_single, i), control_flow_ops.no_op) for i in range(num_buckets) ] return control_flow_ops.group(*enqueues, name="group_enqueues") if keep_input is not None: # TODO(ebrevdo): Expand keep_input param to core training # methods, and pipe through to _store_sparse_tensors; so # that expensive serialization is guarded by keep_input. maybe_enqueue = control_flow_ops.cond(keep_input, enqueue_which, control_flow_ops.no_op) else: maybe_enqueue = enqueue_which() bucket_enqueue_ops = [maybe_enqueue] * num_threads if allow_smaller_final_batch: which_dequeue = lambda q: q.dequeue_up_to else: which_dequeue = lambda q: q.dequeue_many enqueues_to_top = [ top_queue.enqueue( [constant_op.constant(i)] + which_dequeue(q)(batch_size, name="read_bucket_%d" % i), name="enqueue_from_bucket_%d" % i) for i, q in enumerate(bucket_queues) ] for i, q in enumerate(bucket_queues): queue_runner.add_queue_runner( queue_runner.QueueRunner( q, [enqueues_to_top[i]], queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) queue_runner.add_queue_runner( queue_runner.QueueRunner( top_queue, bucket_enqueue_ops, queue_closed_exception_types=(errors.OutOfRangeError, errors.CancelledError))) for q in bucket_queues: summary.scalar("bucket/%s/size" % q.name, math_ops.cast(top_queue.size(), dtypes.float32)) summary.scalar( "bucket/%s/fraction_of_%d_full" % (top_queue.name, capacity), math_ops.cast(top_queue.size(), dtypes.float32) * (1. / capacity)) dequeued = top_queue.dequeue(name="dequeue_top") which_bucket_dequeued = dequeued[0] dequeued = dequeued[1:] dequeued = _restore_sparse_tensors(dequeued, sparse_info) return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
def _enqueue_data(data, capacity, shuffle=False, min_after_dequeue=None, num_threads=1, seed=None, name="enqueue_input", enqueue_size=1, num_epochs=None, pad_value=None): """Creates a queue filled from a numpy array or pandas `DataFrame`. Returns a queue filled with the rows of the given (`OrderedDict` of) array or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of) numpy arrays, the first enqueued `Tensor` contains the row number. Args: data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator yielding `dict`s of numpy arrays or pandas `DataFrame` that will be read into the queue. capacity: the capacity of the queue. shuffle: whether or not to shuffle the rows of the array. min_after_dequeue: minimum number of elements that can remain in the queue after a dequeue operation. Only used when `shuffle` is true. If not set, defaults to `capacity` / 4. num_threads: number of threads used for reading and enqueueing. seed: used to seed shuffling and reader starting points. name: a scope name identifying the data. enqueue_size: the number of rows to enqueue per step. num_epochs: limit enqueuing to a specified number of epochs, if provided. pad_value: default value for dynamic padding of data samples, if provided. Returns: A queue filled with the rows of the given (`OrderedDict` of) array or `DataFrame`. Raises: TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy arrays, a numpy `ndarray`, or a generator producing these. NotImplementedError: padding and shuffling data at the same time. NotImplementedError: padding usage with non generator data type. """ with ops.name_scope(name): if isinstance(data, np.ndarray): types = [dtypes.int64, dtypes.as_dtype(data.dtype)] queue_shapes = [(), data.shape[1:]] get_feed_fn = _ArrayFeedFn elif isinstance(data, collections.OrderedDict): types = [dtypes.int64 ] + [dtypes.as_dtype(col.dtype) for col in data.values()] queue_shapes = [()] + [col.shape[1:] for col in data.values()] get_feed_fn = _OrderedDictNumpyFeedFn elif isinstance(data, tp.FunctionType): x_first_el = six.next(data()) x_first_keys = sorted(x_first_el.keys()) x_first_values = [x_first_el[key] for key in x_first_keys] types = [dtypes.as_dtype(col.dtype) for col in x_first_values] queue_shapes = [col.shape for col in x_first_values] get_feed_fn = _GeneratorFeedFn elif HAS_PANDAS and isinstance(data, pd.DataFrame): types = [ dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes) ] queue_shapes = [() for _ in types] get_feed_fn = _PandasFeedFn else: raise TypeError( "data must be either a numpy array or pandas DataFrame if pandas is " "installed; got {}".format(type(data).__name__)) pad_data = pad_value is not None if pad_data and get_feed_fn is not _GeneratorFeedFn: raise NotImplementedError( "padding is only available with generator usage") if shuffle and pad_data: raise NotImplementedError( "padding and shuffling data at the same time is not implemented" ) # TODO(jamieas): TensorBoard warnings for all warnings below once available. if num_threads > 1 and num_epochs is not None: logging.warning( "enqueue_data was called with num_epochs and num_threads > 1. " "num_epochs is applied per thread, so this will produce more " "epochs than you probably intend. " "If you want to limit epochs, use one thread.") if shuffle and num_threads > 1 and num_epochs is not None: logging.warning( "enqueue_data was called with shuffle=True, num_threads > 1, and " "num_epochs. This will create multiple threads, all reading the " "array/dataframe in order adding to the same shuffling queue; the " "results will likely not be sufficiently shuffled.") if not shuffle and num_threads > 1: logging.warning( "enqueue_data was called with shuffle=False and num_threads > 1. " "This will create multiple threads, all reading the " "array/dataframe in order. If you want examples read in order, use" " one thread; if you want multiple threads, enable shuffling.") if shuffle: min_after_dequeue = int( capacity / 4 if min_after_dequeue is None else min_after_dequeue) queue = data_flow_ops.RandomShuffleQueue(capacity, min_after_dequeue, dtypes=types, shapes=queue_shapes, seed=seed) elif pad_data: min_after_dequeue = 0 # just for the summary text queue_shapes = list( map( lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x, queue_shapes)) queue = data_flow_ops.PaddingFIFOQueue(capacity, dtypes=types, shapes=queue_shapes) else: min_after_dequeue = 0 # just for the summary text queue = data_flow_ops.FIFOQueue(capacity, dtypes=types, shapes=queue_shapes) enqueue_ops = [] feed_fns = [] for i in range(num_threads): # Note the placeholders have no shapes, so they will accept any # enqueue_size. enqueue_many below will break them up. placeholders = [array_ops.placeholder(t) for t in types] enqueue_ops.append(queue.enqueue_many(placeholders)) seed_i = None if seed is None else (i + 1) * seed if not pad_data: feed_fns.append( get_feed_fn(placeholders, data, enqueue_size, random_start=shuffle, seed=seed_i, num_epochs=num_epochs)) else: feed_fns.append( get_feed_fn(placeholders, data, enqueue_size, random_start=shuffle, seed=seed_i, num_epochs=num_epochs, pad_value=pad_value)) runner = fqr._FeedingQueueRunner( # pylint: disable=protected-access queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns) queue_runner.add_queue_runner(runner) full = (math_ops.cast( math_ops.maximum(0, queue.size() - min_after_dequeue), dtypes.float32) * (1. / (capacity - min_after_dequeue))) # Note that name contains a '/' at the end so we intentionally do not place # a '/' after %s below. summary_name = ( "queue/%sfraction_over_%d_of_%d_full" % (queue.name, min_after_dequeue, capacity - min_after_dequeue)) summary.scalar(summary_name, full) return queue