def testBatchAndDropRemainder(self): components = (np.arange(7), np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis], np.array(37.0) * np.arange(7)) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) iterator = ( dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(batch_size)) .make_initializable_iterator()) next_element = iterator.get_next() with self.test_session() as sess: for test_batch_size in [1, 3, 7, 10]: sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size}) num_batches = 7 // test_batch_size for i in range(num_batches): result = sess.run(next_element) for component, result_component in zip(components, result): for j in range(test_batch_size): self.assertAllEqual(component[(i * test_batch_size + j)], result_component[j]) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testBatchAndDropRemainder(self): components = (np.arange(7), np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis], np.array(37.0) * np.arange(7)) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) iterator = ( dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(batch_size)) .make_initializable_iterator()) next_element = iterator.get_next() with self.cached_session() as sess: for test_batch_size in [1, 3, 7, 10]: sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size}) num_batches = 7 // test_batch_size for i in range(num_batches): result = sess.run(next_element) for component, result_component in zip(components, result): for j in range(test_batch_size): self.assertAllEqual(component[(i * test_batch_size + j)], result_component[j]) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testBatchAndDropRemainderSparse(self): def _sparse(i): return sparse_tensor.SparseTensor(indices=[[0]], values=(i * [1]), dense_shape=[1]) iterator = dataset_ops.Dataset.range(12).map(_sparse).apply( batching.batch_and_drop_remainder( 5)).make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(2): actual = sess.run(get_next) expected = sparse_tensor.SparseTensor( indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]], values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4], dense_shape=[5, 1]) self.assertTrue( isinstance(actual, sparse_tensor.SparseTensorValue)) self.assertSparseValuesEqual(actual, expected.eval()) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testBatchAndDropRemainderSparseError(self): def _map_fn(i): return sparse_tensor.SparseTensor( indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i with self.assertRaises(TypeError): _ = dataset_ops.Dataset.range(10).map(_map_fn).apply( batching.batch_and_drop_remainder(10))
def testBatchAndDropRemainderSparseError(self): def _map_fn(i): return sparse_tensor.SparseTensor(indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i with self.assertRaises(TypeError): _ = dataset_ops.Dataset.range(10).map(_map_fn).apply( batching.batch_and_drop_remainder(10))
def testBatchAndDropRemainderShapeInference(self): components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder( dtypes.int32, shape=[None]), array_ops.placeholder( dtypes.int32, shape=[20, 30]))) # Test with a statically known batch size. dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(128))) self.assertIs(None, dataset.output_shapes[0].ndims) self.assertEqual([128], dataset.output_shapes[1][0].as_list()) self.assertEqual([128, 30], dataset.output_shapes[1][1].as_list()) # Test with a dynamic batch size: the static shape will be unknown, because # `batch_size` is a placeholder. batch_size = array_ops.placeholder(dtypes.int64) dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(batch_size))) self.assertIs(None, dataset.output_shapes[0].ndims) self.assertEqual([None], dataset.output_shapes[1][0].as_list()) self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
def testBatchAndDropRemainderShapeInference(self): components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder(dtypes.int32, shape=[None]), array_ops.placeholder(dtypes.int32, shape=[20, 30]))) # Test with a statically known batch size. dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(128))) self.assertIs(None, dataset.output_shapes[0].ndims) self.assertEqual([128], dataset.output_shapes[1][0].as_list()) self.assertEqual([128, 30], dataset.output_shapes[1][1].as_list()) # Test with a dynamic batch size: the static shape will be unknown, because # `batch_size` is a placeholder. batch_size = array_ops.placeholder(dtypes.int64) dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply( batching.batch_and_drop_remainder(batch_size))) self.assertIs(None, dataset.output_shapes[0].ndims) self.assertEqual([None], dataset.output_shapes[1][0].as_list()) self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
def _apply_fn(dataset): # pylint: disable=missing-docstring random_ds = random_ops.RandomDataset(seed).apply( batching.batch_and_drop_remainder(2)) if count is not None and count is not -1: random_ds = random_ds.take(count) def map_fn(seeds): return dataset_ops.ShuffleDataset(input_dataset=dataset, buffer_size=buffer_size, seed=seeds[0], reshuffle_each_iteration=False, seed2=seeds[1]) return random_ds.flat_map(map_fn)
def __init__(self, dataset, devices, prefetch_on_device=None): self._devices = devices # Default to using prefetching in graph mode, unless specified. # TODO(priyag): Enable prefetching in eager mode. self._prefetch_on_device = prefetch_on_device if self._prefetch_on_device is None: self._prefetch_on_device = not context.executing_eagerly() assert not (self._prefetch_on_device and context.executing_eagerly()), ( "Prefetching is only supported in graph mode currently") if self._prefetch_on_device: self._dataset = dataset else: # TODO(priyag): If dropping remainder is not appropriate, find another # approach to distributing the dataset when not possible to divide evenly. # Possibly not an issue when we start using PartitionedDataset. self._dataset = dataset.apply( batching.batch_and_drop_remainder(len(devices)))
def testBatchAndDropRemainderSparse(self): def _sparse(i): return sparse_tensor.SparseTensorValue( indices=[[0]], values=(i * [1]), dense_shape=[1]) iterator = dataset_ops.Dataset.range(12).map(_sparse).apply( batching.batch_and_drop_remainder(5)).make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for i in range(2): actual = sess.run(get_next) expected = sparse_tensor.SparseTensorValue( indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]], values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4], dense_shape=[5, 1]) self.assertTrue(sparse_tensor.is_sparse(actual)) self.assertSparseValuesEqual(actual, expected) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def StreamingFilesDataset(files, filetype=None, file_reader_job=None, worker_job=None, num_epochs=None, filename_shuffle_buffer_size=None, num_parallel_reads=None, batch_transfer_size=None, sloppy=None): """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM). Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read files local to your GCE VM. In order to train using files stored on your local VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset helper to generate a dataset to feed your Cloud TPU with files from your GCE VM. The resulting dataset may return an OutOfRangeError if there are no files found as a result of the fileglob expansion. Note: StreamingFilesDataset assumes that the session is using a TPUClusterResolver and has therefore a worker and a coordinator job. File loading will be done on the coordinator job. Args: files: A string glob to match files, or a `tf.data.Dataset` generating file names. filetype: A string (one of 'tfrecord', or 'textline') or a single-argument TensorFlow function that when given a filename returns a dataset. file_reader_job: An optional string that corresponds to the job that should perform the file reads. worker_job: An optional string that corresponds to the job that should process the tensors (i.e. your GPU or TPU worker). num_epochs: The number of epochs through the training set that should be generated. By default, it will repeat infinitely. filename_shuffle_buffer_size: An optional integer whose value controls the shuffling of the file names. If you would like to read from the files in the same order, set to 0 or False. num_parallel_reads: An optional integer controlling the number of files to read from concurrently. (Set to 1 for no parallelism.) batch_transfer_size: An optional integer controlling the batching used to amortize the remote function invocation overhead. Set to a very large number to increase throughput. Set to a very small number to reduce memory consumption. Set to False to skip batching. sloppy: (Optional.) If `True`, read input data as fast as possible, without maintaining a deterministic order. Defaults to `False`. Returns: A `tf.data.Dataset` with an infinite stream of elements generated by a parallel interleaving of the set of files matched (or generated) by `files` with a type is the output of the dataset specified by `filetype`. Raises: ValueError: if any argument is not of the expected type. """ if filetype is None: filetype = 'tfrecord' if isinstance(filetype, str): if filetype not in _FILETYPE_MAP: raise ValueError('Unexpected filetype: %s' % filetype) reader_fn = _FILETYPE_MAP[filetype] elif callable(filetype): reader_fn = filetype else: raise ValueError('filetype should be a string or a callable') file_reader_job = file_reader_job or 'coordinator' worker_job = worker_job or 'worker' if filename_shuffle_buffer_size is None: filename_shuffle_buffer_size = 4096 num_parallel_reads = num_parallel_reads or 8 if batch_transfer_size is None: batch_transfer_size = 1024 if sloppy is None: sloppy = False with ops.device('/job:%s' % file_reader_job): if isinstance(files, str): source_dataset = dataset_ops.Dataset.list_files(files) elif isinstance(files, dataset_ops.Dataset): source_dataset = files else: raise ValueError('files was not a string or a dataset: %s' % files) if filename_shuffle_buffer_size: source_dataset = source_dataset.shuffle( buffer_size=filename_shuffle_buffer_size) # NOTE: We perform the `repeat` on the source dataset, because the output # dataset does not currently have enough information to recreate an iterator # over the source dataset when it reaches the end. source_dataset = source_dataset.repeat(num_epochs) source_dataset = source_dataset.apply( interleave_ops.parallel_interleave(reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy)) if batch_transfer_size: # Note: we can safely call batch_and_drop_remainder because we have an # infinite stream of TFRecords. source_dataset = source_dataset.apply( batching.batch_and_drop_remainder(batch_transfer_size)) source_dataset = source_dataset.prefetch(1) source_iterator = source_dataset.make_one_shot_iterator() source_handle = source_iterator.string_handle() @function.Defun(dtypes.string) def LoadingFunc(h): remote_iterator = iterator_ops.Iterator.from_string_handle( h, source_dataset.output_types, source_dataset.output_shapes) return remote_iterator.get_next() def MapFn(unused_input): return functional_ops.remote_call( args=[source_handle], Tout=[dtypes.string], f=LoadingFunc, target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job) with ops.device('/job:%s' % worker_job): # TODO(saeta,mrry): Switch to using _GeneratorDataset. # identity = lambda x: x # dummy = constant_op.constant(0) # output_dataset = dataset_ops._GeneratorDataset(dummy, identity, MapFn, # identity) output_dataset = dataset_ops.Dataset.range(2).repeat().map(MapFn) output_dataset = output_dataset.prefetch(1) if batch_transfer_size: # Undo the batching used during the transfer. output_dataset = output_dataset.apply( batching.unbatch()).prefetch(1) return output_dataset
def _dataset_fn(): dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float) # Want to produce a fixed, known shape, so drop remainder when batching. dataset = dataset.apply(batching.batch_and_drop_remainder(4)) return dataset
def make_batched_features_dataset(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=1, reader_num_threads=1, parser_num_threads=2, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of consecutive elements of this dataset to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step (default is 1). reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements. Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. """ # Create dataset of all matching filenames filenames = _get_file_names(file_pattern, False) dataset = dataset_ops.Dataset.from_tensor_slices(filenames) if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel dataset = dataset.apply( interleave_ops.parallel_interleave( lambda filename: reader(filename, *reader_args), cycle_length=reader_num_threads, sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset.output_types == (dtypes.string, dtypes.string): dataset = dataset.map(lambda _, v: v) # Apply dataset repeat and shuffle transformations. repeat_dataset = (num_epochs != 1) if repeat_dataset and shuffle: # Used fused shuffle_and_repeat operation for better performance dataset = dataset.apply( shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs, shuffle_seed)) elif repeat_dataset: dataset = dataset.repeat(num_epochs) elif shuffle: dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed) if drop_final_batch: dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) else: dataset = dataset.batch(batch_size) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features), num_parallel_calls=parser_num_threads) # TODO(rachelim): Add an optional label_name argument for extracting the label # from the features dictionary, to comply with the type expected by the # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function. dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def make_tf_record_dataset( file_pattern, batch_size, parser_fn=None, num_epochs=None, shuffle=True, shuffle_buffer_size=None, shuffle_seed=None, prefetch_buffer_size=None, num_parallel_reads=None, num_parallel_parser_calls=None, drop_final_batch=False): """Reads and optionally parses TFRecord files into a dataset. Provides common functionality such as batching, optional parsing, shuffling, and performant defaults. Args: file_pattern: List of files or patterns of TFRecord file paths. See @{tf.gfile.Glob} for pattern rules. batch_size: An int representing the number of records to combine in a single batch. parser_fn: (Optional.) A function accepting string input to parse and process the record contents. This function must map records to components of a fixed shape, so they may be batched. By default, uses the record contents unmodified. num_epochs: (Optional.) An int specifying the number of times this dataset is repeated. If None (the default), cycles through the dataset forever. shuffle: (Optional.) A bool that indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: (Optional.) Buffer size to use for shuffling. A large buffer size ensures better shuffling, but increases memory usage and startup time. shuffle_seed: (Optional.) Randomization seed to use for shuffling. prefetch_buffer_size: (Optional.) An int specifying the number of feature batches to prefetch for performance improvement. Defaults to auto-tune. Set to 0 to disable prefetching. num_parallel_reads: (Optional.) Number of threads used to read records from files. By default or if set to a value >1, the results will be interleaved. num_parallel_parser_calls: (Optional.) Number of parallel records to parse in parallel. Defaults to an automatic selection. drop_final_batch: (Optional.) Whether the last batch should be dropped in case its size is smaller than `batch_size`; the default behavior is not to drop the smaller batch. Returns: A dataset, where each element matches the output of `parser_fn` except it will have an additional leading `batch-size` dimension, or a `batch_size`-length 1-D tensor of strings if `parser_fn` is unspecified. """ files = dataset_ops.Dataset.list_files( file_pattern, shuffle=shuffle, seed=shuffle_seed) if num_parallel_reads is None: # Note: We considered auto-tuning this value, but there is a concern # that this affects the mixing of records from different files, which # could affect training convergence/accuracy, so we are defaulting to # a constant for now. num_parallel_reads = 24 dataset = core_readers.TFRecordDataset( files, num_parallel_reads=num_parallel_reads) if shuffle_buffer_size is None: # TODO(josh11b): Auto-tune this value when not specified shuffle_buffer_size = 10000 dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) if parser_fn is None: if drop_final_batch: dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) else: dataset = dataset.batch(batch_size) else: # TODO(josh11b): if num_parallel_parser_calls is None, use some function # of num cores instead of map_and_batch's default behavior of one batch. dataset = dataset.apply(batching.map_and_batch( parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls, drop_remainder=drop_final_batch)) if prefetch_buffer_size is None: prefetch_buffer_size = -1 # tf.config.data.AUTOTUNE if prefetch_buffer_size == 0: return dataset else: return dataset.prefetch(buffer_size=prefetch_buffer_size)
def StreamingFilesDataset(files, filetype=None, file_reader_job=None, worker_job=None, num_epochs=None, filename_shuffle_buffer_size=None, num_parallel_reads=None, batch_transfer_size=None, sloppy=None): """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM). Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read files local to your GCE VM. In order to train using files stored on your local VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset helper to generate a dataset to feed your Cloud TPU with files from your GCE VM. The resulting dataset may return an OutOfRangeError if there are no files found as a result of the fileglob expansion. Note: StreamingFilesDataset assumes that the session is using a TPUClusterResolver and has therefore a worker and a coordinator job. File loading will be done on the coordinator job. Args: files: A string glob to match files, or a `tf.data.Dataset` generating file names. filetype: A string (one of 'tfrecord', or 'textline') or a single-argument TensorFlow function that when given a filename returns a dataset. file_reader_job: An optional string that corresponds to the job that should perform the file reads. worker_job: An optional string that corresponds to the job that should process the tensors (i.e. your GPU or TPU worker). num_epochs: The number of epochs through the training set that should be generated. By default, it will repeat infinitely. filename_shuffle_buffer_size: An optional integer whose value controls the shuffling of the file names. If you would like to read from the files in the same order, set to 0 or False. num_parallel_reads: An optional integer controlling the number of files to read from concurrently. (Set to 1 for no parallelism.) batch_transfer_size: An optional integer controlling the batching used to amortize the remote function invocation overhead. Set to a very large number to increase throughput. Set to a very small number to reduce memory consumption. Set to False to skip batching. sloppy: (Optional.) If `True`, read input data as fast as possible, without maintaining a deterministic order. Defaults to `False`. Returns: A `tf.data.Dataset` with an infinite stream of elements generated by a parallel interleaving of the set of files matched (or generated) by `files` with a type is the output of the dataset specified by `filetype`. Raises: ValueError: if any argument is not of the expected type. """ if filetype is None: filetype = 'tfrecord' if isinstance(filetype, str): if filetype not in _FILETYPE_MAP: raise ValueError('Unexpected filetype: %s' % filetype) reader_fn = _FILETYPE_MAP[filetype] elif callable(filetype): reader_fn = filetype else: raise ValueError('filetype should be a string or a callable') file_reader_job = file_reader_job or 'coordinator' worker_job = worker_job or 'tpu_worker' if filename_shuffle_buffer_size is None: filename_shuffle_buffer_size = 4096 num_parallel_reads = num_parallel_reads or 8 if batch_transfer_size is None: batch_transfer_size = 1024 if sloppy is None: sloppy = False with ops.device('/job:%s' % file_reader_job): if isinstance(files, str): source_dataset = dataset_ops.Dataset.list_files(files) elif isinstance(files, dataset_ops.Dataset): source_dataset = files else: raise ValueError('files was not a string or a dataset: %s' % files) if filename_shuffle_buffer_size: source_dataset = source_dataset.shuffle( buffer_size=filename_shuffle_buffer_size) # NOTE: We perform the `repeat` on the source dataset, because the output # dataset does not currently have enough information to recreate an iterator # over the source dataset when it reaches the end. source_dataset = source_dataset.repeat(num_epochs) source_dataset = source_dataset.apply( interleave_ops.parallel_interleave( reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy)) if batch_transfer_size: # Note: we can safely call batch_and_drop_remainder because we have an # infinite stream of TFRecords. source_dataset = source_dataset.apply( batching.batch_and_drop_remainder(batch_transfer_size)) source_dataset = source_dataset.prefetch(1) source_iterator = source_dataset.make_one_shot_iterator() source_handle = source_iterator.string_handle() @function.Defun(dtypes.string) def LoadingFunc(h): remote_iterator = iterator_ops.Iterator.from_string_handle( h, source_dataset.output_types, source_dataset.output_shapes) return remote_iterator.get_next() def MapFn(unused_input): return functional_ops.remote_call( args=[source_handle], Tout=[dtypes.string], f=LoadingFunc, target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job) with ops.device('/job:%s' % worker_job): # TODO(saeta,mrry): Switch to using _GeneratorDataset. # identity = lambda x: x # dummy = constant_op.constant(0) # output_dataset = dataset_ops._GeneratorDataset(dummy, identity, MapFn, # identity) output_dataset = dataset_ops.Dataset.range(2).repeat().map(MapFn) output_dataset = output_dataset.prefetch(1) if batch_transfer_size: # Undo the batching used during the transfer. output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1) return output_dataset
def make_tf_record_dataset( file_pattern, batch_size, parser_fn=None, num_epochs=None, shuffle=True, shuffle_buffer_size=None, shuffle_seed=None, prefetch_buffer_size=None, num_parallel_reads=None, num_parallel_parser_calls=None, drop_final_batch=False): """Reads and optionally parses TFRecord files into a dataset. Provides common functionality such as batching, optional parsing, shuffling, and performant defaults. Args: file_pattern: List of files or patterns of TFRecord file paths. See @{tf.gfile.Glob} for pattern rules. batch_size: An int representing the number of records to combine in a single batch. parser_fn: (Optional.) A function accepting string input to parse and process the record contents. This function must map records to components of a fixed shape, so they may be batched. By default, uses the record contents unmodified. num_epochs: (Optional.) An int specifying the number of times this dataset is repeated. If None (the default), cycles through the dataset forever. shuffle: (Optional.) A bool that indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: (Optional.) Buffer size to use for shuffling. A large buffer size ensures better shuffling, but increases memory usage and startup time. shuffle_seed: (Optional.) Randomization seed to use for shuffling. prefetch_buffer_size: (Optional.) An int specifying the number of feature batches to prefetch for performance improvement. Defaults to auto-tune. Set to 0 to disable prefetching. num_parallel_reads: (Optional.) Number of threads used to read records from files. By default or if set to a value >1, the results will be interleaved. num_parallel_parser_calls: (Optional.) Number of parallel records to parse in parallel. Defaults to an automatic selection. drop_final_batch: (Optional.) Whether the last batch should be dropped in case its size is smaller than `batch_size`; the default behavior is not to drop the smaller batch. Returns: A dataset, where each element matches the output of `parser_fn` except it will have an additional leading `batch-size` dimension, or a `batch_size`-length 1-D tensor of strings if `parser_fn` is unspecified. """ files = dataset_ops.Dataset.list_files( file_pattern, shuffle=shuffle, seed=shuffle_seed) if num_parallel_reads is None: # Note: We considered auto-tuning this value, but there is a concern # that this affects the mixing of records from different files, which # could affect training convergence/accuracy, so we are defaulting to # a constant for now. num_parallel_reads = 24 dataset = core_readers.TFRecordDataset( files, num_parallel_reads=num_parallel_reads) if shuffle_buffer_size is None: # TODO(josh11b): Auto-tune this value when not specified shuffle_buffer_size = 10000 dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) if parser_fn is None: if drop_final_batch: dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) else: dataset = dataset.batch(batch_size) else: # TODO(josh11b): if num_parallel_parser_calls is None, use some function # of num cores instead of map_and_batch's default behavior of one batch. dataset = dataset.apply(batching.map_and_batch( parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls, drop_remainder=drop_final_batch)) if prefetch_buffer_size is None: prefetch_buffer_size = -1 # tf.config.data.AUTOTUNE if prefetch_buffer_size == 0: return dataset else: return dataset.prefetch(buffer_size=prefetch_buffer_size)
def make_batched_features_dataset(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=1, reader_num_threads=1, parser_num_threads=2, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step (default is 1). reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements. Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. """ # Create dataset of all matching filenames filenames = _get_file_names(file_pattern, False) dataset = dataset_ops.Dataset.from_tensor_slices(filenames) if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel dataset = dataset.apply( interleave_ops.parallel_interleave( lambda filename: reader(filename, *reader_args), cycle_length=reader_num_threads, sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset.output_types == (dtypes.string, dtypes.string): dataset = dataset.map(lambda _, v: v) # Apply dataset repeat and shuffle transformations. dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) if drop_final_batch: dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) else: dataset = dataset.batch(batch_size) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.map( lambda x: parsing_ops.parse_example(x, features), num_parallel_calls=parser_num_threads) # TODO(rachelim): Add an optional label_name argument for extracting the label # from the features dictionary, to comply with the type expected by the # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function. dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def _dataset_fn(): dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float) # Want to produce a fixed, known shape, so drop remainder when batching. dataset = dataset.apply(batching.batch_and_drop_remainder(4)) return dataset