def make_batch_feature(self, filenames, num_epochs, batch_size, label_key=None, reader_num_threads=1, parser_num_threads=1, shuffle=False, shuffle_seed=None, drop_final_batch=False): self.filenames = filenames self.num_epochs = num_epochs self.batch_size = batch_size return readers.make_batched_features_dataset( file_pattern=self.filenames, batch_size=self.batch_size, features={ "file": parsing_ops.FixedLenFeature([], dtypes.int64), "record": parsing_ops.FixedLenFeature([], dtypes.int64), "keywords": parsing_ops.VarLenFeature(dtypes.string), "label": parsing_ops.FixedLenFeature([], dtypes.string), }, label_key=label_key, reader=core_readers.TFRecordDataset, num_epochs=self.num_epochs, shuffle=shuffle, shuffle_seed=shuffle_seed, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, drop_final_batch=drop_final_batch)
def _eval_input_fn(): dataset = readers.make_batched_features_dataset(examples_file, batch_size, feature_spec, num_epochs=1) return dataset.map(lambda features: (features, features.pop('label')))
def testMakeBatchedFeaturesDataset(self): # Set up fn = os.path.join(self.get_temp_dir(), "tf_record.txt") writer = python_io.TFRecordWriter(fn) for i in range(1024): writer.write( example_pb2.Example( features=feature_pb2.Features( feature={ "value": feature_pb2.Feature( int64_list=feature_pb2.Int64List(value=[i])) })).SerializeToString()) writer.close() dataset = readers.make_batched_features_dataset( file_pattern=fn, batch_size=32, features={"value": parsing_ops.FixedLenFeature([], dtypes.int64)}, shuffle=False, num_epochs=1, drop_final_batch=False) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) expected_output = [{ "value": [k for k in range(i, i + 8)] } for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output)
def _predict_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) def features_fn(features): features.pop('label') return features return dataset.map(features_fn)
def testOldStyleReader(self): with self.assertRaisesRegexp( TypeError, r"The `reader` argument must return a `Dataset` object. " r"`tf.ReaderBase` subclasses are not supported."): _ = readers.make_batched_features_dataset( file_pattern=self.test_filenames[0], batch_size=32, features={ "file": parsing_ops.FixedLenFeature([], dtypes.int64), "record": parsing_ops.FixedLenFeature([], dtypes.int64), "keywords": parsing_ops.VarLenFeature(dtypes.string), "label": parsing_ops.FixedLenFeature([], dtypes.string), }, reader=io_ops.TFRecordReader)
def testOldStyleReader(self): with self.assertRaisesRegex( TypeError, r"The `reader` argument must return a `Dataset` object. " r"`tf.ReaderBase` subclasses are not supported."): _ = readers.make_batched_features_dataset( file_pattern=self.test_filenames[0], batch_size=32, features={ "file": parsing_ops.FixedLenFeature([], dtypes.int64), "record": parsing_ops.FixedLenFeature([], dtypes.int64), "keywords": parsing_ops.VarLenFeature(dtypes.string), "label": parsing_ops.FixedLenFeature([], dtypes.string), }, reader=io_ops.TFRecordReader)
def testMakeBatchedFeaturesDataset(self): files = 2 records_per_file = 5 def make_record(file_index): example = example_pb2.Example( features=feature_pb2.Features( feature={ "file": feature_pb2.Feature( int64_list=feature_pb2.Int64List(value=[file_index])), })) return example.SerializeToString() filenames = [] for file_index in range(files): filename = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % file_index) filenames.append(filename) writer = python_io.TFRecordWriter(filename) for _ in range(records_per_file): writer.write(make_record(file_index)) writer.close() dataset = readers.make_batched_features_dataset( file_pattern=filenames, batch_size=records_per_file, features={ "file": parsing_ops.FixedLenFeature([], dtypes.int64), }, reader=core_readers.TFRecordDataset, num_epochs=1) # We should shard at the file level, so that all records come from file 0. dataset = distribute._AutoShardDataset(dataset, 2, 0) dataset = dataset.unbatch() output = self.getDatasetOutput(dataset) files = [elem["file"] for elem in output] self.assertEqual(files, [0] * records_per_file)
def _eval_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) return dataset.map(lambda features: (features, features.pop('label')))
def read_batch_features(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, reader_args=None, randomize_input=True, num_epochs=None, capacity=10000): """Reads batches of Examples. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.io.gfile.glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.io.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. reader_args: Additional arguments to pass to the reader class. randomize_input: Whether the input should be randomized. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. capacity: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. Returns: A dict from keys in features to `Tensor` or `SparseTensor` objects. """ dataset = readers.make_batched_features_dataset( file_pattern, batch_size, features, reader=reader, reader_args=reader_args, shuffle=randomize_input, num_epochs=num_epochs, shuffle_buffer_size=capacity) iterator = dataset_ops.make_one_shot_iterator(dataset) outputs = iterator.get_next() return outputs
def make_batched_features_dataset(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, label_key=None, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=None, reader_num_threads=None, parser_num_threads=None, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. If label_key argument is provided, returns a `Dataset` of tuple comprising of feature dictionaries and label. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.io.gfile.glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.io.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. label_key: (Optional) A string corresponding to the key labels are stored in `tf.Examples`. If provided, it must be one of the `features` key, otherwise results in `ValueError`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step. Defaults to auto-tune. reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. Defaults to `1`. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. Defaults to `2`. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. Raises: ValueError: If `label_key` is not one of the `features` keys. """ return readers.make_batched_features_dataset( file_pattern, batch_size, features, reader, label_key, reader_args, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed, prefetch_buffer_size, reader_num_threads, parser_num_threads, sloppy_ordering, drop_final_batch)
def read_batch_features(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, reader_args=None, randomize_input=True, num_epochs=None, capacity=10000): """Reads batches of Examples. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. reader_args: Additional arguments to pass to the reader class. randomize_input: Whether the input should be randomized. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. capacity: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. Returns: A dict from keys in features to `Tensor` or `SparseTensor` objects. """ dataset = readers.make_batched_features_dataset( file_pattern, batch_size, features, reader=reader, reader_args=reader_args, shuffle=randomize_input, num_epochs=num_epochs, shuffle_buffer_size=capacity) iterator = dataset_ops.make_one_shot_iterator(dataset) outputs = iterator.get_next() return outputs
def make_batched_features_dataset(file_pattern, batch_size, features, reader=core_readers.TFRecordDataset, label_key=None, reader_args=None, num_epochs=None, shuffle=True, shuffle_buffer_size=10000, shuffle_seed=None, prefetch_buffer_size=optimization.AUTOTUNE, reader_num_threads=1, parser_num_threads=2, sloppy_ordering=False, drop_final_batch=False): """Returns a `Dataset` of feature dictionaries from `Example` protos. If label_key argument is provided, returns a `Dataset` of tuple comprising of feature dictionaries and label. Example: ``` serialized_examples = [ features { feature { key: "age" value { int64_list { value: [ 0 ] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } } }, features { feature { key: "age" value { int64_list { value: [] } } } feature { key: "gender" value { bytes_list { value: [ "f" ] } } } feature { key: "kws" value { bytes_list { value: [ "sports" ] } } } } ] ``` We can use arguments: ``` features: { "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), "gender": FixedLenFeature([], dtype=tf.string), "kws": VarLenFeature(dtype=tf.string), } ``` And the expected output is: ```python { "age": [[0], [-1]], "gender": [["f"], ["f"]], "kws": SparseTensor( indices=[[0, 0], [0, 1], [1, 0]], values=["code", "art", "sports"] dense_shape=[2, 2]), } ``` Args: file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. See `tf.parse_example`. reader: A function or class that can be called with a `filenames` tensor and (optional) `reader_args` and returns a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`. label_key: (Optional) A string corresponding to the key labels are stored in `tf.Examples`. If provided, it must be one of the `features` key, otherwise results in `ValueError`. reader_args: Additional arguments to pass to the reader class. num_epochs: Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. Defaults to `None`. shuffle: A boolean, indicates whether the input should be shuffled. Defaults to `True`. shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity ensures better shuffling but would increase memory usage and startup time. shuffle_seed: Randomization seed to use for shuffling. prefetch_buffer_size: Number of feature batches to prefetch in order to improve performance. Recommended value is the number of batches consumed per training step. Defaults to auto-tune. reader_num_threads: Number of threads used to read `Example` records. If >1, the results will be interleaved. parser_num_threads: Number of threads to use for parsing `Example` tensors into a dictionary of `Feature` tensors. sloppy_ordering: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still randomized if `shuffle=True`. Note that if the seed is set, then order of elements after shuffling is deterministic). Defaults to `False`. drop_final_batch: If `True`, and the batch size does not evenly divide the input dataset size, the final smaller batch will be dropped. Defaults to `False`. Returns: A dataset of `dict` elements, (or a tuple of `dict` elements and label). Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. Raises: ValueError: If `label_key` is not one of the `features` keys. """ return readers.make_batched_features_dataset( file_pattern, batch_size, features, reader, label_key, reader_args, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed, prefetch_buffer_size, reader_num_threads, parser_num_threads, sloppy_ordering, drop_final_batch)