def testSetBatchSizeSingleTensor1d(self): dataset = tf.data.Dataset.range(4).batch(2) self.assertFalse(dataset.output_shapes.is_fully_defined()) dataset = dataset_ops.set_batch_size(dataset, 2) self.assertEqual([2], dataset.output_shapes) next_batch = dataset.make_one_shot_iterator().get_next() with self.test_session() as sess: batch_value = sess.run(next_batch) self.assertAllEqual([0, 1], batch_value) batch_value = sess.run(next_batch) self.assertAllEqual([2, 3], batch_value) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(next_batch)
def testSetBatchSizeSingleTensor2d(self): values = np.arange(12, dtype=np.int32).reshape([4, 3]) dataset = tf.data.Dataset.from_tensor_slices(values).batch(2) self.assertFalse(dataset.output_shapes.is_fully_defined()) dataset = dataset_ops.set_batch_size(dataset, 2) self.assertEqual([2, 3], dataset.output_shapes) next_batch = dataset.make_one_shot_iterator().get_next() with self.test_session() as sess: batch_value = sess.run(next_batch) self.assertAllEqual([[0, 1, 2], [3, 4, 5]], batch_value) batch_value = sess.run(next_batch) self.assertAllEqual([[6, 7, 8], [9, 10, 11]], batch_value) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(next_batch)
def testSetBatchSizeNested(self): values = { "a": 100 + np.arange(4, dtype=np.int32), "nest": { "b": np.arange(12, dtype=np.int32).reshape([4, 3]), "c": np.arange(4, dtype=np.int32) } } dataset = tf.data.Dataset.from_tensor_slices(values).batch(2) self.assertItemsEqual(["a", "nest"], dataset.output_shapes.keys()) self.assertItemsEqual(["b", "c"], dataset.output_shapes["nest"].keys()) self.assertFalse(dataset.output_shapes["a"].is_fully_defined()) self.assertFalse(dataset.output_shapes["nest"]["b"].is_fully_defined()) self.assertFalse(dataset.output_shapes["nest"]["c"].is_fully_defined()) dataset = dataset_ops.set_batch_size(dataset, 2) self.assertItemsEqual(["a", "nest"], dataset.output_shapes.keys()) self.assertItemsEqual(["b", "c"], dataset.output_shapes["nest"].keys()) self.assertEqual([2], dataset.output_shapes["a"]) self.assertEqual([2, 3], dataset.output_shapes["nest"]["b"]) self.assertEqual([2], dataset.output_shapes["nest"]["c"]) next_batch = dataset.make_one_shot_iterator().get_next() next_a = next_batch["a"] next_b = next_batch["nest"]["b"] next_c = next_batch["nest"]["c"] with self.test_session() as sess: a, b, c = sess.run([next_a, next_b, next_c]) self.assertAllEqual([100, 101], a) self.assertAllEqual([[0, 1, 2], [3, 4, 5]], b) self.assertAllEqual([0, 1], c) a, b, c = sess.run([next_a, next_b, next_c]) self.assertAllEqual([102, 103], a) self.assertAllEqual([[6, 7, 8], [9, 10, 11]], b) self.assertAllEqual([2, 3], c) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(next_batch)
def build(self, batch_size): """Builds the dataset input pipeline. Args: batch_size: Returns: A tf.data.Dataset. Raises: ValueError: If no files match self.file_pattern. """ file_patterns = self.file_pattern.split(",") filenames = [] for p in file_patterns: matches = tf.gfile.Glob(p) if not matches: raise ValueError("Found no input files matching {}".format(p)) filenames.extend(matches) tf.logging.info( "Building input pipeline from %d files matching patterns: %s", len(filenames), file_patterns) is_training = self.mode == tf.estimator.ModeKeys.TRAIN # Create a string dataset of filenames, and possibly shuffle. filename_dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training and len(filenames) > 1: filename_dataset = filename_dataset.shuffle(len(filenames)) # Read serialized Example protos. dataset = filename_dataset.apply( tf.contrib.data.parallel_interleave( self.file_reader(), cycle_length=8, block_length=8, sloppy=True)) if is_training: # Shuffle and repeat. Note that shuffle() is before repeat(), so elements # are shuffled among each epoch of data, and not between epochs of data. if self.config.shuffle_values_buffer > 0: dataset = dataset.shuffle(self.config.shuffle_values_buffer) dataset = dataset.repeat() # Map the parser over the dataset. dataset = dataset.map( self.create_example_parser(), num_parallel_calls=self.config.num_parallel_parser_calls) def _prepare_wavenet_inputs(features): """Validates features, and clips lengths and adds weights if needed.""" # Validate feature names. required_features = {"autoregressive_input", "conditioning_stack"} allowed_features = required_features | {"weights"} feature_names = features.keys() if not required_features.issubset(feature_names): raise ValueError("Features must contain all of: {}. Got: {}".format( required_features, feature_names)) if not allowed_features.issuperset(feature_names): raise ValueError("Features can only contain: {}. Got: {}".format( allowed_features, feature_names)) output = {} for name, value in features.items(): # Validate shapes. The output dimension is [num_samples, dim]. ndims = len(value.shape) if ndims == 1: # Add an extra dimension: [num_samples] -> [num_samples, 1]. value = tf.expand_dims(value, -1) elif ndims != 2: raise ValueError( "Features should be 1D or 2D sequences. Got '{}' = {}".format( name, value)) if self.config.max_length: value = value[:self.config.max_length] output[name] = value if "weights" not in output: output["weights"] = tf.ones_like(output["autoregressive_input"]) return output dataset = dataset.map(_prepare_wavenet_inputs) # Batch results by up to batch_size. dataset = self._batch_and_pad(dataset, batch_size) if is_training: # The dataset repeats infinitely before batching, so each batch has the # maximum number of elements. dataset = dataset_ops.set_batch_size(dataset, batch_size) elif self.use_tpu and self.mode == tf.estimator.ModeKeys.EVAL: # Pad to ensure that each batch has the same number of elements. dataset = dataset_ops.pad_dataset_to_batch_size(dataset, batch_size) # Prefetch batches. buffer_size = ( self.config.batches_buffer_size or max(1, int(256 / batch_size))) dataset = dataset.prefetch(buffer_size) return dataset