def testBucketBySeqLength(self): def example_len(ex): return tf.shape(ex["inputs"])[0] boundaries = [10, 20, 30] batch_sizes = [10, 8, 4, 2] window_size = 40 dataset = data_reader.read_examples( self.problem, self.filepatterns[0], 32, mode=tf.contrib.learn.ModeKeys.EVAL) dataset = data_reader.bucket_by_sequence_length( dataset, example_len, boundaries, batch_sizes, window_size) batch = dataset.make_one_shot_iterator().get_next() input_vals = [] obs_batch_sizes = [] with tf.train.MonitoredSession() as sess: # Until OutOfRangeError while True: batch_val = sess.run(batch) batch_inputs = batch_val["inputs"] batch_size, max_len = batch_inputs.shape obs_batch_sizes.append(batch_size) for inputs in batch_inputs: input_val = inputs[0] input_vals.append(input_val) # The inputs were constructed such that they were repeated value+1 # times (i.e. if the inputs value is 7, the example has 7 repeated 8 # times). repeat = input_val + 1 # Check padding self.assertAllEqual( [input_val] * repeat + [0] * (max_len - repeat), inputs) # Check that all inputs came through self.assertEqual(list(range(30)), sorted(input_vals)) # Check that we saw variable batch size self.assertTrue(len(set(obs_batch_sizes)) > 1)
def testBucketBySeqLength(self): def example_len(ex): return tf.shape(ex["inputs"])[0] boundaries = [10, 20, 30] batch_sizes = [10, 8, 4, 2] dataset = self.problem.dataset( tf.estimator.ModeKeys.TRAIN, data_dir=self.data_dir, shuffle_files=False) dataset = data_reader.bucket_by_sequence_length( dataset, example_len, boundaries, batch_sizes) batch = dataset.make_one_shot_iterator().get_next() input_vals = [] obs_batch_sizes = [] with tf.train.MonitoredSession() as sess: # Until OutOfRangeError while True: batch_val = sess.run(batch) batch_inputs = batch_val["inputs"] batch_size, max_len = batch_inputs.shape obs_batch_sizes.append(batch_size) for inputs in batch_inputs: input_val = inputs[0] input_vals.append(input_val) # The inputs were constructed such that they were repeated value+1 # times (i.e. if the inputs value is 7, the example has 7 repeated 8 # times). repeat = input_val + 1 # Check padding self.assertAllEqual([input_val] * repeat + [0] * (max_len - repeat), inputs) # Check that all inputs came through self.assertEqual(list(range(30)), sorted(input_vals)) # Check that we saw variable batch size self.assertTrue(len(set(obs_batch_sizes)) > 1)
def input_fn(self, # noqa: C901 mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 max_length = self.max_length(hparams) def tpu_valid_size(example): return data_reader.example_valid_size( example, hparams.min_length, max_length ) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9 ) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if is_training: # Repeat and skip a random number of records dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens. " "Override batch_size_means_tokens() " "in your problem subclass if this is undesired behavior.") batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( batch_size, padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: def _pad_batch(features): if not config or config.data_parallelism.n <= 1: return features tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") return pad_batch(features, config.data_parallelism.n) dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, hparams.max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, hparams.max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams }) dataset = self.dataset(**dataset_kwargs) dataset = dataset.map(data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if is_training: dataset = dataset.repeat(None) # Batching if _are_shapes_fully_defined(dataset.output_shapes): # Static shape features (e.g. images) if config and config.use_tpu: tpu_batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(tpu_batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 dataset = dataset.batch(hparams.batch_size * num_shards) else: # Variable length features if config and config.use_tpu: # On TPU, pad to hparams.max_length dataset = dataset.filter(tpu_valid_size) padded_shapes = _fill_shape_nones( dataset.output_shapes, none_filler=hparams.max_length) dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( params["batch_size"], padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(1) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 max_length = self.max_length(hparams) def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size(example, hparams.min_length, max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if is_training: # Repeat and skip a random number of records dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens.") batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: num_shards = config.data_parallelism.n if config else 1 batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( batch_size, padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) shard_multiplier = config.data_parallelism.n if config else 1 batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=shard_multiplier, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: batch_multiple = shard_multiplier if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") dataset = dataset.map( functools.partial(pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example["targets"] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, hparams.max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, hparams.max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams}) dataset = self.dataset(**dataset_kwargs) dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if is_training: dataset = dataset.repeat(None) # Batching if _are_shapes_fully_defined(dataset.output_shapes): # Static shape features (e.g. images) if config and config.use_tpu: tpu_batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(tpu_batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 dataset = dataset.batch(hparams.batch_size * num_shards) else: # Variable length features if config and config.use_tpu: # On TPU, pad to hparams.max_length dataset = dataset.filter(tpu_valid_size) padded_shapes = _fill_shape_nones( dataset.output_shapes, none_filler=hparams.max_length) dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( params["batch_size"], padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: def _pad_batch(features): if not config or config.data_parallelism.n <= 1: return features tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") return pad_batch(features, config.data_parallelism.n) dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(1) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]