def features_for_problem(problem_instance, p_hparams, hparams, data_dir, num_datashards, mode, batch_size=None, dataset_split=None, shard=None, name="problem_inputs"): """Feature map for Problem.""" with tf.name_scope(name): with tf.device("/cpu:0"): # Input reading on CPU capacity = (p_hparams.max_expected_batch_size_per_shard * num_datashards) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or hparams.eval_drop_long_sequences), length_multiplier=(p_hparams.batch_size_multiplier)) if batch_size: # If batch_size is fixed, use a single input bucket batching_scheme["batch_sizes"] = [batch_size] batching_scheme["boundaries"] = [] # Log new batching scheme if updated tf.logging.info("Updated batching_scheme = %s", batching_scheme) feature_map = data_reader.input_pipeline( problem_instance, data_dir, capacity, mode, hparams, batching_scheme, dataset_split=dataset_split, shard=shard) # Ensure inputs and targets are proper rank. if problem_instance.has_inputs: # always true for translation while len(feature_map["inputs"].get_shape()) != 4: feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) while len(feature_map["targets"].get_shape()) != 4: feature_map["targets"] = tf.expand_dims(feature_map["targets"], axis=-1) if problem_instance.has_inputs: feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id) feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id) return feature_map
def features_for_problem(problem_instance, p_hparams, hparams, data_dir, num_datashards, mode, batch_size=None, dataset_split=None, shard=None, name="problem_inputs"): """Feature map for Problem.""" with tf.name_scope(name): with tf.device("/cpu:0"): # Input reading on CPU capacity = (p_hparams.max_expected_batch_size_per_shard * num_datashards) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or hparams.eval_drop_long_sequences), length_multiplier=(p_hparams.batch_size_multiplier)) if batch_size: # If batch_size is fixed, use a single input bucket batching_scheme["batch_sizes"] = [batch_size] batching_scheme["boundaries"] = [] # Log new batching scheme if updated tf.logging.info("Updated batching_scheme = %s", batching_scheme) feature_map = data_reader.input_pipeline( problem_instance, data_dir, capacity, mode, hparams, batching_scheme, dataset_split=dataset_split, shard=shard) # Ensure inputs and targets are proper rank. if problem_instance.has_inputs: while len(feature_map["inputs"].get_shape()) != 4: feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) while len(feature_map["targets"].get_shape()) != 4: feature_map["targets"] = tf.expand_dims(feature_map["targets"], axis=-1) if problem_instance.has_inputs: feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id) feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id) return feature_map
def features_for_problem(problem_instance, p_hparams, hparams, data_filepatterns, num_datashards, mode, name="problem_inputs"): """Feature map for Problem.""" with tf.name_scope(name): with tf.device("/cpu:0"): # Input reading on CPU capacity = (p_hparams.max_expected_batch_size_per_shard * num_datashards) feature_map = data_reader.input_pipeline( problem_instance, data_filepatterns, capacity, mode, hparams, data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or hparams.eval_drop_long_sequences), length_multiplier=(p_hparams.batch_size_multiplier))) # Reverse inputs and targets features if the problem was reversed. if problem_instance is not None: problem_instance.maybe_reverse_features(feature_map) problem_instance.maybe_copy_features(feature_map) else: if p_hparams.was_reversed: inputs = feature_map["inputs"] targets = feature_map["targets"] feature_map["inputs"] = targets feature_map["targets"] = inputs # Use the inputs as the targets if the problem is a copy problem. if p_hparams.was_copy: feature_map["targets"] = feature_map["inputs"] # Ensure inputs and targets are proper rank. while len(feature_map["inputs"].get_shape()) != 4: feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) while len(feature_map["targets"].get_shape()) != 4: feature_map["targets"] = tf.expand_dims(feature_map["targets"], axis=-1) feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id) feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id) return feature_map
def input_fn(self, # noqa: C901 mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 max_length = self.max_length(hparams) def tpu_valid_size(example): return data_reader.example_valid_size( example, hparams.min_length, max_length ) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9 ) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if is_training: # Repeat and skip a random number of records dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens. " "Override batch_size_means_tokens() " "in your problem subclass if this is undesired behavior.") batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( batch_size, padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: def _pad_batch(features): if not config or config.data_parallelism.n <= 1: return features tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") return pad_batch(features, config.data_parallelism.n) dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, force_repeat=False, prevent_repeat=False, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = cpu_count() if is_training else 1 if config and hasattr(config, "data_parallelism") and config.data_parallelism: num_shards = config.data_parallelism.n else: num_shards = 1 max_length = self.max_length(hparams) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() if is_training and self.skip_random_fraction_when_training: data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map(data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens." ) batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] if hparams.pad_batch: tf.logging.warn( "Padding the batch to ensure that remainder eval batches are " "processed. This may lead to incorrect metrics for " "non-zero-padded features, e.g. images. Use a smaller batch " "size that has no remainder in that case.") dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=False) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_size), num_parallel_calls=num_threads) else: dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=True) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_shards, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = dataset.apply( tf.contrib.data.bucket_by_sequence_length( data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"])) if not is_training: batch_multiple = num_shards if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case." ) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) # Add shuffling for training batches. This is necessary along with record # level shuffling in the dataset generation. Record shuffling will shuffle # the examples. However, in some cases, it's possible that the shuffle # buffer size for record shuffling is smaller than the batch size. In such # cases, adding batch shuffling ensures that the data is in random order # during training if (is_training and hasattr(hparams, "batch_shuffle_size") and hparams.batch_shuffle_size): dataset = dataset.shuffle(hparams.batch_shuffle_size) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, num_shards) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example["targets"] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset
def input_fn(): """Supplies input to our model. This function supplies input to our model, where this input is a function of the mode. For example, we supply different data if we're performing training versus evaluation. Returns: A tuple consisting of 1) a dictionary of tensors whose keys are the feature names, and 2) a tensor of target labels if the mode is not INFER (and None, otherwise). Raises: ValueError: if one of the parameters has an unsupported value. """ problem_count, batches = len(hparams.problems), [] with tf.name_scope("input_reader"): for n in xrange(problem_count): if fixed_problem is not None and n != fixed_problem: continue problem_instance = hparams.problem_instances[n] p_hparams = hparams.problems[n] with tf.name_scope("problem_%d" % n): with tf.device("/cpu:0"): # Input reading on CPU capacity = ( p_hparams.max_expected_batch_size_per_shard * num_datashards) feature_map = data_reader.input_pipeline( problem_instance, data_file_patterns and data_file_patterns[n], capacity, mode, hparams, data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, drop_long_sequences=( mode == tf.estimator.ModeKeys.TRAIN or hparams.eval_drop_long_sequences), length_multiplier=( p_hparams.batch_size_multiplier))) # Reverse inputs and targets features if the problem was reversed. if problem_instance is not None: problem_instance.maybe_reverse_features(feature_map) problem_instance.maybe_copy_features(feature_map) else: if p_hparams.was_reversed: inputs = feature_map["inputs"] targets = feature_map["targets"] feature_map["inputs"] = targets feature_map["targets"] = inputs # Use the inputs as the targets if the problem is a copy problem. if p_hparams.was_copy: feature_map["targets"] = feature_map["inputs"] # Ensure inputs and targets are proper rank. while len(feature_map["inputs"].get_shape()) != 4: feature_map["inputs"] = tf.expand_dims( feature_map["inputs"], axis=-1) while len(feature_map["targets"].get_shape()) != 4: feature_map["targets"] = tf.expand_dims( feature_map["targets"], axis=-1) batches.append( (feature_map["inputs"], feature_map["targets"], tf.constant(n), tf.constant(p_hparams.input_space_id), tf.constant(p_hparams.target_space_id))) # We choose which problem to process. loss_moving_avgs = [] # Need loss moving averages for that. for n in xrange(problem_count): with tf.variable_scope("losses_avg"): loss_moving_avgs.append( tf.get_variable("problem_%d/total_loss" % n, initializer=100.0, trainable=False)) if fixed_problem is None: if (hparams.problem_choice == "uniform" or mode != tf.estimator.ModeKeys.TRAIN): problem_choice = tf.random_uniform([], maxval=problem_count, dtype=tf.int32) elif hparams.problem_choice == "adaptive": loss_moving_avgs = tf.stack(loss_moving_avgs) problem_choice = tf.multinomial( tf.reshape(loss_moving_avgs, [1, -1]), 1) problem_choice = tf.to_int32(tf.squeeze(problem_choice)) elif hparams.problem_choice == "distributed": assert worker_replicas >= problem_count assert worker_replicas % problem_count == 0 problem_choice = tf.to_int32(worker_id % problem_count) else: raise ValueError( "Value of hparams.problem_choice is %s and must be " "one of [uniform, adaptive, distributed]" % hparams.problem_choice) # Inputs and targets conditional on problem_choice. rand_inputs, rand_target, choice, inp_id, tgt_id = cond_on_index( lambda n: batches[n], problem_choice, 0, problem_count - 1) else: problem_choice = tf.constant(fixed_problem) # Take the only constructed batch, which is the fixed_problem. rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0] # Set shapes so the ranks are clear. rand_inputs.set_shape([None, None, None, None]) rand_target.set_shape([None, None, None, None]) choice.set_shape([]) inp_id.set_shape([]) tgt_id.set_shape([]) # Forced shape obfuscation is necessary for inference. if mode == tf.estimator.ModeKeys.PREDICT: rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access # Final feature map. rand_feature_map = { "inputs": rand_inputs, "problem_choice": choice, "input_space_id": inp_id, "target_space_id": tgt_id } if mode == tf.estimator.ModeKeys.PREDICT: rand_feature_map["infer_targets"] = rand_target rand_target = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner()) return rand_feature_map, rand_target
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, hparams.max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, hparams.max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams }) dataset = self.dataset(**dataset_kwargs) dataset = dataset.map(data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if is_training: dataset = dataset.repeat(None) # Batching if _are_shapes_fully_defined(dataset.output_shapes): # Static shape features (e.g. images) if config and config.use_tpu: tpu_batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(tpu_batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 dataset = dataset.batch(hparams.batch_size * num_shards) else: # Variable length features if config and config.use_tpu: # On TPU, pad to hparams.max_length dataset = dataset.filter(tpu_valid_size) padded_shapes = _fill_shape_nones( dataset.output_shapes, none_filler=hparams.max_length) dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( params["batch_size"], padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(1) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 max_length = self.max_length(hparams) def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size(example, hparams.min_length, max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if is_training: # Repeat and skip a random number of records dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens.") batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: num_shards = config.data_parallelism.n if config else 1 batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( batch_size, padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) shard_multiplier = config.data_parallelism.n if config else 1 batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=shard_multiplier, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: batch_multiple = shard_multiplier if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") dataset = dataset.map( functools.partial(pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example["targets"] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, hparams.max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, hparams.max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams}) dataset = self.dataset(**dataset_kwargs) dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if is_training: dataset = dataset.repeat(None) # Batching if _are_shapes_fully_defined(dataset.output_shapes): # Static shape features (e.g. images) if config and config.use_tpu: tpu_batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(tpu_batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 dataset = dataset.batch(hparams.batch_size * num_shards) else: # Variable length features if config and config.use_tpu: # On TPU, pad to hparams.max_length dataset = dataset.filter(tpu_valid_size) padded_shapes = _fill_shape_nones( dataset.output_shapes, none_filler=hparams.max_length) dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( params["batch_size"], padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: def _pad_batch(features): if not config or config.data_parallelism.n <= 1: return features tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") return pad_batch(features, config.data_parallelism.n) dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(1) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]