def _input_fn_impl(self, mode, batch_size, prefetch_buffer_size, num_parallel_process_calls, metadata, features_file, labels_file=None, num_buckets=None, sample_buffer_size=None, maximum_features_length=None, maximum_labels_length=None): """See ``input_fn``.""" self._initialize(metadata) feat_dataset, feat_process_fn, feat_padded_shapes_fn = self._get_features_builder(features_file) if labels_file is None: dataset = feat_dataset # Parallel inputs must be catched in a single tuple and not considered as multiple arguments. process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg)) padded_shapes_fn = feat_padded_shapes_fn else: labels_dataset, labels_process_fn, labels_padded_shapes_fn = ( self._get_labels_builder(labels_file)) dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset)) process_fn = lambda features, labels: ( feat_process_fn(features), labels_process_fn(labels)) padded_shapes_fn = lambda: ( feat_padded_shapes_fn(), labels_padded_shapes_fn()) if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(sample_buffer_size, seed=int(time.time())) dataset = dataset.map( process_fn, num_parallel_calls=num_parallel_process_calls).prefetch(prefetch_buffer_size) padded_shapes = padded_shapes_fn() if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.filter(lambda features, labels: self._filter_example( features, labels, maximum_features_length=maximum_features_length, maximum_labels_length=maximum_labels_length)) num_buckets = num_buckets or 1 if mode == tf.estimator.ModeKeys.TRAIN and num_buckets > 1: # Bucketize training dataset to improve efficiency. def _key_func(features, labels): length = None if length is None: length = self._get_features_length(features) maximum_length = maximum_features_length # For multi inputs, apply bucketing on the target side or none at all. if isinstance(length, list): length = None maximum_length = None if length is None: length = self._get_labels_length(labels) maximum_length = maximum_labels_length if length is None: return tf.constant(0, dtype=tf.int64) if maximum_length is not None: bucket_width = (maximum_length + num_buckets - 1) // num_buckets else: bucket_width = 10 bucket_id = length // bucket_width bucket_id = tf.minimum(bucket_id, num_buckets) return tf.to_int64(bucket_id) def _reduce_func(unused_key, dataset): return dataset.padded_batch( batch_size, padded_shapes=padded_shapes) dataset = dataset.apply(tf.contrib.data.group_by_window( _key_func, _reduce_func, window_size=batch_size)) else: dataset = dataset.padded_batch( batch_size, padded_shapes=padded_shapes) if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() iterator = dataset.make_initializable_iterator() # Add the initializer to a standard collection for it to be initialized. tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) return iterator.get_next()
def _input_fn_impl(self, mode, batch_size, metadata, features_file, labels_file=None, batch_type="examples", batch_multiplier=1, bucket_width=None, single_pass=False, num_threads=None, sample_buffer_size=None, maximum_features_length=None, maximum_labels_length=None): """See ``input_fn``.""" self._initialize(metadata) feat_dataset, feat_process_fn, feat_padded_shapes_fn = self._get_features_builder(features_file) if labels_file is None: dataset = feat_dataset # Parallel inputs must be catched in a single tuple and not considered as multiple arguments. process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg)) padded_shapes_fn = feat_padded_shapes_fn else: labels_dataset, labels_process_fn, labels_padded_shapes_fn = ( self._get_labels_builder(labels_file)) dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset)) process_fn = lambda features, labels: ( feat_process_fn(features), labels_process_fn(labels)) padded_shapes_fn = lambda: ( feat_padded_shapes_fn(), labels_padded_shapes_fn()) if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(sample_buffer_size) dataset = dataset.map( process_fn, num_parallel_calls=num_threads or 4) dataset = dataset.apply(data.filter_examples_by_length( maximum_features_length=maximum_features_length, maximum_labels_length=maximum_labels_length, features_length_fn=self._get_features_length, labels_length_fn=self._get_labels_length)) dataset = dataset.apply(data.batch_train_dataset( batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, bucket_width=bucket_width, padded_shapes=padded_shapes_fn(), features_length_fn=self._get_features_length, labels_length_fn=self._get_labels_length)) dataset = dataset.apply(data.filter_irregular_batches(batch_multiplier)) if not single_pass: dataset = dataset.repeat() else: dataset = dataset.map( process_fn, num_parallel_calls=num_threads or 1) dataset = dataset.padded_batch( batch_size, padded_shapes=padded_shapes_fn()) dataset = dataset.prefetch(1) iterator = dataset.make_initializable_iterator() # Add the initializer to a standard collection for it to be initialized. tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) return iterator.get_next()
def _input_fn_impl(self, mode, batch_size, metadata, features_file, labels_file=None, batch_type="examples", batch_multiplier=1, bucket_width=None, single_pass=False, num_threads=None, sample_buffer_size=None, prefetch_buffer_size=None, maximum_features_length=None, maximum_labels_length=None): """See ``input_fn``.""" self._initialize(metadata) feat_dataset, feat_process_fn = self._get_features_builder(features_file) if labels_file is None: dataset = feat_dataset # Parallel inputs must be catched in a single tuple and not considered as multiple arguments. process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg)) else: labels_dataset, labels_process_fn = self._get_labels_builder(labels_file) dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset)) process_fn = lambda features, labels: ( feat_process_fn(features), labels_process_fn(labels)) dataset, process_fn = self._augment_parallel_dataset(dataset, process_fn, mode=mode) if mode == tf.estimator.ModeKeys.TRAIN: dataset = data.training_pipeline( dataset, batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, bucket_width=bucket_width, single_pass=single_pass, process_fn=process_fn, num_threads=num_threads, shuffle_buffer_size=sample_buffer_size, prefetch_buffer_size=prefetch_buffer_size, dataset_size=self._get_dataset_size(features_file), maximum_features_length=maximum_features_length, maximum_labels_length=maximum_labels_length, features_length_fn=self._get_features_length, labels_length_fn=self._get_labels_length) else: dataset = data.inference_pipeline( dataset, batch_size, process_fn=process_fn, num_threads=num_threads, prefetch_buffer_size=prefetch_buffer_size) iterator = dataset.make_initializable_iterator() # Add the initializer to a standard collection for it to be initialized. tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) return iterator.get_next()
def make_training_dataset( self, features_file, labels_file, batch_size, batch_type="examples", batch_multiplier=1, batch_size_multiple=1, shuffle_buffer_size=None, length_bucket_width=None, maximum_features_length=None, maximum_labels_length=None, single_pass=False, num_shards=1, shard_index=0, num_threads=4, prefetch_buffer_size=None, cardinality_multiple=1, weights=None, batch_autotune_mode=False, ): """Builds a dataset to be used for training. It supports the full training pipeline, including: * sharding * shuffling * filtering * bucketing * prefetching Args: features_file: The source file or a list of training source files. labels_file: The target file or a list of training target files. batch_size: The batch size to use. batch_type: The training batching strategy to use: can be "examples" or "tokens". batch_multiplier: The batch size multiplier to prepare splitting accross replicated graph parts. batch_size_multiple: When :obj:`batch_type` is "tokens", ensure that the resulting batch size is a multiple of this value. shuffle_buffer_size: The number of elements from which to sample. length_bucket_width: The width of the length buckets to select batch candidates from (for efficiency). Set ``None`` to not constrain batch formation. maximum_features_length: The maximum length or list of maximum lengths of the features sequence(s). ``None`` to not constrain the length. maximum_labels_length: The maximum length of the labels sequence. ``None`` to not constrain the length. single_pass: If ``True``, makes a single pass over the training data. num_shards: The number of data shards (usually the number of workers in a distributed setting). shard_index: The shard index this data pipeline should read from. num_threads: The number of elements processed in parallel. prefetch_buffer_size: The number of batches to prefetch asynchronously. If ``None``, use an automatically tuned value. cardinality_multiple: Ensure that the dataset cardinality is a multiple of this value when :obj:`single_pass` is ``True``. weights: An optional list of weights to create a weighted dataset out of multiple training files. batch_autotune_mode: When enabled, all batches are padded to the maximum sequence length. Returns: A ``tf.data.Dataset``. See Also: :func:`opennmt.data.training_pipeline` """ if labels_file is not None: data_files = [features_file, labels_file] maximum_length = [maximum_features_length, maximum_labels_length] features_length_fn = self.features_inputter.get_length labels_length_fn = self.labels_inputter.get_length else: data_files = features_file maximum_length = maximum_features_length features_length_fn = self.get_length labels_length_fn = None dataset = self.make_dataset(data_files, training=True) map_fn = lambda *arg: self.make_features( element=misc.item_or_tuple(arg), training=True) filter_fn = lambda *arg: (self.keep_for_training( misc.item_or_tuple(arg), maximum_length=maximum_length)) transform_fns = [ lambda dataset: dataset.map(map_fn, num_parallel_calls=num_threads or 4), lambda dataset: dataset.filter(filter_fn), ] if batch_autotune_mode: # In this mode we want to return batches where all sequences are padded # to the maximum possible length in order to maximize the memory usage. # Shuffling, sharding, prefetching, etc. are not applied since correctness and # performance are not important. if isinstance(dataset, list): # Ignore weighted dataset. dataset = dataset[0] # We repeat the dataset now to ensure full batches are always returned. dataset = dataset.repeat() for transform_fn in transform_fns: dataset = dataset.apply(transform_fn) # length_fn returns the maximum length instead of the actual example length so # that batches are built as if each example has the maximum length. if labels_file is not None: constant_length_fn = [ lambda x: maximum_features_length, lambda x: maximum_labels_length, ] else: constant_length_fn = lambda x: maximum_features_length # The length dimension is set to the maximum length in the padded shapes. padded_shapes = self.get_padded_shapes( dataset.element_spec, maximum_length=maximum_length) dataset = dataset.apply( dataset_util.batch_sequence_dataset( batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, length_bucket_width=1, length_fn=constant_length_fn, padded_shapes=padded_shapes, )) return dataset if weights is not None: dataset = (dataset, weights) dataset = dataset_util.training_pipeline( batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, batch_size_multiple=batch_size_multiple, transform_fns=transform_fns, length_bucket_width=length_bucket_width, features_length_fn=features_length_fn, labels_length_fn=labels_length_fn, single_pass=single_pass, num_shards=num_shards, shard_index=shard_index, num_threads=num_threads, dataset_size=self.get_dataset_size(data_files), shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=prefetch_buffer_size, cardinality_multiple=cardinality_multiple, )(dataset) return dataset
def _input_fn_impl(self, mode, batch_size, metadata, features_file, labels_file=None, batch_type="examples", batch_multiplier=1, bucket_width=None, single_pass=False, num_threads=None, sample_buffer_size=None, prefetch_buffer_size=None, maximum_features_length=None, maximum_labels_length=None): """See ``input_fn``.""" self._initialize(metadata) feat_dataset, feat_process_fn = self._get_features_builder(features_file) if labels_file is None: dataset = feat_dataset # Parallel inputs must be catched in a single tuple and not considered as multiple arguments. process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg)) else: labels_dataset, labels_process_fn = self._get_labels_builder(labels_file) dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset)) process_fn = lambda features, labels: ( feat_process_fn(features), labels_process_fn(labels)) if mode == tf.estimator.ModeKeys.TRAIN: dataset_size = self._get_dataset_size(features_file) if sample_buffer_size < dataset_size: # When the sample buffer size is smaller than the dataset size, shard # the dataset in a random order. This ensures that all parts of the # dataset can be seen when the evaluation frequency is high. dataset = dataset.apply(data.random_shard(sample_buffer_size, dataset_size)) dataset = dataset.shuffle(sample_buffer_size) dataset = dataset.map( process_fn, num_parallel_calls=num_threads or 4) dataset = dataset.apply(data.filter_examples_by_length( maximum_features_length=maximum_features_length, maximum_labels_length=maximum_labels_length, features_length_fn=self._get_features_length, labels_length_fn=self._get_labels_length)) dataset = dataset.apply(data.batch_parallel_dataset( batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, bucket_width=bucket_width, features_length_fn=self._get_features_length, labels_length_fn=self._get_labels_length)) dataset = dataset.apply(data.filter_irregular_batches(batch_multiplier)) if not single_pass: dataset = dataset.repeat() else: dataset = dataset.map( process_fn, num_parallel_calls=num_threads or 1) dataset = dataset.apply(data.batch_parallel_dataset(batch_size)) if prefetch_buffer_size: dataset = dataset.prefetch(prefetch_buffer_size) return dataset
def make_training_dataset(self, features_file, labels_file, batch_size, batch_type="examples", batch_multiplier=1, batch_size_multiple=1, shuffle_buffer_size=None, length_bucket_width=None, maximum_features_length=None, maximum_labels_length=None, single_pass=False, num_shards=1, shard_index=0, num_threads=4, prefetch_buffer_size=None, cardinality_multiple=1, weights=None): """Builds a dataset to be used for training. It supports the full training pipeline, including: * sharding * shuffling * filtering * bucketing * prefetching Args: features_file: The source file or a list of training source files. labels_file: The target file or a list of training target files. batch_size: The batch size to use. batch_type: The training batching strategy to use: can be "examples" or "tokens". batch_multiplier: The batch size multiplier to prepare splitting accross replicated graph parts. batch_size_multiple: When :obj:`batch_type` is "tokens", ensure that the resulting batch size is a multiple of this value. shuffle_buffer_size: The number of elements from which to sample. length_bucket_width: The width of the length buckets to select batch candidates from (for efficiency). Set ``None`` to not constrain batch formation. maximum_features_length: The maximum length or list of maximum lengths of the features sequence(s). ``None`` to not constrain the length. maximum_labels_length: The maximum length of the labels sequence. ``None`` to not constrain the length. single_pass: If ``True``, makes a single pass over the training data. num_shards: The number of data shards (usually the number of workers in a distributed setting). shard_index: The shard index this data pipeline should read from. num_threads: The number of elements processed in parallel. prefetch_buffer_size: The number of batches to prefetch asynchronously. If ``None``, use an automatically tuned value. cardinality_multiple: Ensure that the dataset cardinality is a multiple of this value when :obj:`single_pass` is ``True``. weights: An optional list of weights to create a weighted dataset out of multiple training files. Returns: A ``tf.data.Dataset``. See Also: :func:`opennmt.data.training_pipeline` """ if labels_file is not None: data_files = [features_file, labels_file] maximum_length = [maximum_features_length, maximum_labels_length] features_length_fn = self.features_inputter.get_length labels_length_fn = self.labels_inputter.get_length else: data_files = features_file maximum_length = maximum_features_length features_length_fn = self.get_length labels_length_fn = None map_fn = lambda *arg: self.make_features( element=misc.item_or_tuple(arg), training=True) filter_fn = lambda *arg: (self.keep_for_training( misc.item_or_tuple(arg), maximum_length=maximum_length)) transform_fns = [ lambda dataset: dataset.map(map_fn, num_parallel_calls=num_threads or 4), lambda dataset: dataset.filter(filter_fn) ] dataset = self.make_dataset(data_files, training=True) if weights is not None: dataset = (dataset, weights) dataset = dataset_util.training_pipeline( batch_size, batch_type=batch_type, batch_multiplier=batch_multiplier, batch_size_multiple=batch_size_multiple, transform_fns=transform_fns, length_bucket_width=length_bucket_width, features_length_fn=features_length_fn, labels_length_fn=labels_length_fn, single_pass=single_pass, num_shards=num_shards, shard_index=shard_index, num_threads=num_threads, dataset_size=self.get_dataset_size(data_files), shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=prefetch_buffer_size, cardinality_multiple=cardinality_multiple)(dataset) return dataset
def _map_fn(*arg): features = self.make_features(element=misc.item_or_tuple(arg), training=False) if isinstance(features, (list, tuple)): # Special case for unsupervised inputters that always return a tuple (features, labels). return features[0] return features