def __init__(self, x, y=None, sample_weights=None, sample_weight_modes=None, batch_size=None, steps=None, shuffle=False, **kwargs): super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs) x = _process_numpy_inputs(x) y = _process_numpy_inputs(y) sample_weights = _process_numpy_inputs(sample_weights) sample_weight_modes = broadcast_sample_weight_modes( sample_weights, sample_weight_modes) # If sample_weights are not specified for an output use 1.0 as weights. (sample_weights, any_sample_weight, _) = training_utils.handle_partial_sample_weights(y, sample_weights, sample_weight_modes, check_all_flat=True) if y is not None and any_sample_weight: inputs = (x, y, sample_weights) elif y is not None: # Sample weight is only needed for training, so if y is None, then # sample_weight is ignored. inputs = (x, y) else: inputs = (x, ) dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs) num_samples = int(nest.flatten(x)[0].shape[0]) if shuffle: dataset = dataset.shuffle(num_samples) # If batch_size is not passed but steps is, calculate from the input data. if steps and not batch_size: batch_size = int(math.ceil(num_samples / steps)) if not batch_size: raise ValueError( "`batch_size` or `steps` is required for `Tensor` or `NumPy`" " input data.") dataset = dataset.batch(batch_size) self._size = int(math.ceil(num_samples / batch_size)) self._batch_size = batch_size self._has_partial_batch = (self._size != (num_samples // batch_size)) self._partial_batch_size = None if self._has_partial_batch: self._partial_batch_size = (num_samples - (self._size - 1) * self._batch_size) self._dataset = dataset
def wrapped_generator(): """Remove Nones and lists before invoking Dataset.from_generator.""" for batch in generator_fn(): if wrap_in_tuple: batch = (batch,) if must_extract_lists: batch = nest._list_to_tuple(batch) # pylint: disable=protected-access if must_prune_nones: batch = batch[:elements_to_keep] if partial_sample_weight: sample_weights, _, _ = training_utils.handle_partial_sample_weights( batch[1], batch[2], sample_weight_modes, check_all_flat=False) batch = batch[:2] + (sample_weights,) yield batch
def __init__(self, x, y=None, sample_weights=None, sample_weight_modes=None, batch_size=None, steps=None, shuffle=False, **kwargs): super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs) x, y, sample_weights = _process_tensorlike((x, y, sample_weights)) sample_weight_modes = broadcast_sample_weight_modes( sample_weights, sample_weight_modes) # If sample_weights are not specified for an output use 1.0 as weights. (sample_weights, _, _) = training_utils.handle_partial_sample_weights(y, sample_weights, sample_weight_modes, check_all_flat=True) inputs = pack_x_y_sample_weight(x, y, sample_weights) dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs) num_samples = int(nest.flatten(x)[0].shape[0]) if shuffle: dataset = dataset.shuffle(num_samples) # If batch_size is not passed but steps is, calculate from the input data. # Default to 32 for backwards compat. if not batch_size: batch_size = int(math.ceil(num_samples / steps)) if steps else 32 dataset = dataset.batch(batch_size) self._size = int(math.ceil(num_samples / batch_size)) self._batch_size = batch_size self._has_partial_batch = (self._size != (num_samples // batch_size)) self._partial_batch_size = None if self._has_partial_batch: self._partial_batch_size = (num_samples - (self._size - 1) * self._batch_size) self._dataset = dataset
def __init__(self, x, y=None, sample_weights=None, sample_weight_modes=None, batch_size=None, epochs=1, steps=None, shuffle=False, **kwargs): super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs) x, y, sample_weights = _process_tensorlike((x, y, sample_weights)) sample_weight_modes = broadcast_sample_weight_modes( sample_weights, sample_weight_modes) # If sample_weights are not specified for an output use 1.0 as weights. (sample_weights, _, _) = training_utils.handle_partial_sample_weights( y, sample_weights, sample_weight_modes, check_all_flat=True) inputs = pack_x_y_sample_weight(x, y, sample_weights) num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs)) if len(num_samples) > 1: msg = "Data cardinality is ambiguous:\n" for label, data in zip(["x", "y", "sample_weight"], inputs): msg += " {} sizes: {}\n".format( label, ", ".join(str(i.shape[0]) for i in nest.flatten(data))) msg += "Please provide data which shares the same first dimension." raise ValueError(msg) num_samples = num_samples.pop() # If batch_size is not passed but steps is, calculate from the input data. # Default to 32 for backwards compat. if not batch_size: batch_size = int(math.ceil(num_samples / steps)) if steps else 32 self._size = int(math.ceil(num_samples / batch_size)) self._batch_size = batch_size num_full_batches = int(num_samples // batch_size) self._partial_batch_size = num_samples % batch_size if isinstance(shuffle, str): shuffle = shuffle.lower() self._shuffle = shuffle # Vectorized version of shuffle. # This is a performance improvement over using `from_tensor_slices`. # The indices of the data are shuffled and batched, and these indices # are then zipped with the data and used to extract a batch of the data # at each step. The performance improvements here come from: # 1. vectorized batch using gather # 2. parallelized map # 3. pipelined permutation generation # 4. optimized permutation batching # 5. disabled static optimizations indices_dataset = dataset_ops.DatasetV2.range(1) if shuffle != "batch": indices_dataset = indices_dataset.repeat(epochs) def permutation(_): # It turns out to be more performant to make a new set of indices rather # than reusing the same range Tensor. (presumably because of buffer # forwarding.) indices = math_ops.range(num_samples, dtype=dtypes.int64) if shuffle and shuffle != "batch": indices = random_ops.random_shuffle(indices) return indices # We prefetch a single element. Computing large permutations can take quite # a while so we don't want to wait for prefetching over an epoch boundary to # trigger the next permutation. On the other hand, too many simultaneous # shuffles can contend on a hardware level and degrade all performance. indices_dataset = indices_dataset.map(permutation).prefetch(1) def slice_batch_indices(indices): """Convert a Tensor of indices into a dataset of batched indices. This step can be accomplished in several ways. The most natural is to slice the Tensor in a Dataset map. (With a condition on the upper index to handle the partial batch.) However it turns out that coercing the Tensor into a shape which is divisible by the batch size (and handling the last partial batch separately) allows for a much more favorable memory access pattern and improved performance. Args: indices: Tensor which determines the data order for an entire epoch. Returns: A Dataset of batched indices. """ num_in_full_batch = num_full_batches * batch_size first_k_indices = array_ops.slice(indices, [0], [num_in_full_batch]) first_k_indices = array_ops.reshape( first_k_indices, [num_full_batches, batch_size]) flat_dataset = dataset_ops.DatasetV2.from_tensor_slices(first_k_indices) if self._partial_batch_size: index_remainder = dataset_ops.DatasetV2.from_tensors(array_ops.slice( indices, [num_in_full_batch], [self._partial_batch_size])) flat_dataset = flat_dataset.concatenate(index_remainder) if shuffle == "batch": # 1024 is a magic constant that has not been properly evaluated flat_dataset = flat_dataset.shuffle(1024).repeat(epochs) return flat_dataset indices_dataset = indices_dataset.flat_map(slice_batch_indices) dataset = self.slice_inputs(indices_dataset, inputs) if shuffle == "batch": def shuffle_batch(*batch): return nest.map_structure(random_ops.random_shuffle, batch) dataset = dataset.map(shuffle_batch) self._dataset = dataset
def _canonicalize_peek(self, peek, sample_weight_modes): """Map the peeked batch into a regular form. This function serves two purposes. First, it determines if per-batch transformations are needed. Second, it extracts the structure to be used by Dataset.from_generator. Args: peek: The first batch of the user's data sample_weight_modes: Optional structure indicating how to handle sample weights. If it is a string, it will be mapped to match the target structure. Returns: An updated peek and various inspection results. """ wrap_in_tuple = False if not isinstance(peek, tuple): peek, wrap_in_tuple = (peek, ), True if len(peek) not in (1, 2, 3): raise ValueError( "Output of generator should be a tuple of 1 or 2 or 3 elements: " "(input,) or (input, target) or (input, target, sample_weights). " "Received {}".format(peek)) x_peek, y_peek, sample_weights_peek = list(peek) + [None ] * (3 - len(peek)) any_sample_weight, partial_sample_weight = False, False sample_weight_modes = broadcast_sample_weight_modes( sample_weights_peek if sample_weights_peek is not None else y_peek, sample_weight_modes) if len(peek) == 3: (sample_weights_peek, any_sample_weight, partial_sample_weight ) = training_utils.handle_partial_sample_weights( y_peek, sample_weights_peek, sample_weight_modes, check_all_flat=True) peek = (x_peek, y_peek, sample_weights_peek) # Users often return None for fields which are not used. For instance: # (x, y, None) to indicate no sample weights. if len(peek) >= 2 and y_peek is None: if any_sample_weight: raise ValueError( "Found sample weights but no targets\n{}".format(peek)) elements_to_keep = 1 elif len(peek) == 3 and not any_sample_weight: elements_to_keep = 2 else: elements_to_keep = len(peek) def dynamic_shape_like(t): return tuple(None for _ in t.shape) def convert_for_inspection(t): if getattr(t, "shape", None) and getattr(t, "dtype", None): return t return np.array(t, dtype=backend.floatx()) canonicalized_peek = nest._list_to_tuple( # pylint: disable=protected-access nest.map_structure(convert_for_inspection, peek[:elements_to_keep])) nested_dtypes = nest.map_structure(lambda t: t.dtype, canonicalized_peek) nested_shape = nest.map_structure(dynamic_shape_like, canonicalized_peek) try: self._first_batch_size = int( nest.flatten(canonicalized_peek)[0].shape[0]) except IndexError: raise IndexError( "Could not infer batch size from: {}".format(peek)) return (peek, wrap_in_tuple, elements_to_keep, partial_sample_weight, sample_weight_modes, nested_shape, nested_dtypes)