예제 #1
0
    def __init__(self,
                 x,
                 y=None,
                 sample_weights=None,
                 sample_weight_modes=None,
                 batch_size=None,
                 steps=None,
                 shuffle=False,
                 **kwargs):
        super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs)
        x = _process_numpy_inputs(x)
        y = _process_numpy_inputs(y)
        sample_weights = _process_numpy_inputs(sample_weights)
        sample_weight_modes = broadcast_sample_weight_modes(
            sample_weights, sample_weight_modes)

        # If sample_weights are not specified for an output use 1.0 as weights.
        (sample_weights, any_sample_weight,
         _) = training_utils.handle_partial_sample_weights(y,
                                                           sample_weights,
                                                           sample_weight_modes,
                                                           check_all_flat=True)

        if y is not None and any_sample_weight:
            inputs = (x, y, sample_weights)
        elif y is not None:
            # Sample weight is only needed for training, so if y is None, then
            # sample_weight is ignored.
            inputs = (x, y)
        else:
            inputs = (x, )

        dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
        num_samples = int(nest.flatten(x)[0].shape[0])
        if shuffle:
            dataset = dataset.shuffle(num_samples)

        # If batch_size is not passed but steps is, calculate from the input data.
        if steps and not batch_size:
            batch_size = int(math.ceil(num_samples / steps))

        if not batch_size:
            raise ValueError(
                "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
                " input data.")

        dataset = dataset.batch(batch_size)
        self._size = int(math.ceil(num_samples / batch_size))
        self._batch_size = batch_size
        self._has_partial_batch = (self._size != (num_samples // batch_size))

        self._partial_batch_size = None
        if self._has_partial_batch:
            self._partial_batch_size = (num_samples -
                                        (self._size - 1) * self._batch_size)

        self._dataset = dataset
예제 #2
0
    def wrapped_generator():
      """Remove Nones and lists before invoking Dataset.from_generator."""
      for batch in generator_fn():
        if wrap_in_tuple:
          batch = (batch,)

        if must_extract_lists:
          batch = nest._list_to_tuple(batch)  # pylint: disable=protected-access

        if must_prune_nones:
          batch = batch[:elements_to_keep]

        if partial_sample_weight:
          sample_weights, _, _ = training_utils.handle_partial_sample_weights(
              batch[1], batch[2], sample_weight_modes, check_all_flat=False)
          batch = batch[:2] + (sample_weights,)

        yield batch
예제 #3
0
    def __init__(self,
                 x,
                 y=None,
                 sample_weights=None,
                 sample_weight_modes=None,
                 batch_size=None,
                 steps=None,
                 shuffle=False,
                 **kwargs):
        super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs)
        x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
        sample_weight_modes = broadcast_sample_weight_modes(
            sample_weights, sample_weight_modes)

        # If sample_weights are not specified for an output use 1.0 as weights.
        (sample_weights, _,
         _) = training_utils.handle_partial_sample_weights(y,
                                                           sample_weights,
                                                           sample_weight_modes,
                                                           check_all_flat=True)

        inputs = pack_x_y_sample_weight(x, y, sample_weights)

        dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
        num_samples = int(nest.flatten(x)[0].shape[0])
        if shuffle:
            dataset = dataset.shuffle(num_samples)

        # If batch_size is not passed but steps is, calculate from the input data.
        # Default to 32 for backwards compat.
        if not batch_size:
            batch_size = int(math.ceil(num_samples / steps)) if steps else 32

        dataset = dataset.batch(batch_size)
        self._size = int(math.ceil(num_samples / batch_size))
        self._batch_size = batch_size
        self._has_partial_batch = (self._size != (num_samples // batch_size))

        self._partial_batch_size = None
        if self._has_partial_batch:
            self._partial_batch_size = (num_samples -
                                        (self._size - 1) * self._batch_size)

        self._dataset = dataset
예제 #4
0
  def __init__(self,
               x,
               y=None,
               sample_weights=None,
               sample_weight_modes=None,
               batch_size=None,
               epochs=1,
               steps=None,
               shuffle=False,
               **kwargs):
    super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs)
    x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
    sample_weight_modes = broadcast_sample_weight_modes(
        sample_weights, sample_weight_modes)

    # If sample_weights are not specified for an output use 1.0 as weights.
    (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
        y, sample_weights, sample_weight_modes, check_all_flat=True)

    inputs = pack_x_y_sample_weight(x, y, sample_weights)

    num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs))
    if len(num_samples) > 1:
      msg = "Data cardinality is ambiguous:\n"
      for label, data in zip(["x", "y", "sample_weight"], inputs):
        msg += "  {} sizes: {}\n".format(
            label, ", ".join(str(i.shape[0]) for i in nest.flatten(data)))
      msg += "Please provide data which shares the same first dimension."
      raise ValueError(msg)
    num_samples = num_samples.pop()

    # If batch_size is not passed but steps is, calculate from the input data.
    # Default to 32 for backwards compat.
    if not batch_size:
      batch_size = int(math.ceil(num_samples / steps)) if steps else 32

    self._size = int(math.ceil(num_samples / batch_size))
    self._batch_size = batch_size

    num_full_batches = int(num_samples // batch_size)
    self._partial_batch_size = num_samples % batch_size

    if isinstance(shuffle, str):
      shuffle = shuffle.lower()

    self._shuffle = shuffle
    # Vectorized version of shuffle.
    # This is a performance improvement over using `from_tensor_slices`.
    # The indices of the data are shuffled and batched, and these indices
    # are then zipped with the data and used to extract a batch of the data
    # at each step. The performance improvements here come from:
    # 1. vectorized batch using gather
    # 2. parallelized map
    # 3. pipelined permutation generation
    # 4. optimized permutation batching
    # 5. disabled static optimizations

    indices_dataset = dataset_ops.DatasetV2.range(1)
    if shuffle != "batch":
      indices_dataset = indices_dataset.repeat(epochs)

    def permutation(_):
      # It turns out to be more performant to make a new set of indices rather
      # than reusing the same range Tensor. (presumably because of buffer
      # forwarding.)
      indices = math_ops.range(num_samples, dtype=dtypes.int64)
      if shuffle and shuffle != "batch":
        indices = random_ops.random_shuffle(indices)
      return indices

    # We prefetch a single element. Computing large permutations can take quite
    # a while so we don't want to wait for prefetching over an epoch boundary to
    # trigger the next permutation. On the other hand, too many simultaneous
    # shuffles can contend on a hardware level and degrade all performance.
    indices_dataset = indices_dataset.map(permutation).prefetch(1)

    def slice_batch_indices(indices):
      """Convert a Tensor of indices into a dataset of batched indices.

      This step can be accomplished in several ways. The most natural is to
      slice the Tensor in a Dataset map. (With a condition on the upper index to
      handle the partial batch.) However it turns out that coercing the Tensor
      into a shape which is divisible by the batch size (and handling the last
      partial batch separately) allows for a much more favorable memory access
      pattern and improved performance.

      Args:
        indices: Tensor which determines the data order for an entire epoch.

      Returns:
        A Dataset of batched indices.
      """
      num_in_full_batch = num_full_batches * batch_size
      first_k_indices = array_ops.slice(indices, [0], [num_in_full_batch])
      first_k_indices = array_ops.reshape(
          first_k_indices, [num_full_batches, batch_size])

      flat_dataset = dataset_ops.DatasetV2.from_tensor_slices(first_k_indices)
      if self._partial_batch_size:
        index_remainder = dataset_ops.DatasetV2.from_tensors(array_ops.slice(
            indices, [num_in_full_batch], [self._partial_batch_size]))
        flat_dataset = flat_dataset.concatenate(index_remainder)

      if shuffle == "batch":
        # 1024 is a magic constant that has not been properly evaluated
        flat_dataset = flat_dataset.shuffle(1024).repeat(epochs)
      return flat_dataset

    indices_dataset = indices_dataset.flat_map(slice_batch_indices)

    dataset = self.slice_inputs(indices_dataset, inputs)

    if shuffle == "batch":
      def shuffle_batch(*batch):
        return nest.map_structure(random_ops.random_shuffle, batch)
      dataset = dataset.map(shuffle_batch)

    self._dataset = dataset
예제 #5
0
    def _canonicalize_peek(self, peek, sample_weight_modes):
        """Map the peeked batch into a regular form.

    This function serves two purposes. First, it determines if per-batch
    transformations are needed. Second, it extracts the structure to be used
    by Dataset.from_generator.

    Args:
      peek: The first batch of the user's data
      sample_weight_modes: Optional structure indicating how to handle sample
        weights. If it is a string, it will be mapped to match the target
        structure.

    Returns:
      An updated peek and various inspection results.
    """
        wrap_in_tuple = False
        if not isinstance(peek, tuple):
            peek, wrap_in_tuple = (peek, ), True

        if len(peek) not in (1, 2, 3):
            raise ValueError(
                "Output of generator should be a tuple of 1 or 2 or 3 elements: "
                "(input,) or (input, target) or (input, target, sample_weights). "
                "Received {}".format(peek))

        x_peek, y_peek, sample_weights_peek = list(peek) + [None
                                                            ] * (3 - len(peek))

        any_sample_weight, partial_sample_weight = False, False
        sample_weight_modes = broadcast_sample_weight_modes(
            sample_weights_peek if sample_weights_peek is not None else y_peek,
            sample_weight_modes)

        if len(peek) == 3:
            (sample_weights_peek, any_sample_weight, partial_sample_weight
             ) = training_utils.handle_partial_sample_weights(
                 y_peek,
                 sample_weights_peek,
                 sample_weight_modes,
                 check_all_flat=True)
            peek = (x_peek, y_peek, sample_weights_peek)

        # Users often return None for fields which are not used. For instance:
        # (x, y, None) to indicate no sample weights.
        if len(peek) >= 2 and y_peek is None:
            if any_sample_weight:
                raise ValueError(
                    "Found sample weights but no targets\n{}".format(peek))
            elements_to_keep = 1
        elif len(peek) == 3 and not any_sample_weight:
            elements_to_keep = 2
        else:
            elements_to_keep = len(peek)

        def dynamic_shape_like(t):
            return tuple(None for _ in t.shape)

        def convert_for_inspection(t):
            if getattr(t, "shape", None) and getattr(t, "dtype", None):
                return t
            return np.array(t, dtype=backend.floatx())

        canonicalized_peek = nest._list_to_tuple(  # pylint: disable=protected-access
            nest.map_structure(convert_for_inspection,
                               peek[:elements_to_keep]))
        nested_dtypes = nest.map_structure(lambda t: t.dtype,
                                           canonicalized_peek)
        nested_shape = nest.map_structure(dynamic_shape_like,
                                          canonicalized_peek)

        try:
            self._first_batch_size = int(
                nest.flatten(canonicalized_peek)[0].shape[0])
        except IndexError:
            raise IndexError(
                "Could not infer batch size from: {}".format(peek))

        return (peek, wrap_in_tuple, elements_to_keep, partial_sample_weight,
                sample_weight_modes, nested_shape, nested_dtypes)