예제 #1
0
파일: tputil.py 프로젝트: shawwn/ml-notes
def random_seeds(seed=None):
    if seed is not None:
        seed = tf.convert_to_tensor(seed, dtype=tf.int64)
        if len(seed.shape) == 1 and seed.shape[0].value == 2:
            return seed
    result = random_ops.RandomDataset(seed).batch(
        2).make_one_shot_iterator().get_next()
    result.set_shape([2])
    return result
예제 #2
0
  def testDeterminism(self, global_seed, local_seed):
    expect_determinism = (global_seed is not None) or (local_seed is not None)

    random_seed.set_random_seed(global_seed)
    ds = random_ops.RandomDataset(seed=local_seed).take(10)

    output_1 = self.getDatasetOutput(ds)
    ds = self.graphRoundTrip(ds)
    output_2 = self.getDatasetOutput(ds)

    if expect_determinism:
      self.assertEqual(output_1, output_2)
    else:
      # Technically not guaranteed since the two randomly-chosen int64 seeds
      # could match, but that is sufficiently unlikely (1/2^128 with perfect
      # random number generation).
      self.assertNotEqual(output_1, output_2)
예제 #3
0
def sample_from_datasets_v2(datasets, weights=None, seed=None):
    """Samples elements at random from the datasets in `datasets`.

  Args:
    datasets: A list of `tf.data.Dataset` objects with compatible structure.
    weights: (Optional.) A list of `len(datasets)` floating-point values where
      `weights[i]` represents the probability with which an element should be
      sampled from `datasets[i]`, or a `tf.data.Dataset` object where each
      element is such a list. Defaults to a uniform distribution across
      `datasets`.
    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
      random seed that will be used to create the distribution. See
      `tf.random.set_seed` for behavior.

  Returns:
    A dataset that interleaves elements from `datasets` at random, according to
    `weights` if provided, otherwise with uniform probability.

  Raises:
    TypeError: If the `datasets` or `weights` arguments have the wrong type.
    ValueError: If the `weights` argument is specified and does not match the
      length of the `datasets` element.
  """
    num_datasets = len(datasets)
    if not isinstance(weights, dataset_ops.DatasetV2):
        if weights is None:
            # Select inputs with uniform probability.
            logits = [[1.0] * num_datasets]

        else:
            # Use the given `weights` as the probability of choosing the respective
            # input.
            weights = ops.convert_to_tensor(weights, name="weights")
            if weights.dtype not in (dtypes.float32, dtypes.float64):
                raise TypeError("`weights` must be convertible to a tensor of "
                                "`tf.float32` or `tf.float64` elements.")
            if not weights.shape.is_compatible_with([num_datasets]):
                raise ValueError(
                    "`weights` must be a vector of length `len(datasets)`.")

            # The `stateless_multinomial()` op expects log-probabilities, as opposed
            # to weights.
            logits = array_ops.expand_dims(
                math_ops.log(weights, name="logits"), 0)

        # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When it
        # is a `Dataset`, it is possible that evaluating it has a side effect the
        # user depends on.
        if len(datasets) == 1:
            return datasets[0]

        def select_dataset_constant_logits(seed):
            return array_ops.squeeze(
                gen_stateless_random_ops.stateless_multinomial(logits,
                                                               1,
                                                               seed=seed),
                axis=[0, 1])

        selector_input = dataset_ops.MapDataset(
            random_ops.RandomDataset(seed).batch(2),
            select_dataset_constant_logits,
            use_inter_op_parallelism=False)

    else:
        # Use each element of the given `weights` dataset as the probability of
        # choosing the respective input.

        # The `stateless_multinomial()` op expects log-probabilities, as opposed to
        # weights.
        logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))

        def select_dataset_varying_logits(logits, seed):
            return array_ops.squeeze(
                gen_stateless_random_ops.stateless_multinomial(logits,
                                                               1,
                                                               seed=seed),
                axis=[0, 1])

        logits_and_seeds = dataset_ops.Dataset.zip(
            (logits_ds, random_ops.RandomDataset(seed).batch(2)))
        selector_input = dataset_ops.MapDataset(logits_and_seeds,
                                                select_dataset_varying_logits,
                                                use_inter_op_parallelism=False)

    return _DirectedInterleaveDataset(selector_input, datasets)
예제 #4
0
def sample_from_datasets_v2(datasets,
                            weights=None,
                            seed=None,
                            stop_on_empty_dataset=False):
  """Samples elements at random from the datasets in `datasets`.

  Creates a dataset by interleaving elements of `datasets` with `weight[i]`
  probability of picking an element from dataset `i`. Sampling is done without
  replacement. For example, suppose we have 2 datasets:

  ```python
  dataset1 = tf.data.Dataset.range(0, 3)
  dataset2 = tf.data.Dataset.range(100, 103)
  ```

  Suppose also that we sample from these 2 datasets with the following weights:

  ```python
  sample_dataset = tf.data.experimental.sample_from_datasets(
      [dataset1, dataset2], weights=[0.5, 0.5])
  ```

  One possible outcome of elements in sample_dataset is:

  ```
  print(list(sample_dataset.as_numpy_iterator()))
  # [100, 0, 1, 101, 2, 102]
  ```

  Args:
    datasets: A non-empty list of `tf.data.Dataset` objects with compatible
      structure.
    weights: (Optional.) A list or Tensor of `len(datasets)` floating-point
      values where `weights[i]` represents the probability to sample from
      `datasets[i]`, or a `tf.data.Dataset` object where each element is such a
      list. Defaults to a uniform distribution across `datasets`.
    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
      seed that will be used to create the distribution. See
      `tf.random.set_seed` for behavior.
    stop_on_empty_dataset: If `True`, sampling stops if it encounters an empty
      dataset. If `False`, it skips empty datasets. It is recommended to set it
      to `True`. Otherwise, the distribution of samples starts off as the user
      intends, but may change as input datasets become empty. This can be
      difficult to detect since the dataset starts off looking correct. Default
      to `False` for backward compatibility.

  Returns:
    A dataset that interleaves elements from `datasets` at random, according to
    `weights` if provided, otherwise with uniform probability.

  Raises:
    TypeError: If the `datasets` or `weights` arguments have the wrong type.
    ValueError:
      - If `datasets` is empty, or
      - If `weights` is specified and does not match the length of `datasets`.
  """
  def _shapes_are_compatible(datasets, weights):
    if isinstance(weights, ops.Tensor):
      return weights.shape.is_compatible_with([len(datasets)])
    return len(datasets) == len(weights)

  def _skip_datasets_with_zero_weight(datasets, weights):
    datasets_and_weights = [(dataset, weight)
                            for (dataset, weight) in zip(datasets, weights)
                            if weight > 0]
    return (zip(*datasets_and_weights) if datasets_and_weights else
            ([datasets[0].take(0)], [1.]))

  if not datasets:
    raise ValueError("`datasets` must be a non-empty list of datasets.")

  if not isinstance(weights, dataset_ops.DatasetV2):
    if weights is None:
      # Select inputs with uniform probability.
      logits = [[1.0] * len(datasets)]

    else:
      if not _shapes_are_compatible(datasets, weights):
        raise ValueError("`weights` must have the same length as `datasets`.")

      # Use the given `weights` as the probability of choosing the respective
      # input.
      if not isinstance(weights, ops.Tensor):
        datasets, weights = _skip_datasets_with_zero_weight(datasets, weights)
      weights = ops.convert_to_tensor(weights, name="weights")
      if weights.dtype not in (dtypes.float32, dtypes.float64):
        raise TypeError("`weights` must be convertible to a tensor of "
                        "`tf.float32` or `tf.float64` elements.")

      # The `stateless_multinomial()` op expects log-probabilities, as opposed
      # to weights.
      logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)

    # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When it
    # is a `Dataset`, it is possible that evaluating it has a side effect the
    # user depends on.
    if len(datasets) == 1:
      return datasets[0]

    def select_dataset_constant_logits(seed):
      return array_ops.squeeze(
          gen_stateless_random_ops.stateless_multinomial(logits, 1, seed=seed),
          axis=[0, 1])

    selector_input = dataset_ops.MapDataset(
        random_ops.RandomDataset(seed).batch(2),
        select_dataset_constant_logits,
        use_inter_op_parallelism=False)

  else:
    # Use each element of the given `weights` dataset as the probability of
    # choosing the respective input.
    #
    # The `stateless_multinomial()` op expects log-probabilities, as opposed to
    # weights.
    logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))

    def select_dataset_varying_logits(logits, seed):
      return array_ops.squeeze(
          gen_stateless_random_ops.stateless_multinomial(logits, 1, seed=seed),
          axis=[0, 1])

    logits_and_seeds = dataset_ops.Dataset.zip(
        (logits_ds, random_ops.RandomDataset(seed).batch(2)))
    selector_input = dataset_ops.MapDataset(
        logits_and_seeds,
        select_dataset_varying_logits,
        use_inter_op_parallelism=False)

  return _DirectedInterleaveDataset(selector_input, datasets,
                                    stop_on_empty_dataset)