예제 #1
0
    def testSampleFromDatasets(self):
        cluster = data_service_test_base.TestCluster(num_workers=3)
        random_seed.set_random_seed(1619)
        num_samples = 5000
        rand_probs = np.random.random_sample((5, ))
        rand_probs = rand_probs / rand_probs.sum()

        # Use chi-squared test to assert that the observed distribution matches the
        # expected distribution. Based on the implementation in
        # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py".
        for weights in [[.85, .05, .1], rand_probs, [1.]]:
            classes = len(weights)

            # Create a dataset that samples each integer in `[0, num_datasets)`
            # with probability given by `weights[i]`.
            ds = interleave_ops.sample_from_datasets([
                dataset_ops.Dataset.from_tensors(i).repeat()
                for i in range(classes)
            ], weights)
            ds = self.make_distributed_dataset(
                ds, cluster, processing_mode="distributed_epoch")
            ds = ds.take(num_samples)

            freqs = np.zeros([classes])
            for v in self.getDatasetOutput(ds):
                freqs[v] += 1

            expected = np.asarray(weights)
            actual = np.asarray(freqs / num_samples)
            diff = actual - expected
            chi2 = np.sum(diff * diff / expected, axis=0)
            self.assertLess(chi2, 1e-2)
    def testSampleFromDatasets(self, weights_as_dataset):
        random_seed.set_random_seed(1619)
        num_samples = 5000
        rand_probs = self._normalize(np.random.random_sample((5, )))

        # Use chi-squared test to assert that the observed distribution matches the
        # expected distribution. Based on the implementation in
        # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py".
        for probs in [[.85, .05, .1], rand_probs, [1.]]:
            weights = np.asarray(probs)
            if weights_as_dataset:
                weights = dataset_ops.Dataset.from_tensors(weights).repeat()
            classes = len(probs)

            # Create a dataset that samples each integer in `[0, num_datasets)`
            # with probability given by `weights[i]`.
            dataset = interleave_ops.sample_from_datasets([
                dataset_ops.Dataset.from_tensors(i).repeat()
                for i in range(classes)
            ], weights)
            dataset = dataset.take(num_samples)

            next_element = self.getNext(dataset)
            freqs = np.zeros([classes])
            for _ in range(num_samples):
                freqs[self.evaluate(next_element())] += 1
            with self.assertRaises(errors.OutOfRangeError):
                self.evaluate(next_element())

            self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 def testSampleFromDatasetsNested(self):
   ds1 = dataset_ops.Dataset.range(10).window(2)
   ds2 = dataset_ops.Dataset.range(10, 20).window(2)
   ds = interleave_ops.sample_from_datasets([ds1, ds2], weights=[0.3, 0.7])
   ds = ds.flat_map(lambda x: x)
   next_element = self.getNext(ds)
   self.evaluate(next_element())
 def _build_dataset(self, probs, num_samples):
     dataset = interleave_ops.sample_from_datasets([
         dataset_ops.Dataset.from_tensors(i).repeat(None)
         for i in range(len(probs))
     ],
                                                   probs,
                                                   seed=1813)
     return dataset.take(num_samples)
 def testSampleFromEmptyDataset(self, weights_type):
     weights = _get_weights_of_type(np.asarray([1., 0.]), weights_type)
     datasets = [
         dataset_ops.Dataset.range(0),
         dataset_ops.Dataset.range(1).repeat()
     ]
     sample_dataset = interleave_ops.sample_from_datasets(
         datasets, weights=weights, stop_on_empty_dataset=True)
     self.assertDatasetProduces(sample_dataset, [])
 def _build_dataset(self, probs, num_samples):
   dataset = interleave_ops.sample_from_datasets(
       [
           dataset_ops.Dataset.from_tensors(i).repeat(None)
           for i in range(len(probs))
       ],
       probs,
       seed=1813)
   return dataset.take(num_samples)
  def testErrors(self):
    with self.assertRaisesRegex(ValueError, r"must have the same length"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[0.25, 0.25, 0.25, 0.25])

    with self.assertRaisesRegex(TypeError, "`tf.float32` or `tf.float64`"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[1, 1])

    with self.assertRaisesRegex(TypeError, "must have the same type"):
      interleave_ops.sample_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(0.0)
      ])

    with self.assertRaisesRegex(
        ValueError, r"`datasets` must be a non-empty list of datasets."):
      interleave_ops.sample_from_datasets(datasets=[], weights=[])

    with self.assertRaisesRegex(TypeError, "tf.int64"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))

    with self.assertRaisesRegex(TypeError, "scalar"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))

    with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"):
      dataset = interleave_ops.choose_from_datasets(
          [dataset_ops.Dataset.from_tensors(0)],
          choice_dataset=dataset_ops.Dataset.from_tensors(
              constant_op.constant(1, dtype=dtypes.int64)))
      next_element = self.getNext(dataset)
      self.evaluate(next_element())

    with self.assertRaisesRegex(
        ValueError, r"`datasets` must be a non-empty list of datasets."):
      interleave_ops.choose_from_datasets(
          datasets=[], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))

    with self.assertRaisesRegex(
        TypeError, r"`choice_dataset` must be a dataset of scalar"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=None)
예제 #8
0
    def _apply_fn(dataset):
        """Function from `Dataset` to `Dataset` that applies the transformation."""
        target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
        target_dist_t = math_ops.cast(target_dist_t, dtypes.float32)

        # Get initial distribution.
        if initial_dist is not None:
            initial_dist_t = ops.convert_to_tensor(initial_dist,
                                                   name="initial_dist")
            initial_dist_t = math_ops.cast(initial_dist_t, dtypes.float32)

            acceptance_dist, prob_of_original = (
                _calculate_acceptance_probs_with_mixing(
                    initial_dist_t, target_dist_t))
            initial_dist_ds = dataset_ops.Dataset.from_tensors(
                initial_dist_t).repeat()
            acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
                acceptance_dist).repeat()
            prob_of_original_ds = dataset_ops.Dataset.from_tensors(
                prob_of_original).repeat()
        else:
            initial_dist_ds = _estimate_initial_dist_ds(
                target_dist_t, dataset.map(class_func))
            acceptance_and_original_prob_ds = initial_dist_ds.map(
                lambda initial: _calculate_acceptance_probs_with_mixing(  # pylint: disable=g-long-lambda
                    initial, target_dist_t))
            acceptance_dist_ds = acceptance_and_original_prob_ds.map(
                lambda accept_prob, _: accept_prob)
            prob_of_original_ds = acceptance_and_original_prob_ds.map(
                lambda _, prob_original: prob_original)
        filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
                                 class_func, seed)
        # Prefetch filtered dataset for speed.
        filtered_ds = filtered_ds.prefetch(3)

        prob_original_static = _get_prob_original_static(
            initial_dist_t,
            target_dist_t) if initial_dist is not None else None

        def add_class_value(*x):
            if len(x) == 1:
                return class_func(*x), x[0]
            else:
                return class_func(*x), x

        if prob_original_static == 1:
            return dataset.map(add_class_value)
        elif prob_original_static == 0:
            return filtered_ds
        else:
            return interleave_ops.sample_from_datasets(
                [dataset.map(add_class_value), filtered_ds],
                weights=prob_of_original_ds.map(
                    lambda prob: [(prob, 1.0 - prob)]),
                seed=seed,
                stop_on_empty_dataset=True)
 def testSampleFromDatasetsWithZeroWeight(self, weights_type):
     # Sampling stops when the second dataset is exhausted.
     weights = _get_weights_of_type(np.asarray([0., 1.]), weights_type)
     datasets = [
         dataset_ops.Dataset.from_tensors(-1).repeat(2),
         dataset_ops.Dataset.from_tensors(1).repeat(2)
     ]
     sample_dataset = interleave_ops.sample_from_datasets(
         datasets, weights=weights, stop_on_empty_dataset=True)
     self.assertDatasetProduces(sample_dataset, [1, 1])
 def testSampleFromDatasetsSkippingDatasetsWithZeroWeight(self):
     # Sampling skips the first dataset.
     weights = np.asarray([0., 1.])
     datasets = [
         dataset_ops.Dataset.from_tensors(-1).repeat(),
         dataset_ops.Dataset.from_tensors(1)
     ]
     sample_dataset = interleave_ops.sample_from_datasets(
         datasets, weights=weights, stop_on_empty_dataset=False)
     self.assertDatasetProduces(sample_dataset, [1])
 def testSampleFromDatasetsAllWeightsAreZero(self):
     # Sampling skips both datasets.
     weights = np.asarray([0., 0.])
     datasets = [
         dataset_ops.Dataset.from_tensors(-1).repeat(),
         dataset_ops.Dataset.from_tensors(1).repeat()
     ]
     sample_dataset = interleave_ops.sample_from_datasets(
         datasets, weights=weights, stop_on_empty_dataset=False)
     self.assertDatasetProduces(sample_dataset, [])
예제 #12
0
    def testSampleFromEmptyDataset(self, weights_as_dataset):
        weights = np.asarray([1., 0.])
        if weights_as_dataset:
            weights = dataset_ops.Dataset.from_tensors(weights).repeat()

        datasets = [
            dataset_ops.Dataset.from_tensors(-1).skip(5),
            dataset_ops.Dataset.from_tensors(1).repeat()
        ]
        sample_dataset = interleave_ops.sample_from_datasets(
            datasets, weights=weights, stop_on_empty_dataset=True)
        self.assertDatasetProduces(sample_dataset, [])
  def testErrors(self):
    with self.assertRaisesRegexp(ValueError,
                                 r"vector of length `len\(datasets\)`"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[0.25, 0.25, 0.25, 0.25])

    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[1, 1])

    with self.assertRaisesRegexp(TypeError, "must have the same type"):
      interleave_ops.sample_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(0.0)
      ])

    with self.assertRaisesRegexp(TypeError, "tf.int64"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))

    with self.assertRaisesRegexp(TypeError, "scalar"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
  def testErrors(self):
    with self.assertRaisesRegexp(ValueError,
                                 r"vector of length `len\(datasets\)`"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[0.25, 0.25, 0.25, 0.25])

    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
      interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.range(10),
           dataset_ops.Dataset.range(20)],
          weights=[1, 1])

    with self.assertRaisesRegexp(TypeError, "must have the same type"):
      interleave_ops.sample_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(0.0)
      ])

    with self.assertRaisesRegexp(TypeError, "tf.int64"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))

    with self.assertRaisesRegexp(TypeError, "scalar"):
      interleave_ops.choose_from_datasets([
          dataset_ops.Dataset.from_tensors(0),
          dataset_ops.Dataset.from_tensors(1)
      ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
    def testSampleFromDatasetsStoppingOnEmptyDataset(self, weights_type):
        # Sampling stops when the first dataset is exhausted.
        weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
        datasets = [
            dataset_ops.Dataset.from_tensors(np.int64(-1)),
            dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
            dataset_ops.Dataset.range(10).repeat()
        ]
        sample_dataset = interleave_ops.sample_from_datasets(
            datasets, weights=weights, stop_on_empty_dataset=True)

        samples_list = self.getIteratorOutput(self.getNext(sample_dataset))
        self.assertEqual(samples_list.count(-1), 1)
    def testSampleFromDatasetsSkippingEmptyDataset(self, weights_type):
        # Sampling skips the first dataset after it becomes empty.
        weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
        datasets = [
            dataset_ops.Dataset.from_tensors(np.int64(-1)),
            dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
            dataset_ops.Dataset.range(10).repeat()
        ]
        sample_dataset = interleave_ops.sample_from_datasets(
            datasets, weights=weights, stop_on_empty_dataset=False).take(100)

        samples_list = self.getIteratorOutput(self.getNext(sample_dataset))
        self.assertLen(samples_list, 100)
        self.assertEqual(samples_list.count(-1), 1)
  def testSampleFromDatasetsWithZeroWeight(self, weights_as_dataset):
    weights = np.asarray([0., 1.])
    if weights_as_dataset:
      weights = dataset_ops.Dataset.from_tensors(weights).repeat()

    # Sampling stops when the second dataset is exhausted.
    datasets = [
        dataset_ops.Dataset.from_tensors(-1).repeat(2),
        dataset_ops.Dataset.from_tensors(1).repeat(2)
    ]
    sample_dataset = interleave_ops.sample_from_datasets(
        datasets, weights=weights, stop_on_empty_dataset=True)

    samples_list = self.getIteratorOutput(self.getNext(sample_dataset))
    self.assertEqual(samples_list, [1, 1])
  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
    # Create a dataset that samples each integer in `[0, num_datasets)`
    # with probability given by `weights[i]`.
    dataset = interleave_ops.sample_from_datasets([
        dataset_ops.Dataset.from_tensors(i).repeat(None)
        for i in range(num_datasets)
    ], weights)
    dataset = dataset.take(num_samples)

    next_element = self.getNext(dataset)
    freqs = np.zeros([num_datasets])
    for _ in range(num_samples):
      freqs[self.evaluate(next_element())] += 1
    with self.assertRaises(errors.OutOfRangeError):
      self.evaluate(next_element())

    return freqs
    def _testSampleFromDatasetsHelper(self, weights, num_datasets,
                                      num_samples):
        # Create a dataset that samples each integer in `[0, num_datasets)`
        # with probability given by `weights[i]`.
        dataset = interleave_ops.sample_from_datasets([
            dataset_ops.Dataset.from_tensors(i).repeat(None)
            for i in range(num_datasets)
        ], weights)
        dataset = dataset.take(num_samples)

        next_element = self.getNext(dataset)
        freqs = np.zeros([num_datasets])
        for _ in range(num_samples):
            freqs[self.evaluate(next_element())] += 1
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(next_element())

        return freqs
예제 #20
0
  def _apply_fn(dataset):
    """Function from `Dataset` to `Dataset` that applies the transformation."""
    target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
    class_values_ds = dataset.map(class_func)

    # Get initial distribution.
    if initial_dist is not None:
      initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
      acceptance_dist, prob_of_original = (
          _calculate_acceptance_probs_with_mixing(initial_dist_t,
                                                  target_dist_t))
      initial_dist_ds = dataset_ops.Dataset.from_tensors(
          initial_dist_t).repeat()
      acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
          acceptance_dist).repeat()
      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
          prob_of_original).repeat()
    else:
      initial_dist_ds = _estimate_initial_dist_ds(
          target_dist_t, class_values_ds)
      acceptance_and_original_prob_ds = initial_dist_ds.map(
          lambda initial: _calculate_acceptance_probs_with_mixing(  # pylint: disable=g-long-lambda
              initial, target_dist_t))
      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
          lambda accept_prob, _: accept_prob)
      prob_of_original_ds = acceptance_and_original_prob_ds.map(
          lambda _, prob_original: prob_original)
    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
                             class_values_ds, seed)
    # Prefetch filtered dataset for speed.
    filtered_ds = filtered_ds.prefetch(3)

    prob_original_static = _get_prob_original_static(
        initial_dist_t, target_dist_t) if initial_dist is not None else None
    if prob_original_static == 1:
      return dataset_ops.Dataset.zip((class_values_ds, dataset))
    elif prob_original_static == 0:
      return filtered_ds
    else:
      return interleave_ops.sample_from_datasets(
          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
          seed=seed)
  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
    # Create a dataset that samples each integer in `[0, num_datasets)`
    # with probability given by `weights[i]`.
    dataset = interleave_ops.sample_from_datasets([
        dataset_ops.Dataset.from_tensors(i).repeat(None)
        for i in range(num_datasets)
    ], weights)
    dataset = dataset.take(num_samples)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    with self.cached_session() as sess:
      freqs = np.zeros([num_datasets])
      for _ in range(num_samples):
        freqs[sess.run(next_element)] += 1
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(next_element)

    return freqs
  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
    # Create a dataset that samples each integer in `[0, num_datasets)`
    # with probability given by `weights[i]`.
    dataset = interleave_ops.sample_from_datasets([
        dataset_ops.Dataset.from_tensors(i).repeat(None)
        for i in range(num_datasets)
    ], weights)
    dataset = dataset.take(num_samples)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    with self.cached_session() as sess:
      freqs = np.zeros([num_datasets])
      for _ in range(num_samples):
        freqs[sess.run(next_element)] += 1
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(next_element)

    return freqs
예제 #23
0
  def testSampleFromDatasets(self):
    cluster = data_service_test_base.TestCluster(num_workers=3)
    num_samples = 200
    weights = [.6, .3, .1]
    classes = len(weights)

    # Create a dataset that samples each integer in `[0, num_datasets)`
    # with probability given by `weights[i]`.
    ds = interleave_ops.sample_from_datasets(
        [dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes)],
        weights)
    ds = self._make_dynamic_sharding_dataset(ds, cluster)
    ds = ds.take(num_samples)

    freqs = np.zeros([classes])
    for v in self.getDatasetOutput(ds):
      freqs[v] += 1

    self.assertGreater(freqs[0], freqs[1])
    self.assertGreater(freqs[1], freqs[2])
예제 #24
0
def sample_from_datasets(datasets, weights=None, seed=None):
    """Samples elements at random from the datasets in `datasets`.

  Args:
    datasets: A list of `tf.data.Dataset` objects with compatible structure.
    weights: (Optional.) A list of `len(datasets)` floating-point values where
      `weights[i]` represents the probability with which an element should be
      sampled from `datasets[i]`, or a `tf.data.Dataset` object where each
      element is such a list. Defaults to a uniform distribution across
      `datasets`.
    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
      random seed that will be used to create the distribution. See
      `tf.set_random_seed` for behavior.

  Returns:
    A dataset that interleaves elements from `datasets` at random, according to
    `weights` if provided, otherwise with uniform probability.

  Raises:
    TypeError: If the `datasets` or `weights` arguments have the wrong type.
    ValueError: If the `weights` argument is specified and does not match the
      length of the `datasets` element.
  """
    return interleave_ops.sample_from_datasets(datasets, weights, seed)
예제 #25
0
def sample_from_datasets(datasets, weights=None, seed=None):
  """Samples elements at random from the datasets in `datasets`.

  Args:
    datasets: A list of `tf.data.Dataset` objects with compatible structure.
    weights: (Optional.) A list of `len(datasets)` floating-point values where
      `weights[i]` represents the probability with which an element should be
      sampled from `datasets[i]`, or a `tf.data.Dataset` object where each
      element is such a list. Defaults to a uniform distribution across
      `datasets`.
    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
      random seed that will be used to create the distribution. See
      `tf.compat.v1.set_random_seed` for behavior.

  Returns:
    A dataset that interleaves elements from `datasets` at random, according to
    `weights` if provided, otherwise with uniform probability.

  Raises:
    TypeError: If the `datasets` or `weights` arguments have the wrong type.
    ValueError: If the `weights` argument is specified and does not match the
      length of the `datasets` element.
  """
  return interleave_ops.sample_from_datasets(datasets, weights, seed)
 def testSampleFromDatasetsCardinality(self):
     ds1 = dataset_ops.Dataset.from_tensors([1.0]).repeat()
     ds2 = dataset_ops.Dataset.from_tensors([2.0]).repeat()
     ds = interleave_ops.sample_from_datasets([ds1, ds2])
     self.assertEqual(self.evaluate(ds.cardinality()), dataset_ops.INFINITE)