Exemplo n.º 1
0
  def _testDistribution(self, initial_known):
    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
    initial_dist = [0.2] * 5 if initial_known else None
    iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
            resampling.rejection_resample(
                target_dist=target_dist,
                initial_dist=initial_dist,
                class_func=lambda c, _: c,
                seed=27)).make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.test_session() as sess:
      sess.run(init_op)
      returned = []
      with self.assertRaises(errors.OutOfRangeError):
        while True:
          returned.append(sess.run(get_next))

    returned_classes, returned_classes_and_data = zip(*returned)
    _, returned_data = zip(*returned_classes_and_data)
    self.assertAllEqual([compat.as_bytes(str(c))
                         for c in returned_classes], returned_data)
    total_returned = len(returned_classes)
    # Subsampling rejects a large percentage of the initial data in
    # this case.
    self.assertGreater(total_returned, 20000 * 0.2)
    class_counts = np.array([
        len([True for v in returned_classes if v == c])
        for c in range(5)])
    returned_dist = class_counts / total_returned
    self.assertAllClose(target_dist, returned_dist, atol=1e-2)
Exemplo n.º 2
0
  def _testDistribution(self, initial_known):
    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
    initial_dist = [0.2] * 5 if initial_known else None
    iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
            resampling.rejection_resample(
                target_dist=target_dist,
                initial_dist=initial_dist,
                class_func=lambda c, _: c,
                seed=27)).make_one_shot_iterator())
    get_next = iterator.get_next()

    with self.test_session() as sess:
      returned = []
      with self.assertRaises(errors.OutOfRangeError):
        while True:
          returned.append(sess.run(get_next))

    returned_classes, returned_classes_and_data = zip(*returned)
    _, returned_data = zip(*returned_classes_and_data)
    self.assertAllEqual([compat.as_bytes(str(c))
                         for c in returned_classes], returned_data)
    total_returned = len(returned_classes)
    # Subsampling rejects a large percentage of the initial data in
    # this case.
    self.assertGreater(total_returned, 20000 * 0.2)
    class_counts = np.array([
        len([True for v in returned_classes if v == c])
        for c in range(5)])
    returned_dist = class_counts / total_returned
    self.assertAllClose(target_dist, returned_dist, atol=1e-2)
Exemplo n.º 3
0
  def testDistribution(self, initial_known):
    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
    initial_dist = [0.2] * 5 if initial_known else None
    classes = math_ops.to_int64(classes)  # needed for Windows build.
    dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()

    get_next = dataset.apply(
        resampling.rejection_resample(
            target_dist=target_dist,
            initial_dist=initial_dist,
            class_func=lambda c, _: c,
            seed=27)).make_one_shot_iterator().get_next()

    with self.cached_session() as sess:
      returned = []
      while len(returned) < 4000:
        returned.append(sess.run(get_next))

    returned_classes, returned_classes_and_data = zip(*returned)
    _, returned_data = zip(*returned_classes_and_data)
    self.assertAllEqual([compat.as_bytes(str(c))
                         for c in returned_classes], returned_data)
    total_returned = len(returned_classes)
    class_counts = np.array([
        len([True for v in returned_classes if v == c])
        for c in range(5)])
    returned_dist = class_counts / total_returned
    self.assertAllClose(target_dist, returned_dist, atol=1e-2)
Exemplo n.º 4
0
    def testDistribution(self, initial_known):
        classes = np.random.randint(5, size=(20000, ))  # Uniformly sampled
        target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
        initial_dist = [0.2] * 5 if initial_known else None
        classes = math_ops.to_int64(classes)  # needed for Windows build.
        dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
            200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()

        get_next = dataset.apply(
            resampling.rejection_resample(
                target_dist=target_dist,
                initial_dist=initial_dist,
                class_func=lambda c, _: c,
                seed=27)).make_one_shot_iterator().get_next()

        with self.cached_session() as sess:
            returned = []
            while len(returned) < 4000:
                returned.append(sess.run(get_next))

        returned_classes, returned_classes_and_data = zip(*returned)
        _, returned_data = zip(*returned_classes_and_data)
        self.assertAllEqual(
            [compat.as_bytes(str(c)) for c in returned_classes], returned_data)
        total_returned = len(returned_classes)
        class_counts = np.array([
            len([True for v in returned_classes if v == c]) for c in range(5)
        ])
        returned_dist = class_counts / total_returned
        self.assertAllClose(target_dist, returned_dist, atol=1e-2)
Exemplo n.º 5
0
  def testVariableDevicePlacement(self):
    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
    with ops.device(
        device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
      _ = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
          200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
              resampling.rejection_resample(
                  target_dist=target_dist,
                  initial_dist=None,
                  class_func=lambda c, _: c,
                  seed=27)))

      self.assertEqual(1, len(variables.local_variables()))
      self.assertEqual(b"",
                       compat.as_bytes(variables.local_variables()[0].device))
Exemplo n.º 6
0
    def testVariableDevicePlacement(self):
        classes = np.random.randint(5, size=(20000, ))  # Uniformly sampled
        target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
        with ops.device(
                device_setter.replica_device_setter(ps_tasks=1,
                                                    ps_device="/cpu:0")):
            _ = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
                200,
                seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
                    resampling.rejection_resample(target_dist=target_dist,
                                                  initial_dist=None,
                                                  class_func=lambda c, _: c,
                                                  seed=27)))

            self.assertEqual(1, len(variables.local_variables()))
            self.assertEqual(
                b"", compat.as_bytes(variables.local_variables()[0].device))
Exemplo n.º 7
0
def _time_resampling(test_obj, data_np, target_dist, init_dist, num_to_sample):
    dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()

    # Reshape distribution via rejection sampling.
    dataset = dataset.apply(
        resampling.rejection_resample(class_func=lambda x: x,
                                      target_dist=target_dist,
                                      initial_dist=init_dist,
                                      seed=142))

    get_next = dataset.make_one_shot_iterator().get_next()

    with test_obj.test_session() as sess:
        start_time = time.time()
        for _ in xrange(num_to_sample):
            sess.run(get_next)
        end_time = time.time()

    return end_time - start_time
Exemplo n.º 8
0
def _time_resampling(
    test_obj, data_np, target_dist, init_dist, num_to_sample):
  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()

  # Reshape distribution via rejection sampling.
  dataset = dataset.apply(
      resampling.rejection_resample(
          class_func=lambda x: x,
          target_dist=target_dist,
          initial_dist=init_dist,
          seed=142))

  get_next = dataset.make_one_shot_iterator().get_next()

  with test_obj.test_session() as sess:
    start_time = time.time()
    for _ in xrange(num_to_sample):
      sess.run(get_next)
    end_time = time.time()

  return end_time - start_time
Exemplo n.º 9
0
    def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
        init_dist = [0.5, 0.5]
        target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
        num_classes = len(init_dist)
        # We don't need many samples to test that this works.
        num_samples = 100
        data_np = np.random.choice(num_classes, num_samples, p=init_dist)

        dataset = dataset_ops.Dataset.from_tensor_slices(data_np)

        # Reshape distribution.
        dataset = dataset.apply(
            resampling.rejection_resample(class_func=lambda x: x,
                                          target_dist=target_dist,
                                          initial_dist=init_dist))

        get_next = dataset.make_one_shot_iterator().get_next()

        with self.cached_session() as sess:
            returned = []
            with self.assertRaises(errors.OutOfRangeError):
                while True:
                    returned.append(sess.run(get_next))
Exemplo n.º 10
0
  def testRandomClasses(self):
    init_dist = [0.25, 0.25, 0.25, 0.25]
    target_dist = [0.0, 0.0, 0.0, 1.0]
    num_classes = len(init_dist)
    num_samples = 100   # We don't need many samples to test a dirac-delta target distribution
    data_np = np.random.choice(num_classes, num_samples, p=init_dist)

    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)

    # Apply a random mapping that preserves the data distribution.
    def _remap_fn(_):
      return math_ops.cast(random_ops.random_uniform([1]) * num_classes,
                           dtypes.int32)[0]
    dataset = dataset.map(_remap_fn)

    # Reshape distribution.
    dataset = dataset.apply(
        resampling.rejection_resample(
            class_func=lambda x: x,
            target_dist=target_dist,
            initial_dist=init_dist))

    get_next = dataset.make_one_shot_iterator().get_next()


    with self.test_session() as sess:
      returned = []
      with self.assertRaises(errors.OutOfRangeError):
        while True:
          returned.append(sess.run(get_next))

    classes, _ = zip(*returned)
    bincount = np.bincount(
        np.array(classes),
        minlength=num_classes).astype(np.float32) / len(classes)

    self.assertAllClose(target_dist, bincount, atol=1e-2)
Exemplo n.º 11
0
    def testRandomClasses(self):
        init_dist = [0.25, 0.25, 0.25, 0.25]
        target_dist = [0.0, 0.0, 0.0, 1.0]
        num_classes = len(init_dist)
        # We don't need many samples to test a dirac-delta target distribution.
        num_samples = 100
        data_np = np.random.choice(num_classes, num_samples, p=init_dist)

        dataset = dataset_ops.Dataset.from_tensor_slices(data_np)

        # Apply a random mapping that preserves the data distribution.
        def _remap_fn(_):
            return math_ops.cast(
                random_ops.random_uniform([1]) * num_classes, dtypes.int32)[0]

        dataset = dataset.map(_remap_fn)

        # Reshape distribution.
        dataset = dataset.apply(
            resampling.rejection_resample(class_func=lambda x: x,
                                          target_dist=target_dist,
                                          initial_dist=init_dist))

        get_next = dataset.make_one_shot_iterator().get_next()

        with self.cached_session() as sess:
            returned = []
            with self.assertRaises(errors.OutOfRangeError):
                while True:
                    returned.append(sess.run(get_next))

        classes, _ = zip(*returned)
        bincount = np.bincount(np.array(classes),
                               minlength=num_classes).astype(
                                   np.float32) / len(classes)

        self.assertAllClose(target_dist, bincount, atol=1e-2)
Exemplo n.º 12
0
  def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
    init_dist = [0.5, 0.5]
    target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
    num_classes = len(init_dist)
    # We don't need many samples to test that this works.
    num_samples = 100
    data_np = np.random.choice(num_classes, num_samples, p=init_dist)

    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)

    # Reshape distribution.
    dataset = dataset.apply(
        resampling.rejection_resample(
            class_func=lambda x: x,
            target_dist=target_dist,
            initial_dist=init_dist))

    get_next = dataset.make_one_shot_iterator().get_next()

    with self.cached_session() as sess:
      returned = []
      with self.assertRaises(errors.OutOfRangeError):
        while True:
          returned.append(sess.run(get_next))