Exemplo n.º 1
0
    def test_load(self):
        name = "empty_dataset_builder/k1=1"
        data_dir = "foo"
        as_dataset_kwargs = dict(a=1, b=2)

        # EmptyDatasetBuilder returns self from as_dataset
        builder = registered.load(name=name,
                                  split=dataset_builder.Split.TEST,
                                  data_dir=data_dir,
                                  download=False,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertFalse(builder.download_called)
        print(as_dataset_kwargs)
        print(builder.as_dataset_kwargs)
        self.assertEqual(dataset_builder.Split.TEST,
                         builder.as_dataset_kwargs.pop("split"))
        print(builder.as_dataset_kwargs)
        self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
        self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs)

        builder = registered.load(name=name,
                                  split=dataset_builder.Split.TRAIN,
                                  data_dir=data_dir,
                                  download=True,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertTrue(builder.download_called)
  def test_load_from_gcs(self):
    from tensorflow_datasets.image import mnist  # pylint:disable=g-import-not-at-top
    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
      with absltest.mock.patch.object(
          mnist.MNIST, "_download_and_prepare",
          side_effect=NotImplementedError):
        # Make sure the dataset cannot be generated.
        with self.assertRaises(NotImplementedError):
          registered.load(
              name="mnist",
              data_dir=tmp_dir)
        # Enable GCS access so that dataset will be loaded from GCS.
        with self.gcs_access():
          _, info = registered.load(
              name="mnist",
              data_dir=tmp_dir,
              with_info=True)
      self.assertSetEqual(
          set(["dataset_info.json", "image.image.json",
               "mnist-test.counts.txt-00000-of-00001",
               "mnist-test.tfrecord-00000-of-00001",
               "mnist-train.counts.txt-00000-of-00001"] +
              ["mnist-train.tfrecord-0000%d-of-00010" % i for i in range(10)]),
          set(tf.io.gfile.listdir(os.path.join(tmp_dir, "mnist/1.0.0"))))

      self.assertEqual(set(info.splits.keys()), set(["train", "test"]))
Exemplo n.º 3
0
    def test_load(self):
        name = "empty_dataset_builder/k1=1"
        data_dir = "foo"
        as_dataset_kwargs = dict(a=1, b=2)

        # EmptyDatasetBuilder returns self from as_dataset
        builder = registered.load(name=name,
                                  split=splits.Split.TEST,
                                  data_dir=data_dir,
                                  download=False,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertFalse(builder.download_called)
        self.assertEqual(splits.Split.TEST,
                         builder.as_dataset_kwargs.pop("split"))
        self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size"))
        self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised"))
        self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
        self.assertEqual(dict(data_dir=data_dir, k1=1, config=None),
                         builder.kwargs)

        builder = registered.load(name,
                                  split=splits.Split.TRAIN,
                                  data_dir=data_dir,
                                  download=True,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertTrue(builder.download_called)
Exemplo n.º 4
0
 def test_invalid_split_dataset(self):
   with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
     with self.assertRaisesWithPredicateMatch(ValueError, "ALL is a special"):
       # Raise error during .download_and_prepare()
       registered.load(
           name="invalid_split_dataset",
           data_dir=tmp_dir,
       )
    def test_show_examples(self, mock_fig):
        with testing.mock_data(num_examples=20):
            ds, ds_info = registered.load('imagenet2012',
                                          split='train',
                                          with_info=True)
            visualization.show_examples(ds_info, ds)

            ds, ds_info = registered.load('crema_d',
                                          split='validation',
                                          with_info=True)
            visualization.show_examples(ds_info, ds)
  def test_load_with_config(self):
    data_dir = "foo"
    name = "empty_dataset_builder/bar/k1=1"
    # EmptyDatasetBuilder returns self from as_dataset
    builder = registered.load(name=name, split=splits.Split.TEST,
                              data_dir=data_dir)
    self.assertEqual(dict(data_dir=data_dir, k1=1, config="bar"),
                     builder.kwargs)

    name = "empty_dataset_builder/bar"
    builder = registered.load(name=name, split=splits.Split.TEST,
                              data_dir=data_dir)
    self.assertEqual(dict(data_dir=data_dir, config="bar"),
                     builder.kwargs)
Exemplo n.º 7
0
 def test_mocking_imagenet(self):
     with mocking.mock_data():
         ds = registered.load('imagenet2012', split='train')
         for ex in ds.take(10):
             self.assertCountEqual(list(ex.keys()),
                                   ['file_name', 'image', 'label'])
             ex['image'].shape.assert_is_compatible_with((None, None, 3))
    def test_read_config(self):
        is_called = []

        def interleave_sort(lists):
            is_called.append(True)
            return lists

        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            read_config = read_config_lib.ReadConfig(
                experimental_interleave_sort_fn=interleave_sort, )
            read_config.options.experimental_stats.prefix = "tfds_prefix"
            ds = registered.load(
                name="dummy_dataset_shared_generator",
                data_dir=tmp_dir,
                split="train",
                read_config=read_config,
                shuffle_files=True,
            )

            # Check that the ReadConfig options are properly set
            self.assertEqual(ds.options().experimental_stats.prefix,
                             "tfds_prefix")

            # The instruction function should have been called
            self.assertEqual(is_called, [True])
Exemplo n.º 9
0
    def test_determinism(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            ds = registered.load(name="dummy_dataset_shared_generator",
                                 data_dir=tmp_dir,
                                 split=splits_lib.Split.TRAIN,
                                 as_dataset_kwargs=dict(shuffle_files=False))
            ds_values = list(dataset_utils.as_numpy(ds))

            # Ensure determinism. If this test fail, this mean that numpy random
            # module isn't always determinist (maybe between version, architecture,
            # ...), and so our datasets aren't guaranteed either.
            l = list(range(20))
            np.random.RandomState(42).shuffle(l)
            self.assertEqual(l, [
                0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10,
                14, 6
            ])

            # Ensure determinism. If this test fails, this mean the dataset are not
            # deterministically generated.
            self.assertEqual(
                [e["x"] for e in ds_values],
                [
                    16, 1, 2, 3, 10, 17, 0, 11, 14, 7, 4, 9, 18, 15, 8, 19, 6,
                    13, 12, 5
                ],
            )
 def test_load(self):
     with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
         dataset = registered.load(name="dummy_dataset_shared_generator",
                                   data_dir=tmp_dir,
                                   download=True,
                                   split=dataset_builder.Split.TRAIN)
         data = list(dataset)
         self.assertEqual(20, len(data))
Exemplo n.º 11
0
  def test_load(self):
    name = "empty_dataset_builder/k1=1"
    data_dir = "foo"
    as_dataset_kwargs = dict(a=1, b=2)

    # EmptyDatasetBuilder returns self from as_dataset
    builder = registered.load(
        name=name, data_dir=data_dir, download=False, **as_dataset_kwargs)
    self.assertTrue(builder.as_dataset_called)
    self.assertFalse(builder.download_called)
    self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
    self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs)

    builder = registered.load(
        name=name, data_dir=data_dir, download=True, **as_dataset_kwargs)
    self.assertTrue(builder.as_dataset_called)
    self.assertTrue(builder.download_called)
Exemplo n.º 12
0
 def test_load(self):
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
         dataset = registered.load(name="dummy_dataset_with_configs",
                                   data_dir=tmp_dir,
                                   download=True,
                                   split=splits_lib.Split.TRAIN)
         data = list(dataset_utils.as_numpy(dataset))
         self.assertEqual(20, len(data))
         self.assertLess(data[0]["x"], 30)
Exemplo n.º 13
0
 def test_mocking_lm1b(self):
   with mocking.mock_data():
     ds = registered.load('lm1b/bytes', split='train')
     self.assertEqual(ds.element_spec, {
         'text': tf.TensorSpec(shape=(None,), dtype=tf.int64),
     })
     for ex in ds.take(10):
       self.assertEqual(ex['text'].dtype, tf.int64)
       ex['text'].shape.assert_is_compatible_with((None,))
Exemplo n.º 14
0
 def test_mocking_imagenet(self):
   with mocking.mock_data():
     ds = registered.load('imagenet2012', split='train')
     self.assertEqual(ds.element_spec, {
         'file_name': tf.TensorSpec(shape=(), dtype=tf.string),
         'image': tf.TensorSpec(shape=(None, None, 3), dtype=tf.uint8),
         'label': tf.TensorSpec(shape=(), dtype=tf.int64),
     })
     list(ds.take(3))  # Iteration should work
Exemplo n.º 15
0
    def test_load(self):
        name = "empty_dataset_builder/k1=1"
        data_dir = "foo"
        as_dataset_kwargs = dict(a=1, b=2)

        # EmptyDatasetBuilder returns self from as_dataset
        builder = registered.load(name=name,
                                  split=splits.Split.TEST,
                                  data_dir=data_dir,
                                  download=False,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertFalse(builder.download_called)
        self.assertEqual(splits.Split.TEST,
                         builder.as_dataset_kwargs.pop("split"))
        self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size"))
        self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised"))
        self.assertFalse(builder.as_dataset_kwargs.pop("decoders"))
        self.assertIsNone(builder.as_dataset_kwargs.pop("in_memory"))
        self.assertIsNone(builder.as_dataset_kwargs.pop("read_config"))
        self.assertFalse(builder.as_dataset_kwargs.pop("shuffle_files"))
        self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
        self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs)

        builder = registered.load(name,
                                  split=splits.Split.TRAIN,
                                  data_dir=data_dir,
                                  download=True,
                                  as_dataset_kwargs=as_dataset_kwargs)
        self.assertTrue(builder.as_dataset_called)
        self.assertTrue(builder.download_called)

        # Tests for different batch_size
        # By default batch_size=None
        builder = registered.load(name=name,
                                  split=splits.Split.TEST,
                                  data_dir=data_dir)
        self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size"))
        # Setting batch_size=1
        builder = registered.load(name=name,
                                  split=splits.Split.TEST,
                                  data_dir=data_dir,
                                  batch_size=1)
        self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size"))
Exemplo n.º 16
0
 def test_max_values(self):
   with mocking.mock_data(num_examples=50):
     ds = registered.load('mnist', split='train')
     for ex in ds.take(50):
       self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10)
     self.assertEqual(  # Test determinism
         [ex['label'].numpy() for ex in ds.take(5)],
         [1, 9, 2, 5, 3],
     )
     self.assertEqual(  # Iterating twice should yield the same samples
         [ex['label'].numpy() for ex in ds.take(5)],
         [1, 9, 2, 5, 3],
     )
  def test_multi_split(self):
    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
      ds_train, ds_test = registered.load(
          name="dummy_dataset_shared_generator",
          data_dir=tmp_dir,
          split=["train", "test"],
          shuffle_files=False)

      data = list(dataset_utils.as_numpy(ds_train))
      self.assertEqual(20, len(data))

      data = list(dataset_utils.as_numpy(ds_test))
      self.assertEqual(10, len(data))
Exemplo n.º 18
0
    def test_multi_split(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            ds_train, ds_test = registered.load(
                name="dummy_dataset_shared_generator",
                data_dir=tmp_dir,
                split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST],
                as_dataset_kwargs=dict(shuffle_files=False))

            data = list(dataset_utils.as_numpy(ds_train))
            self.assertEqual(20, len(data))

            data = list(dataset_utils.as_numpy(ds_test))
            self.assertEqual(10, len(data))
Exemplo n.º 19
0
  def test_custom_as_dataset(self):
    def _as_dataset(self, *args, **kwargs):  # pylint: disable=unused-argument
      return tf.data.Dataset.from_generator(
          lambda: ({  # pylint: disable=g-long-lambda
              'text': t,
          } for t in ['some sentence', 'some other sentence']),
          output_types=self.info.features.dtype,
          output_shapes=self.info.features.shape,
      )

    with mocking.mock_data(as_dataset_fn=_as_dataset):
      ds = registered.load('lm1b', split='train')
      out = [ex['text'] for ex in dataset_utils.as_numpy(ds)]
      self.assertEqual(out, [b'some sentence', b'some other sentence'])
Exemplo n.º 20
0
 def test_max_values(self):
   with mocking.mock_data(num_examples=50):
     ds = registered.load('mnist', split='train')
     self.assertEqual(ds.element_spec, {
         'image': tf.TensorSpec(shape=(28, 28, 1), dtype=tf.uint8),
         'label': tf.TensorSpec(shape=(), dtype=tf.int64),
     })
     for ex in ds.take(50):
       self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10)
     self.assertEqual(  # Test determinism
         [ex['label'].numpy() for ex in ds.take(5)],
         [1, 9, 2, 5, 3],
     )
     self.assertEqual(  # Iterating twice should yield the same samples
         [ex['label'].numpy() for ex in ds.take(5)],
         [1, 9, 2, 5, 3],
     )
Exemplo n.º 21
0
    def test_nested_sequence(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            ds_train, ds_info = registered.load(name="nested_sequence_builder",
                                                data_dir=tmp_dir,
                                                split="train",
                                                with_info=True,
                                                shuffle_files=False)
            ex0, ex1, ex2 = [
                ex["frames"]["coordinates"]
                for ex in dataset_utils.as_numpy(ds_train)
            ]
            self.assertAllEqual(
                ex0,
                tf.ragged.constant([
                    [[0, 1], [2, 3], [4, 5]],
                    [],
                    [[6, 7]],
                ],
                                   inner_shape=(2, )))
            self.assertAllEqual(ex1, tf.ragged.constant([], ragged_rank=1))
            self.assertAllEqual(
                ex2,
                tf.ragged.constant([
                    [[10, 11]],
                    [[12, 13], [14, 15]],
                ],
                                   inner_shape=(2, )))

            self.assertEqual(
                ds_info.features.dtype,
                {"frames": {
                    "coordinates": tf.int32
                }},
            )
            self.assertEqual(
                ds_info.features.shape,
                {"frames": {
                    "coordinates": (None, None, 2)
                }},
            )
            nested_tensor_info = ds_info.features.get_tensor_info()
            self.assertEqual(
                nested_tensor_info["frames"]["coordinates"].sequence_rank,
                2,
            )
Exemplo n.º 22
0
 def test_mocking_imagenet_decoders(self):
   with mocking.mock_data():
     ds, ds_info = registered.load(
         'imagenet2012',
         split='train',
         decoders={'image': decode.SkipDecoding()},
         with_info=True,
     )
     self.assertEqual(ds.element_spec, {
         'file_name': tf.TensorSpec(shape=(), dtype=tf.string),
         'image': tf.TensorSpec(shape=(), dtype=tf.string),  # Encoded images
         'label': tf.TensorSpec(shape=(), dtype=tf.int64),
     })
     for ex in ds.take(10):
       # Image decoding should works
       image = ds_info.features['image'].decode_example(ex['image'])
       image.shape.assert_is_compatible_with((None, None, 3))
       self.assertEqual(image.dtype, tf.uint8)
Exemplo n.º 23
0
 def test_load_all_splits(self):
     name = "empty_dataset_builder"
     # EmptyDatasetBuilder returns self from as_dataset
     builder = registered.load(name=name, data_dir="foo")
     self.assertTrue(builder.as_dataset_called)
     self.assertEqual(None, builder.as_dataset_kwargs.pop("split"))
Exemplo n.º 24
0
 def test_max_values(self):
   with mocking.mock_data(num_examples=50):
     ds = registered.load('mnist', split='train')
     for ex in ds.take(50):
       self.assertLessEqual(tf.math.reduce_max(ex['label']).numpy(), 10)
Exemplo n.º 25
0
def _as_df(ds_name: str) -> pandas.DataFrame:
    """Loads the dataset as `pandas.DataFrame`."""
    with testing.mock_data(num_examples=3):
        ds, ds_info = registered.load(ds_name, split='train', with_info=True)
    df = as_dataframe.as_dataframe(ds, ds_info)
    return df
Exemplo n.º 26
0
 def test_mocking_lm1b(self):
     with mocking.mock_data():
         ds = registered.load('lm1b/bytes', split='train')
         for ex in ds.take(10):
             self.assertEqual(ex['text'].dtype, tf.int64)
             ex['text'].shape.assert_is_compatible_with((None, ))