def test_builder_from_directory(code_builder: dataset_builder.DatasetBuilder): """Builder can be created from the files only.""" # Reconstruct the dataset builder = read_only_builder.builder_from_directory(code_builder.data_dir) assert builder.name == code_builder.name assert builder.data_dir == code_builder.data_dir assert builder.info.version == code_builder.info.version assert builder.info.full_name == code_builder.info.full_name assert repr(builder.info) == repr(code_builder.info) assert builder.VERSION == code_builder.info.version assert builder.RELEASE_NOTES == code_builder.info.release_notes assert builder.__module__ == type(code_builder).__module__ assert read_only_builder.ReadOnlyBuilder.VERSION is None if code_builder.builder_config: assert builder.builder_config code_config = code_builder.builder_config file_config = builder.builder_config # Config attributes should be restored too assert code_config.name == file_config.name assert code_config.description == file_config.description assert code_config.version == file_config.version # Test that the dataset can be read ds = dataset_utils.as_numpy(builder.as_dataset(split='train').take(5)) origin_ds = dataset_utils.as_numpy( builder.as_dataset(split='train').take(5)) assert [ex['id'] for ex in ds] == [ex['id'] for ex in origin_ds] builder.download_and_prepare() # Should be a no-op # Test pickling and un-pickling builder2 = dill.loads(dill.dumps(builder)) assert builder.name == builder2.name assert builder.version == builder2.version
def test_all_splits(self): splits = dataset_utils.as_numpy( self.builder.as_dataset(batch_size=-1)) self.assertSetEqual(set(splits.keys()), set([splits_lib.Split.TRAIN, splits_lib.Split.TEST])) # Test that enum and string both access same object self.assertIs(splits["train"], splits[splits_lib.Split.TRAIN]) self.assertIs(splits["test"], splits[splits_lib.Split.TEST]) train_data = splits[splits_lib.Split.TRAIN]["x"] test_data = splits[splits_lib.Split.TEST]["x"] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertEqual(sum(range(30)), int(train_data.sum() + test_data.sum()))
def test_custom_as_dataset(self): def _as_dataset(self, *args, **kwargs): # pylint: disable=unused-argument return tf.data.Dataset.from_generator( lambda: ( { # pylint: disable=g-long-lambda 'text': t, } for t in ['some sentence', 'some other sentence']), output_types=self.info.features.dtype, output_shapes=self.info.features.shape, ) with mocking.mock_data(as_dataset_fn=_as_dataset): ds = registered.load('lm1b', split='train') out = [ex['text'] for ex in dataset_utils.as_numpy(ds)] self.assertEqual(out, [b'some sentence', b'some other sentence'])
def test_supervised_keys_nested(self): self.builder = DummyDatasetWithSupervisedKeys( data_dir=self._tfds_tmp_dir, supervised_keys=("x", ("x", ("x", "x")), { "a": "x", "b": ("x", ) })) single, pair, a_dict = dataset_utils.as_numpy( self.builder.as_dataset(split=splits_lib.Split.TRAIN, as_supervised=True, batch_size=-1)) self.assertEqual(single.shape[0], 20) self.assertLen(pair, 2) self.assertEqual(pair[1][1].shape[0], 20) self.assertLen(a_dict, 2) self.assertEqual(a_dict["b"][0].shape[0], 20)
def main(_): builder_kwargs = { "validation_split": flags.validation_split } tfdataset_path = local_settings.TF_DATASET_PATH if flags.tfds_path is not None: tfdataset_path = flags.tfds_path train, dsinfo = tfds.load("pacs", data_dir=tfdataset_path, split=tfds.Split.VALIDATION, builder_kwargs=builder_kwargs, with_info=True) for example in dataset_utils.as_numpy(train): import pdb; pdb.set_trace() print(example["attributes"]["label"])
def test_decoding(self): self.assertFeatureEagerOnly( feature=feature_lib.Dataset( { 'a': tf.string, 'b': { 'c': tf.uint8, } }, length=None), shape={ 'a': (), 'b': { 'c': (), } }, dtype={ 'a': tf.string, 'b': { 'c': tf.uint8, } }, tests=[ testing.FeatureExpectationItem( value=dataset_utils.as_numpy( tf.data.Dataset.from_tensor_slices({ 'a': ['aa', 'b', 'ccc'], 'b': { 'c': [1, 2, 3], } })), decoders={ 'b': { 'c': IncrementDecoder(), }, }, expected=tf.data.Dataset.from_tensor_slices({ 'a': [tf.compat.as_bytes(t) for t in ('aa', 'b', 'ccc')], 'b': { 'c': [2, 3, 4], } }), ), ], )
def test_nested_sequence(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: ds_train, ds_info = registered.load(name="nested_sequence_builder", data_dir=tmp_dir, split="train", with_info=True, shuffle_files=False) ex0, ex1, ex2 = [ ex["frames"]["coordinates"] for ex in dataset_utils.as_numpy(ds_train) ] self.assertAllEqual( ex0, tf.ragged.constant([ [[0, 1], [2, 3], [4, 5]], [], [[6, 7]], ], inner_shape=(2, ))) self.assertAllEqual(ex1, tf.ragged.constant([], ragged_rank=1)) self.assertAllEqual( ex2, tf.ragged.constant([ [[10, 11]], [[12, 13], [14, 15]], ], inner_shape=(2, ))) self.assertEqual( ds_info.features.dtype, {"frames": { "coordinates": tf.int32 }}, ) self.assertEqual( ds_info.features.shape, {"frames": { "coordinates": (None, None, 2) }}, ) nested_tensor_info = ds_info.features.get_tensor_info() self.assertEqual( nested_tensor_info["frames"]["coordinates"].sequence_rank, 2, )
def features_encode_decode(features_dict, example, decoders): """Runs the full pipeline: encode > write > tmp files > read > decode.""" # Serialize/deserialize the example serialized_example = features_dict.serialize_example(example) decode_fn = functools.partial( features_dict.deserialize_example, decoders=decoders, ) ds = tf.data.Dataset.from_tensors(serialized_example) ds = ds.map(decode_fn) if tf.executing_eagerly(): out_tensor = next(iter(ds)) else: out_tensor = tf.compat.v1.data.make_one_shot_iterator(ds).get_next() out_numpy = dataset_utils.as_numpy(out_tensor) return out_tensor, out_numpy, ds.element_spec
def _read_records(path, file_format=file_adapters.DEFAULT_FILE_FORMAT): """Returns (files_names, list_of_records_in_each_file). Args: path: path to tfrecord, omitting suffix. file_format: format of the record files. """ # Ignore _index.json files. paths = sorted(tf.io.gfile.glob('%s-*-of-*' % path)) paths = [p for p in paths if not p.endswith(writer_lib._INDEX_PATH_SUFFIX)] all_recs = [] for fpath in paths: all_recs.append( list( dataset_utils.as_numpy( file_adapters.ADAPTER_FOR_FORMAT[file_format].make_tf_data( fpath)))) return [os.path.basename(p) for p in paths], all_recs
def assertFeature(self, specs, serialized_info, tests): """Test the TFRecordExampleAdapter encoding.""" adapter = file_format_adapter.TFRecordExampleAdapter(specs) with self._subTest("serialized_info"): self.assertEqual(serialized_info, adapter._parser._build_feature_specs()) for i, test in enumerate(tests): with self._subTest(str(i)): if test.raise_cls is not None: with self.assertRaisesWithPredicateMatch( test.raise_cls, test.raise_msg): adapter._serializer.serialize_example(test.value) continue serialized = adapter._serializer.serialize_example(test.value) if test.expected_serialized is not None: example_proto = tf.train.Example() example_proto.ParseFromString(serialized) expected_proto = tf.train.Example( features=tf.train.Features( feature=test.expected_serialized)) self.assertEqual(expected_proto, example_proto) example = _parse_example(serialized, adapter._parser.parse_example) with self._subTest("dtype"): out_dtypes = utils.map_nested(lambda s: s.dtype, example) expected_dtypes = utils.map_nested(lambda s: s.dtype, specs) self.assertEqual(out_dtypes, expected_dtypes) with self._subTest("shape"): # For shape, because (None, 3) match with (5, 3), we use # tf.TensorShape.assert_is_compatible_with on each of the elements utils.map_nested( lambda x: x[0].shape.assert_is_compatible_with(x[1]. shape), utils.zip_nested(example, specs)) np_example = dataset_utils.as_numpy(example) self.assertAllEqualNested(np_example, test.expected)
def _assertAsDataset(self, builder): """Check the label distribution. This checks that lable get correctly converted between the synset ids and integers. Args: builder: The ImagenetA dataset builder. """ super()._assertAsDataset(builder) label_frequncies = collections.Counter() label_feature = builder.info.features['label'] dataset = builder.as_dataset() for features in dataset_utils.as_numpy(dataset['test']): label_frequncies.update([label_feature.int2str(features['label'])]) self.assertEqual(dict(label_frequncies), {'n01580077': 2, 'n01616318': 3, 'n07697313': 5})
def test_ragged_tensors(self): rt = tf.ragged.constant([ [1, 2, 3], [], [4, 5], ]) rt = dataset_utils.as_numpy(rt) if not tf.executing_eagerly(): # Output of `sess.run(rt)` is a `RaggedTensorValue` object self.assertIsInstance(rt, tf.compat.v1.ragged.RaggedTensorValue) else: self.assertIsInstance(rt, tf.RaggedTensor) self.assertAllEqual(rt, tf.ragged.constant([ [1, 2, 3], [], [4, 5], ]))
def _assertBeamGeneration(self, dl_config, dataset_cls, dataset_name): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: builder = dataset_cls(data_dir=tmp_dir) builder.download_and_prepare(download_config=dl_config) data_dir = os.path.join(tmp_dir, dataset_name, "1.0.0") self.assertEqual(data_dir, builder._data_dir) # Check number of shards self._assertShards( data_dir, pattern="%s-test.tfrecord-{:05}-of-{:05}" % dataset_name, # Liquid sharding is not guaranteed to always use the same number. num_shards=builder.info.splits["test"].num_shards, ) self._assertShards( data_dir, pattern="%s-train.tfrecord-{:05}-of-{:05}" % dataset_name, num_shards=1, ) datasets = dataset_utils.as_numpy(builder.as_dataset()) def get_id(ex): return ex["id"] self._assertElemsAllEqual( sorted(list(datasets["test"]), key=get_id), sorted([_gen_example(i)[1] for i in range(725)], key=get_id), ) self._assertElemsAllEqual( sorted(list(datasets["train"]), key=get_id), sorted([_gen_example(i)[1] for i in range(1000)], key=get_id), ) self.assertDictEqual( builder.info.metadata, { "label_sum_1000": 500, "id_mean_1000": 499.5, "label_sum_725": 362, "id_mean_725": 362.0, })
def test_with_batch_size(self): items = list(dataset_utils.as_numpy(self.builder.as_dataset( split="train+test", batch_size=10))) # 3 batches of 10 self.assertEqual(3, len(items)) x1, x2, x3 = items[0]["x"], items[1]["x"], items[2]["x"] self.assertEqual(10, x1.shape[0]) self.assertEqual(10, x2.shape[0]) self.assertEqual(10, x3.shape[0]) self.assertEqual(sum(range(30)), int(x1.sum() + x2.sum() + x3.sum())) # By default batch_size is None and won't add a batch dimension ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN) self.assertEqual(0, len(tf.compat.v1.data.get_output_shapes(ds)["x"])) # Setting batch_size=1 will add an extra batch dimension ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=1) self.assertEqual(1, len(tf.compat.v1.data.get_output_shapes(ds)["x"])) # Setting batch_size=2 will add an extra batch dimension ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=2) self.assertEqual(1, len(tf.compat.v1.data.get_output_shapes(ds)["x"]))
def _assertAsDataset(self, builder): split_to_checksums = {} # {"split": set(examples_checksums)} for split_name, expected_examples_number in self.SPLITS.items(): dataset = builder.as_dataset(split=split_name) compare_shapes_and_types(builder.info.features.get_tensor_info(), dataset.output_types, dataset.output_shapes) examples = list(dataset_utils.as_numpy( builder.as_dataset(split=split_name))) split_to_checksums[split_name] = set(checksum(rec) for rec in examples) self.assertLen(examples, expected_examples_number) for (split1, hashes1), (split2, hashes2) in itertools.combinations( split_to_checksums.items(), 2): if (split1 in self.OVERLAPPING_SPLITS or split2 in self.OVERLAPPING_SPLITS): continue self.assertFalse( hashes1.intersection(hashes2), ("Splits '%s' and '%s' are overlapping. Are you sure you want to " "have the same objects in those splits? If yes, add one one of " "them to OVERLAPPING_SPLITS class attribute.") % (split1, split2))
def test_beam_datasets( tmp_path: pathlib.Path, dataset_cls: dataset_builder.GeneratorBasedBuilder, make_dl_config: Callable[[], download.DownloadConfig], ): dataset_name = dataset_cls.name builder = dataset_cls(data_dir=tmp_path) builder.download_and_prepare(download_config=make_dl_config()) data_path = tmp_path / dataset_name / '1.0.0' assert data_path.exists() # Dataset has been generated # Check number of shards/generated files _test_shards( data_path, pattern='%s-test.tfrecord-{:05}-of-{:05}' % dataset_name, # Liquid sharding is not guaranteed to always use the same number. num_shards=builder.info.splits['test'].num_shards, ) _test_shards( data_path, pattern='%s-train.tfrecord-{:05}-of-{:05}' % dataset_name, num_shards=1, ) ds = dataset_utils.as_numpy(builder.as_dataset()) def get_id(ex): return ex['id'] _assert_values_equal( sorted(list(ds['test']), key=get_id), sorted([_gen_example(i)[1] for i in range(725)], key=get_id), ) _assert_values_equal( sorted(list(ds['train']), key=get_id), sorted([_gen_example(i)[1] for i in range(1000)], key=get_id), ) assert builder.info.metadata == builder.EXPECTED_METADATA
def test_determinism(self): ds = self.builder.as_dataset( split=splits_lib.Split.TRAIN, shuffle_files=False) ds_values = list(dataset_utils.as_numpy(ds)) # Ensure determinism. If this test fail, this mean that numpy random # module isn't always determinist (maybe between version, architecture, # ...), and so our datasets aren't guaranteed either. l = list(range(20)) np.random.RandomState(42).shuffle(l) self.assertEqual(l, [ 0, 17, 15, 1, 8, 5, 11, 3, 18, 16, 13, 2, 9, 19, 4, 12, 7, 10, 14, 6 ]) # Ensure determinism. If this test fails, this mean the dataset are not # deterministically generated. self.assertEqual( [e["x"] for e in ds_values], [6, 16, 19, 12, 14, 18, 5, 13, 15, 4, 10, 17, 0, 8, 3, 1, 9, 7, 11, 2], )
def test_ragged_tensors_ds(self): def _gen_ragged_tensors(): # Yield the (flat_values, rowids) yield ([0, 1, 2, 3], [0, 0, 0, 2]) # ex0 yield ([], []) # ex1 yield ([4, 5, 6], [0, 1, 1]) # ex2 ds = tf.data.Dataset.from_generator( _gen_ragged_tensors, output_types=(tf.int64, tf.int64), output_shapes=((None,), (None,)) ) ds = ds.map(tf.RaggedTensor.from_value_rowids) rt0, rt1, rt2 = list(dataset_utils.as_numpy(ds)) self.assertAllEqual(rt0, [ [0, 1, 2], [], [3,], ]) self.assertAllEqual(rt1, []) self.assertAllEqual(rt2, [[4], [5, 6]])
def as_dataframe( ds: tf.data.Dataset, ds_info: Optional[dataset_info.DatasetInfo] = None, ) -> StyledDataFrame: """Convert the dataset into a pandas dataframe. Warning: The dataframe will be loaded entirely in memory, you may want to call `tfds.as_dataframe` on a subset of the data instead: ``` df = tfds.as_dataframe(ds.take(10), ds_info) ``` Args: ds: `tf.data.Dataset`. The tf.data.Dataset object to convert to panda dataframe. Examples should not be batched. The full dataset will be loaded. ds_info: Dataset info object. If given, helps improving the formatting. Available either through `tfds.load('mnist', with_info=True)` or `tfds.builder('mnist').info` Returns: dataframe: The `pandas.DataFrame` object """ # Raise a clean error message if panda isn't installed. lazy_imports_lib.lazy_imports.pandas # pylint: disable=pointless-statement # Pack `as_supervised=True` datasets if ds_info: ds = dataset_info.pack_as_supervised_ds(ds, ds_info) # Flatten the keys names, specs,... while keeping the feature key definition # order columns = _make_columns(ds.element_spec, ds_info=ds_info) rows = [_make_row_dict(ex, columns) for ex in dataset_utils.as_numpy(ds)] df = StyledDataFrame(rows) df.current_style.format( {c.name: c.format_fn for c in columns if c.format_fn}) return df
def _assertAsDataset(self, builder): """Check the label distribution for each split.""" super(Ucf101Test, self)._assertAsDataset(builder) label_frequncies = {} label_feature = builder.info.features['label'] dataset = builder.as_dataset() for split_name in Ucf101Test.SPLITS: label_frequncies[split_name] = collections.defaultdict(int) for features in dataset_utils.as_numpy(dataset[split_name]): label_name = label_feature.int2str(features['label']) label_frequncies[split_name][label_name] += 1 self.assertEqual( dict(label_frequncies), { 'test': { 'Archery': 1, 'Nunchucks': 1 }, 'train': { 'Archery': 1, 'Nunchucks': 2 } })
def _assertAsDataset(self, builder): """Check the label distribution. This checks that labels get correctly converted between the synset ids and integers. Args: builder: The ImagenetR dataset builder. """ super()._assertAsDataset(builder) label_frequncies = collections.Counter() label_feature = builder.info.features['label'] dataset = builder.as_dataset() filenames = [] for features in dataset_utils.as_numpy(dataset['test']): label_frequncies.update([label_feature.int2str(features['label'])]) filenames.append(features['file_name']) self.assertEqual(dict(label_frequncies), {'n01443537': 2, 'n01484850': 3, 'n12267677': 5}) self.assertIn(b'n01443537/1.jpeg', filenames)
def test_with_configs(self): with testing.tmp_dir(self.get_temp_dir()) as tmp_dir: builder1 = DummyDatasetWithConfigs(config="plus1", data_dir=tmp_dir) builder2 = DummyDatasetWithConfigs(config="plus2", data_dir=tmp_dir) # Test that builder.builder_config is the correct config self.assertIs(builder1.builder_config, DummyDatasetWithConfigs.builder_configs["plus1"]) self.assertIs(builder2.builder_config, DummyDatasetWithConfigs.builder_configs["plus2"]) builder1.download_and_prepare() builder2.download_and_prepare() data_dir1 = os.path.join(tmp_dir, builder1.name, "plus1", "0.0.1") data_dir2 = os.path.join(tmp_dir, builder2.name, "plus2", "0.0.2") # Test that subdirectories were created per config self.assertTrue(tf.io.gfile.exists(data_dir1)) self.assertTrue(tf.io.gfile.exists(data_dir2)) # 1 train shard, 1 test shard, plus metadata files self.assertGreater(len(tf.io.gfile.listdir(data_dir1)), 2) self.assertGreater(len(tf.io.gfile.listdir(data_dir2)), 2) # Test that the config was used and they didn't collide. splits_list = ["train", "test"] for builder, incr in [(builder1, 1), (builder2, 2)]: train_data, test_data = [ # pylint: disable=g-complex-comprehension [ el["x"] for el in # pylint: disable=g-complex-comprehension dataset_utils.as_numpy(builder.as_dataset(split=split)) ] for split in splits_list ] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertCountEqual([incr + el for el in range(30)], train_data + test_data)
def _assertAsDataset(self, builder): split_to_checksums = {} # {"split": set(examples_checksums)} for split_name, expected_examples_number in self.SPLITS.items(): ds = builder.as_dataset(split=split_name) spec = tf.data.DatasetSpec.from_value(ds) compare_shapes_and_types( builder.info.features.get_tensor_info(), # We use _element_spec because element_spec was added in TF2.5+. element_spec=spec._element_spec, # pylint: disable=protected-access ) examples = list( dataset_utils.as_numpy(builder.as_dataset(split=split_name))) split_to_checksums[split_name] = set(checksum(rec) for rec in examples) self.assertLen(examples, expected_examples_number) for (split1, hashes1), (split2, hashes2) in itertools.combinations( split_to_checksums.items(), 2): if (split1 in self.OVERLAPPING_SPLITS or split2 in self.OVERLAPPING_SPLITS): continue self.assertFalse( hashes1.intersection(hashes2), ("Splits '%s' and '%s' are overlapping. Are you sure you want to " "have the same objects in those splits? If yes, add one one of " "them to OVERLAPPING_SPLITS class attribute.") % (split1, split2))
def test_label(self): self.assertFeatureEagerOnly( feature=feature_lib.Dataset( { 'label': feature_lib.ClassLabel(names=['left', 'right']), }, length=None), shape={'label': ()}, dtype={'label': tf.int64}, serialized_info={ 'label': feature_lib.TensorInfo(shape=(None, ), dtype=tf.int64), }, tests=[ testing.FeatureExpectationItem( value=[{ 'label': 'right' }, { 'label': 'left' }, { 'label': 'left' }], expected=tf.data.Dataset.from_tensor_slices( {'label': [1, 0, 0]}), ), # Variable sequence length testing.FeatureExpectationItem( value=dataset_utils.as_numpy( tf.data.Dataset.from_tensor_slices( {'label': ['right', 'left', 'right', 'left']})), expected=tf.data.Dataset.from_tensor_slices( {'label': [1, 0, 1, 0]}), ), ], test_attributes=dict(_length=None))
def test_shared_generator(self): with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir: builder = DummyDatasetSharedGenerator(data_dir=tmp_dir) builder.download_and_prepare() written_filepaths = [ os.path.join(builder._data_dir, fname) for fname in tf.io.gfile.listdir(builder._data_dir) ] # The data_dir contains the cached directory by default expected_filepaths = builder._build_split_filenames( split_info_list=builder.info.splits.values()) expected_filepaths.append( os.path.join(builder._data_dir, "dataset_info.json")) self.assertEqual(sorted(expected_filepaths), sorted(written_filepaths)) splits_list = [ splits_lib.Split.TRAIN, splits_lib.Split.TEST ] train_data, test_data = [ [el["x"] for el in dataset_utils.as_numpy(builder.as_dataset(split=split))] for split in splits_list ] self.assertEqual(20, len(train_data)) self.assertEqual(10, len(test_data)) self.assertEqual(list(range(30)), sorted(train_data + test_data)) # Builder's info should also have the above information. self.assertTrue(builder.info.initialized) self.assertEqual(20, builder.info.splits[splits_lib.Split.TRAIN].num_examples) self.assertEqual(10, builder.info.splits[splits_lib.Split.TEST].num_examples) self.assertEqual(30, builder.info.splits.total_num_examples)
def test_with_graph(self): with tf.Graph().as_default(): with tf.Graph().as_default() as g: ds = _create_dataset(range(10)) np_ds = dataset_utils.as_numpy(ds, graph=g) self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)])
def test_supervised_keys(self): x, _ = dataset_utils.as_numpy( self.builder.as_dataset(split=splits_lib.Split.TRAIN, as_supervised=True, batch_size=-1)) self.assertEqual(x.shape[0], 20)
def _build_single_dataset(self, split, shuffle_files, batch_size, decoders, as_supervised, in_memory): """as_dataset for a single split.""" if isinstance(split, six.string_types): split = splits_lib.Split(split) wants_full_dataset = batch_size == -1 if wants_full_dataset: batch_size = self.info.splits.total_num_examples or sys.maxsize # If the dataset is small, load it in memory dataset_shape_is_fully_defined = ( dataset_utils.features_shape_is_fully_defined(self.info.features)) in_memory_default = False # TODO(tfds): Consider default in_memory=True for small datasets with # fully-defined shape. # Expose and use the actual data size on disk and rm the manual # name guards. size_in_bytes is the download size, which is misleading, # particularly for datasets that use manual_dir as well as some downloads # (wmt and diabetic_retinopathy_detection). # in_memory_default = ( # self.info.size_in_bytes and # self.info.size_in_bytes <= 1e9 and # not self.name.startswith("wmt") and # not self.name.startswith("diabetic") and # dataset_shape_is_fully_defined) in_memory = in_memory_default if in_memory is None else in_memory # Build base dataset if in_memory and not wants_full_dataset: # TODO(tfds): Enable in_memory without padding features. May be able # to do by using a requested version of tf.data.Dataset.cache that can # persist a cache beyond iterator instances. if not dataset_shape_is_fully_defined: logging.warning( "Called in_memory=True on a dataset that does not " "have fully defined shapes. Note that features with " "variable length dimensions will be 0-padded to " "the maximum length across the dataset.") full_bs = self.info.splits.total_num_examples or sys.maxsize # If using in_memory, escape all device contexts so we can load the data # with a local Session. with tf.device(None): dataset = self._as_dataset(split=split, shuffle_files=shuffle_files, decoders=decoders) # Use padded_batch so that features with unknown shape are supported. dataset = dataset.padded_batch( full_bs, tf.compat.v1.data.get_output_shapes(dataset)) dataset = tf.data.Dataset.from_tensor_slices( next(dataset_utils.as_numpy(dataset))) else: dataset = self._as_dataset(split=split, shuffle_files=shuffle_files, decoders=decoders) if batch_size: # Use padded_batch so that features with unknown shape are supported. dataset = dataset.padded_batch( batch_size, tf.compat.v1.data.get_output_shapes(dataset)) if as_supervised: if not self.info.supervised_keys: raise ValueError( "as_supervised=True but %s does not support a supervised " "(input, label) structure." % self.name) input_f, target_f = self.info.supervised_keys dataset = dataset.map( lambda fs: (fs[input_f], fs[target_f]), num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # If shuffling, allow pipeline to be non-deterministic options = tf.data.Options() options.experimental_deterministic = not shuffle_files dataset = dataset.with_options(options) if wants_full_dataset: return tf.data.experimental.get_single_element(dataset) return dataset
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} np_dataset = dataset_utils.as_numpy(dataset) for example in utils.tqdm(np_dataset, unit=" examples", leave=False): statistics.num_examples += 1 assert isinstance(example, dict) feature_names = sorted(example.keys()) for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_np = example[feature_name] # For compatibility in graph and eager mode, we can get PODs here and # everything may not be neatly wrapped up in numpy's ndarray. feature_dtype = type(feature_np) if isinstance(feature_np, np.ndarray): # If we have an empty array, then don't proceed further with computing # statistics on it. if feature_np.size == 0: continue feature_dtype = feature_np.dtype.type feature_min, feature_max = None, None is_numeric = (np.issubdtype(feature_dtype, np.number) or feature_dtype == np.bool_) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. output_shapes_dict = dataset.output_shapes output_types_dict = dataset.output_types for feature_name in sorted(feature_to_num_examples.keys()): # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): Make this work with nested structures, currently the Schema # proto has no support for it. maybe_feature_shape = output_shapes_dict[feature_name] if not isinstance(maybe_feature_shape, tf.TensorShape): logging.error( "Statistics generation doesn't work for nested structures yet") continue for dim in maybe_feature_shape.as_list(): # We denote `None`s as -1 in the shape proto. feature.shape.dim.add().size = dim if dim else -1 feature_type = output_types_dict[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[feature_name] common_statistics.num_missing = ( statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() # Uses `.get` as Sequence(int) containing only empty array won't contains # any value. numeric_statistics.min = feature_to_min.get(feature_name, 0) numeric_statistics.max = feature_to_max.get(feature_name, 0) numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def test_in_memory(self): train_data = dataset_utils.as_numpy( self.builder.as_dataset(split="train", in_memory=True)) train_data = [el for el in train_data] self.assertEqual(20, len(train_data))