def testAdditionalOperationsAfterReadBack(self): self.setUpTFRecord() filenames = self.test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 10) ] tmpdir = self.makeSnapshotDirectory() dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply(snapshot.snapshot(tmpdir)) self.assertDatasetProduces(dataset, expected) # remove the original files and try to read the data back only from snapshot self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply(snapshot.snapshot(tmpdir)) self.assertDatasetProduces(dataset2, expected) expected_after = [ b"cord %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 10) ] dataset3 = core_readers._TFRecordDataset(filenames) dataset3 = dataset3.apply(snapshot.snapshot(tmpdir)) dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000)) self.assertDatasetProduces(dataset3, expected_after)
def testReadShuffledSnapshotWithSeedAfterWrite(self): self.setUpTFRecord(num_files=10, num_records=50) filenames = self._filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 50) ] tmpdir = self.snapshot_dir dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply( snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10)) self.assertDatasetProduces(dataset, expected) # remove the original files and try to read the data back only from snapshot self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply( snapshot.legacy_snapshot( tmpdir, shard_size_bytes=10, shuffle_on_read=True, shuffle_seed=123456)) next2 = self.getNext(dataset2) dataset3 = core_readers._TFRecordDataset(filenames) dataset3 = dataset3.apply( snapshot.legacy_snapshot( tmpdir, shard_size_bytes=10, shuffle_on_read=True, shuffle_seed=123456)) next3 = self.getNext(dataset3) # make sure that the items are read back in the same order for both datasets for _ in range(500): res2 = self.evaluate(next2()) res3 = self.evaluate(next3()) self.assertEqual(res2, res3)
def testTFRecordReaderWithDirectFileNames(self): # Using `_TFRecordDataset` creates a raw op rather than wrapping it around # a flat_map automatically. dataset = core_readers._TFRecordDataset(self._filenames) dataset = distribute._AutoShardDataset(dataset, 5, 0) expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in (0, 5) ] self.assertDatasetProduces(dataset, expected)
def testReadSnapshotBackAfterWrite(self): self.setUpTFRecord() filenames = self.test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 10) ] tmpdir = self.makeSnapshotDirectory() dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply(snapshot.snapshot(tmpdir)) self.assertDatasetProduces(dataset, expected) # remove the original files and try to read the data back only from snapshot self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply(snapshot.snapshot(tmpdir)) self.assertDatasetProduces(dataset2, expected)
def testReadShuffledSnapshotAfterWrite(self): self.setUpTFRecord(num_files=10, num_records=50) filenames = self.test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 50) ] tmpdir = self.snapshot_dir dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply( snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100)) self.assertDatasetProduces(dataset, expected) # remove the original files and try to read the data back only from snapshot self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply( snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True)) next2 = self.getNext(dataset2) res1 = self.evaluate(next2()) res2 = self.evaluate(next2()) res3 = self.evaluate(next2()) res4 = self.evaluate(next2()) res5 = self.evaluate(next2()) # make sure that we don't read the file back in the same order. self.assertNotEqual([res1, res2, res3, res4, res5], expected[0:5]) # make sure all the elements are still there dataset3 = core_readers._TFRecordDataset(filenames) dataset3 = dataset3.apply( snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True)) self.assertDatasetProduces(dataset3, expected, assert_items_equal=True)
def testTFRecordReaderWithDirectFileNames(self): # Using `_TFRecordDataset` creates a raw op rather than wrapping it around # a flat_map automatically. dataset = core_readers._TFRecordDataset(self.test_filenames) dataset = distribute._AutoShardDataset(dataset, 5, 0) expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in (0, 5) ] self.assertDatasetProduces(dataset, expected)
def testReadSnapshotDatasetDefault(self): self.createTFRecords() filenames = self._test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 100) ] dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir)) self.assertDatasetProduces(dataset, expected) self.assertSnapshotDirectoryContains( self._snapshot_dir, num_fingerprints=1, num_runs_per_fingerprint=1, num_snapshot_shards_per_run=multiprocessing.cpu_count()) self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir)) self.assertDatasetProduces(dataset2, expected)
def testTFRecordReaderWithDirectFileNamesAndShapes(self): # Using `_TFRecordDataset` creates a raw op rather than wrapping it around # a flat_map automatically. dataset = core_readers._TFRecordDataset(self._filenames) # BatchDataset contains `output_types` and `output_shapes` dataset = dataset.batch(5) dataset = distribute._AutoShardDataset(dataset, 2, 0) expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 5) ] self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
def testReadSnapshotDatasetCustomShardFn(self): self.createTFRecords() filenames = self._test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 100) ] dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply( snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: np.int64(0))) self.assertDatasetProduces(dataset, expected) self.assertSnapshotDirectoryContains(self._snapshot_dir, num_fingerprints=1, num_runs_per_fingerprint=1, num_snapshot_shards_per_run=1) self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply( snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: 0)) self.assertDatasetProduces(dataset2, expected)
def testAutoshardPolicyOff(self): options = options_lib.Options() options.experimental_distribute.auto_shard_policy = ( options_lib.AutoShardPolicy.OFF) dataset = core_readers._TFRecordDataset(self._filenames) dataset = dataset.with_options(options) dataset = distribute._AutoShardDataset(dataset, 5, 0) # Should return every record in every file since autosharding is turned off. expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 10) ] self.assertDatasetProduces(dataset, expected)
def testTFRecordReaderWithDirectFileNamesAndShapes(self): # Using `_TFRecordDataset` creates a raw op rather than wrapping it around # a flat_map automatically. dataset = core_readers._TFRecordDataset(self.test_filenames) # BatchDataset contains `output_types` and `output_shapes` dataset = dataset.batch(5) dataset = distribute._AutoShardDataset(dataset, 2, 0) expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 5) ] self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
def testReadSnapshotDatasetCustomReaderFn(self): self.createTFRecords() filenames = self._test_filenames expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in range(0, 100) ] dataset = core_readers._TFRecordDataset(filenames) dataset = dataset.apply( snapshot.snapshot( self._snapshot_dir, reader_func=( lambda ds: ds.interleave( # pylint:disable=g-long-lambda lambda x: x, cycle_length=4, num_parallel_calls=4)))) self.assertDatasetProduces(dataset, expected) self.assertSnapshotDirectoryContains( self._snapshot_dir, num_fingerprints=1, num_runs_per_fingerprint=1, num_snapshot_shards_per_run=multiprocessing.cpu_count()) self.removeTFRecords() dataset2 = core_readers._TFRecordDataset(filenames) dataset2 = dataset2.apply( snapshot.snapshot( self._snapshot_dir, reader_func=( lambda ds: ds.interleave( # pylint:disable=g-long-lambda lambda x: x, cycle_length=4, num_parallel_calls=4)))) self.assertDatasetProducesSet(dataset2, expected)
def testWorkersGreaterThanNumFilesWithDataSharding(self): options = options_lib.Options() options.experimental_distribute.auto_shard_policy = ( options_lib.AutoShardPolicy.DATA) dataset = core_readers._TFRecordDataset(self._filenames) dataset = dataset.with_options(options) dataset = distribute._AutoShardDataset(dataset, 5, 0) # Should return "Record (0,5) of file (0 --> 9)" since we are sharding by # individual elements, we should be able to get some data from all files. expected = [ b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension for f in range(0, 10) for r in (0, 5) ] self.assertDatasetProduces(dataset, expected)