def testCompressionOptions(self): """Create record with mix of random and repeated data to test compression on.""" rnd = random.Random(123) random_record = compat.as_bytes("".join( rnd.choice(string.digits) for _ in range(10000))) repeated_record = compat.as_bytes(_TEXT) for _ in range(10000): start_i = rnd.randint(0, len(_TEXT)) length = rnd.randint(10, 200) repeated_record += _TEXT[start_i:start_i + length] records = [random_record, repeated_record, random_record] tests = [ ("compression_level", 2, -1), # Lower compression is worse. ("compression_level", 6, 0), # Default compression_level is equal. ("flush_mode", zlib.Z_FULL_FLUSH, 1), # A few less bytes. ("flush_mode", zlib.Z_NO_FLUSH, 0), # NO_FLUSH is the default. ("input_buffer_size", 4096, 0), # Increases time not size. ("output_buffer_size", 4096, 0), # Increases time not size. ("window_bits", 8, -1), # Smaller than default window increases size. ("compression_strategy", zlib.Z_HUFFMAN_ONLY, -1), # Worse. ("compression_strategy", zlib.Z_FILTERED, -1), # Worse. ] compression_type = tf_record.TFRecordCompressionType.ZLIB options_a = tf_record.TFRecordOptions(compression_type) for prop, value, delta_sign in tests: options_b = tf_record.TFRecordOptions( compression_type=compression_type, **{prop: value}) delta = self._CompressionSizeDelta(records, options_a, options_b) self.assertTrue( delta == 0 if delta_sign == 0 else delta // delta_sign > 0, "Setting {} = {}, file was {} smaller didn't match sign of {}". format(prop, value, delta, delta_sign))
def testReadGzipFiles(self): files = self._CreateFiles() gzip_files = [] for i, fn in enumerate(files): with open(fn, "rb") as f: cdata = f.read() zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i) with gzip.GzipFile(zfn, "wb") as f: f.write(cdata) gzip_files.append(zfn) with self.test_session() as sess: options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP) reader = io_ops.TFRecordReader(name="test_reader", options=options) queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=()) key, value = reader.read(queue) queue.enqueue_many([gzip_files]).run() queue.close().run() for i in range(self._num_files): for j in range(self._num_records): k, v = sess.run([key, value]) self.assertTrue( compat.as_text(k).startswith("%s:" % gzip_files[i])) self.assertAllEqual(self._Record(i, j), v)
def setUp(self, compression_type=TFRecordCompressionType.NONE): super(TFRecordWriterCloseAndFlushTests, self).setUp() self._fn = os.path.join(self.get_temp_dir(), "tf_record_writer_test.txt") self._options = tf_record.TFRecordOptions(compression_type) self._writer = tf_record.TFRecordWriter(self._fn, self._options) self._num_records = 20
def testZLibFlushRecord(self): fn = self._WriteRecordsToFile([b"small record"], "small_record") with open(fn, "rb") as h: buff = h.read() # creating more blocks and trailing blocks shouldn't break reads compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS) output = b"" for c in buff: if isinstance(c, int): c = six.int2byte(c) output += compressor.compress(c) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FINISH) # overwrite the original file with the compressed data with open(fn, "wb") as h: h.write(output) with self.test_session() as sess: options = tf_record.TFRecordOptions( compression_type=TFRecordCompressionType.ZLIB) reader = io_ops.TFRecordReader(name="test_reader", options=options) queue = data_flow_ops.FIFOQueue(1, [dtypes.string], shapes=()) key, value = reader.read(queue) queue.enqueue(fn).run() queue.close().run() k, v = sess.run([key, value]) self.assertTrue(compat.as_text(k).startswith("%s:" % fn)) self.assertAllEqual(b"small record", v)
def save_rows_to_tf_record_file(rows, make_sequence_example_fn, sessions_df_length, export_filename, content_article_embeddings=None, num_of_articles_in_sub_group=None): tf_record_options = tf_record.TFRecordOptions( tf_record.TFRecordCompressionType.GZIP) tf_writer = tf_record.TFRecordWriter(export_filename, options=tf_record_options) try: counter = 1 for row in rows: start = time.time() print(f"{counter}/{sessions_df_length}") seq_example = make_sequence_example_fn( row, num_of_articles_in_sub_group, content_article_embeddings) end = time.time() print(end - start) counter += 1 tf_writer.write(seq_example.SerializeToString()) finally: tf_writer.close() sys.stdout.flush()
def testNoCompressionType(self): self.assertEqual( "", tf_record.TFRecordOptions.get_compression_type_string( tf_record.TFRecordOptions())) self.assertEqual( "", tf_record.TFRecordOptions.get_compression_type_string( tf_record.TFRecordOptions(""))) with self.assertRaises(ValueError): tf_record.TFRecordOptions(5) with self.assertRaises(ValueError): tf_record.TFRecordOptions("BZ2")
def testZLibFlushRecord(self): """test ZLib Flush Record""" original = [b"small record"] fn = self._WriteRecordsToFile(original, "small_record") with open(fn, "rb") as h: buff = h.read() # creating more blocks and trailing blocks shouldn't break reads compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS) output = b"" for c in buff: if isinstance(c, int): c = six.int2byte(c) output += compressor.compress(c) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FULL_FLUSH) output += compressor.flush(zlib.Z_FINISH) # overwrite the original file with the compressed data with open(fn, "wb") as h: h.write(output) options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) actual = list(tf_record.tf_record_iterator(fn, options=options)) self.assertEqual(actual, original)
def testZlibCompressionType(self): zlib_t = tf_record.TFRecordCompressionType.ZLIB self.assertEqual( "ZLIB", tf_record.TFRecordOptions.get_compression_type_string( tf_record.TFRecordOptions("ZLIB"))) self.assertEqual( "ZLIB", tf_record.TFRecordOptions.get_compression_type_string( tf_record.TFRecordOptions(zlib_t))) self.assertEqual( "ZLIB", tf_record.TFRecordOptions.get_compression_type_string( tf_record.TFRecordOptions(tf_record.TFRecordOptions(zlib_t))))
def testGzipReadWrite(self): """Verify that files produced are gzip compatible.""" original = [b"foo", b"bar"] fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord") gzfn = self._GzipCompressFile(fn, "tfrecord.gz") options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP) actual = list(tf_record.tf_record_iterator(gzfn, options=options)) self.assertEqual(actual, original)
def testWriteGZIP(self): options = tf_record.TFRecordOptions( tf_record.TFRecordCompressionType.GZIP) self.evaluate( self.writer_fn(self._createFile(options), compression_type="GZIP")) for i, r in enumerate( tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r)
def testWriteGzipRead(self): original = [b"foo", b"bar"] options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP) fn = self._WriteRecordsToFile(original, "write_gzip_read.tfrecord.gz", options) gzfn = self._GzipDecompressFile(fn, "write_gzip_read.tfrecord") actual = list(tf_record.tf_record_iterator(gzfn)) self.assertEqual(actual, original)
def testZlibReadWrite(self): """Verify that files produced are zlib compatible.""" original = [b"foo", b"bar"] fn = self._WriteRecordsToFile(original, "zlib_read_write.tfrecord") zfn = self._ZlibCompressFile(fn, "zlib_read_write.tfrecord.z") # read the compressed contents and verify. options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) actual = list(tf_record.tf_record_iterator(zfn, options=options)) self.assertEqual(actual, original)
def testWriteZlibRead(self): """Verify compression with TFRecordWriter is zlib library compatible.""" original = [b"foo", b"bar"] options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) fn = self._WriteRecordsToFile(original, "write_zlib_read.tfrecord.z", options) zfn = self._ZlibDecompressFile(fn, "write_zlib_read.tfrecord") actual = list(tf_record.tf_record_iterator(zfn)) self.assertEqual(actual, original)
def testWriteZlibReadLarge(self): """Verify compression for large records is zlib library compatible.""" # Make it large (about 5MB) original = [_TEXT * 10240] options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) fn = self._WriteRecordsToFile(original, "write_zlib_read_large.tfrecord.z", options) zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tfrecord") actual = list(tf_record.tf_record_iterator(zfn)) self.assertEqual(actual, original)
def testZlibReadWriteLarge(self): """Verify that writing large contents also works.""" # Make it large (about 5MB) original = [_TEXT * 10240] fn = self._WriteRecordsToFile(original, "zlib_read_write_large.tfrecord") zfn = self._ZlibCompressFile(fn, "zlib_read_write_large.tfrecord.z") options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) actual = list(tf_record.tf_record_iterator(zfn, options=options)) self.assertEqual(actual, original)
def generateTestData(self, prefix, n, m, compression_type=tf_record.TFRecordCompressionType.NONE): options = tf_record.TFRecordOptions(compression_type) for i in range(n): f = os.path.join(self.get_temp_dir(), prefix + "." + str(i)) w = tf_record.TFRecordWriter(f, options=options) for j in range(m): w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8")) w.close()
def testIterator(self): records = [self._Record(0, i) for i in range(self._num_records)] options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) fn = self._WriteRecordsToFile(records, "compressed_records", options) reader = tf_record.tf_record_iterator(fn, options) for expected in records: record = next(reader) self.assertAllEqual(expected, record) with self.assertRaises(StopIteration): record = next(reader)
def testIterator(self): fn = self._WriteCompressedRecordsToFile( [self._Record(i) for i in range(self._num_records)], "compressed_records") options = tf_record.TFRecordOptions( compression_type=TFRecordCompressionType.ZLIB) reader = tf_record.tf_record_iterator(fn, options) for i in range(self._num_records): record = next(reader) self.assertAllEqual(self._Record(i), record) with self.assertRaises(StopIteration): record = next(reader)
def testWriteReadZLibFiles(self): # Write uncompressed then compress manually. options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE) files = self._CreateFiles(options, prefix="uncompressed") zlib_files = [ self._ZlibCompressFile(fn, "tfrecord_%s.z" % i) for i, fn in enumerate(files) ] self._AssertFilesEqual(files, zlib_files, False) # Now write compressd and verify same. options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB) compressed_files = self._CreateFiles(options, prefix="compressed") self._AssertFilesEqual(compressed_files, zlib_files, True) # Decompress compress and verify same. uncompressed_files = [ self._ZlibDecompressFile(fn, "tfrecord_%s.z" % i) for i, fn in enumerate(compressed_files) ] self._AssertFilesEqual(uncompressed_files, files, True)
def testWriteGZIP(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) with self.cached_session() as sess: sess.run( self.writer, feed_dict={ self.filename: self._createFile(options), self.compression_type: "GZIP", }) for i, r in enumerate( tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r)
def _WriteCompressedRecordsToFile( self, records, name="tfrecord.z", compression_type=tf_record.TFRecordCompressionType.ZLIB): fn = os.path.join(self.get_temp_dir(), name) options = tf_record.TFRecordOptions(compression_type=compression_type) writer = tf_record.TFRecordWriter(fn, options=options) for r in records: writer.write(r) writer.close() del writer return fn
def save_rows_to_tf_record_file(df_rows, make_sequence_example_fn, export_filename): tf_record_options = tf_record.TFRecordOptions( tf_record.TFRecordCompressionType.GZIP) tf_writer = tf_record.TFRecordWriter(export_filename, options=tf_record_options) try: for index, row in df_rows.iterrows(): seq_example = make_sequence_example_fn(row) tf_writer.write(seq_example.SerializeToString()) finally: tf_writer.close() sys.stdout.flush()
def testWriteReadGzipFiles(self): # Write uncompressed then compress manually. options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE) files = self._CreateFiles(options, prefix="uncompressed") gzip_files = [ self._GzipCompressFile(fn, "tfrecord_%s.gz" % i) for i, fn in enumerate(files) ] self._AssertFilesEqual(files, gzip_files, False) # Now write compressd and verify same. options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP) compressed_files = self._CreateFiles(options, prefix="compressed") # Note: Gzips written by TFRecordWriter add 'tfrecord_0' so # compressed_files can't be compared with gzip_files # Decompress compress and verify same. uncompressed_files = [ self._GzipDecompressFile(fn, "tfrecord_%s.gz" % i) for i, fn in enumerate(compressed_files) ] self._AssertFilesEqual(uncompressed_files, files, True)
def _CreateFiles(self): filenames = [] for i in range(self._num_files): fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i) filenames.append(fn) options = tf_record.TFRecordOptions( compression_type=TFRecordCompressionType.ZLIB) writer = tf_record.TFRecordWriter(fn, options=options) for j in range(self._num_records): writer.write(self._Record(i, j)) writer.close() del writer return filenames
def testReadGzipFiles(self): options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP) files = self._CreateFiles(options) reader = io_ops.TFRecordReader(name="test_reader", options=options) queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=()) key, value = reader.read(queue) self.evaluate(queue.enqueue_many([files])) self.evaluate(queue.close()) for i in range(self._num_files): for j in range(self._num_records): k, v = self.evaluate([key, value]) self.assertTrue(compat.as_text(k).startswith("%s:" % files[i])) self.assertAllEqual(self._Record(i, j), v)
def testGzipReadWrite(self): """Verify that files produced are gzip compatible.""" original = [b"foo", b"bar"] fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord") # gzip compress the file and write compressed contents to file. with open(fn, "rb") as f: cdata = f.read() gzfn = os.path.join(self.get_temp_dir(), "tf_record.gz") with gzip.GzipFile(gzfn, "wb") as f: f.write(cdata) actual = [] for r in tf_record.tf_record_iterator( gzfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)): actual.append(r) self.assertEqual(actual, original)
def testOneEpoch(self): files = self._CreateFiles() with self.test_session() as sess: options = tf_record.TFRecordOptions( compression_type=TFRecordCompressionType.ZLIB) reader = io_ops.TFRecordReader(name="test_reader", options=options) queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=()) key, value = reader.read(queue) queue.enqueue_many([files]).run() queue.close().run() for i in range(self._num_files): for j in range(self._num_records): k, v = sess.run([key, value]) self.assertTrue(compat.as_text(k).startswith("%s:" % files[i])) self.assertAllEqual(self._Record(i, j), v) with self.assertRaisesOpError("is closed and has insufficient elements " "\\(requested 1, current size 0\\)"): k, v = sess.run([key, value])
def main(unused_argv): example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[111])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["1111111111"])), })) options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) writer = python_io.TFRecordWriter("/tmp/test1.tfrecord", options) writer.write(example.SerializeToString()) example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[222])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["2222222222"])), })) writer.write(example.SerializeToString()) example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[333])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["3333333333"])), })) writer.write(example.SerializeToString()) writer.close() tf.compat.v1.logging.info('File /tmp/test1.tfrecord generated!')
def input_pipeline(mode, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS): with tf.name_scope('img_pipeline'): if mode == 'train': filenames = [TRAIN_FILENAME] image_feature = 'train/image' label_feature = 'train/label' else: filenames = [VAL_FILENAME] image_feature = 'val/image' label_feature = 'val/label' feature = { image_feature: tf.FixedLenFeature([], tf.string), label_feature: tf.FixedLenFeature([], tf.int64) } # Create a list of filenames and pass it to a queue filename_queue = tf.train.string_input_producer(filenames, num_epochs=NUM_EPOCHS + 1) # Define a reader and read the next record options = tf_record.TFRecordOptions( compression_type=tf_record.TFRecordCompressionType.GZIP) reader = tf.TFRecordReader(options=options) _, serialized_example = reader.read(filename_queue) # Decode the record read by the reader features = tf.parse_single_example(serialized_example, features=feature) # Convert the image data from string back to the numbers image = tf.decode_raw(features[image_feature], tf.uint8) # Cast label data into one_hot encoded label = tf.cast(features[label_feature], tf.int32) label = tf.one_hot(label, NUM_CLASSES) # Reshape image data into the original shape image = tf.reshape(image, [256, 256, 3]) # Any preprocessing here ... # 1. random cropping 224x224 # 2. random LR-flipping image = tf.random_crop(image, [224, 224, 3]) image = tf.image.random_flip_left_right(image) #print_features(image) # Creates batches by randomly shuffling tensors # min_after_dequeue defines how big a buffer we will randomly sample # from -- bigger means better shuffling but slower start up and more # memory used. # capacity must be larger than min_after_dequeue and the amount larger # determines the maximum we will prefetch. Recommendation: # min_after_dequeue + (num_threads + a small safety margin) * batch_size min_after_dequeue = 100 num_threads = 6 capacity = min_after_dequeue + (num_threads + 2) * BATCH_SIZE images, labels = tf.train.shuffle_batch( [image, label], batch_size=BATCH_SIZE, capacity=capacity, num_threads=num_threads, min_after_dequeue=min_after_dequeue) #print("input_pipeline will return now.") return images, labels