def test_avro_dataset(self): """Test case for AvroDataset.""" # The test.bin was created from avro/lang/c++/examples/datafile.cc. filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_avro", "test.bin") filename = "file://" + filename schema_filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_avro", "cpx.json") with open(schema_filename, 'r') as f: schema = f.read() columns = ['im', 're'] output_types = (dtypes.float64, dtypes.float64) num_repeats = 2 dataset = avro_io.AvroDataset([filename], columns, schema, output_types).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(num_repeats): for i in range(100): (im, re) = (i + 100, i * 100) vv = sess.run(get_next) self.assertAllClose((im, re), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) dataset = avro_io.AvroDataset([filename, filename], columns, schema, output_types, batch=3) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for ii in range(0, 198, 3): i = ii % 100 (im, re) = ([ i + 100, ((i + 1) % 100) + 100, ((i + 2) % 100) + 100 ], [i * 100, ((i + 1) % 100) * 100, ((i + 2) % 100) * 100]) vv = sess.run(get_next) self.assertAllClose((im, re), vv) (im, re) = ([198, 199], [9800, 9900]) vv = sess.run(get_next) self.assertAllClose((im, re), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_cifar_10_dataset(self): """Test case for CIFARDataset. """ url = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' filedata = urlopen(url) f, filename = tempfile.mkstemp() os.write(f, filedata.read()) os.close(f) (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() num_repeats = 2 dataset = cifar_io.CIFAR10Dataset(filename, batch=3).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(16666): image, label = sess.run(get_next) self.assertAllEqual(image[0], x_train[i * 3 + 0]) self.assertEqual(label[0], y_train[i * 3 + 0]) self.assertAllEqual(image[1], x_train[i * 3 + 1]) self.assertEqual(label[1], y_train[i * 3 + 1]) self.assertAllEqual(image[2], x_train[i * 3 + 2]) self.assertEqual(label[2], y_train[i * 3 + 2]) image, label = sess.run(get_next) self.assertAllEqual(image[0], x_train[49998]) self.assertEqual(label[0], y_train[49998]) self.assertAllEqual(image[1], x_train[49999]) self.assertEqual(label[1], y_train[49999]) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) dataset = cifar_io.CIFAR10Dataset(filename, test=True).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(10000): image, label = sess.run(get_next) self.assertAllEqual(image, x_test[i]) self.assertEqual(label, y_test[i]) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_dataset_int32_zlib(self): """Test case for HDF5Dataset with zlib.""" # Note the file is generated with tdset.h5: # with h5py.File('compressed_h5.h5', 'w') as output_f: # output_f.create_dataset( # '/dset1', data=h5f['/dset1'][()], compression='gzip') filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "compressed_h5.h5") filename = "file://" + filename column = '/dset1' dtype = dtypes.int32 shape = tf.TensorShape([None, 20]) dataset = hdf5_io.HDF5Dataset(filename, column, start=0, stop=10, dtype=dtype, shape=shape).apply( tf.data.experimental.unbatch()) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(10): v0 = list([np.asarray([v for v in range(i, i + 20)])]) vv = sess.run(get_next) self.assertAllEqual(v0, [vv]) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_dataset_int32(self): """Test case for HDF5Dataset.""" filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "tdset.h5") filename = "file://" + filename column = '/dset1' dtype = dtypes.int32 shape = tf.TensorShape([None, 20]) dataset = hdf5_io.HDF5Dataset(filename, column, start=0, stop=10, dtype=dtype, shape=shape).apply( tf.data.experimental.unbatch()) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(10): v0 = list([np.asarray([v for v in range(i, i + 20)])]) vv = sess.run(get_next) self.assertAllEqual(v0, [vv]) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_sequence_file_dataset(self): """Test case for SequenceFileDataset. The file is generated with `org.apache.hadoop.io.Text` for key/value. There are 25 records in the file with the format of: key = XXX value = VALUEXXX where XXX is replaced as the line number (starts with 001). """ filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hadoop", "string.seq") num_repeats = 2 dataset = hadoop_io.SequenceFileDataset([filename]).repeat( num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(25): # 25 records. v0 = ("%03d" % (i + 1)).encode() v1 = ("VALUE%03d" % (i + 1)).encode() self.assertEqual((v0, v1), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_dataset(self): """Test case for HDF5Dataset.""" filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "tdset.h5") filename = "file://" + filename columns = ['/dset2'] output_types = [dtypes.float64] output_shapes = [(1, 20)] dataset = hdf5_io.HDF5Dataset( [filename], columns, output_types, output_shapes, batch=1) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(30): v0 = list( [np.asarray([[i + 1e-04 * v for v in range(20)]], dtype=np.float64)]) vv = sess.run(get_next) self.assertAllEqual(v0, vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_dataset_int32_zlib(self): """Test case for HDF5Dataset with zlib.""" # Note the file is generated with tdset.h5: # with h5py.File('compressed_h5.h5', 'w') as output_f: # output_f.create_dataset( # '/dset1', data=h5f['/dset1'][()], compression='gzip') filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "compressed_h5.h5") filename = "file://" + filename columns = ['/dset1'] output_types = [dtypes.int32] output_shapes = [(1, 20)] dataset = hdf5_io.HDF5Dataset([filename], columns, output_types, output_shapes) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(10): v0 = list([np.asarray([v for v in range(i, i + 20)])]) vv = sess.run(get_next) self.assertAllEqual(v0, vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_json_dataset(self): """Test case for JSONDataset.""" filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_json", "feature.json") columns = ['floatfeature', 'integerfeature'] output_types = (dtypes.float64, dtypes.int64) num_repeats = 2 dataset = json_io.JSONDataset( filename, columns=columns, dtypes=output_types).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() test_json = [(1.1, 2), (2.1, 3)] with self.test_session() as sess: sess.run(init_op) for _ in range(num_repeats): for i in range(2): (floatf, intf) = test_json[i] vv = sess.run(get_next) self.assertAllClose((floatf, intf), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_invalid_dataset(self): """test_hdf5_invalid_dataset""" filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "tdset.h5") filename = "file://" + filename dataset = hdf5_io.HDF5Dataset([filename], ['/invalid', '/invalid2'], [dtypes.int32, dtypes.int32], [(1, 20), (1, 30)]) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) with self.assertRaisesRegexp(errors.InvalidArgumentError, "unable to open dataset /invalid"): sess.run(get_next)
def test_parquet_dataset(self): """Test case for ParquetDataset. Note: The sample file is generated from: `parquet-cpp/examples/low-level-api/reader_writer` This test extracts columns of [0, 1, 2, 4, 5] with column data types of [bool, int32, int64, float, double]. Please check `parquet-cpp/examples/low-level-api/reader-writer.cc` to find details of how records are generated: Column 0 (bool): True for even rows and False otherwise. Column 1 (int32): Equal to row_index. Column 2 (int64): Equal to row_index * 1000 * 1000 * 1000 * 1000. Column 4 (float): Equal to row_index * 1.1. Column 5 (double): Equal to row_index * 1.1111111. """ filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_parquet", "parquet_cpp_example.parquet") filenames = tensorflow.constant([filename], dtypes.string) columns = [0, 1, 2, 4, 5] output_types = (dtypes.bool, dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64) num_repeats = 2 dataset = parquet_io.ParquetDataset(filenames, columns, output_types).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(500): # 500 rows. v0 = ((i % 2) == 0) v1 = i v2 = i * 1000 * 1000 * 1000 * 1000 v4 = 1.1 * i v5 = 1.1111111 * i self.assertAllClose((v0, v1, v2, v4, v5), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_hdf5_invalid_dataset(self): """test_hdf5_invalid_dataset""" filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "tdset.h5") filename = "file://" + filename dataset = hdf5_io.HDF5Dataset(filename, '/invalid', dtype=dtypes.int32, shape=tf.TensorShape([1, 20]), start=0, stop=10) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) with self.assertRaisesRegexp(errors.InvalidArgumentError, "unable to open dataset"): sess.run(get_next)
def test_hdf5_dataset_binary(self): """Test case for HDF5Dataset.""" filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hdf5", "tbinary.h5") filename = "file://" + filename columns = ['integer', 'float', 'double'] output_types = [dtypes.int32, dtypes.float32, dtypes.float64] output_shapes = [(1), (1), (1)] dataset = hdf5_io.HDF5Dataset([filename], columns, output_types, output_shapes) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(1, 7): vv = sess.run(get_next) self.assertAllEqual((i, np.float32(i), np.float64(i)), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_parquet_dataset(self): """Test case for ParquetDataset. Note: The sample file is generated from: `parquet-cpp/examples/low-level-api/reader_writer` This test extracts columns of [0, 1, 2, 4, 5] with column data types of [bool, int32, int64, float, double]. Please check `parquet-cpp/examples/low-level-api/reader-writer.cc` to find details of how records are generated: Column 0 (bool): True for even rows and False otherwise. Column 1 (int32): Equal to row_index. Column 2 (int64): Equal to row_index * 1000 * 1000 * 1000 * 1000. Column 4 (float): Equal to row_index * 1.1. Column 5 (double): Equal to row_index * 1.1111111. """ filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_parquet", "parquet_cpp_example.parquet") filename = "file://" + filename columns = [ 'boolean_field', 'int32_field', 'int64_field', 'float_field', 'double_field' ] output_types = (dtypes.bool, dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64) num_repeats = 2 dataset = parquet_io.ParquetDataset([filename], columns, output_types).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(500): # 500 rows. v0 = ((i % 2) == 0) v1 = i v2 = i * 1000 * 1000 * 1000 * 1000 v4 = 1.1 * i v5 = 1.1111111 * i vv = sess.run(get_next) self.assertAllClose((v0, v1, v2, v4, v5), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) dataset = parquet_io.ParquetDataset([filename], columns, output_types, batch=1) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for i in range(500): v0 = ((i % 2) == 0) v1 = i v2 = i * 1000 * 1000 * 1000 * 1000 v4 = 1.1 * i v5 = 1.1111111 * i vv = sess.run(get_next) self.assertAllClose(([v0], [v1], [v2], [v4], [v5]), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) dataset = parquet_io.ParquetDataset([filename, filename], columns, output_types, batch=3) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for ii in range(0, 999, 3): v0, v1, v2, v4, v5 = [], [], [], [], [] for i in [ii % 500, (ii + 1) % 500, (ii + 2) % 500]: v0.append((i % 2) == 0) v1.append(i) v2.append(i * 1000 * 1000 * 1000 * 1000) v4.append(1.1 * i) v5.append(1.1111111 * i) vv = sess.run(get_next) self.assertAllClose((v0, v1, v2, v4, v5), vv) i = 999 % 500 v0 = ((i % 2) == 0) v1 = i v2 = i * 1000 * 1000 * 1000 * 1000 v4 = 1.1 * i v5 = 1.1111111 * i vv = sess.run(get_next) self.assertAllClose(([v0], [v1], [v2], [v4], [v5]), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # With compression filename = filename + '.gz' dataset = parquet_io.ParquetDataset([filename], columns, output_types).repeat(num_repeats) iterator = data.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() with self.test_session() as sess: sess.run(init_op) for _ in range(num_repeats): # Dataset is repeated. for i in range(500): # 500 rows. v0 = ((i % 2) == 0) v1 = i v2 = i * 1000 * 1000 * 1000 * 1000 v4 = 1.1 * i v5 = 1.1111111 * i vv = sess.run(get_next) self.assertAllClose((v0, v1, v2, v4, v5), vv) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)