def test_parquet_images_training(self): from zoo.orca.learn.tf2 import Estimator temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } output_shapes = {"id": (), "image": (), "label": ()} def data_creator(config, batch_size): dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types, output_shapes=output_shapes) dataset = dataset.shuffle(10) dataset = dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) dataset = dataset.map(parse_data_train) dataset = dataset.batch(batch_size) return dataset ray_ctx = RayContext.get() trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(data=data_creator, epochs=1, batch_size=2) finally: shutil.rmtree(temp_dir)
def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema, block_size=4) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) dataloader = read_parquet("dataloader", input_path=path) cur_dl = iter(dataloader) while True: try: print(next(cur_dl)['label']) except StopIteration: break finally: shutil.rmtree(temp_dir)
def test_write_parquet_images(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(): dataset_path = os.path.join(resource_path, "cat_dog") for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 1, "id": image_path} for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 0, "id": image_path} schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)), "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } try: ParquetDataset.write("file://" + temp_dir, generator(), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] image_path = data['id'] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)
def test_write_parquet_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(num): for i in range(num): yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))} schema = { "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()), "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)) } try: ParquetDataset.write("file://" + temp_dir, generator(100), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] assert data['id'] == 0 assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32)) assert np.all(data['label'] == np.ones((4,), dtype=np.float32)) finally: shutil.rmtree(temp_dir)
def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema, block_size=4) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) num_shards, rank = 3, 1 dataset_shard = read_parquet("tf_dataset", path=path, config={ "num_shards": num_shards, "rank": rank }, output_types=output_types) assert len(list(dataset_shard)) <= len(list(dataset)) // num_shards, \ "len of dataset_shard should be 1/`num_shards` of the whole dataset." dataloader = read_parquet("dataloader", path=path) dataloader_shard = read_parquet("dataloader", path=path, config={ "num_shards": num_shards, "rank": rank }) cur_dl = iter(dataloader_shard) cur_count = 0 while True: try: print(next(cur_dl)['label']) cur_count += 1 except StopIteration: break assert cur_count == len(list(dataset_shard)) finally: shutil.rmtree(temp_dir)
def test_write_mnist(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: train_image_file = os.path.join(temp_dir, "train-images") train_label_file = os.path.join(temp_dir, "train-labels") output_path = os.path.join(temp_dir, "output_dataset") images = np.array([[i] * 16 for i in range(20)]).reshape( (20, 4, 4)).astype(np.uint8) labels = np.array(list(range(20))).reshape((20, )).astype(np.uint8) _images_to_mnist_file(images, train_image_file) _labels_to_mnist_file(labels, train_label_file) write_mnist(image_file=train_image_file, label_file=train_label_file, output_path=output_path) data, schema = ParquetDataset._read_as_dict_rdd(output_path) data = data.sortBy(lambda x: x['label']).collect() images_load = np.reshape(np.stack([d['image'] for d in data]), (-1, 4, 4)) labels_load = np.stack([d['label'] for d in data]) assert np.all(images_load == images) assert np.all(labels_load == labels_load) finally: shutil.rmtree(temp_dir)
def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) finally: shutil.rmtree(temp_dir)
def test_train_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: _write_ndarrays(images=np.random.randn(500, 28, 28, 1).astype(np.float32), labels=np.random.randint(0, 10, (500, )).astype(np.int32), output_path=temp_dir) dataset = ParquetDataset.read_as_tf(temp_dir) def preprocess(data): return data['image'], data["label"] dataset = dataset.map(preprocess) import tensorflow as tf model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=dataset, batch_size=100, epochs=1) finally: shutil.rmtree(temp_dir)
def test_write_from_directory(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: label_map = {"cats": 0, "dogs": 1} write_from_directory(os.path.join(resource_path, "cat_dog"), label_map, temp_dir) train_xshard = ParquetDataset._read_as_xshards(temp_dir) data = train_xshard.collect()[0] image_path = data["image_id"][0] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'][0] finally: shutil.rmtree(temp_dir)
def test_write_voc(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: from zoo.orca.data import SparkXShards dataset_path = os.path.join(resource_path, "VOCdevkit") output_path = os.path.join(temp_dir, "output_dataset") write_voc(dataset_path, splits_names=[(2007, "trainval")], output_path="file://" + output_path) data, schema = ParquetDataset._read_as_dict_rdd("file://" + output_path) data = data.collect()[0] image_path = data["image_id"] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)