def main_fun(args, ctx): import tensorflow_datasets as tfds import tensorflow as tf from tensorflowonspark import compat tfds.disable_progress_bar() strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size NUM_WORKERS = args.cluster_size # Scaling MNIST data from (0, 255] to (0., 1.] def scale(image, label): return tf.cast(image, tf.float32) / 255, label # workaround for https://github.com/tensorflow/datasets/issues/1405 datasets = tfds.load(name='mnist', split='train', as_supervised=True) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA train_datasets_unbatched = datasets.with_options(options).repeat().map( scale).shuffle(BUFFER_SIZE) def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model # single node # single_worker_model = build_and_compile_cnn_model() # single_worker_model.fit(x=train_datasets, epochs=3) # Here the batch size scales up by number of workers since # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, # and now this becomes 128. GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE) # this fails # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True), tf.keras.callbacks.TensorBoard(log_dir=args.model_dir) ] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) compat.export_saved_model(multi_worker_model, args.export_dir, ctx.job_name == 'chief')
def main_fun(args, ctx): import numpy as np import tensorflow as tf from tensorflowonspark import compat, TFNode strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model # single node # single_worker_model = build_and_compile_cnn_model() # single_worker_model.fit(x=train_datasets, epochs=3) tf_feed = TFNode.DataFeed(ctx.mgr, False) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # this fails # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True) ] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, # so we need to ensure that all workers complete training before any of them run out of data from the RDD. # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, # we'll just stop training at 90% of the total expected number of steps. steps_per_epoch = 60000 / args.batch_size steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers max_steps_per_worker = steps_per_epoch_per_worker * 0.9 multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=max_steps_per_worker, callbacks=callbacks) from tensorflow_estimator.python.estimator.export import export_lib export_dir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief') # terminating feed tells spark to skip processing further partitions tf_feed.terminate()
def mainFun(args, ctx): import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, BatchNormalization import tensorflow.keras as keras from tensorflowonspark import compat, TFNode # Setting distributed model strategy strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() def buildAndCompileModel(): # Initiating model model = Sequential() # Building model structure model.add(Input(shape=(1025, 50, 1))) # First convolution and pooling step model.add(Conv2D(16, kernel_size=[3,3], activation='relu', data_format='channels_last')) model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last')) model.add(Dropout(0.2)) # Second convolution and pooling step model.add(Conv2D(32, kernel_size=[3,3], activation='relu', data_format='channels_last')) model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last')) model.add(Dropout(0.2)) # Flattening output of convolution to pass on to Dense layers model.add(Flatten()) model.add(BatchNormalization()) model.add(Dense(128, activation='relu')) model.add(BatchNormalization()) model.add(Dense(128, activation='relu')) # Output layer model.add(Dense(30, activation='softmax')) # Compiling model model.compile( loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'] ) return model # Opening up datafeed to iterate over entries tfFeed = TFNode.DataFeed(ctx.mgr, False) # Function to split data into features and labels def rddGenerator(): while not tfFeed.should_stop(): batch = tfFeed.next_batch(1) if len(batch) > 0: example = batch[0] # Splitting into X and y X = np.array(example[1]).astype(np.float32) y = np.array(example[0]) # Encoding labels _, y = np.unique(y, return_inverse=True) y = y.astype(np.float32) # Adjusting data shape X = X.reshape(-1, 50, 1) # Returning features and labels as separate arrays yield (X, y) else: return # Creating Tensorflow Dataset ds = tf.data.Dataset.from_generator(rddGenerator, (tf.float32, tf.float32), (tf.TensorShape([1025, 50, 1]), tf.TensorShape([1]))) ds = ds.batch(1) # Instantiating Model with strategy.scope(): multiWorkerModel = buildAndCompileModel() # Defining Training Parameters stepsPerEpoch = 600 / 1 stepsPerWorker = stepsPerEpoch / 1 maxStepsPerWorker = stepsPerWorker * 0.9 # Fitting Model multiWorkerModel.fit(x = ds, epochs = 2, steps_per_epoch = stepsPerWorker) # Exporting log files for Tensorboard from tensorflow_estimator.python.estimator.export import export_lib exportDir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multiWorkerModel, exportDir, ctx.job_name == 'chief') # terminating feed tells spark to skip processing further partitions tfFeed.terminate()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.compat.v1.reset_default_graph() strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model = Sequential() model.add(Dense(1, activation='linear', input_shape=[2])) model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) model.summary() tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch['x']) > 0: features = batch['x'][0] label = batch['y_'][0] yield (features, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([2]), tf.TensorShape([1]))) # disable auto-sharding since we're feeding from an RDD generator options = tf.data.Options() compat.disable_auto_shard(options) ds = ds.with_options(options) ds = ds.batch(args.batch_size) # only train 90% of each epoch to account for uneven RDD partition sizes steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers) tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint( filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True) ] model.fit(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy" # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) if args.export_dir: print("exporting model to: {}".format(args.export_dir)) compat.export_saved_model(model, args.export_dir, ctx.job_name == 'chief') tf_feed.terminate()
def main_fun(args, ctx): """Example demonstrating loading TFRecords directly from disk (e.g. HDFS) without tensorflow_datasets.""" import tensorflow as tf from tensorflowonspark import compat strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size NUM_WORKERS = args.cluster_size # parser for TFRecords downloaded by tensorflow_datasets # these are images + labels, where the images are just serialized PNGs def parse_tfds(x): feature_def = {"label": tf.io.FixedLenFeature(1, tf.int64), "image": tf.io.VarLenFeature(tf.string)} example = tf.io.parse_single_example(x, feature_def) image = tf.io.decode_image(example['image'].values[0]) / 255 image.set_shape([28, 28, 1]) # fix for https://github.com/tensorflow/tensorflow/issues/24520 label = example['label'] return (image, label) # parser for TFRecords generated by ${TFoS_HOME}/examples/mnist/mnist_data_setup.py # these are images + labels, where the images are a flattened arrays of ints def parse_tfos(example_proto): feature_def = {"label": tf.io.FixedLenFeature(10, tf.int64), "image": tf.io.FixedLenFeature(28 * 28 * 1, tf.int64)} features = tf.io.parse_single_example(example_proto, feature_def) image = tf.cast(features['image'], tf.float32) / 255 image = tf.reshape(image, (28, 28, 1)) label = tf.math.argmax(features['label'], output_type=tf.int32) return (image, label) # Dataset for input data # tfds: /path/to/tensorflow_datasets/mnist/1.0.0/mnist-train.tfrecord* # tfos: /path/to/mnist/tfr/train/part-r-* image_pattern = ctx.absolute_path(args.images_labels) ds = tf.data.Dataset.list_files(image_pattern) ds = ds.repeat(args.epochs).shuffle(BUFFER_SIZE) ds = ds.interleave(tf.data.TFRecordDataset) if args.data_format == 'tfds': train_datasets_unbatched = ds.map(parse_tfds) else: # 'tfos' train_datasets_unbatched = ds.map(parse_tfos) def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model # single node # single_worker_model = build_and_compile_cnn_model() # single_worker_model.fit(x=train_datasets, epochs=3) # Here the batch size scales up by number of workers since # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, # and now this becomes 128. GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE) # this fails # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True)] # Note: if you part files have an uneven number of records, you may see an "Out of Range" exception # at less than the expected number of steps_per_epoch, because the executor with least amount of records will finish first. steps_per_epoch = 60000 / GLOBAL_BATCH_SIZE with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) from tensorflow_estimator.python.estimator.export import export_lib export_dir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief')