예제 #1
0
파일: mnist_tf.py 프로젝트: lazywhite/sa
def main_fun(args, ctx):
    import tensorflow_datasets as tfds
    import tensorflow as tf
    from tensorflowonspark import compat

    tfds.disable_progress_bar()

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

    BUFFER_SIZE = args.buffer_size
    BATCH_SIZE = args.batch_size
    NUM_WORKERS = args.cluster_size

    # Scaling MNIST data from (0, 255] to (0., 1.]
    def scale(image, label):
        return tf.cast(image, tf.float32) / 255, label

    # workaround for https://github.com/tensorflow/datasets/issues/1405
    datasets = tfds.load(name='mnist', split='train', as_supervised=True)
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_datasets_unbatched = datasets.with_options(options).repeat().map(
        scale).shuffle(BUFFER_SIZE)

    def build_and_compile_cnn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                      metrics=['accuracy'])
        return model

    # single node
    # single_worker_model = build_and_compile_cnn_model()
    # single_worker_model.fit(x=train_datasets, epochs=3)

    # Here the batch size scales up by number of workers since
    # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64,
    # and now this becomes 128.
    GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS
    train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE)

    # this fails
    # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)]
    tf.io.gfile.makedirs(args.model_dir)
    filepath = args.model_dir + "/weights-{epoch:04d}"
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                           verbose=1,
                                           save_weights_only=True),
        tf.keras.callbacks.TensorBoard(log_dir=args.model_dir)
    ]

    with strategy.scope():
        multi_worker_model = build_and_compile_cnn_model()
    multi_worker_model.fit(x=train_datasets,
                           epochs=args.epochs,
                           steps_per_epoch=args.steps_per_epoch,
                           callbacks=callbacks)

    compat.export_saved_model(multi_worker_model, args.export_dir,
                              ctx.job_name == 'chief')
예제 #2
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    from tensorflowonspark import compat, TFNode

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

    def build_and_compile_cnn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                      metrics=['accuracy'])
        return model

    # single node
    # single_worker_model = build_and_compile_cnn_model()
    # single_worker_model.fit(x=train_datasets, epochs=3)

    tf_feed = TFNode.DataFeed(ctx.mgr, False)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    ds = tf.data.Dataset.from_generator(
        rdd_generator, (tf.float32, tf.float32),
        (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
    ds = ds.batch(args.batch_size)

    # this fails
    # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)]
    tf.io.gfile.makedirs(args.model_dir)
    filepath = args.model_dir + "/weights-{epoch:04d}"
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                           verbose=1,
                                           save_weights_only=True)
    ]

    with strategy.scope():
        multi_worker_model = build_and_compile_cnn_model()

    # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous,
    # so we need to ensure that all workers complete training before any of them run out of data from the RDD.
    # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers,
    # we'll just stop training at 90% of the total expected number of steps.
    steps_per_epoch = 60000 / args.batch_size
    steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers
    max_steps_per_worker = steps_per_epoch_per_worker * 0.9

    multi_worker_model.fit(x=ds,
                           epochs=args.epochs,
                           steps_per_epoch=max_steps_per_worker,
                           callbacks=callbacks)

    from tensorflow_estimator.python.estimator.export import export_lib
    export_dir = export_lib.get_timestamped_export_dir(args.export_dir)
    compat.export_saved_model(multi_worker_model, export_dir,
                              ctx.job_name == 'chief')

    # terminating feed tells spark to skip processing further partitions
    tf_feed.terminate()
예제 #3
0
def mainFun(args, ctx):
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, BatchNormalization
	import tensorflow.keras as keras
	from tensorflowonspark import compat, TFNode

	# Setting distributed model strategy
	strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

	def buildAndCompileModel():

		# Initiating model
		model = Sequential()

		# Building model structure
		model.add(Input(shape=(1025, 50, 1)))

		# First convolution and pooling step
		model.add(Conv2D(16, kernel_size=[3,3], activation='relu', data_format='channels_last'))
		model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last'))
		model.add(Dropout(0.2))

		# Second convolution and pooling step
		model.add(Conv2D(32, kernel_size=[3,3], activation='relu', data_format='channels_last'))
		model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last'))
		model.add(Dropout(0.2))

		# Flattening output of convolution to pass on to Dense layers
		model.add(Flatten())
		model.add(BatchNormalization())
		model.add(Dense(128, activation='relu'))
		model.add(BatchNormalization())
		model.add(Dense(128, activation='relu'))

		# Output layer
		model.add(Dense(30, activation='softmax'))

		# Compiling model
		model.compile(
			loss='sparse_categorical_crossentropy',
			optimizer='adam',
			metrics=['accuracy']
		)

		return model

	# Opening up datafeed to iterate over entries
	tfFeed = TFNode.DataFeed(ctx.mgr, False)

	# Function to split data into features and labels
	def rddGenerator():
		while not tfFeed.should_stop():
			batch = tfFeed.next_batch(1)
			if len(batch) > 0:
				example = batch[0]

				# Splitting into X and y
				X = np.array(example[1]).astype(np.float32)
				y = np.array(example[0])

				# Encoding labels
				_, y = np.unique(y, return_inverse=True)
				y = y.astype(np.float32)

				# Adjusting data shape
				X = X.reshape(-1, 50, 1)

				# Returning features and labels as separate arrays
				yield (X, y)
			else:
				return

	# Creating Tensorflow Dataset
	ds = tf.data.Dataset.from_generator(rddGenerator, (tf.float32, tf.float32), (tf.TensorShape([1025, 50, 1]), tf.TensorShape([1])))
	ds = ds.batch(1)

	# Instantiating Model
	with strategy.scope():
		multiWorkerModel = buildAndCompileModel()

	# Defining Training Parameters
	stepsPerEpoch = 600 / 1
	stepsPerWorker = stepsPerEpoch / 1
	maxStepsPerWorker = stepsPerWorker * 0.9

	# Fitting Model
	multiWorkerModel.fit(x = ds, epochs = 2, steps_per_epoch = stepsPerWorker)

	# Exporting log files for Tensorboard
	from tensorflow_estimator.python.estimator.export import export_lib
	exportDir = export_lib.get_timestamped_export_dir(args.export_dir)
	compat.export_saved_model(multiWorkerModel, exportDir, ctx.job_name == 'chief')

  # terminating feed tells spark to skip processing further partitions
	tfFeed.terminate()
예제 #4
0
        def _spark_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            tf.compat.v1.reset_default_graph()
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

            with strategy.scope():
                model = Sequential()
                model.add(Dense(1, activation='linear', input_shape=[2]))
                model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2),
                              loss='mse',
                              metrics=['mse'])
                model.summary()

            tf_feed = TFNode.DataFeed(ctx.mgr,
                                      input_mapping=args.input_mapping)

            def rdd_generator():
                while not tf_feed.should_stop():
                    batch = tf_feed.next_batch(1)
                    if len(batch['x']) > 0:
                        features = batch['x'][0]
                        label = batch['y_'][0]
                        yield (features, label)
                    else:
                        return

            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([2]), tf.TensorShape([1])))
            # disable auto-sharding since we're feeding from an RDD generator
            options = tf.data.Options()
            compat.disable_auto_shard(options)
            ds = ds.with_options(options)
            ds = ds.batch(args.batch_size)

            # only train 90% of each epoch to account for uneven RDD partition sizes
            steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers)

            tf.io.gfile.makedirs(args.model_dir)
            filepath = args.model_dir + "/weights-{epoch:04d}"
            callbacks = [
                tf.keras.callbacks.ModelCheckpoint(
                    filepath=filepath,
                    verbose=1,
                    load_weights_on_restart=True,
                    save_weights_only=True)
            ]

            model.fit(ds,
                      epochs=args.epochs,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks)

            # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy"
            # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks)

            if args.export_dir:
                print("exporting model to: {}".format(args.export_dir))
                compat.export_saved_model(model, args.export_dir,
                                          ctx.job_name == 'chief')

            tf_feed.terminate()
예제 #5
0
def main_fun(args, ctx):
  """Example demonstrating loading TFRecords directly from disk (e.g. HDFS) without tensorflow_datasets."""
  import tensorflow as tf
  from tensorflowonspark import compat

  strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

  BUFFER_SIZE = args.buffer_size
  BATCH_SIZE = args.batch_size
  NUM_WORKERS = args.cluster_size

  # parser for TFRecords downloaded by tensorflow_datasets
  # these are images + labels, where the images are just serialized PNGs
  def parse_tfds(x):
    feature_def = {"label": tf.io.FixedLenFeature(1, tf.int64), "image": tf.io.VarLenFeature(tf.string)}
    example = tf.io.parse_single_example(x, feature_def)
    image = tf.io.decode_image(example['image'].values[0]) / 255
    image.set_shape([28, 28, 1])     # fix for https://github.com/tensorflow/tensorflow/issues/24520
    label = example['label']
    return (image, label)

  # parser for TFRecords generated by ${TFoS_HOME}/examples/mnist/mnist_data_setup.py
  # these are images + labels, where the images are a flattened arrays of ints
  def parse_tfos(example_proto):
    feature_def = {"label": tf.io.FixedLenFeature(10, tf.int64),
                   "image": tf.io.FixedLenFeature(28 * 28 * 1, tf.int64)}
    features = tf.io.parse_single_example(example_proto, feature_def)
    image = tf.cast(features['image'], tf.float32) / 255
    image = tf.reshape(image, (28, 28, 1))
    label = tf.math.argmax(features['label'], output_type=tf.int32)
    return (image, label)

  # Dataset for input data
  # tfds: /path/to/tensorflow_datasets/mnist/1.0.0/mnist-train.tfrecord*
  # tfos: /path/to/mnist/tfr/train/part-r-*
  image_pattern = ctx.absolute_path(args.images_labels)

  ds = tf.data.Dataset.list_files(image_pattern)
  ds = ds.repeat(args.epochs).shuffle(BUFFER_SIZE)
  ds = ds.interleave(tf.data.TFRecordDataset)

  if args.data_format == 'tfds':
    train_datasets_unbatched = ds.map(parse_tfds)
  else:  # 'tfos'
    train_datasets_unbatched = ds.map(parse_tfos)

  def build_and_compile_cnn_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
        metrics=['accuracy'])
    return model

  # single node
  # single_worker_model = build_and_compile_cnn_model()
  # single_worker_model.fit(x=train_datasets, epochs=3)

  # Here the batch size scales up by number of workers since
  # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64,
  # and now this becomes 128.
  GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS
  train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE)

  # this fails
  # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)]
  tf.io.gfile.makedirs(args.model_dir)
  filepath = args.model_dir + "/weights-{epoch:04d}"
  callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True)]

  # Note: if you part files have an uneven number of records, you may see an "Out of Range" exception
  # at less than the expected number of steps_per_epoch, because the executor with least amount of records will finish first.
  steps_per_epoch = 60000 / GLOBAL_BATCH_SIZE

  with strategy.scope():
    multi_worker_model = build_and_compile_cnn_model()
  multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks)

  from tensorflow_estimator.python.estimator.export import export_lib
  export_dir = export_lib.get_timestamped_export_dir(args.export_dir)
  compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief')