def test_spark_task_cuda_devices_env_support(num_workers, num_gpus_per_worker): def train_fn(): import os return os.environ['CUDA_VISIBLE_DEVICES'] for num_slots in [2, 3, 4]: runner = MirroredStrategyRunner(num_slots=num_slots) task_cuda_env = runner.run(train_fn) gpu_set = {int(i) for i in task_cuda_env.split(',')} assert len(gpu_set) == num_slots for gpu_id in gpu_set: assert gpu_id in [10, 11, 12, 13]
def test_cpu_training_with_gpus(num_workers, num_gpus_per_worker): def train_fn(): from pyspark import BarrierTaskContext context = BarrierTaskContext.get() cuda_state = os.environ['CUDA_VISIBLE_DEVICES'] if cuda_state: num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) else: num_gpus = 0 return [int(e) for e in context.allGather(str(num_gpus))] runner = MirroredStrategyRunner(num_slots=2, use_gpu=False) assert runner.get_num_tasks() == 2 gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [0, 0]
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker): def train_fn(): import os from pyspark import BarrierTaskContext context = BarrierTaskContext.get() cuda_state = os.environ['CUDA_VISIBLE_DEVICES'] if cuda_state: num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) else: num_gpus = 0 return [int(e) for e in context.allGather(str(num_gpus))] for num_slots in [2, 4, 6, 8]: runner = MirroredStrategyRunner(num_slots=num_slots) task_gpu_amount = int( runner.sc.getConf().get('spark.task.resource.gpu.amount')) expected_num_task = math.ceil(num_slots / task_gpu_amount) assert runner.get_num_tasks() == expected_num_task gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [(num_slots // expected_num_task) + (i < (num_slots % expected_num_task)) for i in range(expected_num_task)]
def test_local_run(num_workers, num_gpus_per_worker, num_slots, old_cuda_state): def train_fn(): import os return len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) if old_cuda_state is not None: os.environ['CUDA_VISIBLE_DEVICES'] = old_cuda_state result = MirroredStrategyRunner(num_slots=num_slots, local_mode=True, gpu_resource_name='gpu').run(train_fn) gpus_on_the_driver = [str(e) for e in range(num_slots)] assert result == num_slots new_cuda_state = os.environ.get('CUDA_VISIBLE_DEVICES') assert old_cuda_state == new_cuda_state
def test_local_run(num_workers, num_gpus_per_worker, num_slots, old_cuda_state): def train_fn(): import os return os.environ['CUDA_VISIBLE_DEVICES'] if old_cuda_state is not None: mock_env = {'CUDA_VISIBLE_DEVICES': old_cuda_state} else: mock_env = {} with mock.patch.dict(os.environ, mock_env, clear=True): task_cuda_env = MirroredStrategyRunner( num_slots=num_slots, local_mode=True, gpu_resource_name='gpu').run(train_fn) gpu_set = {int(i) for i in task_cuda_env.split(',')} assert len(gpu_set) == num_slots for gpu_id in gpu_set: if old_cuda_state is not None: assert gpu_id in [10, 11, 12, 13] else: assert gpu_id in [0, 1, 2, 3] new_cuda_state = os.environ.get('CUDA_VISIBLE_DEVICES') assert old_cuda_state == new_cuda_state
def test_run_on_ssl_cluster_override(num_workers, num_gpus_per_worker, extra_spark_configs): MirroredStrategyRunner(num_slots=2, gpu_resource_name='gpu').run(lambda: None)
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker): def train_fn(): import os from pyspark import BarrierTaskContext context = BarrierTaskContext.get() cuda_state = os.environ['CUDA_VISIBLE_DEVICES'] if cuda_state: num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) else: num_gpus = 0 return [int(e) for e in context.allGather(str(num_gpus))] runner = MirroredStrategyRunner(num_slots=2) assert runner.get_num_tasks() == 1 gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [2] runner = MirroredStrategyRunner(num_slots=4) assert runner.get_num_tasks() == 1 gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [4] runner = MirroredStrategyRunner(num_slots=6) assert runner.get_num_tasks() == 2 gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [3, 3] runner = MirroredStrategyRunner(num_slots=8) assert runner.get_num_tasks() == 2 gpus_used_by_each_task = runner.run(train_fn) assert gpus_used_by_each_task == [4, 4]
def test_run_on_ssl_cluster(num_workers, num_gpus_per_worker, extra_spark_configs): with pytest.raises(Exception): MirroredStrategyRunner(num_slots=2, gpu_resource_name='gpu').run(lambda: None)
def test_zero_num_slots(num_workers, num_gpus_per_worker): with pytest.raises(ValueError): result = MirroredStrategyRunner(num_slots=0).run(lambda: None)
dataset = tf.data.Dataset.from_tensor_slices(( tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) ) dataset = dataset.repeat().shuffle(BUFFER_SIZE).batch(BATCH_SIZE) return dataset def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy'], ) return model train_datasets = make_datasets() options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA train_datasets = train_datasets.with_options(options) multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=train_datasets, epochs=3, steps_per_epoch=5) MirroredStrategyRunner(num_slots=8).run(train)
def distributed_train(self, train_datasets): self.train_datasets = train_datasets MirroredStrategyRunner(num_slots=8).run(self.train)
activation='relu', kernel_initializer='he_uniform')) model2.add(tf.keras.layers.Dense(10, activation='softmax')) model2.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'], ) return model2 train_datasets = make_datasets() options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA train_datasets = train_datasets.with_options(options) multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=train_datasets, epochs=10, steps_per_epoch=60000 // 32) return multi_worker_model.get_weights() spark = SparkSession.builder.master("spark://172.31.0.101:7077").appName("distributedTrain")\ .config("spark.driver.memory" , "2g")\ .config("spark.executor.memory" , "2g").enableHiveSupport().getOrCreate() sc = spark.sparkContext sc.setLogLevel("Error") weights = MirroredStrategyRunner(num_slots=2, spark=spark, use_gpu=False).run(train) model = build_and_compile_cnn_model() model.set_weights(weights) model.save("./trained_model.h5")
activation='relu', kernel_initializer='he_uniform')) model2.add(tf.keras.layers.Dense(10, activation='softmax')) model2.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'], ) return model2 train_datasets = make_datasets() options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA train_datasets = train_datasets.with_options(options) multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=train_datasets, epochs=1, steps_per_epoch=60000 // 32) return multi_worker_model.get_weights() spark = SparkSession.builder.master("spark://192.168.1.38:7077").appName("distributedTrain")\ .config("spark.driver.memory" , "2g")\ .config("spark.executor.memory" , "2g").enableHiveSupport().getOrCreate() sc = spark.sparkContext sc.setLogLevel("Error") weights = MirroredStrategyRunner(num_slots=sc.defaultParallelism, spark=spark, use_gpu=False).run(train) model = build_and_compile_cnn_model() model.set_weights(weights) model.save("./trained_model.h5")