def test_safe_shell_exec_interrupts_on_parent_shutdown(self): sleep = 20 parent_script = os.path.join(os.path.dirname(__file__), 'data/run_safe_shell_exec.py') child_script = os.path.join(os.path.dirname(__file__), 'data/sleep.py') def get_pid(logfile): # Wait until the script has written its PID to the logfile wait(lambda: os.path.exists(logfile), timeout=5) with open(logfile, 'r') as f: return int(f.read()) with temppath() as parent_logfile, temppath() as child_logfile: # It's important that this executes in an entirely different interpreter with as little shared # state as possible, to avoid issues with the semaphore tracker. cmd = ' '.join([sys.executable, parent_script, parent_logfile, child_script, str(sleep), child_logfile]) p = subprocess.Popen(cmd, shell=True) parent = psutil.Process(get_pid(parent_logfile)) child = psutil.Process(get_pid(child_logfile)) self.assertTrue(parent.is_running()) self.assertTrue(child.is_running()) # Hard kill the parent process parent.kill() parent.wait(timeout=safe_shell_exec.GRACEFUL_TERMINATION_TIME_S) p.wait() # Child process will exit when pipe breaks child.wait(timeout=2 * safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 1) self.assertFalse(parent.is_running()) self.assertFalse(child.is_running())
def spark_session(app, cores=2, gpus=0, *args): from pyspark import SparkConf from pyspark.sql import SparkSession master = 'local-cluster[{},1,1024]'.format(cores) if gpus > 0 else 'local[{}]'.format(cores) conf = SparkConf().setAppName(app).setMaster(master) with temppath() as temp_filename: if gpus > 0: with open(temp_filename, 'wb') as temp_file: addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus)) temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' + addresses.encode('ascii') + b']}') os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP | stat.S_IROTH | stat.S_IXOTH) conf = conf.set("spark.test.home", os.environ.get('SPARK_HOME')) conf = conf.set("spark.worker.resource.gpu.discoveryScript", temp_filename) conf = conf.set("spark.worker.resource.gpu.amount", 1) conf = conf.set("spark.task.resource.gpu.amount", "1") conf = conf.set("spark.executor.resource.gpu.amount", "1") session = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() try: yield session finally: session.stop()
def test_load_model_custom_optimizers(self): class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session(config=self.config) as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) custom_optimizers = [TestOptimizer] new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self._check_optimizer_weights(opt, new_opt)
def test_load_model(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) new_model = hvd.load_model(fname) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'RMSprop') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self._check_optimizer_weights(opt, new_opt)
def test_load_model_broadcast(self): def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with temppath() as fname: with self.session(config=self.config) as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: model.save(fname) K.clear_session() with self.session(config=self.config) as sess: K.set_session(sess) weight = np.random.random((1, 3)) if hvd.rank() == 0: model = hvd.load_model(fname) else: model = create_model() def generator(): while 1: yield (x, y, weight) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=1, callbacks=callbacks, epochs=1, verbose=0, workers=4, initial_epoch=0) self.assertEqual(len(model.optimizer.weights), 5)
def test_generate_jsrun_rankfile(self): settings = hvd_settings.Settings( num_proc=5, hosts='host1:4,host2:4,host3:4', ) with temppath() as rankfile_path: rankfile_path = generate_jsrun_rankfile(settings, rankfile_path) with open(rankfile_path, 'r') as file: gen_rankfile = file.read() expected_rankfile = ( """overlapping_rs: allow cpu_index_using: logical rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * } rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * } rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * } rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * } rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * } """) self.assertMultiLineEqual(gen_rankfile, expected_rankfile)
def test_horovodrun_hostfile(self): with temppath() as host_filename: with open(host_filename, 'w+') as fp: fp.write('172.31.32.7 slots=8\n') fp.write('172.31.33.9 slots=8\n') hosts = parse_host_files(host_filename) self.assertEqual(hosts, '172.31.32.7:8,172.31.33.9:8')
def _run(self, discovery_schedule=None, exit_schedule=None, exit_mode='exception', np=2, min_np=2, max_np=4, hosts=None): if not discovery_schedule and not hosts: raise ValueError( 'at least one of discovery schedule or hosts must be given') with temppath() as logfile: with _temp_discovery_script(logfile, discovery_schedule or [(None, hosts.split(','))]) \ as discovery_script: command_args = [ 'horovodrun', '-np', str(np), '--min-np', str(min_np), '--log-level', 'DEBUG' ] if hosts is not None: command_args += ['-H', hosts] else: command_args += [ '--host-discovery-script', discovery_script, '--max-np', str(max_np) ] command_args += [ 'python', self._training_script, '--logfile', logfile ] if discovery_schedule: command_args += [ '--discovery-schedule', json.dumps(discovery_schedule) ] if exit_schedule: command_args += [ '--exit-schedule', json.dumps(exit_schedule), '--exit-mode', exit_mode ] print(' '.join(command_args)) with override_args(*command_args): args = parse_args() env = {} config_parser.set_env_from_args(env, args) _run_elastic(args) with open(logfile, 'r') as f: lines = f.readlines() print('logfile:') for line in lines: print(line) return [json.loads(line) for line in lines]
def _run(self, discovery_schedule=None, exit_schedule=None, hosts=None, discovery_wait=10, epoch_wait=None, epochs=None, num_proc=2, min_num_proc=None, max_num_proc=None, extra_conf=None): with temppath() as logfile: with spark_cluster(logfile=logfile, discovery_schedule=discovery_schedule, hosts=hosts, extra_conf=extra_conf): command = [ sys.executable, self._training_script, '--logfile', logfile ] if discovery_schedule: command += [ '--discovery-schedule', "'{}'".format(json.dumps(discovery_schedule)), '--discovery-wait', str(discovery_wait) ] if exit_schedule: command += [ '--exit-schedule', "'{}'".format(json.dumps(exit_schedule)) ] if epochs: command += ['--epochs', str(epochs)] if epoch_wait: command += ['--epoch-wait', str(epoch_wait)] cmd = ' '.join(command) run_elastic(self._exec, (cmd, ), env={'HOROVOD_LOG_LEVEL': 'DEBUG'}, num_proc=num_proc, min_num_proc=min_num_proc, max_num_proc=max_num_proc, stdout=sys.stdout, stderr=sys.stderr, start_timeout=10, elastic_timeout=10, verbose=2, prefix_output_with_timestamp=True) with open(logfile, 'r') as f: lines = f.readlines() print('logfile:') for line in lines: print(line) return [json.loads(line) for line in lines]
def spark_session(app, cores=2, gpus=0, max_failures=1, *args): from pyspark import SparkConf from pyspark.sql import SparkSession with TemporaryDirectory() as tmpdir: metastore_path = os.path.join(tmpdir, 'metastore') # start a single worker with given cores when gpus are present # max failures are ignored when gpus in that case master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \ else 'local[{},{}]'.format(cores, max_failures) conf = SparkConf().setAppName(app).setMaster(master) conf = conf.setAll([ ('spark.ui.showConsoleProgress', 'false'), ('spark.test.home', os.environ.get('SPARK_HOME')), ('spark.locality.wait', '0'), ('spark.unsafe.exceptionOnMemoryLeak', 'true'), ('spark.ui.enabled', 'false'), ('spark.local.dir', os.path.join(tmpdir, 'tmp')), ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')), ('javax.jdo.option.ConnectionURL', f'jdbc:derby:;databaseName={metastore_path};create=true'), ]) with temppath() as temp_filename: if gpus > 0: with open(temp_filename, 'wb') as temp_file: addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus)) temp_file.write( b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' + addresses.encode('ascii') + b']}') os.chmod( temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP | stat.S_IROTH | stat.S_IXOTH) # the single worker takes all gpus discovered, and a single executor will get them # each task on that executor will get a single gpu conf = conf.setAll([ ('spark.worker.resource.gpu.discoveryScript', temp_filename), ('spark.worker.resource.gpu.amount', str(gpus)), ('spark.task.resource.gpu.amount', '1'), ('spark.executor.resource.gpu.amount', str(gpus)), ]) session = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() try: yield session finally: session.stop()
def _temp_discovery_script(logfile, discovery_schedule): with temppath() as discovery_script: with open(discovery_script, 'w') as f: f.write(DISCOVERY_SCRIPT_TEMPLATE.format(logfile=logfile) + os.linesep) for i, schedule_step in enumerate(discovery_schedule): f.write(_get_discovery_lines(schedule_step, start=i == 0, end=i == len(discovery_schedule) - 1)) os.chmod(discovery_script, 0o755) yield discovery_script
def test_timeline(self): with temppath() as t: with env(HOROVOD_TIMELINE=t, HOROVOD_TIMELINE_MARK_CYCLES='1'): hvd.init() # Perform a simple allreduce operation hvd.allreduce(torch.tensor([1, 2, 3], dtype=torch.float32), name='test_allreduce') # Wait for it to register in the timeline. time.sleep(0.1) if hvd.rank() == 0: with open(t, 'r') as tf: timeline_text = tf.read() assert 'allreduce.test_allreduce' in timeline_text, timeline_text assert 'NEGOTIATE_ALLREDUCE' in timeline_text, timeline_text assert 'ALLREDUCE' in timeline_text, timeline_text assert 'CYCLE_START' in timeline_text, timeline_text
def test_model_serialization(self, mock_remote_trainer): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' def train(serialized_model, train_rows, val_rows, avg_row_size): return None, serialized_model, 2 mock_remote_trainer.return_value = train with spark_session('test_model_serialization') as spark: df = create_xor_data(spark) keras_estimator = hvd.KerasEstimator(model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) backend = CallbackBackend() with local_store() as store: with temppath() as saved_path: keras_estimator.save(saved_path) keras_estimator_loaded = hvd.KerasEstimator.load( saved_path) keras_model = keras_estimator_loaded.fit( df, params={ keras_estimator_loaded.backend: backend, keras_estimator_loaded.store: store }) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32