def gloo_run_elastic(settings, driver, env, stdout=None, stderr=None): """ Run distributed gloo jobs. :param settings: Settings for running the distributed jobs. Note: settings.num_proc and settings.hosts must not be None. :param driver: The Spark driver service that tasks are connected to. :param env: Environment dictionary to use for running gloo jobs. Can be None. :param stdout: Horovod stdout is redirected to this stream. :param stderr: Horovod stderr is redirected to this stream. """ if env is None: env = {} # Each thread will use SparkTaskClient to launch the job on each remote host. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session. command = (sys.executable, '-m', 'horovod.spark.task.gloo_exec_fn', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) # Pass secret key through the environment variables. env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key) # get common interfaces from driver nics = driver.get_common_interfaces() exec_command = _exec_command_fn(driver, settings.key, settings, env, stdout, stderr, settings.prefix_output_with_timestamp) rendezvous = SparkRendezvousServer(driver, settings.verbose) launch_gloo_elastic(command, exec_command, settings, env, lambda _: nics, rendezvous)
def mpi_run(settings, nics, driver, env, stdout=None, stderr=None): """ Runs mpirun. :param settings: Settings for running MPI. Note: settings.num_proc and settings.hosts must not be None. :param nics: Interfaces to include by MPI. :param driver: The Spark driver service that tasks are connected to. :param env: Environment dictionary to use for running MPI. Can be None. :param stdout: Stdout of the mpi process. Only used when settings.run_func_mode is True. :param stderr: Stderr of the mpi process. Only used when settings.run_func_mode is True. """ env = {} if env is None else copy.copy( env) # copy env so we do not leak env modifications # Pass secret key through the environment variables. env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key) # we don't want the key to be serialized along with settings from here on settings.key = None rsh_agent = (sys.executable, '-m', 'horovod.spark.driver.mpirun_rsh', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) settings.extra_mpi_args = ( '{extra_mpi_args} -x NCCL_DEBUG=INFO -mca plm_rsh_agent "{rsh_agent}"'. format(extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', rsh_agent=' '.join(rsh_agent))) command = (sys.executable, '-m', 'horovod.spark.task.mpirun_exec_fn', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) hr_mpi_run(settings, nics, env, command, stdout=stdout, stderr=stderr)
def gloo_run(settings, nics, driver, env, stdout=None, stderr=None): """ Run distributed gloo jobs. :param settings: Settings for running the distributed jobs. Note: settings.num_proc and settings.hosts must not be None. :param nics: Interfaces to use by gloo. :param driver: The Spark driver service that tasks are connected to. :param env: Environment dictionary to use for running gloo jobs. Can be None. :param stdout: Horovod stdout is redirected to this stream. :param stderr: Horovod stderr is redirected to this stream. """ if env is None: env = {} # we don't want the key to be serialized along with settings from here on key = settings.key settings.key = None # Each thread will use SparkTaskClient to launch the job on each remote host. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session. iface = list(nics)[0] server_ip = driver.addresses()[iface][0][0] command = (sys.executable, '-m', 'horovod.spark.task.gloo_exec_fn', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) exec_command = _exec_command_fn(driver, key, settings, env, stdout, stderr, settings.prefix_output_with_timestamp) launch_gloo(command, exec_command, settings, nics, {}, server_ip)
def _serialize(model): """Serialize model into byte array encoded into base 64.""" if is_module_available('torch'): import torch sys.modules["torch._C._nn"] = torch.nn.functional serialized_obj = codec.dumps_base64(model) return serialized_obj
def _load_model_from_checkpoint(self, run_id): store = self.getStore() last_ckpt_path = store.get_checkpoint_path(run_id) if self.getVerbose(): print('Resuming training from last checkpoint: {}'.format(last_ckpt_path)) model_bytes = store.read(last_ckpt_path) return codec.dumps_base64(model_bytes)
def _torch_param_serialize(param_name, param_val): if param_name in [EstimatorParams.backend.name, EstimatorParams.store.name]: # We do not serialize backend and store. These params have to be regenerated for each # run of the pipeline return None if param_val is None: return None return codec.dumps_base64(param_val)
def put_data_into_kvstore(addr, port, scope, key, value): try: url = "http://{addr}:{port}/{scope}/{key}".format(addr=addr, port=str(port), scope=scope, key=key) req = Request(url, data=codec.dumps_base64(value, to_ascii=False)) req.get_method = lambda: "PUT" # for urllib2 compatibility urlopen(req) except (HTTPError, URLError) as e: raise RuntimeError("Put data input KVStore server failed.", e)
def _serialize_keras_optimizer(opt, optimizer_class, save_optimizer_fn): if isinstance(opt, str): return opt elif isinstance(opt, optimizer_class): bio = io.BytesIO() with h5py.File(bio, 'w') as f: save_optimizer_fn(opt, f) return codec.dumps_base64(bio.getvalue()) else: raise \ ValueError('Keras optimizer has to be an instance of str or keras.optimizers.Optimizer')
def read_serialized_keras_model(self, ckpt_path, model): """Reads the checkpoint file of the keras model into model bytes and returns the base 64 encoded model bytes. :param ckpt_path: A string of path to the checkpoint file. :param model: A keras model. This parameter will be used in DBFSLocalStore\ .read_serialized_keras_model() when the ckpt_path only contains model weights. :return: the base 64 encoded model bytes of the checkpoint model. """ from horovod.runner.common.util import codec model_bytes = self.read(ckpt_path) return codec.dumps_base64(model_bytes)
def _serialize(model): """Serialize model into byte array encoded into base 64.""" if is_module_available('torch'): import torch sys.modules["torch._C._nn"] = torch.nn.functional if isinstance(model, torch.jit.ScriptModule): # If torch model is converted to torchScript model = save_into_bio(model, torch.jit.save) serialized_obj = codec.dumps_base64(model) return serialized_obj
def _serialize_param_value(param_name, param_val, serialize_model_fn, serialize_opt_fn): if param_val is None: return param_val if param_name in [params.EstimatorParams.backend.name, params.EstimatorParams.store.name]: # We do not serialize backend and store. These params have to be regenerated for each # run of the pipeline return None elif param_name == params.EstimatorParams.model.name: return serialize_model_fn(param_val) if param_name == params.EstimatorParams.optimizer.name: return serialize_opt_fn(param_val) else: return codec.dumps_base64(param_val)
def __init__(self, index, key, nics, minimum_command_lifetime_s, verbose=0): # on a Spark cluster we need our train function to see the Spark worker environment # this includes PYTHONPATH, HADOOP_TOKEN_FILE_LOCATION and _HOROVOD_SECRET_KEY env = os.environ.copy() # we inject the secret key here env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(key) # we also need to provide the current working dir to mpirun_exec_fn.py env['HOROVOD_SPARK_WORK_DIR'] = os.getcwd() super(SparkTaskService, self).__init__(SparkTaskService.NAME_FORMAT % index, key, nics, env, verbose) self._key = key self._minimum_command_lifetime_s = minimum_command_lifetime_s self._minimum_command_lifetime = None
def read_serialized_keras_model(self, ckpt_path, model, custom_objects): """Reads the checkpoint file of the keras model into model bytes and returns the base 64 encoded model bytes. :param ckpt_path: A string of path to the checkpoint file. :param model: A keras model. This parameter will be used in DBFSLocalStore\ .read_serialized_keras_model() when the ckpt_path only contains model weights. :param custom_objects: This parameter will be used in DBFSLocalStore\ .read_serialized_keras_model() when loading the keras model. :return: the base 64 encoded model bytes of the checkpoint model. """ from horovod.runner.common.util import codec import tensorflow from tensorflow import keras from horovod.spark.keras.util import TFKerasUtil if LooseVersion(tensorflow.__version__) < LooseVersion("2.0.0"): model_bytes = self.read(ckpt_path) return codec.dumps_base64(model_bytes) else: with keras.utils.custom_object_scope(custom_objects): model = keras.models.load_model(ckpt_path) return TFKerasUtil.serialize_model(model)
def _save_meta_to_fs(fs, path, schema, rows, total_byte_size): with fs.open(path, 'wb') as train_meta_file: serialized_content = codec.dumps_base64( dict(schema=schema, rows=rows, total_byte_size=total_byte_size)) train_meta_file.write(serialized_content.encode('utf-8'))
def train(serialized_model, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader import horovod as _horovod k = get_keras() k.backend.set_floatx(floatx) hvd = get_horovod() hvd.init() pin_gpu(hvd, tf, k) # If user specifies any user_shuffle_buffer_size (even 0), we should honor it. if user_shuffle_buffer_size is None: shuffle_buffer_size = calculate_shuffle_buffer_size( hvd, avg_row_size, train_rows / hvd.size()) else: if user_shuffle_buffer_size < 0: raise ValueError( "user_shuffle_buffer_size cannot be negative!") shuffle_buffer_size = user_shuffle_buffer_size # needs to be deserialized in the with scope with k.utils.custom_object_scope(custom_objects): model = deserialize_keras_model(serialized_model, lambda x: hvd.load_model(x)) # Horovod: adjust learning rate based on number of processes. scaled_lr = k.backend.get_value(model.optimizer.lr) * hvd.size() k.backend.set_value(model.optimizer.lr, scaled_lr) # Verbose mode 1 will print a progress bar verbose = user_verbose if hvd.rank() == 0 else 0 if verbose: print( f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}" ) transform_spec = None if transformation: transform_spec = TransformSpec(transformation) # The inital_lr needs to be set to scaled learning rate in the checkpointing callbacks. for callback in user_callbacks: if isinstance( callback, _horovod._keras.callbacks. LearningRateScheduleCallbackImpl): callback.initial_lr = scaled_lr with remote_store.get_local_output_dir() as run_output_dir: callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), ] callbacks += user_callbacks # Horovod: save checkpoints only on the first worker to prevent other workers from # corrupting them. if hvd.rank() == 0: ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) # This callback checkpoints the model that ultimately is wrapped and returned after # Estimator.fit is called. _checkpoint_callback = checkpoint_callback if _checkpoint_callback: _checkpoint_callback.filepath = ckpt_file else: if is_dbfs and LooseVersion( tf.__version__) < LooseVersion("2.0.0"): # Because DBFS local file APIs does not support random write which is # required by h5 format, save_weights_only=True is needed for switching # to the TensorFlow SavedModel format. _checkpoint_callback = k.callbacks.ModelCheckpoint( ckpt_file, save_weights_only=True) else: _checkpoint_callback = k.callbacks.ModelCheckpoint( ckpt_file) callbacks.append(_checkpoint_callback) if remote_store.saving_runs: tb_callback = None for i, c in enumerate(callbacks): if isinstance(c, k.callbacks.TensorBoard): tb_callback = c print( f"Found TensorBoard callback, updating log_dir to {logs_dir}" ) tb_callback.log_dir = logs_dir break if tb_callback: # Rather than a possibly arbitrary order, we always place the TensorBoard # callback right before the SyncCallback callbacks.pop(i) callbacks.append(tb_callback or k.callbacks.TensorBoard(logs_dir)) callbacks.append( SyncCallback(run_output_dir, remote_store.sync, k)) if train_steps_per_epoch is None: steps_per_epoch = int( math.ceil(train_rows / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch if validation_steps_per_epoch is None: # math.ceil because if val_rows is smaller than val_batch_size we still get the at least # one step. float(val_rows) because val_rows/val_batch_size evaluates to zero before # math.ceil validation_steps = int(math.ceil(float(val_rows) / val_batch_size / hvd.size())) \ if should_validate else None else: validation_steps = validation_steps_per_epoch schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if verbose: print( f"Training parameters: Epochs: {epochs}, Scaled lr: {scaled_lr}, Shuffle size: {shuffle_buffer_size}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {steps_per_epoch}\n" f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {validation_steps}\n" f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n" ) # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True is_batch_reader = False else: reader_factory = make_batch_reader is_batch_reader = True with reader_factory( remote_store.train_data_path, num_epochs=1, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, # Don't shuffle row groups if shuffle_buffer_size is 0 (non-shuffle case). shuffle_row_groups=True if shuffle_buffer_size > 0 else False, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=1, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, shuffle_row_groups=False, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: train_data = make_dataset( train_reader, batch_size, shuffle_buffer_size, is_batch_reader, shuffle=True if shuffle_buffer_size > 0 else False, cache=inmemory_cache_all) val_data = make_dataset(val_reader, val_batch_size, shuffle_buffer_size, is_batch_reader, shuffle=False, cache=inmemory_cache_all) \ if val_reader else None history = fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose) # Dataset API usage currently displays a wall of errors upon termination. # This global model registration ensures clean termination. # Tracked in https://github.com/tensorflow/tensorflow/issues/24570 globals()['_DATASET_FINALIZATION_HACK'] = model if hvd.rank() == 0: if is_dbfs: if LooseVersion(tf.__version__) < LooseVersion("2.0.0"): model.load_weights(ckpt_file) else: # needs to be deserialized in the with scope with k.utils.custom_object_scope(custom_objects): model = k.models.load_model(ckpt_file) serialized_model = keras_utils.serialize_model(model) else: if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"): with k.utils.custom_object_scope(custom_objects): model = k.models.load_model(ckpt_file) serialized_model = keras_utils.serialize_model(model) else: with open(ckpt_file, 'rb') as f: serialized_model = codec.dumps_base64(f.read()) return history.history, serialized_model, hvd.size()
def train(serialized_model, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader k = get_keras() k.backend.set_floatx(floatx) hvd = get_horovod() hvd.init() pin_gpu(hvd, tf, k) if not user_shuffle_buffer_size: shuffle_buffer_size = calculate_shuffle_buffer_size( hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size # needs to be deserialized in the with scope with k.utils.custom_object_scope(custom_objects): model = deserialize_keras_model(serialized_model, lambda x: hvd.load_model(x)) # Horovod: adjust learning rate based on number of processes. k.backend.set_value( model.optimizer.lr, k.backend.get_value(model.optimizer.lr) * hvd.size()) # Verbose mode 1 will print a progress bar verbose = user_verbose if hvd.rank() == 0 else 0 transform_spec = None if transformation: transform_spec = TransformSpec(transformation) with remote_store.get_local_output_dir() as run_output_dir: callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), ] callbacks += user_callbacks # Horovod: save checkpoints only on the first worker to prevent other workers from # corrupting them. if hvd.rank() == 0: ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) # This callback checkpoints the model that ultimately is wrapped and returned after # Estimator.fit is called. _checkpoint_callback = checkpoint_callback if _checkpoint_callback: _checkpoint_callback.filepath = ckpt_file else: if is_dbfs and LooseVersion( tf.__version__) < LooseVersion("2.0.0"): # Because DBFS local file APIs does not support random write which is # required by h5 format, save_weights_only=True is needed for switching # to the TensorFlow SavedModel format. _checkpoint_callback = k.callbacks.ModelCheckpoint( ckpt_file, save_weights_only=True) else: _checkpoint_callback = k.callbacks.ModelCheckpoint( ckpt_file) callbacks.append(_checkpoint_callback) if remote_store.saving_runs: callbacks.append(k.callbacks.TensorBoard(logs_dir)) callbacks.append( SyncCallback(run_output_dir, remote_store.sync, k)) if train_steps_per_epoch is None: steps_per_epoch = int( math.ceil(train_rows / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch if validation_steps_per_epoch is None: # math.ceil because if val_rows is smaller than batch_size we still get the at least # one step. float(val_rows) because val_rows/batch_size evaluates to zero before # math.ceil validation_steps = int(math.ceil(float(val_rows) / batch_size / hvd.size())) \ if should_validate else None else: validation_steps = validation_steps_per_epoch schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True is_batch_reader = False else: reader_factory = make_batch_reader is_batch_reader = True # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type='process', workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type='process', workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: train_data = make_dataset(train_reader, shuffle_buffer_size, is_batch_reader, shuffle=True) val_data = make_dataset(val_reader, shuffle_buffer_size, is_batch_reader, shuffle=False) \ if val_reader else None history = fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose) # Dataset API usage currently displays a wall of errors upon termination. # This global model registration ensures clean termination. # Tracked in https://github.com/tensorflow/tensorflow/issues/24570 globals()['_DATASET_FINALIZATION_HACK'] = model if hvd.rank() == 0: if is_dbfs: if LooseVersion(tf.__version__) < LooseVersion("2.0.0"): model.load_weights(ckpt_file) else: # needs to be deserialized in the with scope with k.utils.custom_object_scope(custom_objects): model = k.models.load_model(ckpt_file) serialized_model = keras_utils.serialize_model(model) else: with open(ckpt_file, 'rb') as f: serialized_model = codec.dumps_base64(f.read()) return history.history, serialized_model, hvd.size()
def _serialize_keras_model(model, save_model_fn): """Serialize model into byte array encoded into base 64.""" bio = io.BytesIO() with h5py.File(bio, 'w') as f: save_model_fn(model, f) return codec.dumps_base64(bio.getvalue())
def _launch_task_servers(all_host_names, local_host_names, driver_addresses, settings): """ Executes the task server and service client task for registration on the hosts. :param all_host_names: list of addresses. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type all_host_names: list(string) :param local_host_names: names that are resolved to one of the addresses of local hosts interfaces. For example, set(['localhost', '127.0.0.1']) :type local_host_names: set :param driver_addresses: map of interfaces and their address and port for the service. For example: { 'lo': [('127.0.0.1', 34588)], 'docker0': [('172.122.10.1', 34588)], 'eth0': [('11.111.33.73', 34588)] } :type driver_addresses: map :param settings: the object that contains the setting for running horovod :type settings: horovod.runner.common.util.settings.Settings :return: :rtype: """ def _exec_command(command): host_output = io.StringIO() try: exit_code = safe_shell_exec.execute(command, stdout=host_output, stderr=host_output) if exit_code != 0: print('Launching horovod task function was not ' 'successful:\n{host_output}'.format( host_output=host_output.getvalue())) os._exit(exit_code) finally: host_output.close() return exit_code args_list = [] num_hosts = len(all_host_names) for index in range(num_hosts): host_name = all_host_names[index] command = \ '{python} -m horovod.runner.task_fn {index} {num_hosts} ' \ '{driver_addresses} {settings}' \ .format(python=sys.executable, index=codec.dumps_base64(index), num_hosts=codec.dumps_base64(num_hosts), driver_addresses=codec.dumps_base64(driver_addresses), settings=codec.dumps_base64(settings)) if host_name not in local_host_names: command = get_ssh_command(command, host=host_name, port=settings.ssh_port, identity_file=settings.ssh_identity_file) if settings.verbose >= 2: print('Launching horovod task function: {}'.format(command)) args_list.append([command]) # Each thread will use ssh command to launch the server on one task. If an # error occurs in one thread, entire process will be terminated. Otherwise, # threads will keep running and ssh session -- and the the task server -- # will be bound to the thread. In case, the horovod process dies, all # the ssh sessions and all the task servers will die as well. threads.execute_function_multithreaded(_exec_command, args_list, block_until_all_done=False)