예제 #1
0
def gloo_run_elastic(settings, driver, env, stdout=None, stderr=None):
    """
    Run distributed gloo jobs.

    :param settings: Settings for running the distributed jobs.
                     Note: settings.num_proc and settings.hosts must not be None.
    :param driver: The Spark driver service that tasks are connected to.
    :param env: Environment dictionary to use for running gloo jobs.  Can be None.
    :param stdout: Horovod stdout is redirected to this stream.
    :param stderr: Horovod stderr is redirected to this stream.
    """
    if env is None:
        env = {}

    # Each thread will use SparkTaskClient to launch the job on each remote host. If an
    # error occurs in one thread, entire process will be terminated. Otherwise,
    # threads will keep running and ssh session.
    command = (sys.executable, '-m', 'horovod.spark.task.gloo_exec_fn',
               codec.dumps_base64(driver.addresses()),
               codec.dumps_base64(settings))

    # Pass secret key through the environment variables.
    env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key)

    # get common interfaces from driver
    nics = driver.get_common_interfaces()

    exec_command = _exec_command_fn(driver, settings.key, settings, env,
                                    stdout, stderr,
                                    settings.prefix_output_with_timestamp)
    rendezvous = SparkRendezvousServer(driver, settings.verbose)
    launch_gloo_elastic(command, exec_command, settings, env, lambda _: nics,
                        rendezvous)
예제 #2
0
def mpi_run(settings, nics, driver, env, stdout=None, stderr=None):
    """
    Runs mpirun.

    :param settings: Settings for running MPI.
                     Note: settings.num_proc and settings.hosts must not be None.
    :param nics: Interfaces to include by MPI.
    :param driver: The Spark driver service that tasks are connected to.
    :param env: Environment dictionary to use for running MPI.  Can be None.
    :param stdout: Stdout of the mpi process.
                   Only used when settings.run_func_mode is True.
    :param stderr: Stderr of the mpi process.
                   Only used when settings.run_func_mode is True.
    """
    env = {} if env is None else copy.copy(
        env)  # copy env so we do not leak env modifications

    # Pass secret key through the environment variables.
    env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key)
    # we don't want the key to be serialized along with settings from here on
    settings.key = None

    rsh_agent = (sys.executable, '-m', 'horovod.spark.driver.mpirun_rsh',
                 codec.dumps_base64(driver.addresses()),
                 codec.dumps_base64(settings))
    settings.extra_mpi_args = (
        '{extra_mpi_args} -x NCCL_DEBUG=INFO -mca plm_rsh_agent "{rsh_agent}"'.
        format(extra_mpi_args=settings.extra_mpi_args
               if settings.extra_mpi_args else '',
               rsh_agent=' '.join(rsh_agent)))
    command = (sys.executable, '-m', 'horovod.spark.task.mpirun_exec_fn',
               codec.dumps_base64(driver.addresses()),
               codec.dumps_base64(settings))
    hr_mpi_run(settings, nics, env, command, stdout=stdout, stderr=stderr)
예제 #3
0
def gloo_run(settings, nics, driver, env, stdout=None, stderr=None):
    """
    Run distributed gloo jobs.

    :param settings: Settings for running the distributed jobs.
                     Note: settings.num_proc and settings.hosts must not be None.
    :param nics: Interfaces to use by gloo.
    :param driver: The Spark driver service that tasks are connected to.
    :param env: Environment dictionary to use for running gloo jobs.  Can be None.
    :param stdout: Horovod stdout is redirected to this stream.
    :param stderr: Horovod stderr is redirected to this stream.
    """
    if env is None:
        env = {}

    # we don't want the key to be serialized along with settings from here on
    key = settings.key
    settings.key = None

    # Each thread will use SparkTaskClient to launch the job on each remote host. If an
    # error occurs in one thread, entire process will be terminated. Otherwise,
    # threads will keep running and ssh session.
    iface = list(nics)[0]
    server_ip = driver.addresses()[iface][0][0]
    command = (sys.executable, '-m', 'horovod.spark.task.gloo_exec_fn',
               codec.dumps_base64(driver.addresses()),
               codec.dumps_base64(settings))

    exec_command = _exec_command_fn(driver, key, settings, env, stdout, stderr,
                                    settings.prefix_output_with_timestamp)
    launch_gloo(command, exec_command, settings, nics, {}, server_ip)
예제 #4
0
파일: util.py 프로젝트: zw0610/horovod
    def _serialize(model):
        """Serialize model into byte array encoded into base 64."""
        if is_module_available('torch'):
            import torch
            sys.modules["torch._C._nn"] = torch.nn.functional

        serialized_obj = codec.dumps_base64(model)
        return serialized_obj
예제 #5
0
    def _load_model_from_checkpoint(self, run_id):
        store = self.getStore()
        last_ckpt_path = store.get_checkpoint_path(run_id)

        if self.getVerbose():
            print('Resuming training from last checkpoint: {}'.format(last_ckpt_path))

        model_bytes = store.read(last_ckpt_path)
        return codec.dumps_base64(model_bytes)
예제 #6
0
def _torch_param_serialize(param_name, param_val):
    if param_name in [EstimatorParams.backend.name, EstimatorParams.store.name]:
        # We do not serialize backend and store. These params have to be regenerated for each
        # run of the pipeline
        return None

    if param_val is None:
        return None

    return codec.dumps_base64(param_val)
예제 #7
0
def put_data_into_kvstore(addr, port, scope, key, value):
    try:
        url = "http://{addr}:{port}/{scope}/{key}".format(addr=addr,
                                                          port=str(port),
                                                          scope=scope,
                                                          key=key)
        req = Request(url, data=codec.dumps_base64(value, to_ascii=False))
        req.get_method = lambda: "PUT"  # for urllib2 compatibility
        urlopen(req)
    except (HTTPError, URLError) as e:
        raise RuntimeError("Put data input KVStore server failed.", e)
예제 #8
0
def _serialize_keras_optimizer(opt, optimizer_class, save_optimizer_fn):
    if isinstance(opt, str):
        return opt
    elif isinstance(opt, optimizer_class):
        bio = io.BytesIO()
        with h5py.File(bio, 'w') as f:
            save_optimizer_fn(opt, f)
        return codec.dumps_base64(bio.getvalue())
    else:
        raise \
            ValueError('Keras optimizer has to be an instance of str or keras.optimizers.Optimizer')
예제 #9
0
    def read_serialized_keras_model(self, ckpt_path, model):
        """Reads the checkpoint file of the keras model into model bytes and returns the base 64
        encoded model bytes.
        :param ckpt_path: A string of path to the checkpoint file.
        :param model: A keras model. This parameter will be used in DBFSLocalStore\
            .read_serialized_keras_model() when the ckpt_path only contains model weights.
        :return: the base 64 encoded model bytes of the checkpoint model.
        """
        from horovod.runner.common.util import codec

        model_bytes = self.read(ckpt_path)
        return codec.dumps_base64(model_bytes)
예제 #10
0
    def _serialize(model):
        """Serialize model into byte array encoded into base 64."""
        if is_module_available('torch'):
            import torch
            sys.modules["torch._C._nn"] = torch.nn.functional

        if isinstance(model, torch.jit.ScriptModule):
            # If torch model is converted to torchScript
            model = save_into_bio(model, torch.jit.save)

        serialized_obj = codec.dumps_base64(model)
        return serialized_obj
예제 #11
0
def _serialize_param_value(param_name, param_val, serialize_model_fn, serialize_opt_fn):
    if param_val is None:
        return param_val

    if param_name in [params.EstimatorParams.backend.name, params.EstimatorParams.store.name]:
        # We do not serialize backend and store. These params have to be regenerated for each
        # run of the pipeline
        return None
    elif param_name == params.EstimatorParams.model.name:
        return serialize_model_fn(param_val)
    if param_name == params.EstimatorParams.optimizer.name:
        return serialize_opt_fn(param_val)
    else:
        return codec.dumps_base64(param_val)
예제 #12
0
    def __init__(self, index, key, nics, minimum_command_lifetime_s, verbose=0):
        # on a Spark cluster we need our train function to see the Spark worker environment
        # this includes PYTHONPATH, HADOOP_TOKEN_FILE_LOCATION and _HOROVOD_SECRET_KEY
        env = os.environ.copy()

        # we inject the secret key here
        env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(key)

        # we also need to provide the current working dir to mpirun_exec_fn.py
        env['HOROVOD_SPARK_WORK_DIR'] = os.getcwd()

        super(SparkTaskService, self).__init__(SparkTaskService.NAME_FORMAT % index,
                                               key, nics, env, verbose)
        self._key = key
        self._minimum_command_lifetime_s = minimum_command_lifetime_s
        self._minimum_command_lifetime = None
예제 #13
0
    def read_serialized_keras_model(self, ckpt_path, model, custom_objects):
        """Reads the checkpoint file of the keras model into model bytes and returns the base 64
        encoded model bytes.
        :param ckpt_path: A string of path to the checkpoint file.
        :param model: A keras model. This parameter will be used in DBFSLocalStore\
            .read_serialized_keras_model() when the ckpt_path only contains model weights.
        :param custom_objects: This parameter will be used in DBFSLocalStore\
            .read_serialized_keras_model() when loading the keras model.
        :return: the base 64 encoded model bytes of the checkpoint model.
        """
        from horovod.runner.common.util import codec
        import tensorflow
        from tensorflow import keras
        from horovod.spark.keras.util import TFKerasUtil

        if LooseVersion(tensorflow.__version__) < LooseVersion("2.0.0"):
            model_bytes = self.read(ckpt_path)
            return codec.dumps_base64(model_bytes)
        else:
            with keras.utils.custom_object_scope(custom_objects):
                model = keras.models.load_model(ckpt_path)
            return TFKerasUtil.serialize_model(model)
예제 #14
0
파일: util.py 프로젝트: chongxiaoc/horovod
def _save_meta_to_fs(fs, path, schema, rows, total_byte_size):
    with fs.open(path, 'wb') as train_meta_file:
        serialized_content = codec.dumps_base64(
            dict(schema=schema, rows=rows, total_byte_size=total_byte_size))
        train_meta_file.write(serialized_content.encode('utf-8'))
예제 #15
0
    def train(serialized_model, train_rows, val_rows, avg_row_size):
        from petastorm import TransformSpec, make_reader, make_batch_reader
        import horovod as _horovod
        k = get_keras()
        k.backend.set_floatx(floatx)

        hvd = get_horovod()
        hvd.init()

        pin_gpu(hvd, tf, k)

        # If user specifies any user_shuffle_buffer_size (even 0), we should honor it.
        if user_shuffle_buffer_size is None:
            shuffle_buffer_size = calculate_shuffle_buffer_size(
                hvd, avg_row_size, train_rows / hvd.size())
        else:
            if user_shuffle_buffer_size < 0:
                raise ValueError(
                    "user_shuffle_buffer_size cannot be negative!")
            shuffle_buffer_size = user_shuffle_buffer_size

        # needs to be deserialized in the with scope
        with k.utils.custom_object_scope(custom_objects):
            model = deserialize_keras_model(serialized_model,
                                            lambda x: hvd.load_model(x))

        # Horovod: adjust learning rate based on number of processes.
        scaled_lr = k.backend.get_value(model.optimizer.lr) * hvd.size()
        k.backend.set_value(model.optimizer.lr, scaled_lr)

        # Verbose mode 1 will print a progress bar
        verbose = user_verbose if hvd.rank() == 0 else 0

        if verbose:
            print(
                f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}"
            )

        transform_spec = None
        if transformation:
            transform_spec = TransformSpec(transformation)

        # The inital_lr needs to be set to scaled learning rate in the checkpointing callbacks.
        for callback in user_callbacks:
            if isinstance(
                    callback, _horovod._keras.callbacks.
                    LearningRateScheduleCallbackImpl):
                callback.initial_lr = scaled_lr

        with remote_store.get_local_output_dir() as run_output_dir:
            callbacks = [
                # Horovod: broadcast initial variable states from rank 0 to all other processes.
                # This is necessary to ensure consistent initialization of all workers when
                # training is started with random weights or restored from a checkpoint.
                hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),

                # Horovod: average metrics among workers at the end of every epoch.
                #
                # Note: This callback must be in the list before the ReduceLROnPlateau,
                # TensorBoard, or other metrics-based callbacks.
                hvd.callbacks.MetricAverageCallback(),
            ]

            callbacks += user_callbacks

            # Horovod: save checkpoints only on the first worker to prevent other workers from
            # corrupting them.
            if hvd.rank() == 0:
                ckpt_file = os.path.join(run_output_dir,
                                         remote_store.checkpoint_filename)
                logs_dir = os.path.join(run_output_dir,
                                        remote_store.logs_subdir)

                # This callback checkpoints the model that ultimately is wrapped and returned after
                # Estimator.fit is called.
                _checkpoint_callback = checkpoint_callback
                if _checkpoint_callback:
                    _checkpoint_callback.filepath = ckpt_file
                else:
                    if is_dbfs and LooseVersion(
                            tf.__version__) < LooseVersion("2.0.0"):
                        # Because DBFS local file APIs does not support random write which is
                        # required by h5 format, save_weights_only=True is needed for switching
                        # to the TensorFlow SavedModel format.
                        _checkpoint_callback = k.callbacks.ModelCheckpoint(
                            ckpt_file, save_weights_only=True)
                    else:
                        _checkpoint_callback = k.callbacks.ModelCheckpoint(
                            ckpt_file)
                callbacks.append(_checkpoint_callback)

                if remote_store.saving_runs:
                    tb_callback = None
                    for i, c in enumerate(callbacks):
                        if isinstance(c, k.callbacks.TensorBoard):
                            tb_callback = c
                            print(
                                f"Found TensorBoard callback, updating log_dir to {logs_dir}"
                            )
                            tb_callback.log_dir = logs_dir
                            break
                    if tb_callback:
                        # Rather than a possibly arbitrary order, we always place the TensorBoard
                        # callback right before the SyncCallback
                        callbacks.pop(i)
                    callbacks.append(tb_callback
                                     or k.callbacks.TensorBoard(logs_dir))
                    callbacks.append(
                        SyncCallback(run_output_dir, remote_store.sync, k))

            if train_steps_per_epoch is None:
                steps_per_epoch = int(
                    math.ceil(train_rows / batch_size / hvd.size()))
            else:
                steps_per_epoch = train_steps_per_epoch

            if validation_steps_per_epoch is None:
                # math.ceil because if val_rows is smaller than val_batch_size we still get the at least
                # one step. float(val_rows) because val_rows/val_batch_size evaluates to zero before
                # math.ceil
                validation_steps = int(math.ceil(float(val_rows) / val_batch_size / hvd.size())) \
                    if should_validate else None
            else:
                validation_steps = validation_steps_per_epoch

            schema_fields = feature_columns + label_columns
            if sample_weight_col:
                schema_fields.append(sample_weight_col)

            if verbose:
                print(
                    f"Training parameters: Epochs: {epochs}, Scaled lr: {scaled_lr}, Shuffle size: {shuffle_buffer_size}\n"
                    f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {steps_per_epoch}\n"
                    f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {validation_steps}\n"
                    f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n"
                )
            # In general, make_batch_reader is faster than make_reader for reading the dataset.
            # However, we found out that make_reader performs data transformations much faster than
            # make_batch_reader with parallel worker processes. Therefore, the default reader
            # we choose is make_batch_reader unless there are data transformations.
            reader_factory_kwargs = dict()
            if transform_spec:
                reader_factory = make_reader
                reader_factory_kwargs['pyarrow_serialize'] = True
                is_batch_reader = False
            else:
                reader_factory = make_batch_reader
                is_batch_reader = True

            with reader_factory(
                    remote_store.train_data_path,
                    num_epochs=1,
                    cur_shard=hvd.rank(),
                    reader_pool_type=reader_pool_type,
                    workers_count=train_reader_worker_count,
                    shard_count=hvd.size(),
                    hdfs_driver=PETASTORM_HDFS_DRIVER,
                    schema_fields=schema_fields,
                    transform_spec=transform_spec,
                    storage_options=storage_options,
                    # Don't shuffle row groups if shuffle_buffer_size is 0 (non-shuffle case).
                    shuffle_row_groups=True
                    if shuffle_buffer_size > 0 else False,
                    **reader_factory_kwargs) as train_reader:
                with reader_factory(remote_store.val_data_path,
                                    num_epochs=1,
                                    cur_shard=hvd.rank(),
                                    reader_pool_type=reader_pool_type,
                                    workers_count=val_reader_worker_count,
                                    shard_count=hvd.size(),
                                    hdfs_driver=PETASTORM_HDFS_DRIVER,
                                    schema_fields=schema_fields,
                                    transform_spec=transform_spec,
                                    storage_options=storage_options,
                                    shuffle_row_groups=False,
                                    **reader_factory_kwargs) \
                    if should_validate else empty_batch_reader() as val_reader:

                    train_data = make_dataset(
                        train_reader,
                        batch_size,
                        shuffle_buffer_size,
                        is_batch_reader,
                        shuffle=True if shuffle_buffer_size > 0 else False,
                        cache=inmemory_cache_all)
                    val_data = make_dataset(val_reader, val_batch_size, shuffle_buffer_size,
                                            is_batch_reader, shuffle=False, cache=inmemory_cache_all) \
                        if val_reader else None

                    history = fit(model, train_data, val_data, steps_per_epoch,
                                  validation_steps, callbacks, verbose)

            # Dataset API usage currently displays a wall of errors upon termination.
            # This global model registration ensures clean termination.
            # Tracked in https://github.com/tensorflow/tensorflow/issues/24570
            globals()['_DATASET_FINALIZATION_HACK'] = model

            if hvd.rank() == 0:
                if is_dbfs:
                    if LooseVersion(tf.__version__) < LooseVersion("2.0.0"):
                        model.load_weights(ckpt_file)
                    else:
                        # needs to be deserialized in the with scope
                        with k.utils.custom_object_scope(custom_objects):
                            model = k.models.load_model(ckpt_file)
                    serialized_model = keras_utils.serialize_model(model)
                else:
                    if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"):
                        with k.utils.custom_object_scope(custom_objects):
                            model = k.models.load_model(ckpt_file)
                        serialized_model = keras_utils.serialize_model(model)
                    else:
                        with open(ckpt_file, 'rb') as f:
                            serialized_model = codec.dumps_base64(f.read())

                return history.history, serialized_model, hvd.size()
예제 #16
0
    def train(serialized_model, train_rows, val_rows, avg_row_size):
        from petastorm import TransformSpec, make_reader, make_batch_reader

        k = get_keras()
        k.backend.set_floatx(floatx)

        hvd = get_horovod()
        hvd.init()
        pin_gpu(hvd, tf, k)

        if not user_shuffle_buffer_size:
            shuffle_buffer_size = calculate_shuffle_buffer_size(
                hvd, avg_row_size, train_rows / hvd.size())
        else:
            shuffle_buffer_size = user_shuffle_buffer_size

        # needs to be deserialized in the with scope
        with k.utils.custom_object_scope(custom_objects):
            model = deserialize_keras_model(serialized_model,
                                            lambda x: hvd.load_model(x))

        # Horovod: adjust learning rate based on number of processes.
        k.backend.set_value(
            model.optimizer.lr,
            k.backend.get_value(model.optimizer.lr) * hvd.size())

        # Verbose mode 1 will print a progress bar
        verbose = user_verbose if hvd.rank() == 0 else 0

        transform_spec = None
        if transformation:
            transform_spec = TransformSpec(transformation)

        with remote_store.get_local_output_dir() as run_output_dir:
            callbacks = [
                # Horovod: broadcast initial variable states from rank 0 to all other processes.
                # This is necessary to ensure consistent initialization of all workers when
                # training is started with random weights or restored from a checkpoint.
                hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),

                # Horovod: average metrics among workers at the end of every epoch.
                #
                # Note: This callback must be in the list before the ReduceLROnPlateau,
                # TensorBoard, or other metrics-based callbacks.
                hvd.callbacks.MetricAverageCallback(),
            ]
            callbacks += user_callbacks

            # Horovod: save checkpoints only on the first worker to prevent other workers from
            # corrupting them.
            if hvd.rank() == 0:
                ckpt_file = os.path.join(run_output_dir,
                                         remote_store.checkpoint_filename)
                logs_dir = os.path.join(run_output_dir,
                                        remote_store.logs_subdir)

                # This callback checkpoints the model that ultimately is wrapped and returned after
                # Estimator.fit is called.
                _checkpoint_callback = checkpoint_callback
                if _checkpoint_callback:
                    _checkpoint_callback.filepath = ckpt_file
                else:
                    if is_dbfs and LooseVersion(
                            tf.__version__) < LooseVersion("2.0.0"):
                        # Because DBFS local file APIs does not support random write which is
                        # required by h5 format, save_weights_only=True is needed for switching
                        # to the TensorFlow SavedModel format.
                        _checkpoint_callback = k.callbacks.ModelCheckpoint(
                            ckpt_file, save_weights_only=True)
                    else:
                        _checkpoint_callback = k.callbacks.ModelCheckpoint(
                            ckpt_file)
                callbacks.append(_checkpoint_callback)

                if remote_store.saving_runs:
                    callbacks.append(k.callbacks.TensorBoard(logs_dir))
                    callbacks.append(
                        SyncCallback(run_output_dir, remote_store.sync, k))

            if train_steps_per_epoch is None:
                steps_per_epoch = int(
                    math.ceil(train_rows / batch_size / hvd.size()))
            else:
                steps_per_epoch = train_steps_per_epoch

            if validation_steps_per_epoch is None:
                # math.ceil because if val_rows is smaller than batch_size we still get the at least
                # one step. float(val_rows) because val_rows/batch_size evaluates to zero before
                # math.ceil
                validation_steps = int(math.ceil(float(val_rows) / batch_size / hvd.size())) \
                    if should_validate else None
            else:
                validation_steps = validation_steps_per_epoch

            schema_fields = feature_columns + label_columns
            if sample_weight_col:
                schema_fields.append(sample_weight_col)

            # In general, make_batch_reader is faster than make_reader for reading the dataset.
            # However, we found out that make_reader performs data transformations much faster than
            # make_batch_reader with parallel worker processes. Therefore, the default reader
            # we choose is make_batch_reader unless there are data transformations.
            reader_factory_kwargs = dict()
            if transform_spec:
                reader_factory = make_reader
                reader_factory_kwargs['pyarrow_serialize'] = True
                is_batch_reader = False
            else:
                reader_factory = make_batch_reader
                is_batch_reader = True

            # Petastorm: read data from the store with the correct shard for this rank
            # setting num_epochs=None will cause an infinite iterator
            # and enables ranks to perform training and validation with
            # unequal number of samples
            with reader_factory(remote_store.train_data_path,
                                num_epochs=None,
                                cur_shard=hvd.rank(),
                                reader_pool_type='process',
                                workers_count=train_reader_worker_count,
                                shard_count=hvd.size(),
                                hdfs_driver=PETASTORM_HDFS_DRIVER,
                                schema_fields=schema_fields,
                                transform_spec=transform_spec,
                                **reader_factory_kwargs) as train_reader:
                with reader_factory(remote_store.val_data_path,
                                    num_epochs=None,
                                    cur_shard=hvd.rank(),
                                    reader_pool_type='process',
                                    workers_count=val_reader_worker_count,
                                    shard_count=hvd.size(),
                                    hdfs_driver=PETASTORM_HDFS_DRIVER,
                                    schema_fields=schema_fields,
                                    transform_spec=transform_spec,
                                    **reader_factory_kwargs) \
                    if should_validate else empty_batch_reader() as val_reader:

                    train_data = make_dataset(train_reader,
                                              shuffle_buffer_size,
                                              is_batch_reader,
                                              shuffle=True)
                    val_data = make_dataset(val_reader, shuffle_buffer_size,
                                            is_batch_reader, shuffle=False) \
                        if val_reader else None

                    history = fit(model, train_data, val_data, steps_per_epoch,
                                  validation_steps, callbacks, verbose)

            # Dataset API usage currently displays a wall of errors upon termination.
            # This global model registration ensures clean termination.
            # Tracked in https://github.com/tensorflow/tensorflow/issues/24570
            globals()['_DATASET_FINALIZATION_HACK'] = model

            if hvd.rank() == 0:
                if is_dbfs:
                    if LooseVersion(tf.__version__) < LooseVersion("2.0.0"):
                        model.load_weights(ckpt_file)
                    else:
                        # needs to be deserialized in the with scope
                        with k.utils.custom_object_scope(custom_objects):
                            model = k.models.load_model(ckpt_file)
                    serialized_model = keras_utils.serialize_model(model)
                else:
                    with open(ckpt_file, 'rb') as f:
                        serialized_model = codec.dumps_base64(f.read())

                return history.history, serialized_model, hvd.size()
예제 #17
0
def _serialize_keras_model(model, save_model_fn):
    """Serialize model into byte array encoded into base 64."""
    bio = io.BytesIO()
    with h5py.File(bio, 'w') as f:
        save_model_fn(model, f)
    return codec.dumps_base64(bio.getvalue())
예제 #18
0
def _launch_task_servers(all_host_names, local_host_names, driver_addresses,
                         settings):
    """
    Executes the task server and service client task for registration on the
    hosts.
    :param all_host_names: list of addresses. for example,
        ['worker-0','worker-1']
        ['10.11.11.11', '10.11.11.12']
    :type all_host_names: list(string)
    :param local_host_names: names that are resolved to one of the addresses
    of local hosts interfaces. For example,
        set(['localhost', '127.0.0.1'])
    :type local_host_names: set
    :param driver_addresses: map of interfaces and their address and port for
    the service. For example:
        {
            'lo': [('127.0.0.1', 34588)],
            'docker0': [('172.122.10.1', 34588)],
            'eth0': [('11.111.33.73', 34588)]
        }
    :type driver_addresses: map
    :param settings: the object that contains the setting for running horovod
    :type settings: horovod.runner.common.util.settings.Settings
    :return:
    :rtype:
    """
    def _exec_command(command):
        host_output = io.StringIO()
        try:
            exit_code = safe_shell_exec.execute(command,
                                                stdout=host_output,
                                                stderr=host_output)
            if exit_code != 0:
                print('Launching horovod task function was not '
                      'successful:\n{host_output}'.format(
                          host_output=host_output.getvalue()))
                os._exit(exit_code)
        finally:
            host_output.close()
        return exit_code

    args_list = []
    num_hosts = len(all_host_names)
    for index in range(num_hosts):
        host_name = all_host_names[index]
        command = \
            '{python} -m horovod.runner.task_fn {index} {num_hosts} ' \
            '{driver_addresses} {settings}' \
            .format(python=sys.executable,
                    index=codec.dumps_base64(index),
                    num_hosts=codec.dumps_base64(num_hosts),
                    driver_addresses=codec.dumps_base64(driver_addresses),
                    settings=codec.dumps_base64(settings))
        if host_name not in local_host_names:
            command = get_ssh_command(command,
                                      host=host_name,
                                      port=settings.ssh_port,
                                      identity_file=settings.ssh_identity_file)

        if settings.verbose >= 2:
            print('Launching horovod task function: {}'.format(command))
        args_list.append([command])
    # Each thread will use ssh command to launch the server on one task. If an
    # error occurs in one thread, entire process will be terminated. Otherwise,
    # threads will keep running and ssh session -- and the the task server --
    # will be bound to the thread. In case, the horovod process dies, all
    # the ssh sessions and all the task servers will die as well.
    threads.execute_function_multithreaded(_exec_command,
                                           args_list,
                                           block_until_all_done=False)