def test_mpi_worker_run_no_wait(popen, ssh_client): worker = _mpi.WorkerRunner(user_entry_point='train.sh', args=['-v', '--lr', '35'], env_vars={'LD_CONFIG_PATH': '/etc/ld'}, master_hostname='algo-1') worker.run(wait=False) ssh_client.assert_not_called() popen.assert_called_with(['/usr/sbin/sshd', '-D'])
def test_mpi_worker_run_no_wait(popen, ssh_client): worker = _mpi.WorkerRunner( user_entry_point="train.sh", args=["-v", "--lr", "35"], env_vars={"LD_CONFIG_PATH": "/etc/ld"}, master_hostname="algo-1", ) worker.run(wait=False) ssh_client.assert_not_called() popen.assert_called_with(["/usr/sbin/sshd", "-D"])
def test_mpi_worker_run(popen, policy, process_iter, wait_procs, ssh_client): process = MagicMock(info={'name': 'orted'}) process_iter.side_effect = lambda attrs: [process] worker = _mpi.WorkerRunner(user_entry_point='train.sh', args=['-v', '--lr', '35'], env_vars={'LD_CONFIG_PATH': '/etc/ld'}, master_hostname='algo-1') worker.run() ssh_client().load_system_host_keys.assert_called() ssh_client().set_missing_host_key_policy.assert_called_with(policy()) ssh_client().connect.assert_called_with('algo-1', port=22) ssh_client().close.assert_called() wait_procs.assert_called_with([process]) popen.assert_called_with(['/usr/sbin/sshd', '-D'])
def _get_by_runner_type(identifier, user_entry_point=None, args=None, env_vars=None, extra_opts=None): """Placeholder docstring""" env = sagemaker_containers.training_env() user_entry_point = user_entry_point or env.user_entry_point args = args or env.to_cmd_args() env_vars = env_vars or env.to_env_vars() if identifier is RunnerType.MPI and env.is_master: mpi_args = extra_opts or {} # Default to single process for CPU default_processes_per_host = env.num_gpus if env.num_gpus > 0 else 1 processes_per_host = _mpi_param_value(mpi_args, env, _params.MPI_PROCESSES_PER_HOST, default_processes_per_host) num_processes = _mpi_param_value(mpi_args, env, _params.MPI_NUM_PROCESSES) custom_mpi_options = _mpi_param_value(mpi_args, env, _params.MPI_CUSTOM_OPTIONS, "") return _mpi.MasterRunner( user_entry_point, args, env_vars, env.master_hostname, env.hosts, processes_per_host, custom_mpi_options, env.network_interface_name, num_processes=num_processes, ) elif identifier is RunnerType.MPI: return _mpi.WorkerRunner(user_entry_point, args, env_vars, env.master_hostname) elif identifier is RunnerType.Process: return _process.ProcessRunner(user_entry_point, args, env_vars) else: raise ValueError("Invalid identifier %s" % identifier)
def _get_by_runner_type(identifier): env = sagemaker_containers.training_env() if identifier is RunnerType.MPI and env.is_master: processes_per_host = env.additional_framework_parameters.get( _params.MPI_PROCESSES_PER_HOST, 1) custom_mpi_options = env.additional_framework_parameters.get( _params.MPI_CUSTOM_OPTIONS, '') return _mpi.MasterRunner(env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), env.master_hostname, env.hosts, processes_per_host, custom_mpi_options, env.network_interface_name) elif identifier is RunnerType.MPI: return _mpi.WorkerRunner(env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), env.master_hostname) elif identifier is RunnerType.Process: return _process.ProcessRunner(env.user_entry_point, env.to_cmd_args(), env.to_env_vars()) else: raise ValueError('Invalid identifier %s' % identifier)
def test_mpi_worker_run(popen, policy, process_iter, wait_procs, ssh_client, sleep): process = MagicMock(info={"name": "orted"}) process_iter.side_effect = lambda attrs: [process] worker = _mpi.WorkerRunner( user_entry_point="train.sh", args=["-v", "--lr", "35"], env_vars={"LD_CONFIG_PATH": "/etc/ld"}, master_hostname="algo-1", ) worker.run() ssh_client().load_system_host_keys.assert_called() ssh_client().set_missing_host_key_policy.assert_called_with(policy()) ssh_client().connect.assert_called_with("algo-1", port=22) ssh_client().close.assert_called() wait_procs.assert_called_with([process]) popen.assert_called_with(["/usr/sbin/sshd", "-D"])
def _get_by_runner_type(identifier, user_entry_point=None, args=None, env_vars=None, extra_opts=None): env = sagemaker_containers.training_env() user_entry_point = user_entry_point or env.user_entry_point args = args or env.to_cmd_args() env_vars = env_vars or env.to_env_vars() if identifier is RunnerType.MPI and env.is_master: mpi_args = extra_opts or {} processes_per_host = _mpi_param_value(mpi_args, env, _params.MPI_PROCESSES_PER_HOST, 1) num_processes = _mpi_param_value(mpi_args, env, _params.MPI_NUM_PROCESSES) custom_mpi_options = _mpi_param_value(mpi_args, env, _params.MPI_CUSTOM_OPTIONS, '') return _mpi.MasterRunner(user_entry_point, args, env_vars, env.master_hostname, env.hosts, processes_per_host, custom_mpi_options, env.network_interface_name, num_processes=num_processes) elif identifier is RunnerType.MPI: return _mpi.WorkerRunner(user_entry_point, args, env_vars, env.master_hostname) elif identifier is RunnerType.Process: return _process.ProcessRunner(user_entry_point, args, env_vars) else: raise ValueError('Invalid identifier %s' % identifier)