예제 #1
0
def test_distributed_training_from_master_node_use_mpi_with_slot_processes_per_host(
        log_script_invocation, chmod, stat, download_and_install, check_call,
        _wait_for_worker_nodes_to_start_sshd, _start_ssh_daemon,
        _change_hostname):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(hosts=hosts,
                            num_gpus=8,
                            network_interface_name='foonet')
    training.train(env, {
        'sagemaker_use_mpi': True,
        'sagemaker_process_slots_per_host': 16
    })

    download_and_install.assert_called_with('s3://my/script')
    _change_hostname.assert_called_once_with('algo-1')
    _start_ssh_daemon.assert_called_once()
    _wait_for_worker_nodes_to_start_sshd.assert_called_once_with(hosts)
    check_call.assert_called_once_with([
        'mpirun', '--allow-run-as-root', '--host', 'algo-1:16,algo-2:16',
        '-mca', 'btl_tcp_if_include', 'foonet', '-mca', 'oob_tcp_if_include',
        'foonet', '-mca', 'btl', '^openib', '-x', 'PATH', '-x',
        'LD_LIBRARY_PATH', '-x', 'LD_PRELOAD=/libchangehostname.so', '-mca',
        'orte_abort_on_non_zero_status', '1', '-x', 'NCCL_DEBUG=INFO', '-x',
        'NCCL_SOCKET_IFNAME=foonet', '-np', '32', '/mpi_script.sh'
    ])
    chmod.assert_called_with('/mpi_script.sh', stat().st_mode.__or__())
def test_distributed_training_from_worker_node_use_mpi_with_sagemaker_additional_mpi_options(
        log_script_invocation, socket, chmod, stat, download_and_install, check_call, sleep,
        _can_connect, popen, system):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(hosts=hosts, num_gpus=8, network_interface_name='foonet')
    training.train(env, {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 16,
                         'sagemaker_additional_mpi_options': '-x MY_ENVIRONMENT_VARIABLE'})

    download_and_install.assert_called_with('s3://my/script')
    system.assert_called_once_with('change-hostname.sh algo-1')
    popen.assert_called_once_with(["/usr/sbin/sshd", "-D"])
    _can_connect.assert_called_with('algo-2', 22, socket())
    check_call.assert_called_once_with(
        ['mpirun', '--allow-run-as-root', '--host', 'algo-1:16,algo-2:16', '-mca',
         'btl_tcp_if_include',
         'foonet', '-mca',
         'oob_tcp_if_include', 'foonet',
         '-mca', 'btl', '^openib', '-x', 'PATH', '-x', 'LD_LIBRARY_PATH', '-x',
         'LD_PRELOAD=/libchangehostname.so', '-mca', 'orte_abort_on_non_zero_status', '1', '-x',
         'NCCL_DEBUG=INFO', '-x', 'NCCL_SOCKET_IFNAME=foonet',
         '-np', '32', '-x', 'MY_ENVIRONMENT_VARIABLE', '/mpi_script.sh'])

    chmod.assert_called_with('/mpi_script.sh', stat().st_mode.__or__())

    open().write.assert_called_with("""#!/usr/bin/env bash
touch /mpi_is_running
%s -m mpi4py -m imagenet
EXIT_CODE=$?
touch /mpi_is_finished
exit ${EXIT_CODE}
""" % sys.executable)
예제 #3
0
def test_single_machine(run_entry, download_and_install):
    env = mock_training_env()
    training.train(env, {})

    download_and_install.assert_called_with('s3://my/script')
    run_entry.assert_called_with('s3://my/script',
                                 'imagenet',
                                 env.to_cmd_args(),
                                 env.to_env_vars(),
                                 runner=framework.runner.ProcessRunnerType,
                                 extra_opts={})
def test_distributed_training_from_worker_node_use_mpi(
        log_script_invocation, isfile, chmod, stat, download_and_install, popen):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(current_host='algo-2', hosts=hosts)
    training.train(env, {'sagemaker_use_mpi': True})

    download_and_install.assert_called_with('s3://my/script')
    popen.assert_called_with(['/usr/sbin/sshd', '-D'])

    isfile.assert_called_with('/mpi_is_finished')
    chmod.assert_called_once_with('/mpi_script.sh', stat().st_mode.__or__())
예제 #5
0
def test_distributed_training_from_worker_node(isfile, chmod, stat,
                                               download_and_install,
                                               _start_ssh_daemon, system):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(current_host='algo-2', hosts=hosts)
    training.train(env, {})

    download_and_install.assert_called_with('s3://my/script')
    system.assert_called_once_with('change-hostname.sh algo-2')
    _start_ssh_daemon.assert_called_once()

    isfile.assert_called_with('/mpi_is_finished')
    chmod.assert_called_once_with('/mpi_script.sh', stat().st_mode.__or__())
예제 #6
0
def test_distributed_training_from_master_node(
        _create_mpi_script, _run_mpi_on_all_nodes,
        _wait_for_worker_nodes_to_start_sshd, _start_ssh_daemon,
        _change_hostname):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(hosts=hosts)
    training.train(env, {})

    _create_mpi_script.assert_called_with(env)
    _change_hostname.assert_called_once_with('algo-1')
    _start_ssh_daemon.assert_called_once()
    _wait_for_worker_nodes_to_start_sshd.assert_called_once_with(hosts)
    _run_mpi_on_all_nodes.assert_called_once_with(env, {})
예제 #7
0
def test_distributed_training(run_entry, download_and_install):
    hosts = ['algo-1', 'algo-2']
    env = mock_training_env(hosts=hosts)
    training.train(env, {})

    download_and_install.assert_called_with('s3://my/script')
    run_entry.assert_called_with('s3://my/script',
                                 'imagenet',
                                 env.to_cmd_args(),
                                 env.to_env_vars(),
                                 runner=framework.runner.MPIRunnerType,
                                 extra_opts={'sagemaker_mpi_num_of_processes_per_host': None,
                                             'sagemaker_mpi_num_processes': None})
예제 #8
0
def test_single_machine(run_module):
    env = mock_training_env()
    training.train(env, {})

    run_module.assert_called_with('s3://my/script', env.to_cmd_args(),
                                  env.to_env_vars(), 'imagenet')