def test_mpi_run_on_large_cluster(self): if _get_mpi_implementation_flags(False)[0] is None: self.skipTest("MPI is not available") cmd = ['cmd'] settings = copy.copy(self.minimal_settings) settings.num_hosts = large_cluster_threshold run_func = MagicMock(return_value=0) mpi_run(settings, None, {}, cmd, run_func=run_func) mpi_flags, binding_args = _get_mpi_implementation_flags(False) self.assertIsNotNone(mpi_flags) mpi_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_flags.append('-mca plm_rsh_num_concurrent {}'.format( settings.num_hosts)) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H host ' '{binding_args} ' '{mpi_flags} ' 'cmd').format(binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) expected_env = {} run_func.assert_called_once_with(command=expected_cmd, env=expected_env, stdout=None, stderr=None)
def test_mpi_run_minimal(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.run.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H host ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None)
def test_mpi_run_full(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] nics = ['eth0', 'eth1'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' tmout = timeout.Timeout(5, message='Timed out waiting for something.') settings = hvd_settings.Settings( verbose=0, ssh_port=1022, extra_mpi_args='>mpi-extra args go here<', binding_args='>binding args go here<', key=secret.make_secret_key(), start_timeout=tmout, num_hosts=1, num_proc=1, hosts='>host names go here<', output_filename='>output filename goes here<', run_func_mode=True) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags) as impl: with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr) # assert call on _get_mpi_implementation_flags impl.assert_called_once_with(None, env=env) # call the mocked _get_mpi_implementation_flags method ourselves mpi_flags, _ = horovod.run.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_command = ( 'mpirun ' '--allow-run-as-root --tag-output ' '-np 1 -H >host names go here< ' '>binding args go here< ' '{mpi_flags} ' '-mca plm_rsh_args "-p 1022" ' '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 ' '--output-filename >output filename goes here< ' '-x env1 -x env2 ' '>mpi-extra args go here< ' 'cmd arg1 arg2').format(mpi_flags=' '.join(mpi_flags)) expected_env = { 'env1': 'val1', 'env2': 'val2', 'PATH': os.environ.get('PATH') } execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)
def _launch_job(args, remote_host_names, settings, common_intfs, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) driver_ip = _get_driver_ip(common_intfs) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_mpi_run_with_non_zero_exit(self): if _get_mpi_implementation_flags(False)[0] is None: self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings run_func = MagicMock(return_value=1) with pytest.raises(RuntimeError, match="^mpirun failed with exit code 1$") as e: mpi_run(settings, None, {}, cmd, run_func=run_func)
def test_mpi_run_full(self): if _get_mpi_implementation_flags(False)[0] is None: self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] common_intfs = ['eth0', 'eth1'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' tmout = timeout.Timeout(5, message='Timed out waiting for something.') settings = hvd_settings.Settings( verbose=0, ssh_port=1022, extra_mpi_args='>mpi-extra args go here<', binding_args='>binding args go here<', key=secret.make_secret_key(), timeout=tmout, num_hosts=1, num_proc=1, hosts='>host names go here<', output_filename='>output filename goes here<', run_func_mode=True) run_func = MagicMock(return_value=0) mpi_run(settings, common_intfs, env, cmd, stdout=stdout, stderr=stderr, run_func=run_func) mpi_flags, _ = _get_mpi_implementation_flags(False) self.assertIsNotNone(mpi_flags) expected_command = ( 'mpirun ' '--allow-run-as-root --tag-output ' '-np 1 -H >host names go here< ' '>binding args go here< ' '{mpi_flags} ' '-mca plm_rsh_args "-p 1022" ' '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 ' '--output-filename >output filename goes here< ' '-x env1 -x env2 ' '>mpi-extra args go here< ' 'cmd arg1 arg2').format(mpi_flags=' '.join(mpi_flags)) expected_env = {'env1': 'val1', 'env2': 'val2'} run_func.assert_called_once_with(command=expected_command, env=expected_env, stdout=stdout, stderr=stderr)
def test_mpi_run_with_non_zero_exit(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp): return [], [] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=1): with pytest.raises(RuntimeError, match="^mpirun failed with exit code 1$"): mpi_run(settings, None, {}, cmd)
def test_mpi_run_on_large_cluster(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = copy.copy(self.minimal_settings) settings.num_hosts = large_cluster_threshold def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.run.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) mpi_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_flags.append('-mca plm_rsh_num_concurrent {}'.format( settings.num_hosts)) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H localhost:2 ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # so we cannot test it through execute.assert_called_once_with self.assertIn('env', execute.call_args.kwargs) self.assertIn('PYTHONPATH', execute.call_args.kwargs['env']) actual_python_path = execute.call_args.kwargs['env'].pop( 'PYTHONPATH') self.assertIn(actual_python_path, os.pathsep.join(sys.path)) expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None)
def do_test_mpi_run_env_override(self, sysenv, argenv, env_var, expected): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags),\ mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute,\ override_env(sysenv): mpi_run(settings, None, argenv, cmd) # assert the env variable in the execute's env self.assertIn('env', execute.call_args.kwargs) self.assertEqual(execute.call_args.kwargs['env'].get(env_var), expected)
def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, nics, env, command) elif args.use_jsrun: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run(settings, nics, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run(settings, nics, env, command) else: mpi_run(settings, nics, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_mpi_run_with_os_environ(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0): with pytest.raises( Exception, match= "^env argument must be a dict, not <class 'os._Environ'>: " ): mpi_run(settings, None, os.environ, cmd)
def run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_jsrun, js_run, verbosity): # keep logic in sync with is_gloo_used(...) verbose = verbosity is not None and verbosity >= 2 if use_gloo: if not gloo_built(verbose=verbose): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run() elif use_mpi: if not mpi_built(verbose=verbose): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run() elif use_jsrun: if not mpi_built(verbose=verbose): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run() else: if mpi_built(verbose=verbose): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run() else: mpi_run() elif gloo_built(verbose=verbose): gloo_run() else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_mpi_run_minimal(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.run.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H localhost:2 ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None)
def test_mpi_run_minimal(self): if _get_mpi_implementation_flags() is None: self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings run_func = MagicMock(return_value=0) mpi_run(settings, None, {}, cmd, run_func=run_func) mpi_flags = _get_mpi_implementation_flags() self.assertIsNotNone(mpi_flags) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H host ' '-bind-to none -map-by slot ' '{mpi_flags} ' 'cmd').format(mpi_flags=' '.join(mpi_flags)) expected_env = {} run_func.assert_called_once_with(command=expected_cmd, env=expected_env, stdout=None, stderr=None)
def test_mpi_run_on_large_cluster(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = copy.copy(self.minimal_settings) settings.num_hosts = large_cluster_threshold def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.run.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.run.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.run.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) mpi_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_flags.append('-mca plm_rsh_num_concurrent {}'.format( settings.num_hosts)) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H host ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None)
def mpi_run_fn(): mpi_run(settings, nics, env, command)
def run(fn, args=(), kwargs={}, num_proc=None, start_timeout=None, extra_mpi_args=None, env=None, stdout=None, stderr=None, verbose=1, nics=None, run_func=safe_shell_exec.execute): """ Runs Horovod in Spark. Runs `num_proc` processes executing `fn` using the same amount of Spark tasks. Args: fn: Function to run. args: Arguments to pass to `fn`. kwargs: Keyword arguments to pass to `fn`. num_proc: Number of Horovod processes. Defaults to `spark.default.parallelism`. start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds. If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. extra_mpi_args: Extra arguments for mpi_run. Defaults to no extra args. env: Environment dictionary to use in Horovod run. Defaults to `os.environ`. stdout: Horovod stdout is redirected to this stream. Defaults to sys.stdout. stderr: Horovod stderr is redirected to this stream. Defaults to sys.stderr. verbose: Debug output verbosity (0-2). Defaults to 1. nics: List of NICs for tcp network communication. run_func: Run function to use. Must have arguments 'command', 'env', 'stdout', 'stderr'. Defaults to safe_shell_exec.execute. Returns: List of results returned by running `fn` on each rank. """ if start_timeout is None: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600')) # nics needs to be a set if nics and not isinstance(nics, set): nics = set(nics) tmout = timeout.Timeout( start_timeout, message='Timed out waiting for {activity}. Please check that you have ' 'enough resources to run all Horovod processes. Each Horovod ' 'process runs in a Spark task. You may need to increase the ' 'start_timeout parameter to a larger value if your Spark resources ' 'are allocated on-demand.') settings = hvd_settings.Settings(verbose=verbose, extra_mpi_args=extra_mpi_args, key=secret.make_secret_key(), timeout=tmout, nics=nics, run_func_mode=True) spark_context = pyspark.SparkContext._active_spark_context if spark_context is None: raise Exception('Could not find an active SparkContext, are you ' 'running in a PySpark session?') if num_proc is None: num_proc = spark_context.defaultParallelism if settings.verbose >= 1: print( 'Running %d processes (inferred from spark.default.parallelism)...' % num_proc) else: if settings.verbose >= 1: print('Running %d processes...' % num_proc) settings.num_proc = num_proc result_queue = queue.Queue(1) spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id() driver = driver_service.SparkDriverService(settings.num_proc, fn, args, kwargs, settings.key, settings.nics) spark_thread = _make_spark_thread(spark_context, spark_job_group, driver, result_queue, settings) try: driver.wait_for_initial_registration(settings.timeout) if settings.verbose >= 2: print('Initial Spark task registration is complete.') task_clients = [ task_service.SparkTaskClient( index, driver.task_addresses_for_driver(index), settings.key, settings.verbose) for index in range(settings.num_proc) ] for task_client in task_clients: task_client.notify_initial_registration_complete() driver.wait_for_task_to_task_address_updates(settings.timeout) if settings.verbose >= 2: print('Spark task-to-task address registration is complete.') # Determine a set of common interfaces for task-to-task communication. common_intfs = set(driver.task_addresses_for_tasks(0).keys()) for index in range(1, settings.num_proc): common_intfs.intersection_update( driver.task_addresses_for_tasks(index).keys()) if not common_intfs: raise Exception( 'Unable to find a set of common task-to-task communication interfaces: %s' % [(index, driver.task_addresses_for_tasks(index)) for index in range(settings.num_proc)]) # Determine the index grouping based on host hashes. # Barrel shift until index 0 is in the first host. host_hashes = list(driver.task_host_hash_indices().keys()) host_hashes.sort() while 0 not in driver.task_host_hash_indices()[host_hashes[0]]: host_hashes = host_hashes[1:] + host_hashes[:1] settings.hosts = ','.join( '%s:%d' % (host_hash, len(driver.task_host_hash_indices()[host_hash])) for host_hash in host_hashes) ranks_to_indices = [] for host_hash in host_hashes: ranks_to_indices += driver.task_host_hash_indices()[host_hash] driver.set_ranks_to_indices(ranks_to_indices) if env is None: env = os.environ.copy() # Pass secret key through the environment variables. env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key) rsh_agent = (sys.executable, '-m', 'horovod.spark.driver.mpirun_rsh', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) settings.extra_mpi_args = ( '{extra_mpi_args} -x NCCL_DEBUG=INFO -mca plm_rsh_agent "{rsh_agent}"' .format(extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '', rsh_agent=' '.join(rsh_agent))) command = (sys.executable, '-m', 'horovod.spark.task.mpirun_exec_fn', codec.dumps_base64(driver.addresses()), codec.dumps_base64(settings)) mpi_run(settings, common_intfs, env, command, stdout=stdout, stderr=stderr, run_func=run_func) except: # Terminate Spark job. spark_context.cancelJobGroup(spark_job_group) # Re-raise exception. raise finally: spark_thread.join() driver.shutdown() # Make sure Spark Job did not fail. driver.check_for_spark_job_failure() # If there's no exception, execution results are in this queue. results = result_queue.get_nowait() return [results[index] for index in ranks_to_indices]
def run(): args = parse_args() if args.check_build: check_build(args.verbose) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, command=args.command) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')