def _launch_job(args, remote_host_names, settings, common_intfs, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) driver_ip = _get_driver_ip(common_intfs) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def __call__(self, parser, args, values, option_string=None): output = '''\ Horovod v{version}: Available Frameworks: [{tensorflow}] TensorFlow [{torch}] PyTorch [{mxnet}] MXNet Available Controllers: [{mpi}] MPI [{gloo}] Gloo Available Tensor Operations: [{nccl_ops}] NCCL [{ddl_ops}] DDL [{mlsl_ops}] MLSL [{mpi_ops}] MPI [{gloo_ops}] Gloo\ '''.format( version=horovod.__version__, tensorflow=CheckBuildAction.get_check( extension_available('tensorflow')), torch=CheckBuildAction.get_check(extension_available('torch')), mxnet=CheckBuildAction.get_check(extension_available('mxnet')), mpi=CheckBuildAction.get_check(mpi_built()), gloo=CheckBuildAction.get_check(gloo_built()), nccl_ops=CheckBuildAction.get_check(nccl_built()), ddl_ops=CheckBuildAction.get_check(ddl_built()), mpi_ops=CheckBuildAction.get_check(mpi_built()), mlsl_ops=CheckBuildAction.get_check(mlsl_built()), gloo_ops=CheckBuildAction.get_check(gloo_built())) print(textwrap.dedent(output)) os._exit(0)
def run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_jsrun, js_run, verbosity): # keep logic in sync with is_gloo_used(...) verbose = verbosity is not None and verbosity >= 2 if use_gloo: if not gloo_built(verbose=verbose): raise ValueError('Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.') gloo_run() elif use_mpi: if not mpi_built(verbose=verbose): raise ValueError('MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.') mpi_run() elif use_jsrun: if not mpi_built(verbose=verbose): raise ValueError('MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.') if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run() else: if mpi_built(verbose=verbose): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run() else: mpi_run() elif gloo_built(verbose=verbose): gloo_run() else: raise ValueError('Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() # The other worker waits a while before exiting. time.sleep(120) assert gloo_built() or mpi_built() start = time.time() if gloo_built(): with pytest.raises( RuntimeError, match='Horovod detected that one or more processes exited' ): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True) # The controller should be terminating workers way before the 2-minute delay. assert time.time() - start < 60
def test_happy_run(self): def fn(a, b, c, d): hvd.init() rank = hvd.rank() v = a + b + c + d res = hvd.allgather(torch.tensor([rank, v])).tolist() if rank == 0: return res elif rank == 1: return "ret_val_of_rank_1" else: return None assert gloo_built() or mpi_built() for use_gloo, use_mpi in [(True, False), (False, True)]: if use_mpi and not mpi_built(): continue if use_gloo and not gloo_built(): continue res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321]], res1) res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321, 1, 4321, 2, 4321], "ret_val_of_rank_1", None], res2)
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() assert gloo_built() or mpi_built() if gloo_built(): with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True)
def test_run_failure(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") self.do_test_run_with_controller_failure(controller, mode, run)
def test_run_success(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") if is_mpich(): self.skipTest("MPICH is not testable") self.do_test_run_with_controller_success(controller, mode, run)
def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, nics, env, command) elif args.use_jsrun: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run(settings, nics, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run(settings, nics, env, command) else: mpi_run(settings, nics, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_run_failure(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") if is_mpich(): self.skipTest("MPICH is not testable") if is_intel_mpi(): self.skipTest( "Intel(R) MPI is not testable because it is based on MPICH" ) self.do_test_run_with_controller_failure(controller, mode, run)
def check_build(verbose): def get_check(value): return 'X' if value else ' ' output = '''{verbose_newline}\ Horovod v{version}: Available Frameworks: [{tensorflow}] TensorFlow [{torch}] PyTorch [{mxnet}] MXNet Available Controllers: [{mpi}] MPI [{gloo}] Gloo Available Tensor Operations: [{nccl_ops}] NCCL [{ddl_ops}] DDL [{ccl_ops}] CCL [{mpi_ops}] MPI [{gloo_ops}] Gloo\ '''.format(verbose_newline='\n' if verbose else '', version=horovod.__version__, tensorflow=get_check( extension_available('tensorflow', verbose=verbose)), torch=get_check(extension_available('torch', verbose=verbose)), mxnet=get_check(extension_available('mxnet', verbose=verbose)), mpi=get_check(mpi_built(verbose=verbose)), gloo=get_check(gloo_built(verbose=verbose)), nccl_ops=get_check(nccl_built(verbose=verbose)), ddl_ops=get_check(ddl_built(verbose=verbose)), mpi_ops=get_check(mpi_built(verbose=verbose)), ccl_ops=get_check(ccl_built(verbose=verbose)), gloo_ops=get_check(gloo_built(verbose=verbose))) print(textwrap.dedent(output)) os._exit(0)
def is_gloo_used(use_gloo=None, use_mpi=None, use_jsrun=None): # determines whether run_controller will run gloo # for the given (use_gloo, _, use_mpi, _, use_jsrun, _, _) return use_gloo or (not use_mpi and not use_jsrun and not mpi_built())
def run(): args = parse_args() if args.check_build: check_build(args.verbose) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, command=args.command) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_mpi_built(self): """Test that MPI has been built if env is set.""" gloo_rank = int(os.getenv('HOROVOD_RANK', -1)) if gloo_rank == -1: self.assertTrue(mpi_built())