def __call__(self, parser, args, values, option_string=None): output = '''\ Horovod v{version}: Available Frameworks: [{tensorflow}] TensorFlow [{torch}] PyTorch [{mxnet}] MXNet Available Controllers: [{mpi}] MPI [{gloo}] Gloo Available Tensor Operations: [{nccl_ops}] NCCL [{ddl_ops}] DDL [{mlsl_ops}] MLSL [{mpi_ops}] MPI [{gloo_ops}] Gloo\ '''.format( version=horovod.__version__, tensorflow=CheckBuildAction.get_check( extension_available('tensorflow')), torch=CheckBuildAction.get_check(extension_available('torch')), mxnet=CheckBuildAction.get_check(extension_available('mxnet')), mpi=CheckBuildAction.get_check(mpi_built()), gloo=CheckBuildAction.get_check(gloo_built()), nccl_ops=CheckBuildAction.get_check(nccl_built()), ddl_ops=CheckBuildAction.get_check(ddl_built()), mpi_ops=CheckBuildAction.get_check(mpi_built()), mlsl_ops=CheckBuildAction.get_check(mlsl_built()), gloo_ops=CheckBuildAction.get_check(gloo_built())) print(textwrap.dedent(output)) os._exit(0)
def run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_jsrun, js_run, verbosity): # keep logic in sync with is_gloo_used(...) verbose = verbosity is not None and verbosity >= 2 if use_gloo: if not gloo_built(verbose=verbose): raise ValueError('Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.') gloo_run() elif use_mpi: if not mpi_built(verbose=verbose): raise ValueError('MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.') mpi_run() elif use_jsrun: if not mpi_built(verbose=verbose): raise ValueError('MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.') if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run() else: if mpi_built(verbose=verbose): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run() else: mpi_run() elif gloo_built(verbose=verbose): gloo_run() else: raise ValueError('Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() # The other worker waits a while before exiting. time.sleep(120) assert gloo_built() or mpi_built() start = time.time() if gloo_built(): with pytest.raises( RuntimeError, match='Horovod detected that one or more processes exited' ): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True) # The controller should be terminating workers way before the 2-minute delay. assert time.time() - start < 60
def _launch_job(args, remote_host_names, settings, common_intfs, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) driver_ip = _get_driver_ip(common_intfs) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_happy_run(self): def fn(a, b, c, d): hvd.init() rank = hvd.rank() v = a + b + c + d res = hvd.allgather(torch.tensor([rank, v])).tolist() if rank == 0: return res elif rank == 1: return "ret_val_of_rank_1" else: return None assert gloo_built() or mpi_built() for use_gloo, use_mpi in [(True, False), (False, True)]: if use_mpi and not mpi_built(): continue if use_gloo and not gloo_built(): continue res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321]], res1) res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321, 1, 4321, 2, 4321], "ret_val_of_rank_1", None], res2)
def test_happy_run_elastic_fault_tolerant_fails(self): self.skipTest( 'elastic horovod does not support shutdown from the spark driver ' 'while elastic driver is waiting for hosts to come up') if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic_fault_tolerant_fails', max_failures=2): with tempdir() as dir: # these files make training function fail in given rank, epoch and batch # we have as many failures as Spark has max_failures (per task / index) with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_1_epoch_3_batch_1_fail']), 'w'): pass res = horovod.spark.run_elastic( fn, args=(2, 5, 5, dir), env={'HOROVOD_LOG_LEVEL': 'DEBUG'}, num_proc=2, min_num_proc=2, max_num_proc=2, start_timeout=5, verbose=2) self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0), ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
def test_happy_run_elastic_fault_tolerant(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic_fault_tolerant', max_failures=3): with tempdir() as dir: # these files make training function fail in given rank, epoch and batch with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_0_epoch_3_batch_1_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_1_epoch_4_batch_2_fail']), 'w'): pass res = horovod.spark.run_elastic( fn, args=(2, 5, 5, dir), env={'HOROVOD_LOG_LEVEL': 'DEBUG'}, num_proc=2, min_num_proc=2, max_num_proc=2, start_timeout=5, verbose=2) self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0), ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() assert gloo_built() or mpi_built() if gloo_built(): with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True)
def _run_elastic(args): # construct host discovery component if args.host_discovery_script: discover_hosts = discovery.HostDiscoveryScript( args.host_discovery_script, args.slots) elif args.hosts: _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts) if len(available_host_slots) < 2: raise ValueError( 'Cannot run in fault tolerance mode with fewer than 2 hosts.') discover_hosts = discovery.FixedHosts(available_host_slots) else: raise ValueError( 'One of --host-discovery-script, --hosts, or --hostnames must be provided' ) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = elastic_settings.ElasticSettings( discovery=discover_hosts, min_num_proc=args.min_num_proc or args.num_proc, max_num_proc=args.max_num_proc, elastic_timeout=args.elastic_timeout, reset_limit=args.reset_limit, cooldown_range=args.cooldown_range, num_proc=args.num_proc, verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, key=secret.make_secret_key(), start_timeout=tmout, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=args.nics, prefix_output_with_timestamp=args.prefix_output_with_timestamp) if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) env = os.environ.copy() config_parser.set_env_from_args(env, args) executable = args.executable or sys.executable return gloo_run_elastic(settings, env, args.run_func if args.run_func else args.command, executable)
def test_run_failure(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") self.do_test_run_with_controller_failure(controller, mode, run)
def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, nics, env, command) elif args.use_jsrun: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run(settings, nics, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run(settings, nics, env, command) else: mpi_run(settings, nics, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def test_happy_run_elastic(self): if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic'): res = horovod.spark.run_elastic(fn, args=(2, 5, 4), num_proc=2, min_np=2, max_np=2, start_timeout=10, verbose=2) self.assertListEqual([([0, 3, 0, 1, 1, 3, 0, 1], 0), ([0, 3, 0, 1, 1, 3, 0, 1], 1)], res)
def test_run_success(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") if is_mpich(): self.skipTest("MPICH is not testable") self.do_test_run_with_controller_success(controller, mode, run)
def test_run_failure(self, controller, mode, run): if controller == 'gloo' and not gloo_built(): self.skipTest("Gloo is not available") if controller == 'mpi': if not (mpi_built() and mpi_available()): self.skipTest("MPI is not available") if is_mpich(): self.skipTest("MPICH is not testable") if is_intel_mpi(): self.skipTest( "Intel(R) MPI is not testable because it is based on MPICH" ) self.do_test_run_with_controller_failure(controller, mode, run)
def test_happy_run_elastic(self): if skip_lightning_tests: self.skipTest('Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic'): res = horovod.spark.run_elastic(fn, args=(2, 5, 4), num_proc=2, min_np=2, max_np=2, start_timeout=10, verbose=2) self.assertListEqual([([0, 3, 0, 1, 1, 3, 0, 1], 0), ([0, 3, 0, 1, 1, 3, 0, 1], 1)], res)
def check_build(verbose): def get_check(value): return 'X' if value else ' ' output = '''{verbose_newline}\ Horovod v{version}: Available Frameworks: [{tensorflow}] TensorFlow [{torch}] PyTorch [{mxnet}] MXNet Available Controllers: [{mpi}] MPI [{gloo}] Gloo Available Tensor Operations: [{nccl_ops}] NCCL [{ddl_ops}] DDL [{ccl_ops}] CCL [{mpi_ops}] MPI [{gloo_ops}] Gloo\ '''.format(verbose_newline='\n' if verbose else '', version=horovod.__version__, tensorflow=get_check( extension_available('tensorflow', verbose=verbose)), torch=get_check(extension_available('torch', verbose=verbose)), mxnet=get_check(extension_available('mxnet', verbose=verbose)), mpi=get_check(mpi_built(verbose=verbose)), gloo=get_check(gloo_built(verbose=verbose)), nccl_ops=get_check(nccl_built(verbose=verbose)), ddl_ops=get_check(ddl_built(verbose=verbose)), mpi_ops=get_check(mpi_built(verbose=verbose)), ccl_ops=get_check(ccl_built(verbose=verbose)), gloo_ops=get_check(gloo_built(verbose=verbose))) print(textwrap.dedent(output)) os._exit(0)
def test_happy_run_elastic(self): if not gloo_built(): self.skipTest("Gloo is not available") args = _HorovodArgs() # we need two different hosts here, otherwise would need to give args.nics args.hosts = 'localhost:2,127.0.0.1:2' args.command = [sys.executable, '-V'] args.num_proc = 2 args.min_num_proc = 2 args.verbose = True # no assertions, we are happy when there are no exceptions # TODO: call into run() when elastic supports args.run_func (#1873) # we can assert the returned result then _run(args)
def run(): args = parse_args() if args.check_build: check_build(args.verbose) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, command=args.command) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
actor.update_env_vars.remote({"TEST": DUMMY_VALUE}) assert ray.get(actor.env_vars.remote())["TEST"] == str(DUMMY_VALUE) def test_local(ray_start_4_cpus): original_resources = ray.available_resources() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor(setting, num_hosts=1, num_slots=4) hjob.start() hostnames = hjob.execute(lambda _: ray.services.get_node_ip_address()) assert len(set(hostnames)) == 1, hostnames hjob.shutdown() assert check_resources(original_resources) @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_ray_init(ray_start_4_cpus): original_resources = ray.available_resources() def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank() setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) assert len(set(result)) == 4 hjob.shutdown()
# ============================================================================== import os import unittest from tempfile import NamedTemporaryFile import horovod from horovod.common.util import gloo_built from horovod.runner.common.util.env import get_env_rank_and_size def train(): return get_env_rank_and_size() @unittest.skipIf(not gloo_built(), "Gloo is not available") class ElasticRunTests(unittest.TestCase): """ Tests for run api with elastic config. """ def test_run_with_hosts(self): """Tests two usable hosts, two slots each in standard happy path.""" hosts = 'localhost:2,127.0.0.1:2' results = horovod.run(train, num_proc=2, min_num_proc=2, max_num_proc=2, hosts=hosts) self.assertEqual([(0, 2), (1, 2)], results) def test_run_with_discovery_script(self):
return logger, training_fn @contextmanager def fault_tolerance_patches(): with mock.patch( 'horovod.runner.elastic.driver.DISCOVER_HOSTS_FREQUENCY_SECS', 0.1): with mock.patch("horovod.runner.util.network.get_driver_ip", return_value=socket.gethostbyname( socket.gethostname())): yield @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (20, ['host-1:2']), (60, ['host-1:2', 'host-2:1', 'host-3:1']), (None, ['host-2:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = ElasticRayExecutor.create_settings(min_np=1, nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = ElasticRayExecutor(settings, cpus_per_slot=1, override_discovery=False)
def run_elastic( fn, args=(), kwargs={}, num_proc=None, min_num_proc=None, max_num_proc=None, start_timeout=None, elastic_timeout=None, reset_limit=None, env=None, stdout=None, stderr=None, verbose=1, nics=None, prefix_output_with_timestamp=False, # np is deprecated, use min_num_proc instead min_np=None, # max_num_proc is deprecated, use max_num_proc instead max_np=None): """ Runs Elastic Horovod on Spark. Runs `num_proc` processes executing `fn` using the same amount of Spark tasks. Args: fn: Function to run. args: Arguments to pass to `fn`. kwargs: Keyword arguments to pass to `fn`. num_proc: Number of Horovod processes. Defaults to `spark.default.parallelism`. min_num_proc: Minimum number of processes running for training to continue. If number of available processes dips below this threshold, then training will wait for more instances to become available. max_num_proc: Maximum number of training processes, beyond which no additional processes will be created. If not specified, then will be unbounded. start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds. If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. elastic_timeout: Timeout for elastic initialisation after re-scaling the cluster. If not set, falls back to `HOROVOD_ELASTIC_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. reset_limit: Maximum number of resets after which the job is terminated. env: Environment dictionary to use in Horovod run. Defaults to `os.environ`. stdout: Horovod stdout is redirected to this stream. stderr: Horovod stderr is redirected to this stream. verbose: Debug output verbosity (0-2). Defaults to 1. nics: List of NICs for tcp network communication. prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver Returns: List of results returned by running `fn` on each rank. """ if min_np is not None: min_num_proc = min_np warnings.warn('min_np is deprecated, use min_num_proc instead', DeprecationWarning) if max_np is not None: max_num_proc = max_np warnings.warn('max_np is deprecated, use max_num_proc instead', DeprecationWarning) if not gloo_built(verbose=(verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) spark_context = pyspark.SparkContext._active_spark_context if spark_context is None: raise Exception('Could not find an active SparkContext, are you ' 'running in a PySpark session?') if start_timeout is None: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600')) # nics needs to be a set if nics and not isinstance(nics, set): nics = set(nics) if num_proc is None: # TODO: #2023 try spark.dynamicAllocation.initialExecutors num_proc = spark_context.defaultParallelism if verbose >= 1: logging.info( 'Running %d processes (inferred from spark.default.parallelism)...', num_proc) else: if verbose >= 1: logging.info('Running %d processes...', num_proc) if min_num_proc is None: # TODO: #2023 try spark.dynamicAllocation.minExecutors min_num_proc = num_proc if max_num_proc is None: # TODO: #2023 try spark.dynamicAllocation.maxExecutors max_num_proc = num_proc # start Spark driver service and launch settings.num_proc Spark tasks key = secret.make_secret_key() spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id() driver = driver_service.SparkDriverService(num_proc, max_num_proc, fn, args, kwargs, key, nics) discovery = host_discovery.SparkDriverHostDiscovery(driver) tmout = timeout.Timeout( start_timeout, message='Timed out waiting for {activity}. Please check that you have ' 'enough resources to run all Horovod processes. Each Horovod ' 'process runs in a Spark task. You may need to increase the ' 'start_timeout parameter to a larger value if your Spark resources ' 'are allocated on-demand.') settings = hvd_elastic_settings.ElasticSettings( discovery=discovery, min_num_proc=min_num_proc, max_num_proc=max_num_proc, elastic_timeout=elastic_timeout, reset_limit=reset_limit, num_proc=num_proc, verbose=verbose, key=key, start_timeout=tmout, nics=nics, run_func_mode=True, prefix_output_with_timestamp=prefix_output_with_timestamp) result_queue = queue.Queue(1) # launch settings.num_proc / settings.max_num_proc Spark tasks spark_thread = _make_spark_thread(spark_context, spark_job_group, driver, result_queue, settings, use_gloo=True, is_elastic=True) try: # Register task addresses of initial num_proc tasks _register_task_addresses(driver, settings) # Run the job gloo_run_elastic(settings, driver, env, stdout, stderr) except: # Terminate Spark job. spark_context.cancelJobGroup(spark_job_group) # Re-raise exception. raise finally: spark_thread.join() driver.shutdown() # Make sure Spark Job did not fail. driver.check_for_spark_job_failure() # get ranks from driver indices_in_rank_order = _get_indices_in_rank_order(driver) # If there's no exception, execution results are in this queue. results = result_queue.get_nowait() return [results[index] for index in indices_in_rank_order]
return training_fn @contextmanager def fault_tolerance_patches(): with mock.patch( 'horovod.runner.elastic.driver.DISCOVER_HOSTS_FREQUENCY_SECS', 0.1): with mock.patch( "horovod.runner.util.network.get_driver_ip", return_value=socket.gethostbyname(socket.gethostname())): yield @pytest.mark.skipif( not gloo_built(), reason='Gloo is required for Ray integration') @pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197') def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:2']), (30, ['host-1:2', 'host-2:1', 'host-3:1']), (None, ['host-2:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = RayExecutor.create_settings(nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = RayExecutor( settings, min_workers=1,
class TestRayDiscoverySuite: @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_cpu_discovery(self, ray_shutdown): ray.init(num_cpus=4, num_gpus=1) discovery = RayHostDiscovery(cpus_per_slot=1) mapping = discovery.find_available_hosts_and_slots() assert len(mapping) == 1 assert list(mapping.values()) == [4] @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_gpu_discovery(self, ray_shutdown): ray.init(num_cpus=4, num_gpus=1) discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1) mapping = discovery.find_available_hosts_and_slots() assert len(mapping) == 1 assert list(mapping.values()) == [1] @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_gpu_slot_discovery(self, ray_shutdown): ray.init(num_cpus=4, num_gpus=4) discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1, gpus_per_slot=2) mapping = discovery.find_available_hosts_and_slots() assert len(mapping) == 1 assert list(mapping.values()) == [2] @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_multinode(self, monkeypatch): def create_multi_node_mock(): host_names = ["host-1", "host-2", "host-3"] resources = {"GPU": 2, "CPU": 8} def create_node_entry(hostname): return { "NodeManagerAddress": hostname, "Resources": resources.copy(), "alive": True } return map(create_node_entry, host_names) monkeypatch.setattr(ray, "nodes", create_multi_node_mock) discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1) mapping = discovery.find_available_hosts_and_slots() assert len(mapping) == 3 assert list(mapping.values()) == [2, 2, 2] @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_multinode_gpus_per_slot(self, monkeypatch): def create_multi_node_mock(): host_names = ["host-1", "host-2", "host-3"] resources = {"GPU": 2, "CPU": 8} def create_node_entry(hostname): return { "NodeManagerAddress": hostname, "Resources": resources.copy(), "alive": True } return map(create_node_entry, host_names) monkeypatch.setattr(ray, "nodes", create_multi_node_mock) discovery = RayHostDiscovery(use_gpu=True, gpus_per_slot=2) mapping = discovery.find_available_hosts_and_slots() assert len(mapping) == 3 assert list(mapping.values()) == [1, 1, 1] @pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration') def test_multinode_mismatch(self, monkeypatch): def create_multi_node_mock(): host_names = ["host-1", "host-2", "host-3"] resources = {"CPU": 8} def create_node_entry(hostname): return { "NodeManagerAddress": hostname, "Resources": resources.copy(), "alive": True } return map(create_node_entry, host_names) monkeypatch.setattr(ray, "nodes", create_multi_node_mock) discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1) mapping = discovery.find_available_hosts_and_slots() assert sum(mapping.values()) == 0
def test_gloo_built(self): """Test that Gloo has been built if env is set.""" gloo_rank = int(os.getenv('HOROVOD_RANK', -1)) if gloo_rank >= 0: self.assertTrue(gloo_built())