def _launch_job(args, remote_host_names, settings, common_intfs, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) driver_ip = _get_driver_ip(common_intfs) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env, driver_ip, command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def _run(self, discovery_schedule=None, exit_schedule=None, exit_mode='exception', np=2, min_np=2, max_np=4, hosts=None): if not discovery_schedule and not hosts: raise ValueError( 'at least one of discovery schedule or hosts must be given') with temppath() as logfile: with _temp_discovery_script(logfile, discovery_schedule or [(None, hosts.split(','))]) \ as discovery_script: command_args = [ 'horovodrun', '-np', str(np), '--min-np', str(min_np), '--log-level', 'DEBUG' ] if hosts is not None: command_args += ['-H', hosts] else: command_args += [ '--host-discovery-script', discovery_script, '--max-np', str(max_np) ] command_args += [ 'python', self._training_script, '--logfile', logfile ] if discovery_schedule: command_args += [ '--discovery-schedule', json.dumps(discovery_schedule) ] if exit_schedule: command_args += [ '--exit-schedule', json.dumps(exit_schedule), '--exit-mode', exit_mode ] print(' '.join(command_args)) with override_args(*command_args): args = parse_args() env = {} config_parser.set_env_from_args(env, args) _run_elastic(args) with open(logfile, 'r') as f: lines = f.readlines() print('logfile:') for line in lines: print(line) return [json.loads(line) for line in lines]
def test_logging_args(self): with override_args('horovodrun', '-np', '2', '--log-level', 'INFO', '--log-hide-timestamp'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_LOG_LEVEL), 'INFO') self.assertEqual(env.get(config_parser.HOROVOD_LOG_HIDE_TIME), '1')
def test_timeline_args(self): with override_args('horovodrun', '-np', '2', '--timeline-filename', '/tmp/timeline.json', '--timeline-mark-cycles'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE), '/tmp/timeline.json') self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE_MARK_CYCLES), '1')
def test_library_args(self): with override_args('horovodrun', '-np', '2', '--mpi-threads-disable', '--num-nccl-streams', '2', '--mlsl-bgt-affinity', '1'): args = run.parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env[config_parser.HOROVOD_MPI_THREADS_DISABLE], '1') self.assertEqual(env[config_parser.HOROVOD_NUM_NCCL_STREAMS], '2') self.assertEqual(env[config_parser.HOROVOD_MLSL_BGT_AFFINITY], '1')
def _run_elastic(args): # construct host discovery component if args.host_discovery_script: discover_hosts = discovery.HostDiscoveryScript( args.host_discovery_script, args.slots) elif args.hosts: _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts) if len(available_host_slots) < 2: raise ValueError( 'Cannot run in fault tolerance mode with fewer than 2 hosts.') discover_hosts = discovery.FixedHosts(available_host_slots) else: raise ValueError( 'One of --host-discovery-script, --hosts, or --hostnames must be provided' ) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = elastic_settings.ElasticSettings( discovery=discover_hosts, min_np=args.min_np or args.np, max_np=args.max_np, elastic_timeout=args.elastic_timeout, reset_limit=args.reset_limit, num_proc=args.np, verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, extra_mpi_args=args.mpi_args, key=secret.make_secret_key(), start_timeout=tmout, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=args.nics) if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) env = os.environ.copy() config_parser.set_env_from_args(env, args) gloo_run_elastic(settings, env, args.command)
def test_autotuning_with_fixed_param(self): with override_args('horovodrun', '-np', '2', '--autotune', '--cache-capacity', '1024', '--no-hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_FUSION_THRESHOLD, env) self.assertNotIn(config_parser.HOROVOD_CYCLE_TIME, env) self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '1024') self.assertNotIn(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE, env) self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '0')
def test_library_args(self): with override_args('horovodrun', '-np', '2', '--mpi-threads-disable', '--num-nccl-streams', '2', '--ccl-bgt-affinity', '1', '--gloo-timeout-seconds', '60'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_MPI_THREADS_DISABLE), '1') self.assertEqual(env.get(config_parser.HOROVOD_NUM_NCCL_STREAMS), '2') self.assertEqual(env.get(config_parser.HOROVOD_CCL_BGT_AFFINITY), '1') self.assertEqual(env.get(config_parser.HOROVOD_GLOO_TIMEOUT_SECONDS), '60')
def _launch_job(args, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) def gloo_run_fn(): driver_ip = network.get_driver_ip(nics) gloo_run(settings, nics, env, driver_ip, command) def mpi_run_fn(): mpi_run(settings, nics, env, command) def js_run_fn(): js_run(settings, nics, env, command) run_controller(args.use_gloo, gloo_run_fn, args.use_mpi, mpi_run_fn, args.use_jsrun, js_run_fn, args.verbose)
def test_params_args(self): with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb', '10', '--cycle-time-ms', '20', '--cache-capacity', '512', '--hierarchical-allreduce', '--hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_FUSION_THRESHOLD), str(10 * 1024 * 1024)) self.assertEqual(env.get(config_parser.HOROVOD_CYCLE_TIME), '20.0') self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '512') self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE), '1') self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '1')
def test_autotune_args(self): with override_args('horovodrun', '-np', '2', '--autotune', '--autotune-log-file', '/tmp/autotune.txt', '--autotune-warmup-samples', '1', '--autotune-steps-per-sample', '5', '--autotune-bayes-opt-max-samples', '10', '--autotune-gaussian-process-noise', '0.2'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE), '1') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_LOG), '/tmp/autotune.txt') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_WARMUP_SAMPLES), '1') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE), '5') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES), '10') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE), '0.2')
def test_stall_check_args(self): with override_args('horovodrun', '-np', '2', '--no-stall-check'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_DISABLE), '1') with override_args('horovodrun', '-np', '2', '--stall-check-warning-time-seconds', '10', '--stall-check-shutdown-time-seconds', '20'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_STALL_CHECK_DISABLE, env) self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_TIME_SECONDS), '10') self.assertEqual(env.get(config_parser.HOROVOD_STALL_SHUTDOWN_TIME_SECONDS), '20')
def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, nics, env, command) elif args.use_jsrun: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run(settings, nics, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run(settings, nics, env, command) else: mpi_run(settings, nics, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def run(): args = parse_args() if args.check_build: check_build(args.verbose) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, command=args.command) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs, env) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs, env) else: if mpi_built(verbose=(settings.verbose >= 2)): mpi_run(settings, common_intfs, env) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, common_intfs, env) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')