def _run_elastic(args): # construct host discovery component if args.host_discovery_script: discover_hosts = discovery.HostDiscoveryScript( args.host_discovery_script, args.slots) elif args.hosts: _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts) if len(available_host_slots) < 2: raise ValueError( 'Cannot run in fault tolerance mode with fewer than 2 hosts.') discover_hosts = discovery.FixedHosts(available_host_slots) else: raise ValueError( 'One of --host-discovery-script, --hosts, or --hostnames must be provided' ) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = elastic_settings.ElasticSettings( discovery=discover_hosts, min_num_proc=args.min_num_proc or args.num_proc, max_num_proc=args.max_num_proc, elastic_timeout=args.elastic_timeout, reset_limit=args.reset_limit, cooldown_range=args.cooldown_range, num_proc=args.num_proc, verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, key=secret.make_secret_key(), start_timeout=tmout, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=args.nics, prefix_output_with_timestamp=args.prefix_output_with_timestamp) if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) env = os.environ.copy() config_parser.set_env_from_args(env, args) executable = args.executable or sys.executable return gloo_run_elastic(settings, env, args.run_func if args.run_func else args.command, executable)
def test_logging_args(self): with override_args('horovodrun', '-np', '2', '--log-level', 'INFO', '--log-hide-timestamp'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_LOG_LEVEL), 'INFO') self.assertEqual(env.get(config_parser.HOROVOD_LOG_HIDE_TIME), '1')
def test_timeline_args(self): with override_args('horovodrun', '-np', '2', '--timeline-filename', '/tmp/timeline.json', '--timeline-mark-cycles'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE), '/tmp/timeline.json') self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE_MARK_CYCLES), '1')
def test_autotuning_with_fixed_param(self): with override_args('horovodrun', '-np', '2', '--autotune', '--cache-capacity', '1024', '--no-hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_FUSION_THRESHOLD, env) self.assertNotIn(config_parser.HOROVOD_CYCLE_TIME, env) self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '1024') self.assertNotIn(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE, env) self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '0')
def test_library_args(self): with override_args('horovodrun', '-np', '2', '--mpi-threads-disable', '--num-nccl-streams', '2', '--ccl-bgt-affinity', '1', '--gloo-timeout-seconds', '60'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_MPI_THREADS_DISABLE), '1') self.assertEqual(env.get(config_parser.HOROVOD_NUM_NCCL_STREAMS), '2') self.assertEqual(env.get(config_parser.HOROVOD_CCL_BGT_AFFINITY), '1') self.assertEqual(env.get(config_parser.HOROVOD_GLOO_TIMEOUT_SECONDS), '60')
def _launch_job(args, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) def gloo_run_fn(): driver_ip = network.get_driver_ip(nics) gloo_run(settings, nics, env, driver_ip, command) def mpi_run_fn(): mpi_run(settings, nics, env, command) def js_run_fn(): js_run(settings, nics, env, command) run_controller(args.use_gloo, gloo_run_fn, args.use_mpi, mpi_run_fn, args.use_jsrun, js_run_fn, args.verbose)
def test_params_args(self): with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb', '10', '--cycle-time-ms', '20', '--cache-capacity', '512', '--hierarchical-allreduce', '--hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_FUSION_THRESHOLD), str(10 * 1024 * 1024)) self.assertEqual(env.get(config_parser.HOROVOD_CYCLE_TIME), '20.0') self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '512') self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE), '1') self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '1')
def test_autotune_args(self): with override_args('horovodrun', '-np', '2', '--autotune', '--autotune-log-file', '/tmp/autotune.txt', '--autotune-warmup-samples', '1', '--autotune-steps-per-sample', '5', '--autotune-bayes-opt-max-samples', '10', '--autotune-gaussian-process-noise', '0.2'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE), '1') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_LOG), '/tmp/autotune.txt') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_WARMUP_SAMPLES), '1') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE), '5') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES), '10') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE), '0.2')
def _run(self, discovery_schedule=None, exit_schedule=None, exit_mode='exception', np=2, min_np=2, max_np=4, hosts=None, reset_limit=None): if not discovery_schedule and not hosts: raise ValueError('at least one of discovery schedule or hosts must be given') with temppath() as logfile: with _temp_discovery_script(logfile, discovery_schedule or [(None, hosts.split(','))]) \ as discovery_script: command_args = ['horovodrun', '-np', str(np), '--min-np', str(min_np), '--log-level', 'DEBUG'] if hosts is not None: command_args += ['-H', hosts] else: command_args += ['--host-discovery-script', discovery_script, '--max-np', str(max_np)] if reset_limit is not None: command_args += ['--reset-limit', str(reset_limit)] command_args += ['python', self._training_script, '--logfile', logfile] if discovery_schedule: command_args += ['--discovery-schedule', json.dumps(discovery_schedule)] if exit_schedule: command_args += ['--exit-schedule', json.dumps(exit_schedule), '--exit-mode', exit_mode] print(' '.join(command_args)) with override_args(*command_args): args = parse_args() env = {} config_parser.set_env_from_args(env, args) _run_elastic(args) with open(logfile, 'r') as f: lines = f.readlines() print('logfile:') for line in lines: print(line) return [json.loads(line) for line in lines]
def test_stall_check_args(self): with override_args('horovodrun', '-np', '2', '--no-stall-check'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_DISABLE), '1') with override_args('horovodrun', '-np', '2', '--stall-check-warning-time-seconds', '10', '--stall-check-shutdown-time-seconds', '20'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_STALL_CHECK_DISABLE, env) self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_TIME_SECONDS), '10') self.assertEqual(env.get(config_parser.HOROVOD_STALL_SHUTDOWN_TIME_SECONDS), '20')