def test_rank_and_size(self): """Tests two hosts, two slots each in standard happy path.""" slots = {'host-1': 2, 'host-2': 2} discovery = FixedHosts(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) rank_results = {} def exec_command(slot_info, events): driver.record_ready(slot_info.hostname, slot_info.local_rank) updated_slot_info = driver.get_slot_info(slot_info.hostname, slot_info.local_rank) rank_results[slot_info.rank] = (slot_info, updated_slot_info) return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 4 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 4 for rank, (slot_info, updated_slot_info) in rank_results.items(): assert slot_info.to_response_string( ) == updated_slot_info.to_response_string(), rank
def test_shutdown_on_success(self): """Tests that shutdown event is triggered when one worker succeeds but the others are still working.""" slots = {'host-1': 2, 'host-2': 2} discovery = FixedHosts(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) def exec_command(slot_info, events): if slot_info.rank == 0: return 0, time.time() driver.record_ready(slot_info.hostname, slot_info.local_rank) wait_for_one(events) return 1, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 4 exit_code_sum = 0 for name, (exit_code, timestamp) in res.items(): exit_code_sum += exit_code assert exit_code_sum == 3
def test_rank_and_size_with_host_failure(self): """Tests two hosts, two slots each with second host failing before rendezvous completes.""" slots = {'host-1': 2, 'host-2': 2} discovery = FixedHosts(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) rank_results = {} def exec_command(slot_info, events): if slot_info.hostname == 'host-2': return 1, time.time() driver.record_ready(slot_info.hostname, slot_info.local_rank) updated_slot_info = driver.get_slot_info(slot_info.hostname, slot_info.local_rank) rank_results[slot_info.rank] = (slot_info, updated_slot_info) return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 2 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 2 for rank, (slot_info, updated_slot_info) in rank_results.items(): assert updated_slot_info.size == 2, rank assert updated_slot_info.rank == slot_info.rank % 2, rank assert updated_slot_info.local_size == slot_info.local_size, rank assert updated_slot_info.local_rank == slot_info.local_rank, rank assert updated_slot_info.cross_size == 1, rank assert updated_slot_info.cross_rank == 0, rank
def launch_gloo_elastic(command, exec_command, settings, env, get_common_interfaces, rendezvous): # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) driver = ElasticDriver(rendezvous, settings.discovery, settings.min_np, settings.max_np, timeout=settings.elastic_timeout, reset_limit=settings.reset_limit, verbose=settings.verbose) handler = create_rendezvous_handler(driver) global_rendezv_port = rendezvous.start(handler) driver.wait_for_available_slots(settings.num_proc) nics = get_common_interfaces(driver) server_ip = network.get_driver_ip(nics) event = register_shutdown_event() run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True) create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event) driver.start(settings.num_proc, create_worker) res = driver.get_results() driver.stop() rendezvous.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def test_send_notifications_without_assignments(self, mock_get_worker_client, mock_get_coordinator_info, mock_host_assignments): """Tests that notifications are still sent correctly even if host assignments cannot be generated.""" slots = [{ 'host-1': 8, 'host-2': 4 }, { 'host-1': 8, 'host-2': 4 }, { 'host-2': 4 }, { 'host-2': 4 }, { 'host-2': 4, 'host-3': 12 }] discovery = mock.Mock() discovery.find_available_hosts_and_slots.side_effect = sequence(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=8, max_np=12) driver.wait_for_available_slots(min_np=16) driver.stop() # On the second call, we should see the number of slots dip below the minimum, but we still want to ensure # we notify workers every time there is a change, so in total we should observe 3 calls. assert mock_get_worker_client.call_count == 3 assert mock_get_coordinator_info.call_count == 3
def test_wait_for_min_hosts(self): """Tests that driver blocks until the min number of hosts and slots are available.""" slots = [{ 'host-1': 4 }, { 'host-1': 4, 'host-2': 8 }, { 'host-1': 4, 'host-2': 8, 'host-3': 4 }] mock_discovery = mock.Mock() mock_discovery.find_available_hosts_and_slots.side_effect = sequence( slots) driver = ElasticDriver(mock.Mock(), mock_discovery, min_np=2, max_np=12) driver.wait_for_available_slots(min_np=2, min_hosts=2) # Even though we only needed 2 slots, because we also needed 2 hosts, we will at least 12 slots total assert driver._host_manager.current_hosts.count_available_slots() >= 12 driver.stop()
def test_wait_for_available_slots(self, mock_get_worker_client, mock_get_coordinator_info): """Tests that driver blocks until the min number of slots are available.""" slots = [{ 'host-1': 4 }, { 'host-1': 4, 'host-2': 8 }, { 'host-1': 4, 'host-2': 8, 'host-3': 4 }] mock_discovery = mock.Mock() mock_discovery.find_available_hosts_and_slots.side_effect = sequence( slots) driver = ElasticDriver(mock.Mock(), mock_discovery, min_np=8, max_np=20) driver.wait_for_available_slots(min_np=16) assert driver._host_manager.current_hosts.count_available_slots() >= 16 driver.stop() # Notify coordinator 2 times, as the first time we are below min_np and the existing host assignments # are empty assert mock_get_worker_client.call_count == 2 assert mock_get_coordinator_info.call_count == 2
def test_rank_and_size_with_host_added(self): """Tests training starts with one host two slots, then a second host is added.""" slots = {'host-1': 2} discovery = FixedHosts(slots) def add_host(): slots = {'host-1': 2, 'host-2': 2} discovery.set(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) rank_results = {} def exec_command(slot_info, events): driver.record_ready(slot_info.hostname, slot_info.local_rank) if slot_info.hostname == 'host-1': if slot_info.rank == 0: add_host() driver.wait_for_available_slots(4) driver.record_ready(slot_info.hostname, slot_info.local_rank) driver.record_ready(slot_info.hostname, slot_info.local_rank) updated_slot_info = driver.get_slot_info(slot_info.hostname, slot_info.local_rank) rank_results[slot_info.rank] = (slot_info, updated_slot_info) return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 4 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 4 for rank, (slot_info, updated_slot_info) in rank_results.items(): assert updated_slot_info.size == 4, rank assert updated_slot_info.rank == slot_info.rank, rank assert updated_slot_info.local_size == slot_info.local_size, rank assert updated_slot_info.local_rank == slot_info.local_rank, rank assert updated_slot_info.cross_size == 2, rank assert updated_slot_info.cross_rank == slot_info.cross_rank, rank
def test_shutdown_on_initial_discovery_failure(self): """Tests that the driver will shutdown immediately if initial host discovery fails.""" discovery = mock.Mock() discovery.find_available_hosts_and_slots.side_effect = RuntimeError() discover_hosts = ElasticDriver._discover_hosts def wrapped_discover_hosts(obj): try: discover_hosts(obj) except RuntimeError: # Suppress the error message from the background discovery thread to clean up unit tests pass try: ElasticDriver._discover_hosts = wrapped_discover_hosts driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) with pytest.raises(RuntimeError): driver.wait_for_available_slots(min_np=2) assert driver.finished() finally: ElasticDriver._discover_hosts = discover_hosts
def test_all_workers_fail(self): """Tests that training fails when all workers fail.""" slots = {'host-1': 2, 'host-2': 2} discovery = FixedHosts(slots) driver = ElasticDriver(mock.Mock(), discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) def exec_command(slot_info, events): driver.record_ready(slot_info.hostname, slot_info.local_rank) return 1, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 4 for name, (exit_code, timestamp) in res.items(): assert exit_code == 1, name
def gloo_run_elastic(settings, env, command): # Make the output directory if it does not exist if settings.output_filename: _mkdir_p(settings.output_filename) rendezvous = RendezvousServer(settings.verbose) driver = ElasticDriver(rendezvous, settings.discovery, settings.min_np, settings.max_np, timeout=settings.elastic_timeout, verbose=settings.verbose) handler = create_rendezvous_handler(driver) global_rendezv_port = rendezvous.start_server(handler) # Host-to-host common interface detection requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(settings) current_hosts = driver.wait_for_available_slots(settings.num_proc, min_hosts=min_hosts) nics = driver_service.get_common_interfaces( settings, current_hosts.host_assignment_order) server_ip = network.get_driver_ip(nics) exec_command = _exec_command_fn(settings) event = register_shutdown_event() run_command = get_run_command(command, server_ip, nics, global_rendezv_port, elastic=True) create_worker = _create_elastic_worker_fn(exec_command, run_command, env, event) driver.start(settings.num_proc, create_worker) res = driver.get_results() driver.stop() rendezvous.stop_server() for name, value in sorted(res.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes exited with non-zero ' 'status, thus causing the job to be terminated. The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code))
def test_worker_notification_manager(self): """Tests that host add events are sent to the worker notification service and consumed.""" slots = {'host-1': 2} discovery = FixedHosts(slots) rendezvous = RendezvousServer() driver = ElasticDriver(rendezvous, discovery, min_np=2, max_np=4) driver.wait_for_available_slots(min_np=2) handler = create_rendezvous_handler(driver) common_intfs = network.get_local_intfs() addr = network.get_driver_ip(common_intfs) port = rendezvous.start_server(handler) nic = list(common_intfs)[0] rank_results = {} class NotificationReceiver: def __init__(self): self.events = [] def on_hosts_updated(self, timestamp): self.events.append(timestamp) def add_host(): slots = {'host-1': 2, 'host-2': 2} discovery.set(slots) def remove_host(): slots = {'host-2': 2} discovery.set(slots) def exec_command(slot_info, events): manager = WorkerNotificationManager() manager.init(rendezvous_addr=addr, rendezvous_port=port, nic=nic, hostname=slot_info.hostname, local_rank=slot_info.local_rank) notification_receiver = NotificationReceiver() manager.register_listener(notification_receiver) driver.record_ready(slot_info.hostname, slot_info.local_rank) if slot_info.rank == 0: add_host() driver.wait_for_available_slots(4) if slot_info.rank == 0: remove_host() # Busy wait for the number of available slots to decrease while driver._host_manager.current_hosts.count_available_slots( ) > 2: time.sleep(0.01) rank_results[slot_info.rank] = notification_receiver.events return 0, time.time() driver.start(np=2, create_worker_fn=exec_command) res = driver.get_results() driver.stop() assert len(res) == 2 for name, (exit_code, timestamp) in res.items(): assert exit_code == 0, name assert len(rank_results) == 2 for rank, timestamps in rank_results.items(): expected = 2 if rank == 0 else 0 assert len(timestamps) == expected, rank rendezvous.stop_server()