def _task_fn(index, num_hosts, driver_addresses, settings): task = task_service.HorovodRunTaskService(index, settings.key, settings.nics) try: driver = driver_service.HorovodRunDriverClient(driver_addresses, settings.key, settings.verbose) driver.register_task(index, task.addresses(), host_hash.host_hash()) task.wait_for_initial_registration(settings.start_timeout) # Tasks ping each other in a circular fashion to determine interfaces # reachable within the cluster. next_task_index = (index + 1) % num_hosts next_task_addresses = driver.all_task_addresses(next_task_index) # We request interface matching to weed out all the NAT'ed interfaces. next_task = task_service.HorovodRunTaskClient(next_task_index, next_task_addresses, settings.key, settings.verbose, match_intf=True, attempts=10) driver.register_task_to_task_addresses(next_task_index, next_task.addresses()) # Notify the next task that the address checks are completed. next_task.task_to_task_address_check_completed() # Wait to get a notification from previous task that its address checks # are completed as well. task.wait_for_task_to_task_address_check_finish_signal( settings.start_timeout) finally: task.shutdown()
def _find_common_network_interface(host_to_size, host_rank_to_id, workers, settings): all_host_names = [k for k in host_to_size] driver = driver_service.HorovodRunDriverService(len(all_host_names), settings.key, settings.nic) _launch_task_servers(all_host_names, host_rank_to_id, driver.addresses(), settings, workers) # the following code is copied and modified from horovod.run._driver_fn try: # wait for all the hosts to register with the service service. if settings.verbose >= 2: print('Waiting for the hosts to acknowledge.') driver.wait_for_initial_registration(settings.timeout) tasks = [ task_service.HorovodRunTaskClient( index, driver.task_addresses_for_driver(index), settings.key, settings.verbose) for index in range(settings.num_hosts) ] # Notify all the drivers that the initial registration is complete. for task in tasks: task.notify_initial_registration_complete() if settings.verbose >= 2: print('Notified all the hosts that the registration is complete.') # Each worker should probe the interfaces of the next worker in a ring # manner and filter only the routed ones -- it should filter out # interfaces that are not really connected to any external networks # such as lo0 with address 127.0.0.1. if settings.verbose >= 2: print('Waiting for hosts to perform host-to-host ' 'interface checking.') driver.wait_for_task_to_task_address_updates(settings.timeout) if settings.verbose >= 2: print('Host-to-host interface checking successful.') # Determine a set of common interfaces for task-to-task communication. common_intfs = set(driver.task_addresses_for_tasks(0).keys()) for index in range(1, settings.num_hosts): common_intfs.intersection_update( driver.task_addresses_for_tasks(index).keys()) if not common_intfs: raise Exception( 'Unable to find a set of common task-to-task communication ' 'interfaces: %s' % [(index, driver.task_addresses_for_tasks(index)) for index in range(settings.num_hosts)]) return common_intfs finally: driver.shutdown()
def _driver_fn(all_host_names, local_host_names, settings): """ launches the service service, launches the task service on each worker and have them register with the service service. Each worker probes all the interfaces of the worker index + 1 (in a ring manner) and only keeps the routed interfaces. Function returns the intersection of the set of all the routed interfaces on all the workers. :param all_host_names: list of addresses. for example, ['worker-0','worker-1'] ['10.11.11.11', '10.11.11.12'] :type all_host_names: list(string) :param local_host_names: host names that resolve into a local addresses. :type local_host_names: set :param settings: the object that contains the setting for running horovod :type settings: Horovod.run.common.util.settings.Settings :return: example: ['eth0', 'eth1'] :rtype: list[string] """ # Launch a TCP server called service service on the host running # horovodrun. driver = driver_service.HorovodRunDriverService(settings.num_hosts, settings.key, settings.nic) if settings.verbose >= 2: print('Launched horovodrun server.') # Have all the workers register themselves with the service service. _launch_task_servers(all_host_names, local_host_names, driver.addresses(), settings) if settings.verbose >= 2: print('Attempted to launch horovod task servers.') try: # wait for all the hosts to register with the service service. if settings.verbose >= 2: print('Waiting for the hosts to acknowledge.') driver.wait_for_initial_registration(settings.timeout) tasks = [ task_service.HorovodRunTaskClient( index, driver.task_addresses_for_driver(index), settings.key, settings.verbose) for index in range(settings.num_hosts) ] # Notify all the drivers that the initial registration is complete. for task in tasks: task.notify_initial_registration_complete() if settings.verbose >= 2: print('Notified all the hosts that the registration is complete.') # Each worker should probe the interfaces of the next worker in a ring # manner and filter only the routed ones -- it should filter out # interfaces that are not really connected to any external networks # such as lo0 with address 127.0.0.1. if settings.verbose >= 2: print('Waiting for hosts to perform host-to-host ' 'interface checking.') driver.wait_for_task_to_task_address_updates(settings.timeout) if settings.verbose >= 2: print('Host-to-host interface checking successful.') # Determine a set of common interfaces for task-to-task communication. common_intfs = set(driver.task_addresses_for_tasks(0).keys()) for index in range(1, settings.num_hosts): common_intfs.intersection_update( driver.task_addresses_for_tasks(index).keys()) if not common_intfs: raise Exception( 'Unable to find a set of common task-to-task communication ' 'interfaces: %s' % [(index, driver.task_addresses_for_tasks(index)) for index in range(settings.num_hosts)]) return common_intfs finally: driver.shutdown()