Пример #1
0
def _launch_job(args, remote_host_names, settings, common_intfs, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)
    driver_ip = _get_driver_ip(common_intfs)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                 command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                     command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Пример #2
0
    def _run(self,
             discovery_schedule=None,
             exit_schedule=None,
             exit_mode='exception',
             np=2,
             min_np=2,
             max_np=4,
             hosts=None):
        if not discovery_schedule and not hosts:
            raise ValueError(
                'at least one of discovery schedule or hosts must be given')

        with temppath() as logfile:
            with _temp_discovery_script(logfile, discovery_schedule or [(None, hosts.split(','))]) \
                    as discovery_script:
                command_args = [
                    'horovodrun', '-np',
                    str(np), '--min-np',
                    str(min_np), '--log-level', 'DEBUG'
                ]
                if hosts is not None:
                    command_args += ['-H', hosts]
                else:
                    command_args += [
                        '--host-discovery-script', discovery_script,
                        '--max-np',
                        str(max_np)
                    ]

                command_args += [
                    'python', self._training_script, '--logfile', logfile
                ]
                if discovery_schedule:
                    command_args += [
                        '--discovery-schedule',
                        json.dumps(discovery_schedule)
                    ]
                if exit_schedule:
                    command_args += [
                        '--exit-schedule',
                        json.dumps(exit_schedule), '--exit-mode', exit_mode
                    ]
                print(' '.join(command_args))

                with override_args(*command_args):
                    args = parse_args()
                    env = {}
                    config_parser.set_env_from_args(env, args)
                    _run_elastic(args)

                    with open(logfile, 'r') as f:
                        lines = f.readlines()

                    print('logfile:')
                    for line in lines:
                        print(line)

                    return [json.loads(line) for line in lines]
Пример #3
0
    def test_logging_args(self):
        with override_args('horovodrun', '-np', '2', '--log-level', 'INFO',
                           '--log-hide-timestamp'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_LOG_LEVEL), 'INFO')
            self.assertEqual(env.get(config_parser.HOROVOD_LOG_HIDE_TIME), '1')
Пример #4
0
    def test_timeline_args(self):
        with override_args('horovodrun', '-np', '2',
                           '--timeline-filename', '/tmp/timeline.json',
                           '--timeline-mark-cycles'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE), '/tmp/timeline.json')
            self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE_MARK_CYCLES), '1')
Пример #5
0
    def test_library_args(self):
        with override_args('horovodrun', '-np', '2', '--mpi-threads-disable',
                           '--num-nccl-streams', '2', '--mlsl-bgt-affinity',
                           '1'):
            args = run.parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env[config_parser.HOROVOD_MPI_THREADS_DISABLE],
                             '1')
            self.assertEqual(env[config_parser.HOROVOD_NUM_NCCL_STREAMS], '2')
            self.assertEqual(env[config_parser.HOROVOD_MLSL_BGT_AFFINITY], '1')
Пример #6
0
def _run_elastic(args):
    # construct host discovery component
    if args.host_discovery_script:
        discover_hosts = discovery.HostDiscoveryScript(
            args.host_discovery_script, args.slots)
    elif args.hosts:
        _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts)
        if len(available_host_slots) < 2:
            raise ValueError(
                'Cannot run in fault tolerance mode with fewer than 2 hosts.')
        discover_hosts = discovery.FixedHosts(available_host_slots)
    else:
        raise ValueError(
            'One of --host-discovery-script, --hosts, or --hostnames must be provided'
        )

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = elastic_settings.ElasticSettings(
        discovery=discover_hosts,
        min_np=args.min_np or args.np,
        max_np=args.max_np,
        elastic_timeout=args.elastic_timeout,
        reset_limit=args.reset_limit,
        num_proc=args.np,
        verbose=2 if args.verbose else 0,
        ssh_port=args.ssh_port,
        extra_mpi_args=args.mpi_args,
        key=secret.make_secret_key(),
        start_timeout=tmout,
        output_filename=args.output_filename,
        run_func_mode=args.run_func is not None,
        nics=args.nics)

    if not gloo_built(verbose=(settings.verbose >= 2)):
        raise ValueError(
            'Gloo support is required to use elastic training, but has not been built.  Ensure CMake is '
            'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
        )

    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)
    gloo_run_elastic(settings, env, args.command)
Пример #7
0
    def test_autotuning_with_fixed_param(self):
        with override_args('horovodrun', '-np', '2',
                           '--autotune',
                           '--cache-capacity', '1024',
                           '--no-hierarchical-allgather'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertNotIn(config_parser.HOROVOD_FUSION_THRESHOLD, env)
            self.assertNotIn(config_parser.HOROVOD_CYCLE_TIME, env)
            self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '1024')
            self.assertNotIn(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE, env)
            self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '0')
Пример #8
0
    def test_library_args(self):
        with override_args('horovodrun', '-np', '2',
                           '--mpi-threads-disable',
                           '--num-nccl-streams', '2',
                           '--ccl-bgt-affinity', '1',
                           '--gloo-timeout-seconds', '60'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_MPI_THREADS_DISABLE), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_NUM_NCCL_STREAMS), '2')
            self.assertEqual(env.get(config_parser.HOROVOD_CCL_BGT_AFFINITY), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_GLOO_TIMEOUT_SECONDS), '60')
Пример #9
0
def _launch_job(args, settings, nics, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    def gloo_run_fn():
        driver_ip = network.get_driver_ip(nics)
        gloo_run(settings, nics, env, driver_ip, command)

    def mpi_run_fn():
        mpi_run(settings, nics, env, command)

    def js_run_fn():
        js_run(settings, nics, env, command)

    run_controller(args.use_gloo, gloo_run_fn, args.use_mpi, mpi_run_fn,
                   args.use_jsrun, js_run_fn, args.verbose)
Пример #10
0
    def test_params_args(self):
        with override_args('horovodrun', '-np', '2',
                           '--fusion-threshold-mb', '10',
                           '--cycle-time-ms', '20',
                           '--cache-capacity', '512',
                           '--hierarchical-allreduce',
                           '--hierarchical-allgather'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_FUSION_THRESHOLD), str(10 * 1024 * 1024))
            self.assertEqual(env.get(config_parser.HOROVOD_CYCLE_TIME), '20.0')
            self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '512')
            self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '1')
Пример #11
0
    def test_autotune_args(self):
        with override_args('horovodrun', '-np', '2',
                           '--autotune',
                           '--autotune-log-file', '/tmp/autotune.txt',
                           '--autotune-warmup-samples', '1',
                           '--autotune-steps-per-sample', '5',
                           '--autotune-bayes-opt-max-samples', '10',
                           '--autotune-gaussian-process-noise', '0.2'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_LOG), '/tmp/autotune.txt')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_WARMUP_SAMPLES), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE), '5')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES), '10')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE), '0.2')
Пример #12
0
    def test_stall_check_args(self):
        with override_args('horovodrun', '-np', '2',
                           '--no-stall-check'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_DISABLE), '1')

        with override_args('horovodrun', '-np', '2',
                           '--stall-check-warning-time-seconds', '10',
                           '--stall-check-shutdown-time-seconds', '20'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertNotIn(config_parser.HOROVOD_STALL_CHECK_DISABLE, env)
            self.assertEqual(env.get(config_parser.HOROVOD_STALL_CHECK_TIME_SECONDS), '10')
            self.assertEqual(env.get(config_parser.HOROVOD_STALL_SHUTDOWN_TIME_SECONDS), '20')
Пример #13
0
def _launch_job(args, remote_host_names, settings, nics, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, nics, env,
                 network._get_driver_ip(nics), command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, nics, env, command)
    elif args.use_jsrun:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        if not lsf.LSFUtils.using_lsf():
            raise ValueError(
                'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. '
                'Please, pick a different launcher for other environments.')
        js_run(settings, nics, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            if lsf.LSFUtils.using_lsf() and is_jsrun_installed():
                js_run(settings, nics, env, command)
            else:
                mpi_run(settings, nics, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, nics, env,
                     network._get_driver_ip(nics), command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Пример #14
0
def run():
    args = parse_args()

    if args.check_build:
        check_build(args.verbose)

    # if hosts are not specified, either parse from hostfile, or default as
    # localhost
    if not args.hosts:
        if args.hostfile:
            args.hosts = parse_host_files(args.hostfile)
        else:
            # Set hosts to localhost if not specified
            args.hosts = 'localhost:{np}'.format(np=args.np)

    host_list = args.hosts.split(',')
    all_host_names = []
    pattern = re.compile(r'^[\w.-]+:\d+$')
    for host in host_list:
        if not pattern.match(host.strip()):
            raise ValueError('Invalid host input, please make sure it has '
                             'format as : worker-0:2,worker-1:2.')
        all_host_names.append(host.strip().split(':')[0])

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = hvd_settings.Settings(verbose=2 if args.verbose else 0,
                                     ssh_port=args.ssh_port,
                                     key=secret.make_secret_key(),
                                     timeout=tmout,
                                     num_hosts=len(all_host_names),
                                     num_proc=args.np,
                                     hosts=args.hosts,
                                     command=args.command)

    # This cache stores the results of checks performed by horovodrun
    # during the initialization step. It can be disabled by setting
    # --disable-cache flag.
    fn_cache = None
    if not args.disable_cache:
        params = ''
        if args.np:
            params += str(args.np) + ' '
        if args.hosts:
            params += str(args.hosts) + ' '
        if args.ssh_port:
            params += str(args.ssh_port)
        parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest()
        fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES,
                               parameters_hash)

    if settings.verbose >= 2:
        print('Filtering local host names.')
    remote_host_names = network.filter_local_addresses(all_host_names)
    if settings.verbose >= 2:
        print('Remote host found: ' + ' '.join(remote_host_names))

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Checking ssh on all remote hosts.')
        # Check if we can ssh into all remote hosts successfully.
        _check_all_hosts_ssh_successful(remote_host_names,
                                        args.ssh_port,
                                        fn_cache=fn_cache)
        if settings.verbose >= 2:
            print('SSH was successful into all the remote hosts.')

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Testing interfaces on all the hosts.')

        local_host_names = set(all_host_names) - set(remote_host_names)
        # Find the set of common, routed interfaces on all the hosts (remote
        # and local) and specify it in the args to be used by NCCL. It is
        # expected that the following function will find at least one interface
        # otherwise, it will raise an exception.
        common_intfs = _driver_fn(all_host_names,
                                  local_host_names,
                                  settings,
                                  fn_cache=fn_cache)

        if settings.verbose >= 2:
            print('Interfaces on all the hosts were successfully checked.')
            print('Common interface found: ' + ' '.join(common_intfs))

    else:
        if settings.verbose >= 2:
            print('All hosts are local, finding the interfaces '
                  'with address 127.0.0.1')
        # If all the given hosts are local, find the interfaces with address
        # 127.0.0.1
        common_intfs = set()
        for iface, addrs in net_if_addrs().items():
            for addr in addrs:
                if addr.family == AF_INET and addr.address == '127.0.0.1':
                    common_intfs.add(iface)
                    break

        if len(common_intfs) == 0:
            raise ValueError('No interface is found for address 127.0.0.1.')

        if settings.verbose >= 2:
            print('Local interface found ' + ' '.join(common_intfs))

    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')