예제 #1
0
def test_both_num_workers_min_workers(ray_8_cpus):
    settings = RayExecutor.create_settings()
    with pytest.raises(ValueError, match=r"Both `min_workers` and `num_workers` provided."):
        executor = RayExecutor(
            settings,
            min_workers=1,
            num_workers=1,
            cpus_per_worker=1)
예제 #2
0
def test_local(ray_start_4_cpus):
    original_resources = ray.available_resources()
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting, num_hosts=1, num_slots=4)
    hjob.start()
    hostnames = hjob.execute(lambda _: socket.gethostname())
    assert len(set(hostnames)) == 1, hostnames
    hjob.shutdown()
    assert check_resources(original_resources)
예제 #3
0
def test_local(ray_start_4_cpus, num_workers, num_hosts, num_workers_per_host):
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting,
                       num_workers=num_workers,
                       num_hosts=num_hosts,
                       num_workers_per_host=num_workers_per_host)
    hjob.start()
    hostnames = hjob.execute(lambda _: socket.gethostname())
    assert len(set(hostnames)) == 1, hostnames
    hjob.shutdown()
예제 #4
0
def test_train(ray_start_4_cpus):
    def simple_fn(worker):
        local_rank = _train()
        return local_rank

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_workers=3, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.execute(simple_fn)
    assert set(result) == {0, 1, 2}
    result = ray.get(hjob.run_remote(simple_fn, args=[None]))
    assert set(result) == {0, 1, 2}
    hjob.shutdown()
예제 #5
0
def test_infeasible_placement(ray_start_2_cpus):
    setting = RayExecutor.create_settings(timeout_s=30,
                                          placement_group_timeout_s=5)
    hjob = RayExecutor(setting, num_hosts=1, num_slots=4)
    with pytest.raises(TimeoutError):
        hjob.start()
    hjob.shutdown()
예제 #6
0
def test_gpu_ids(ray_start_4_cpus_4_gpus):
    original_resources = ray.available_resources()
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_workers_per_host=4, use_gpu=True)
    hjob.start()
    all_envs = hjob.execute(lambda _: os.environ.copy())
    all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs}
    assert len(all_cudas) == 1, all_cudas
    assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(",")) == 4
    hjob.shutdown()
    assert check_resources(original_resources)
예제 #7
0
파일: test_ray.py 프로젝트: yuduber/horovod
def test_horovod_train(ray_start_4_cpus):
    def simple_fn(worker):
        local_rank = _train()
        return local_rank

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.execute(simple_fn)
    assert set(result) == {0, 1, 2, 3}
    hjob.shutdown()
예제 #8
0
파일: test_ray.py 프로젝트: yuduber/horovod
def test_ray_exec_func(ray_start_4_cpus):
    def simple_fn(num_epochs):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank() * num_epochs

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.run(simple_fn, args=[0])
    assert len(set(result)) == 1
    hjob.shutdown()
예제 #9
0
def test_min_num_proc(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:1']),
            (10, ['host-1:1', 'host-4:1', 'host-5:1']),
            (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = RayExecutor.create_settings(nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = RayExecutor(settings,
            min_workers=4,
            max_workers=4,
            override_discovery=False
        )

        training_fn = _create_training_function(iterations=30)
        executor.start()
        trace = StatusCallback()
        results = executor.run(training_fn, callbacks=[trace])
        assert len(results) == 4

        events = trace.fetch()
        assert sum(int("started" in e) for e in events) == 4, events
        assert sum(int("finished" in e) for e in events) == 4, events
예제 #10
0
def test_fault_tolerance_hosts_remove_and_add_cooldown(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:2', 'host-2:1', 'host-3:2']),
            (10, ['host-1:2']),
            (None, ['host-1:2', 'host-2:1', 'host-3:2']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = RayExecutor.create_settings(nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = RayExecutor(settings,
                               min_workers=1,
                               cpus_per_worker=1,
                               override_discovery=False,
                               cooldown_range=[1, 1])

        training_fn = _create_training_function(iterations=30)
        executor.start()
        trace = StatusCallback()
        results = executor.run(training_fn, callbacks=[trace])
        assert len(results) == 5

        events = trace.fetch()
        assert sum(int("started" in e) for e in events) == 5, events
        assert sum(int("finished" in e) for e in events) == 5, events
예제 #11
0
def test_gpu_ids(ray_start_4_cpus_4_gpus):
    original_resources = ray.available_resources()
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting, num_hosts=1, num_slots=4, use_gpu=True)
    hjob.start()
    worker_handles = hjob.workers
    all_envs = ray.get([h.env_vars.remote() for h in worker_handles])
    all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs}
    assert len(all_cudas) == 1, all_cudas
    assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(",")) == 4
    hjob.shutdown()
    assert check_resources(original_resources)
예제 #12
0
파일: test_ray.py 프로젝트: yuduber/horovod
def test_ray_init(ray_start_4_cpus):
    original_resources = ray.available_resources()

    def simple_fn(worker):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank()

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.execute(simple_fn)
    assert len(set(result)) == 4
    hjob.shutdown()
    assert check_resources(original_resources)
예제 #13
0
def test_gpu_ids_num_workers(ray_start_4_cpus_4_gpus):
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting, num_workers=4, use_gpu=True)
    hjob.start()
    all_envs = hjob.execute(lambda _: os.environ.copy())
    all_cudas = {ev["CUDA_VISIBLE_DEVICES"] for ev in all_envs}

    assert len(all_cudas) == 1, all_cudas
    assert len(all_envs[0]["CUDA_VISIBLE_DEVICES"].split(
        ",")) == 4, all_envs[0]["CUDA_VISIBLE_DEVICES"]

    def _test(worker):
        import horovod.torch as hvd
        hvd.init()
        local_rank = str(hvd.local_rank())
        return local_rank in os.environ["CUDA_VISIBLE_DEVICES"]

    all_valid_local_rank = hjob.execute(_test)
    assert all(all_valid_local_rank)
    hjob.shutdown()
예제 #14
0
def test_ray_init(ray_start_4_cpus, num_workers, num_hosts,
                  num_workers_per_host):
    def simple_fn(worker):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank()

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting,
                       num_workers=num_workers,
                       num_hosts=num_hosts,
                       num_workers_per_host=num_workers_per_host,
                       use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.execute(simple_fn)
    assert len(set(result)) == 4
    hjob.shutdown()
예제 #15
0
파일: test_ray.py 프로젝트: yuduber/horovod
def test_ray_executable(ray_start_4_cpus):
    class Executable:
        def __init__(self, epochs):
            import horovod.torch as hvd
            self.hvd = hvd
            self.epochs = epochs
            self.hvd.init()

        def rank_epoch(self):
            return self.hvd.rank() * self.epochs

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start(executable_cls=Executable, executable_args=[2])
    result = hjob.execute(lambda w: w.rank_epoch())
    assert set(result) == {0, 2, 4, 6}
    hjob.shutdown()
예제 #16
0
def test_ray_exec_remote_func(ray_start_4_cpus, num_workers, num_hosts,
                              num_workers_per_host):
    def simple_fn(num_epochs):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank() * num_epochs

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting,
                       num_workers=num_workers,
                       num_hosts=num_hosts,
                       num_workers_per_host=num_workers_per_host,
                       use_gpu=torch.cuda.is_available())
    hjob.start()
    object_refs = hjob.run_remote(simple_fn, args=[0])
    result = ray.get(object_refs)
    assert len(set(result)) == 1
    hjob.shutdown()
예제 #17
0
        def run(self):
            def simple_fn(worker):
                local_rank = _train()
                return local_rank

            setting = RayExecutor.create_settings(timeout_s=30)
            hjob = RayExecutor(setting,
                               num_workers=4,
                               num_hosts=None,
                               num_workers_per_host=None,
                               cpus_per_worker=1,
                               gpus_per_worker=int(torch.cuda.is_available())
                               or None,
                               use_gpu=torch.cuda.is_available())
            hjob.start()
            assert not hjob.adapter.strategy._created_placement_group
            result = hjob.execute(simple_fn)
            assert set(result) == {0, 1, 2, 3}
            hjob.shutdown()