def test_fault_tolerance_hosts_remove_and_add_cooldown(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:2', 'host-2:1', 'host-3:2']), (10, ['host-1:2']), (None, ['host-1:2', 'host-2:1', 'host-3:2']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = RayExecutor.create_settings(nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = RayExecutor(settings, min_workers=1, cpus_per_worker=1, override_discovery=False, cooldown_range=[1, 1]) training_fn = _create_training_function(iterations=30) executor.start() trace = StatusCallback() results = executor.run(training_fn, callbacks=[trace]) assert len(results) == 5 events = trace.fetch() assert sum(int("started" in e) for e in events) == 5, events assert sum(int("finished" in e) for e in events) == 5, events
def test_min_num_proc(ray_8_cpus): with fault_tolerance_patches(): discovery_schedule = [ (10, ['host-1:1']), (10, ['host-1:1', 'host-4:1', 'host-5:1']), (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']), ] nics = list(psutil.net_if_addrs().keys())[0] settings = RayExecutor.create_settings(nics={nics}) settings.discovery = SimpleTestDiscovery(discovery_schedule) executor = RayExecutor(settings, min_workers=4, max_workers=4, override_discovery=False ) training_fn = _create_training_function(iterations=30) executor.start() trace = StatusCallback() results = executor.run(training_fn, callbacks=[trace]) assert len(results) == 4 events = trace.fetch() assert sum(int("started" in e) for e in events) == 4, events assert sum(int("finished" in e) for e in events) == 4, events
def test_ray_exec_func(ray_start_4_cpus): def simple_fn(num_epochs): import horovod.torch as hvd hvd.init() return hvd.rank() * num_epochs setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.run(simple_fn, args=[0]) assert len(set(result)) == 1 hjob.shutdown()