def test_worker_start_failure(ray_start_2_cpus): test_config = TestConfig() trainer = Trainer(test_config, num_workers=2) restart = trainer._executor._restart def init_hook(): pass def init_hook_fail(): ray.actor.exit_actor() def restart_patched(self): self._initialization_hook = init_hook restart() with patch.object(BackendExecutor, "_restart", restart_patched): trainer.start(initialization_hook=init_hook_fail) assert len(trainer._executor.worker_group) == 2
def test_report(ray_start_2_cpus): config = TestConfig() def train_func(): for i in range(3): sgd.report(index=i) return 1 callback = TestCallback() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func, callbacks=[callback]) assert results == [1, 1] result_list = callback.result_list assert len(result_list) == 3 for index in range(len(result_list)): intermediate_results = result_list[index] assert len(intermediate_results) == 2 for worker_result in intermediate_results: assert worker_result["index"] == index
def test_load_checkpoint(ray_start_2_cpus): config = TestConfig() def train_func_checkpoint(): checkpoint = sgd.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 3 result = [] for i in range(checkpoint["epoch"], 5): result.append(i) return result trainer = Trainer(config, num_workers=2) trainer.start() result = trainer.run(train_func_checkpoint, checkpoint={"epoch": 3}) assert result is not None assert len(result) == 2 assert result[0] == [3, 4] assert result[1] == [3, 4]
def test_fast_slow(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) sgd.report(index=i) def train_slow(): for i in range(2): sgd.save_checkpoint(epoch=i) time.sleep(5) sgd.report(index=i) time.sleep(5) new_backend_executor_cls = gen_new_backend_executor(train_slow) callback = TestCallback() with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() trainer.run(train, callbacks=[callback]) assert trainer.latest_checkpoint["epoch"] == 1 result_list = callback.result_list assert len(result_list) == 2 for index in range(len(result_list)): intermediate_results = result_list[index] assert len(intermediate_results) == 2 for worker_result in intermediate_results: assert worker_result["index"] == index
def test_mismatch_checkpoint_report(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) sgd.report(index=i) def train_mismatch(): sgd.save_checkpoint(epoch=0) sgd.report(index=0) # skip checkpoint sgd.report(index=1) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) callback = TestCallback() with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train, callbacks=[callback]) # validate checkpoint assert trainer.latest_checkpoint["epoch"] == 0 # validate callback result_list = callback.result_list assert len(result_list) == 1 # 1 epoch succeeded intermediate_results = result_list[0] assert len(intermediate_results) == 2 # both workers reported for worker_result in intermediate_results: assert worker_result["index"] == 0
def test_json(ray_start_4_cpus, make_temp_dir, workers_to_log, detailed, filename): if detailed: os.environ[ENABLE_DETAILED_AUTOFILLED_METRICS_ENV] = "1" else: os.environ.pop(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) config = TestConfig() num_iters = 5 num_workers = 4 if workers_to_log is None: num_workers_to_log = num_workers elif isinstance(workers_to_log, int): num_workers_to_log = 1 else: num_workers_to_log = len(workers_to_log) def train_func(): for i in range(num_iters): sgd.report(index=i) return 1 if filename is None: # if None, use default value callback = JsonLoggerCallback( make_temp_dir, workers_to_log=workers_to_log) assert str( callback.log_path.name) == JsonLoggerCallback._default_filename else: callback = JsonLoggerCallback( make_temp_dir, filename=filename, workers_to_log=workers_to_log) assert str(callback.log_path.name) == filename trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) with open(callback.log_path, "r") as f: log = json.load(f) print(log) assert len(log) == num_iters assert len(log[0]) == num_workers_to_log assert all(len(element) == len(log[0]) for element in log) assert all( all(worker["index"] == worker[TRAINING_ITERATION] - 1 for worker in element) for element in log) assert all( all( all(key in worker for key in BASIC_AUTOFILLED_KEYS) for worker in element) for element in log) if detailed: assert all( all( all(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log) else: assert all( all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS) for worker in element) for element in log)
def test_worker_failure_2(ray_start_2_cpus): test_config = TestConfig() def train(): for _ in range(2): sgd.report(loss=1) return 1 def train_actor_failure(): for _ in range(2): sgd.report(loss=1) import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() results = trainer.run(train) assert results == [1, 1]
def train_linear(num_workers=1): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run(train_func, config, callbacks=[JsonLoggerCallback("./sgd_results")]) trainer.shutdown() print(results) return results
def test_horovod_simple(ray_start_2_cpus): def simple_fn(): hvd_torch.init() return hvd_torch.rank() num_workers = 2 trainer = Trainer("horovod", num_workers) trainer.start() result = trainer.run(simple_fn) trainer.shutdown() assert result == list(range(num_workers))
def test_run_iterator(ray_start_2_cpus): config = TestConfig() def train_func(): for i in range(3): sgd.report(index=i) return 1 trainer = Trainer(config, num_workers=2) trainer.start() iterator = trainer.run_iterator(train_func) count = 0 for results in iterator: assert (value["index"] == count for value in results) count += 1 assert count == 3 assert iterator.is_finished() assert iterator.get_final_results() == [1, 1] with pytest.raises(StopIteration): next(iterator)
def train_tensorflow_mnist(num_workers=2, use_gpu=False): trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}")
def test_run(ray_start_2_cpus): config = TestConfig() def train_func(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(train_func) trainer.shutdown() assert len(results) == 2 assert all(result == 1 for result in results)
def test_dataset_pipeline(ray_start_4_cpus): """Checks that Pipeline is correctly sharded even with multiple epochs.""" num_epochs = 2 num_data = 10 dataset = ray.data.range(num_data).repeat() def get_dataset(): pipeline_iterator = sgd.get_dataset_shard().iter_datasets() data_all_epochs = [] for _ in range(num_epochs): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches(): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset=dataset) check_dataset_output(num_data, num_epochs, results)
def test_torch_fashion_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(fashion_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1] < result[0]
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_run_config(ray_start_2_cpus): backend_config = TestConfig() def train_func(config): return config["fruit"] config = {"fruit": "banana"} trainer = Trainer(backend_config, num_workers=2) trainer.start() results = trainer.run(train_func, config) trainer.shutdown() assert len(results) == 2 assert all(result == "banana" for result in results)
def test_torch_linear(ray_start_2_cpus): num_workers = 2 epochs = 3 trainer = Trainer("torch", num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run(linear_train_func, config) trainer.shutdown() assert len(results) == num_workers for result in results: assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"]
def test_horovod_torch_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 num_epochs = 2 trainer = Trainer("horovod", num_workers, use_gpu=True) trainer.start() results = trainer.run(horovod_torch_train_func, config={ "num_epochs": num_epochs, "lr": 1e-3 }) trainer.shutdown() assert len(results) == num_workers for worker_result in results: assert len(worker_result) == num_epochs assert worker_result[num_epochs - 1] < worker_result[0]
def test_run_after_user_error(ray_start_2_cpus): config = TestConfig() def fail_train(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train) def train(): return 1 output = trainer.run(train) assert output == [1, 1]
def train_tensorflow_linear(num_workers=2, use_gpu=False): dataset_pipeline = get_dataset_pipeline() trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu) trainer.start() results = trainer.run(train_func=train_func, dataset=dataset_pipeline, config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }) trainer.shutdown() print(f"Results: {results[0]}") return results
def test_user_error(ray_start_2_cpus): """Tests if user training function raises an error""" config = TestConfig() def fail_train_1(): raise NotImplementedError trainer = Trainer(config, num_workers=2) trainer.start() with pytest.raises(NotImplementedError): trainer.run(fail_train_1) def fail_train_2(): for _ in range(2): sgd.report(loss=1) raise NotImplementedError with pytest.raises(NotImplementedError): trainer.run(fail_train_2)
def test_worker_failure_1(ray_start_2_cpus): test_config = TestConfig() def train(): return 1 def train_actor_failure(): import sys sys.exit(0) new_backend_executor_cls = gen_new_backend_executor(train_actor_failure) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train) # Make sure Trainer is shutdown after worker failure. with pytest.raises(RuntimeError): trainer.run(train)
def test_start_shutdown(ray_start_2_cpus, num_workers): config = TestConfig() assert ray.available_resources()["CPU"] == 2 trainer = Trainer(config, num_workers=num_workers) trainer.start() time.sleep(1) remaining = 2 - num_workers if remaining == 0: assert "CPU" not in ray.available_resources() else: assert ray.available_resources()["CPU"] == remaining trainer.shutdown() time.sleep(1) assert ray.available_resources()["CPU"] == 2
def test_tensorflow_mnist_gpu(ray_start_2_cpus_2_gpus): num_workers = 2 epochs = 3 trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True) config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer.start() results = trainer.run(tensorflow_mnist_train_func, config) trainer.shutdown() assert len(results) == num_workers result = results[0] loss = result["loss"] assert len(loss) == epochs assert loss[-1] < loss[0] accuracy = result["accuracy"] assert len(accuracy) == epochs assert accuracy[-1] > accuracy[0]
def test_multiple_run(ray_start_2_cpus): config = TestConfig() def train_1(): return 1 trainer = Trainer(config, num_workers=2) trainer.start() output_1 = trainer.run(train_1) assert output_1 == [1, 1] def train_2(): return 2 output_2 = trainer.run(train_2) assert output_2 == [2, 2]
def test_multiple_datasets(ray_start_4_cpus): num_epochs = 2 num_data_1 = 10 num_data_2 = 6 train_data = ray.data.range(num_data_1) val_data = ray.data.range(num_data_2) def get_dataset(): data_train_all_epochs = [] data_val_all_epochs = [] for _ in range(2): data_this_epoch_train = [] train_dataset = sgd.get_dataset_shard("train") for batch in train_dataset.iter_batches(): data_this_epoch_train.extend(batch) data_train_all_epochs.append(data_this_epoch_train) data_this_epoch_val = [] val_dataset = sgd.get_dataset_shard("val") for batch in val_dataset.iter_batches(): data_this_epoch_val.extend(batch) data_val_all_epochs.append(data_this_epoch_val) return data_train_all_epochs, data_val_all_epochs config = TestConfig() trainer = Trainer(config, num_workers=2) trainer.start() results = trainer.run(get_dataset, dataset={ "train": train_data, "val": val_data }) check_dataset_output(num_data_1, num_epochs, [worker_data[0] for worker_data in results]) check_dataset_output(num_data_2, num_epochs, [worker_data[1] for worker_data in results]) trainer.shutdown()
def test_TBX(ray_start_4_cpus, make_temp_dir): config = TestConfig() temp_dir = make_temp_dir num_workers = 4 def train_func(): sgd.report(episode_reward_mean=4) sgd.report(episode_reward_mean=5) sgd.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1 callback = TBXLoggerCallback(temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) _validate_tbx_result(temp_dir)
def test_mismatch_checkpoint(ray_start_2_cpus): test_config = TestConfig() def train(): for i in range(2): sgd.save_checkpoint(epoch=i) def train_mismatch(): sgd.save_checkpoint(epoch=0) new_backend_executor_cls = gen_new_backend_executor(train_mismatch) with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor", new_backend_executor_cls): trainer = Trainer(test_config, num_workers=2) trainer.start() with pytest.raises(RuntimeError): trainer.run(train)
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested): num_workers = 2 config = TestConfig() original = ray.available_resources().get(resource) resources_per_worker = {resource: num_requested} use_gpu = resource == "GPU" trainer = Trainer(config, num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker) trainer.start() expected = original - num_workers * num_requested wait_for_condition( lambda: ray.available_resources().get(resource, 0) == expected) trainer.shutdown() wait_for_condition( lambda: ray.available_resources().get(resource, 0) == original)
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or \ args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, config) else: # Run training locally. train_func(config)