def test_reduce_result(ray_start_2_cpus, use_local): if not dist.is_available(): return def data_creator(config): """Returns training dataloader, validation dataloader.""" train_dataset = LinearDataset(2, 5, size=config["data_size"]) test_dataset = LinearDataset(2, 5, size=config["data_size"]) return DataLoader( train_dataset, batch_size=1), DataLoader( test_dataset, batch_size=1) data_size = 600 TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, data_creator, loss_creator=lambda config: nn.MSELoss()) trainer = TorchTrainer( training_operator_cls=TestOperator, num_workers=2, use_local=use_local, config={"data_size": data_size}) list_stats = trainer.train(reduce_results=False, profile=True) assert len(list_stats) == 2 assert [stats[NUM_SAMPLES] == data_size for stats in list_stats] assert [stats[BATCH_COUNT] == (data_size // 2) for stats in list_stats] list_stats = trainer.validate(reduce_results=False, profile=True) assert len(list_stats) == 2 assert [stats[NUM_SAMPLES] == data_size for stats in list_stats] assert [stats[BATCH_COUNT] == (data_size // 2) for stats in list_stats] trainer.shutdown()
def test_fail_with_recover(ray_start_2_cpus, use_local): # noqa: F811 print(locals()) if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss()) start_with_fail = gen_start_with_fail(3) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer(training_operator_cls=TestOperator, config={"batch_size": 100000}, timeout_s=5, use_local=use_local, num_workers=2) with pytest.raises(RuntimeError): trainer1.train(max_retries=1) trainer1.shutdown(force=True)
def test_split_batch(ray_start_2_cpus, use_local): if not dist.is_available(): return def data_creator(config): """Returns training dataloader, validation dataloader.""" train_dataset = LinearDataset(2, 5, size=config["data_size"]) return DataLoader( train_dataset, batch_size=config[BATCH_SIZE], ) data_size = 600 batch_size = 21 TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, data_creator, loss_creator=lambda config: nn.MSELoss()) trainer = TorchTrainer( training_operator_cls=TestOperator, num_workers=2, use_local=use_local, config={ BATCH_SIZE: batch_size, "data_size": data_size, }) stats = trainer.train() assert trainer.config[BATCH_SIZE] == (batch_size - 1) assert stats[NUM_SAMPLES] == 600 assert stats[BATCH_COUNT] == (data_size // 20) trainer.shutdown()
def test_resize(ray_start_2_cpus, use_local): # noqa: F811 if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) start_with_fail = gen_start_with_fail(1) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss()) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer(training_operator_cls=TestOperator, config={"batch_size": 100000}, use_local=use_local, num_workers=2) @ray.remote def try_test(): import time time.sleep(100) try_test.remote() trainer1.train(max_retries=1) assert trainer1.worker_group.num_workers == 1 trainer1.shutdown(force=True)
def test_dataset(ray_start_4_cpus, use_local): """ This test tries training the mlp_identity example. We check the accuracy of the model as an all inclusive way of ensuring that we are properly sharding and iterating over the entire dataset (instead of repeating the first set of points for example). """ model_creator = mlp_identity.model_creator optimizer_creator = mlp_identity.optimizer_creator dataset_creator = mlp_identity.dataset_creator DatasetOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss) trainer = TorchTrainer( training_operator_cls=DatasetOperator, use_local=use_local, num_workers=2, ) dataset = dataset_creator() for i in range(5): trainer.train(dataset=dataset, num_steps=100) x = mlp_identity.to_mat(0.5) prediction = float(trainer.get_model()(x)[0][0]) assert 0.4 <= prediction <= 0.6 trainer.shutdown()
def test_failure_during_resize(ray_start_2_cpus): # noqa: F811 """Tests if training succeeds even with failures during worker resizing.""" if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss()) start_with_fail = gen_start_with_startup_fail(1) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer(training_operator_cls=TestOperator, config={"batch_size": 100000}, timeout_s=5, use_local=False, num_workers=2) trainer1.train() trainer1.shutdown()
def test_fail_twice(ray_start_2_cpus, use_local): # noqa: F811 if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss()) start_with_fail = gen_start_with_fail(2) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer(training_operator_cls=TestOperator, config={"batch_size": 100000}, use_local=use_local, num_workers=2) # MAX RETRIES SHOULD BE ON BY DEFAULT trainer1.train() assert trainer1._num_failures == 2 assert trainer1.worker_group.num_workers == 2 trainer1.shutdown(force=True)
def setUp(self): self.Operator = TrainingOperator.from_creators( model_creator, optimizer_creator, create_dataloaders, loss_creator=loss_creator, )
def test_fail_state(ray_start_2_cpus): # noqa: F811 """Tests if state of training with failure is same as training without.""" if not dist.is_available(): return torch.manual_seed(0) def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss(), ) def init_hook(): torch.manual_seed(0) trainer1 = TorchTrainer( training_operator_cls=TestOperator, config={"batch_size": 100000}, timeout_s=5, initialization_hook=init_hook, num_workers=2, ) initial_state = trainer1.state_dict() trainer1.train() trainer1_state = trainer1.state_dict() assert trainer1_state != initial_state trainer1.shutdown() trainer2 = TorchTrainer( training_operator_cls=TestOperator, config={"batch_size": 100000}, timeout_s=5, initialization_hook=init_hook, num_workers=2, ) trainer2.load_state_dict(initial_state) trainer2.train() assert trainer2.state_dict() == trainer1_state trainer2.shutdown() start_with_fail = gen_start_with_fail(1) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer3 = TorchTrainer( training_operator_cls=TestOperator, config={"batch_size": 100000}, timeout_s=5, initialization_hook=init_hook, num_workers=2, ) trainer3.load_state_dict(initial_state) trainer3.train() assert trainer3.state_dict() == trainer1_state trainer3.shutdown()
def testNativeLoss(self): NativeOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=nn.MSELoss) runner = TorchRunner(training_operator_cls=NativeOperator) runner.setup_operator() runner.train_epoch()
def testSingleLoader(self): SingleOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=loss_creator) runner = TorchRunner(training_operator_cls=SingleOperator) runner.setup_operator() runner.train_epoch() with self.assertRaises(ValueError): runner.validate()
def test_resize(ray_start_2_cpus, use_local): # noqa: F811 if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) start_with_fail = gen_start_with_fail(1) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss(), ) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer( training_operator_cls=TestOperator, config={"batch_size": 100000}, use_local=use_local, num_workers=2, ) # we use placement_group to occupy resources bundle = { "CPU": 1, } bundles = [bundle] dummy_pg = ray.util.placement_group(bundles, strategy="SPREAD") trainer1.train(max_retries=1) assert trainer1.worker_group.num_workers == 1 assert trainer1._num_failures == 1 ray.util.remove_placement_group(dummy_pg) def is_placement_group_removed(): table = ray.util.placement_group_table(dummy_pg) if "state" not in table: return False return table["state"] == "REMOVED" # wait for free resource wait_for_condition(is_placement_group_removed) # trigger scale up trainer1.train() assert trainer1.worker_group.num_workers == 2 trainer1.shutdown(force=True)
def test_multi_input_model(ray_start_2_cpus, use_local): def model_creator(config): class MultiInputModel(nn.Module): def __init__(self): super(MultiInputModel, self).__init__() self._fc1 = torch.nn.Linear(1, 1) self._fc2 = torch.nn.Linear(1, 1) def forward(self, x, y): return self._fc1(x) + self._fc2(y) return MultiInputModel() def data_creator(config): class LinearDataset(torch.utils.data.Dataset): def __init__(self, a, b, size=1000): x = np.random.randn(size) y = np.random.randn(size) self.x = torch.tensor(x, dtype=torch.float32) self.y = torch.tensor(y, dtype=torch.float32) self.z = torch.tensor(a * (x + y) + 2 * b, dtype=torch.float32) def __getitem__(self, index): return (self.x[index, None], self.y[index, None], self.z[index, None]) def __len__(self): return len(self.x) train_dataset = LinearDataset(3, 4) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.get("batch_size", 32), ) return train_loader, None Operator = TrainingOperator.from_creators( model_creator, optimizer_creator, data_creator, loss_creator=lambda config: nn.MSELoss()) trainer = TorchTrainer(training_operator_cls=Operator, num_workers=1, use_local=use_local) metrics = trainer.train(num_steps=1) assert metrics[BATCH_COUNT] == 1 trainer.shutdown()
def testMultiLoaders(self): def three_data_loader(config): return (LinearDataset(2, 5), LinearDataset(2, 5, size=400), LinearDataset(2, 5, size=400)) ThreeOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, three_data_loader, loss_creator=loss_creator) runner = TorchRunner(training_operator_cls=ThreeOperator) with self.assertRaises(ValueError): runner.setup_operator() runner2 = TorchRunner(training_operator_cls=ThreeOperator) with self.assertRaises(ValueError): runner2.setup_operator()
def test_scheduler_validate(ray_start_2_cpus): # noqa: F811 from torch.optim.lr_scheduler import ReduceLROnPlateau TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, data_creator, scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer), loss_creator=lambda config: nn.MSELoss()) TestOperator = get_test_operator(TestOperator) trainer = TorchTrainer(scheduler_step_freq="manual", training_operator_cls=TestOperator) trainer.update_scheduler(0.5) trainer.update_scheduler(0.5) assert all( trainer.apply_all_operators( lambda op: op._schedulers[0].last_epoch == 2)) trainer.shutdown()
def test_resize(ray_start_2_cpus, use_local): # noqa: F811 if not dist.is_available(): return def single_loader(config): dataset = LinearDataset(2, 5, size=1000000) return DataLoader(dataset, batch_size=config.get("batch_size", 32)) start_with_fail = gen_start_with_fail(1) TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, single_loader, loss_creator=lambda config: nn.MSELoss()) with patch.object(TorchTrainer, "_start_workers", start_with_fail): trainer1 = TorchTrainer(training_operator_cls=TestOperator, config={"batch_size": 100000}, use_local=use_local, num_workers=2) @ray.remote(num_cpus=1) class DummyActor: def get(self): return 1 dummy_handler = DummyActor.remote() trainer1.train(max_retries=1) assert trainer1.worker_group.num_workers == 1 assert trainer1._num_failures == 1 ray.get(dummy_handler.get.remote()) ray.kill(dummy_handler) time.sleep(1) # trigger scale up trainer1.train() assert trainer1.worker_group.num_workers == 2 trainer1.shutdown(force=True)
def test_num_steps(ray_start_2_cpus, use_local): """Tests if num_steps continues training from the subsampled dataset.""" def data_creator(config): train_dataset = [0] * 5 + [1] * 5 val_dataset = [0] * 5 + [1] * 5 return DataLoader(train_dataset, batch_size=config["batch_size"]), \ DataLoader(val_dataset, batch_size=config["batch_size"]) batch_size = 1 Operator = TrainingOperator.from_creators(model_creator, optimizer_creator, data_creator) def train_func(self, iterator, info=None): total_sum = 0 num_items = 0 for e in iterator: total_sum += e num_items += 1 return {"average": total_sum.item() / num_items} TestOperator = get_test_operator(Operator) trainer = TorchTrainer( training_operator_cls=TestOperator, num_workers=2, use_local=use_local, add_dist_sampler=False, config={ "batch_size": batch_size, "custom_func": train_func }) # If num_steps not passed, should do one full epoch. result = trainer.train() # Average of 5 0s and 5 1s assert result["average"] == 0.5 assert result["epoch"] == 1 val_result = trainer.validate() assert val_result["average"] == 0.5 # Train again with num_steps. result = trainer.train(num_steps=5) # 5 zeros assert result["average"] == 0 assert result["epoch"] == 2 val_result = trainer.validate(num_steps=5) assert val_result["average"] == 0 # Should continue where last train run left off. result = trainer.train(num_steps=3) # 3 ones. assert result["average"] == 1 assert result["epoch"] == 2 val_result = trainer.validate(num_steps=3) assert val_result["average"] == 1 # Should continue from last train run, and cycle to beginning. result = trainer.train(num_steps=5) # 2 ones and 3 zeros. assert result["average"] == 0.4 assert result["epoch"] == 3 val_result = trainer.validate(num_steps=5) assert val_result["average"] == 0.4 # Should continue, and since num_steps not passed in, just finishes epoch. result = trainer.train() # 2 zeros and 5 ones. assert result["average"] == 5 / 7 assert result["epoch"] == 3 val_result = trainer.validate() assert val_result["average"] == 5 / 7 trainer.shutdown()
import torch.nn as nn import torch.distributed as dist from torch.utils.data import DataLoader import ray from ray.util.sgd.torch import TorchTrainer from ray.util.sgd.torch.worker_group import RemoteWorkerGroup from ray.util.sgd.torch.training_operator import TrainingOperator from ray.util.sgd.torch.examples.train_example import (model_creator, optimizer_creator, data_creator, LinearDataset) Operator = TrainingOperator.from_creators(model_creator, optimizer_creator, data_creator, loss_creator=nn.MSELoss) @pytest.fixture def ray_start_2_cpus(): address_info = ray.init(num_cpus=2) yield address_info # The code after the yield will run as teardown code. ray.shutdown() # Ensure that tests don't ALL fail if dist.is_initialized(): dist.destroy_process_group() @pytest.fixture