def test_save_and_restore(ray_start_2_cpus, num_replicas): # noqa: F811 trainer1 = PyTorchTrainer( model_creator, data_creator, optimizer_creator, loss_creator=lambda config: nn.MSELoss(), num_replicas=num_replicas) trainer1.train() filename = os.path.join(tempfile.mkdtemp(), "checkpoint") trainer1.save(filename) model1 = trainer1.get_model() trainer1.shutdown() trainer2 = PyTorchTrainer( model_creator, data_creator, optimizer_creator, loss_creator=lambda config: nn.MSELoss(), num_replicas=num_replicas) trainer2.restore(filename) os.remove(filename) model2 = trainer2.get_model() model1_state_dict = model1.state_dict() model2_state_dict = model2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k])
def train_example(num_replicas=1, num_epochs=5, use_gpu=False, use_fp16=False, test_mode=False): config = {TEST_MODE: test_mode} trainer1 = PyTorchTrainer(ResNet18, cifar_creator, optimizer_creator, nn.CrossEntropyLoss, scheduler_creator=scheduler_creator, initialization_hook=initialization_hook, num_replicas=num_replicas, config=config, use_gpu=use_gpu, batch_size=16 if test_mode else 512, backend="nccl" if use_gpu else "gloo", scheduler_step_freq="epoch", use_fp16=use_fp16) for i in range(num_epochs): # Increase `max_retries` to turn on fault tolerance. stats = trainer1.train(max_retries=0) print(stats) print(trainer1.validate()) trainer1.shutdown() print("success!")
def test_multi_model(ray_start_2_cpus, num_replicas): # noqa: F811 def custom_train(config, models, dataloader, criterion, optimizers, **kwargs): result = {} for i, (model, optimizer) in enumerate(zip(models, optimizers)): result["model_{}".format(i)] = train(config, model, dataloader, criterion, optimizer) return result def multi_model_creator(config): return nn.Linear(1, 1), nn.Linear(1, 1) def multi_optimizer_creator(models, config): opts = [ torch.optim.SGD(model.parameters(), lr=0.0001) for model in models ] return opts[0], opts[1] trainer1 = PyTorchTrainer( multi_model_creator, data_creator, multi_optimizer_creator, loss_creator=lambda config: nn.MSELoss(), train_function=custom_train, num_replicas=num_replicas) trainer1.train() filename = os.path.join(tempfile.mkdtemp(), "checkpoint") trainer1.save(filename) models1 = trainer1.get_model() trainer1.shutdown() trainer2 = PyTorchTrainer( multi_model_creator, data_creator, multi_optimizer_creator, loss_creator=lambda config: nn.MSELoss(), num_replicas=num_replicas) trainer2.restore(filename) os.remove(filename) models2 = trainer2.get_model() for model_1, model_2 in zip(models1, models2): model1_state_dict = model_1.state_dict() model2_state_dict = model_2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) trainer2.shutdown()
def train_example(num_replicas=1, use_gpu=False): trainer1 = PyTorchTrainer(model_creator, data_creator, optimizer_creator, num_replicas=num_replicas, use_gpu=use_gpu, backend="gloo") trainer1.train() trainer1.shutdown() print("success!")
def test_multi_model_matrix(ray_start_2_cpus, num_replicas): # noqa: F811 def custom_train(config, model, dataloader, criterion, optimizer, scheduler): if config.get("models", 1) > 1: assert len(model) == config["models"], config if config.get("optimizers", 1) > 1: assert len(optimizer) == config["optimizers"], config if config.get("schedulers", 1) > 1: assert len(scheduler) == config["schedulers"], config return {"done": 1} def multi_model_creator(config): models = [] for i in range(config.get("models", 1)): models += [nn.Linear(1, 1)] return models[0] if len(models) == 1 else models def multi_optimizer_creator(models, config): optimizers = [] main_model = models[0] if type(models) is list else models for i in range(config.get("optimizers", 1)): optimizers += [torch.optim.SGD(main_model.parameters(), lr=0.0001)] return optimizers[0] if len(optimizers) == 1 else optimizers def multi_scheduler_creator(optimizer, config): schedulers = [] main_opt = optimizer[0] if type(optimizer) is list else optimizer for i in range(config.get("schedulers", 1)): schedulers += [ torch.optim.lr_scheduler.StepLR( main_opt, step_size=30, gamma=0.1) ] return schedulers[0] if len(schedulers) == 1 else schedulers for model_count in range(1, 3): for optimizer_count in range(1, 3): for scheduler_count in range(1, 3): trainer = PyTorchTrainer( multi_model_creator, data_creator, multi_optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=multi_scheduler_creator, train_function=custom_train, num_replicas=num_replicas, config={ "models": model_count, "optimizers": optimizer_count, "schedulers": scheduler_count }) trainer.train() trainer.shutdown()
def train_example(num_replicas=1, use_gpu=False): trainer1 = PyTorchTrainer(model_creator, data_creator, optimizer_creator, num_replicas=num_replicas, resources_per_replica=Resources( num_cpus=1, num_gpus=int(use_gpu), resources={})) trainer1.train() trainer1.shutdown()
def test_scheduler_validate(ray_start_2_cpus): # noqa: F811 def custom_train(config, model, dataloader, criterion, optimizer, scheduler): return {"done": 1} from torch.optim.lr_scheduler import ReduceLROnPlateau trainer = PyTorchTrainer( model_creator, data_creator, optimizer_creator, loss_creator=lambda config: nn.MSELoss(), scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer)) trainer.update_scheduler(0.5) trainer.update_scheduler(0.5) assert all( trainer.apply_all_workers(lambda r: r.schedulers[0].last_epoch == 2)) trainer.shutdown()
def test_scheduler_freq(ray_start_2_cpus, scheduler_freq): # noqa: F811 def custom_train(config, model, dataloader, criterion, optimizer, scheduler): assert config[SCHEDULER_STEP] == scheduler_freq return {"done": 1} def scheduler_creator(optimizer, config): return torch.optim.lr_scheduler.StepLR( optimizer, step_size=30, gamma=0.1) trainer = PyTorchTrainer( model_creator, data_creator, optimizer_creator, loss_creator=lambda config: nn.MSELoss(), scheduler_creator=scheduler_creator) for i in range(3): trainer.train()["train_loss"] trainer.shutdown()
def train_example(num_replicas=1, use_gpu=False, test_mode=False): config = {"test_mode": test_mode} trainer1 = PyTorchTrainer(ResNet18, cifar_creator, optimizer_creator, nn.CrossEntropyLoss, initialization_hook=initialization_hook, train_function=train, validation_function=validate, num_replicas=num_replicas, config=config, use_gpu=use_gpu, batch_size=16 if test_mode else 512, backend="nccl" if use_gpu else "gloo") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) trainer1.shutdown() print("success!")