def test_dead_trainer(ray_start_2_cpus): # noqa: F811 TestOperator = get_test_operator(Operator) trainer = TorchTrainer(training_operator_cls=TestOperator, num_workers=2) trainer.train(num_steps=1) trainer.shutdown() with pytest.raises(RuntimeError): trainer.train()
def test_scheduler_validate(ray_start_2_cpus): # noqa: F811 from torch.optim.lr_scheduler import ReduceLROnPlateau TestOperator = TrainingOperator.from_creators( model_creator, optimizer_creator, data_creator, scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer), loss_creator=lambda config: nn.MSELoss()) TestOperator = get_test_operator(TestOperator) trainer = TorchTrainer(scheduler_step_freq="manual", training_operator_cls=TestOperator) trainer.update_scheduler(0.5) trainer.update_scheduler(0.5) assert all( trainer.apply_all_operators( lambda op: op._schedulers[0].last_epoch == 2)) trainer.shutdown()
def test_num_steps(ray_start_2_cpus, use_local): """Tests if num_steps continues training from the subsampled dataset.""" def data_creator(config): train_dataset = [0] * 5 + [1] * 5 val_dataset = [0] * 5 + [1] * 5 return DataLoader(train_dataset, batch_size=config["batch_size"]), \ DataLoader(val_dataset, batch_size=config["batch_size"]) batch_size = 1 Operator = TrainingOperator.from_creators(model_creator, optimizer_creator, data_creator) def train_func(self, iterator, info=None): total_sum = 0 num_items = 0 for e in iterator: total_sum += e num_items += 1 return {"average": total_sum.item() / num_items} TestOperator = get_test_operator(Operator) trainer = TorchTrainer( training_operator_cls=TestOperator, num_workers=2, use_local=use_local, add_dist_sampler=False, config={ "batch_size": batch_size, "custom_func": train_func }) # If num_steps not passed, should do one full epoch. result = trainer.train() # Average of 5 0s and 5 1s assert result["average"] == 0.5 assert result["epoch"] == 1 val_result = trainer.validate() assert val_result["average"] == 0.5 # Train again with num_steps. result = trainer.train(num_steps=5) # 5 zeros assert result["average"] == 0 assert result["epoch"] == 2 val_result = trainer.validate(num_steps=5) assert val_result["average"] == 0 # Should continue where last train run left off. result = trainer.train(num_steps=3) # 3 ones. assert result["average"] == 1 assert result["epoch"] == 2 val_result = trainer.validate(num_steps=3) assert val_result["average"] == 1 # Should continue from last train run, and cycle to beginning. result = trainer.train(num_steps=5) # 2 ones and 3 zeros. assert result["average"] == 0.4 assert result["epoch"] == 3 val_result = trainer.validate(num_steps=5) assert val_result["average"] == 0.4 # Should continue, and since num_steps not passed in, just finishes epoch. result = trainer.train() # 2 zeros and 5 ones. assert result["average"] == 5 / 7 assert result["epoch"] == 3 val_result = trainer.validate() assert val_result["average"] == 5 / 7 trainer.shutdown()
def test_multi_model_matrix(ray_start_2_cpus, num_workers, use_local): # # noqa: F811 def train_epoch(self, iterator, info): if self.config.get("models", 1) > 1: assert len(self.models) == self.config["models"], self.config if self.config.get("optimizers", 1) > 1: assert len( self.optimizers) == self.config["optimizers"], self.config if self.config.get("schedulers", 1) > 1: assert len( self.schedulers) == self.config["schedulers"], self.config return {"done": 1} def multi_model_creator(config): models = [] for i in range(config.get("models", 1)): models += [nn.Linear(1, 1)] return models[0] if len(models) == 1 else models def multi_optimizer_creator(models, config): optimizers = [] main_model = models[0] if type(models) is list else models for i in range(config.get("optimizers", 1)): optimizers += [torch.optim.SGD(main_model.parameters(), lr=0.0001)] return optimizers[0] if len(optimizers) == 1 else optimizers def multi_scheduler_creator(optimizer, config): schedulers = [] main_opt = optimizer[0] if type(optimizer) is list else optimizer for i in range(config.get("schedulers", 1)): schedulers += [ torch.optim.lr_scheduler.StepLR( main_opt, step_size=30, gamma=0.1) ] return schedulers[0] if len(schedulers) == 1 else schedulers class MultiModelOperator(TrainingOperator): def setup(self, config): models = multi_model_creator(config) optimizers = multi_optimizer_creator(models, config) schedulers = multi_scheduler_creator(optimizers, config) train_loader, val_loader = data_creator(config) loss = nn.MSELoss() self.models, self.optimizers, self.criterion, self.schedulers = \ self.register(models=models, optimizers=optimizers, schedulers=schedulers, criterion=loss) self.register_data( train_loader=train_loader, validation_loader=val_loader) TestOperator = get_test_operator(MultiModelOperator) for model_count in range(1, 3): for optimizer_count in range(1, 3): for scheduler_count in range(1, 3): trainer = TorchTrainer( scheduler_step_freq="epoch", training_operator_cls=TestOperator, num_workers=num_workers, use_local=use_local, config={ "models": model_count, "optimizers": optimizer_count, "schedulers": scheduler_count, "custom_func": train_epoch }) trainer.train() trainer.shutdown()
def test_multi_model(ray_start_2_cpus, num_workers, use_local): def train(*, model=None, criterion=None, optimizer=None, iterator=None): model.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(iterator): optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() return { "accuracy": correct / total, "train_loss": train_loss / (batch_idx + 1) } def train_epoch(self, iterator, info): result = {} data = list(iterator) for i, (model, optimizer) in enumerate( zip(self.models, self.optimizers)): result[f"model_{i}"] = train( model=model, criterion=self.criterion, optimizer=optimizer, iterator=iter(data)) return result class MultiModelOperator(TrainingOperator): def setup(self, config): models = nn.Linear(1, 1), nn.Linear(1, 1) opts = [ torch.optim.SGD(model.parameters(), lr=0.0001) for model in models ] loss = nn.MSELoss() train_dataloader, val_dataloader = data_creator(config) self.models, self.optimizers, self.criterion = self.register( models=models, optimizers=opts, criterion=loss) self.register_data( train_loader=train_dataloader, validation_loader=val_dataloader) TestOperator = get_test_operator(MultiModelOperator) trainer1 = TorchTrainer( config={"custom_func": train_epoch}, training_operator_cls=TestOperator, num_workers=num_workers, use_local=use_local, use_gpu=False, ) trainer1.train() state = trainer1.state_dict() models1 = trainer1.get_model() trainer1.shutdown() trainer2 = TorchTrainer( config={"custom_func": train_epoch}, training_operator_cls=TestOperator, num_workers=num_workers, use_local=use_local, use_gpu=False, ) trainer2.load_state_dict(state) models2 = trainer2.get_model() for model_1, model_2 in zip(models1, models2): model1_state_dict = model_1.state_dict() model2_state_dict = model_2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) trainer2.shutdown()