def test_controller_stops_at_num_epochs(): num_epochs = 10 model = torch.nn.Linear(2, 2) optimizer = torch.optim.Adam(model.parameters()) params = training.TrainingStateParams(num_epochs=num_epochs, early_stopping_threshold=0.0) controller = training.TrainingStateController(params) for _ in range(9): assert controller.update_for_epoch(model, optimizer, 0.1, 0.1) assert controller.continue_training() assert not controller.update_for_epoch(model, optimizer, 0.1, 0.1) assert not controller.continue_training()
def test_pydrobert_param_optuna_hooks(): poptuna = pytest.importorskip("pydrobert.param.optuna") optuna = pytest.importorskip("optuna") assert issubclass(training.TrainingStateParams, poptuna.TunableParameterized) global_dict = {"training": training.TrainingStateParams()} assert "training.log10_learning_rate" in poptuna.get_param_dict_tunable( global_dict) def objective(trial): param_dict = poptuna.suggest_param_dict(trial, global_dict) return param_dict["training"].log10_learning_rate sampler = optuna.samplers.RandomSampler(seed=5) study = optuna.create_study(sampler=sampler) study.optimize(objective, n_trials=50) assert study.best_params["training.log10_learning_rate"] < -5
def test_controller_scheduling(): model = torch.nn.Linear(2, 2) optimizer = torch.optim.Adam(model.parameters()) p = training.TrainingStateParams( early_stopping_threshold=0.1, early_stopping_patience=10, early_stopping_burnin=1, reduce_lr_threshold=0.2, reduce_lr_factor=0.5, reduce_lr_patience=5, reduce_lr_cooldown=2, reduce_lr_burnin=4, ) controller = training.TrainingStateController(p) controller.load_model_and_optimizer_for_epoch(model, optimizer) init_lr = optimizer.param_groups[0]["lr"] for _ in range(8): assert controller.update_for_epoch(model, optimizer, 1, 1) assert controller.continue_training() assert np.isclose(optimizer.param_groups[0]["lr"], init_lr) assert controller.update_for_epoch(model, optimizer, 1, 1) assert np.isclose(optimizer.param_groups[0]["lr"], init_lr / 2) for _ in range(6): assert controller.update_for_epoch(model, optimizer, 0.89, 0.89) assert controller.continue_training() assert np.isclose(optimizer.param_groups[0]["lr"], init_lr / 2) assert controller.update_for_epoch(model, optimizer, 0.68, 0.68) assert controller.continue_training() assert np.isclose(optimizer.param_groups[0]["lr"], init_lr / 2) for _ in range(9): assert controller.update_for_epoch(model, optimizer, 0.68, 0.68) assert controller.continue_training() assert not controller.update_for_epoch(model, optimizer, 0.68, 0.68) assert not controller.continue_training() p.early_stopping_threshold = 0.0 p.reduce_lr_threshold = 0.0 controller = training.TrainingStateController(p) controller.load_model_and_optimizer_for_epoch(model, optimizer) init_lr = optimizer.param_groups[0]["lr"] for _ in range(20): assert controller.update_for_epoch(model, optimizer, 0.0, 0.0) assert controller.continue_training() assert np.isclose(optimizer.param_groups[0]["lr"], init_lr)
def test_controller_slippery_slope(): model = torch.nn.Linear(2, 2) optimizer = torch.optim.Adam(model.parameters()) p = training.TrainingStateParams( early_stopping_threshold=1.0, early_stopping_patience=5, early_stopping_burnin=0, reduce_lr_threshold=1.0, reduce_lr_patience=2, reduce_lr_factor=0.5, reduce_lr_burnin=0, reduce_lr_cooldown=0, ) controller = training.TrainingStateController(p) controller.load_model_and_optimizer_for_epoch(model, optimizer) init_lr = optimizer.param_groups[0]["lr"] for step in range(6): dev = 3.5 - 0.75 * step controller.update_for_epoch(model, optimizer, 1., dev) assert controller.continue_training(), step assert np.isclose(optimizer.param_groups[0]["lr"], init_lr), step
def test_controller_stores_and_retrieves(temp_dir, device, opt_class): torch.manual_seed(50) model = torch.nn.Linear(2, 2).to(device) optimizer = opt_class(model.parameters(), lr=20) p = training.TrainingStateParams(seed=5, log10_learning_rate=-1) state_csv_path = os.path.join(temp_dir, "a.csv") state_dir = os.path.join(temp_dir, "states") controller = training.TrainingStateController( p, state_csv_path=state_csv_path, state_dir=state_dir, ) controller.add_entry("cool_guy_entry", int) controller.load_model_and_optimizer_for_epoch(model, optimizer, 0) assert optimizer.param_groups[0]["lr"] == 10**p.log10_learning_rate inp = torch.randn(5, 2, device=device) def closure(): optimizer.zero_grad() loss = model(inp).sum() loss.backward() return loss model_2 = torch.nn.Linear(2, 2).to(device) optimizer_2 = opt_class(model_2.parameters(), lr=20) controller.load_model_and_optimizer_for_epoch(model_2, optimizer_2, 0) assert optimizer_2.param_groups[0]["lr"] == 10**p.log10_learning_rate for parameter_1, parameter_2 in zip(model.parameters(), model_2.parameters()): assert parameter_1.device == device assert parameter_2.device == device assert torch.allclose(parameter_1, parameter_2) optimizer.step(closure) for parameter_1, parameter_2 in zip(model.parameters(), model_2.parameters()): assert not torch.allclose(parameter_1, parameter_2) def closure(): optimizer_2.zero_grad() loss = model_2(inp).sum() loss.backward() return loss optimizer_2.step(closure) for parameter_1, parameter_2 in zip(model.parameters(), model_2.parameters()): assert torch.allclose(parameter_1, parameter_2) epoch_info = { "epoch": 10, "es_resume_cd": 3, "es_patience_cd": 4, "rlr_resume_cd": 10, "rlr_patience_cd": 5, "lr": 1e-7, "train_met": 10, "val_met": 4, "cool_guy_entry": 30, } controller.save_model_and_optimizer_with_info(model, optimizer, epoch_info) controller.save_info_to_hist(epoch_info) assert controller[10] == epoch_info torch.manual_seed(4) model_2 = torch.nn.Linear(2, 2).to(device) optimizer_2 = opt_class(model_2.parameters(), lr=20) controller.load_model_and_optimizer_for_epoch(model_2, optimizer_2, 10) for parameter_1, parameter_2 in zip(model.parameters(), model_2.parameters()): assert parameter_1.device == device assert parameter_2.device == device assert torch.allclose(parameter_1, parameter_2) optimizer_2.step(closure) for parameter_1, parameter_2 in zip(model.parameters(), model_2.parameters()): assert not torch.allclose(parameter_1, parameter_2) controller = training.TrainingStateController( p, state_csv_path=state_csv_path, state_dir=state_dir, ) assert "cool_guy_entry" not in controller[10] assert controller[10]["es_resume_cd"] == epoch_info["es_resume_cd"] controller.add_entry("cool_guy_entry", int) assert controller[10] == epoch_info model_3 = torch.nn.Linear(2, 2).to(device) optimizer_3 = opt_class(model_3.parameters(), lr=20) controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 10) model_3.to(device) for parameter_1, parameter_3 in zip(model.parameters(), model_3.parameters()): assert parameter_3.device == device assert torch.allclose(parameter_1, parameter_3) def closure(): optimizer_3.zero_grad() loss = model_3(inp).sum() loss.backward() return loss optimizer_3.step(closure) for parameter_1, parameter_2, parameter_3 in zip(model.parameters(), model_2.parameters(), model_3.parameters()): assert not torch.allclose(parameter_1, parameter_2) assert torch.allclose(parameter_2, parameter_3) torch.manual_seed(300) model_2 = torch.nn.Linear(2, 2).to(device) optimizer_2 = opt_class(model_2.parameters(), lr=20) epoch_info["epoch"] = 3 epoch_info["val_met"] = 2 controller.save_model_and_optimizer_with_info(model_2, optimizer_2, epoch_info) controller.save_info_to_hist(epoch_info) # by default, load_model_and_optimizer_for_epoch loads last controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3) for parameter_1, parameter_2, parameter_3 in zip(model.parameters(), model_2.parameters(), model_3.parameters()): assert torch.allclose(parameter_1, parameter_3) assert not torch.allclose(parameter_2, parameter_3) # by default, load_model_for_epoch loads best controller.load_model_for_epoch(model_3) for parameter_1, parameter_2, parameter_3 in zip(model.parameters(), model_2.parameters(), model_3.parameters()): assert not torch.allclose(parameter_1, parameter_3) assert torch.allclose(parameter_2, parameter_3)
def test_controller_best(temp_dir): torch.manual_seed(10) model_1 = torch.nn.Linear(100, 100) optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=1) model_2 = torch.nn.Linear(100, 100) optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=2) model_3 = torch.nn.Linear(100, 100) optimizer_3 = torch.optim.Adam(model_1.parameters(), lr=3) training.TrainingStateController.SCIENTIFIC_PRECISION = 5 controller = training.TrainingStateController( training.TrainingStateParams(), state_dir=temp_dir) assert controller.get_best_epoch() == 0 controller.update_for_epoch(model_1, optimizer_1, 0.5, 0.5) assert controller.get_best_epoch() == 1 controller.update_for_epoch(model_2, optimizer_2, 1, 1) assert controller.get_best_epoch() == 1 controller.update_for_epoch(model_2, optimizer_2, 1, 1) with pytest.raises(IOError): # neither best nor last controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 2) controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 1) for parameter_1, parameter_3 in zip(model_1.parameters(), model_3.parameters()): assert torch.allclose(parameter_1, parameter_3) assert optimizer_3.param_groups[0]["lr"] == 1 controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 3) for parameter_3, parameter_2 in zip(model_3.parameters(), model_2.parameters()): assert torch.allclose(parameter_3, parameter_2) assert optimizer_3.param_groups[0]["lr"] == 2 controller.update_for_epoch(model_1, optimizer_1, 0.6, 0.6) assert controller.get_best_epoch() == 1 # round-on-even dictates .400005 will round to .40000 controller.update_for_epoch(model_1, optimizer_1, 0.400005, 0.400005) assert controller.get_best_epoch() == 5 controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 5) for parameter_1, parameter_3 in zip(model_1.parameters(), model_3.parameters()): assert torch.allclose(parameter_1, parameter_3) with pytest.raises(IOError): # no longer the best controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 1) # this block ensures that negligible differences in the loss aren't being # considered "better." This is necessary to remain consistent # with the truncated floats saved to history controller.update_for_epoch(model_1, optimizer_1, 0.4, 0.4) # last controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 6) # best (because ~ equal and older) controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 5) # ensure we're keeping track of the last when the model name is not # unique controller = training.TrainingStateController( training.TrainingStateParams(saved_model_fmt="model.pt", ), state_dir=temp_dir, warn=False, ) model_1.reset_parameters() model_2.reset_parameters() controller.update_for_epoch(model_1, optimizer_1, 0.6, 0.6) controller.update_for_epoch(model_2, optimizer_2, 0.4, 0.4) controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 2) for parameter_2, parameter_3 in zip(model_2.parameters(), model_3.parameters()): assert torch.allclose(parameter_2, parameter_3) controller.update_for_epoch(model_1, optimizer_1, 0.5, 0.5) controller.load_model_and_optimizer_for_epoch(model_3, optimizer_3, 3) for parameter_1, parameter_3 in zip(model_1.parameters(), model_3.parameters()): assert torch.allclose(parameter_1, parameter_3)