def run_experiment(experiment_tags, data_dir, results_dir, start_fresh=False, use_cuda=False, workers=None, experiments_file=None, *args, **kwargs): if not os.path.exists(data_dir): raise RuntimeError('Cannot find data_dir directory: {}'.format(data_dir)) if not os.path.exists(results_dir): raise RuntimeError('Cannot find results_dir directory: {}'.format(results_dir)) cfg = load_experiment_config(experiments_file, experiment_tags) logger.info(cfg) model, optimizer, trainer, trainer_params = experiment_config_parser(cfg, workers=workers, data_dir=data_dir) experiment_dir = os.path.join(results_dir, '_'.join(experiment_tags)) manager = ExperimentManager(experiment_dir, model, optimizer) if start_fresh: logger.info('Starting fresh option enabled. Clearing all previous results...') manager.delete_dirs() manager.make_dirs() if use_cuda: manager.model = manager.model.cuda() import torch.backends.cudnn as cudnn cudnn.benchmark = True last_iter = manager.get_last_model_iteration() if last_iter > 0: logger.info('Continue experiment from iteration: {}'.format(last_iter)) manager.load_train_state(last_iter) trainer_params.update(kwargs) trainer(manager, start_iter=last_iter, use_cuda=use_cuda, *args, **trainer_params)
def test_train_with_nan_loss(tmp_path): class NanLoss(torch.nn.Module): def __init__(self): super(NanLoss, self).__init__() def forward(self, Ypred, Y, W=None): return Ypred.mean() * float('nan') expdir = str(tmp_path / "testexp") tmp_data_dir = str(tmp_path / "tmpdata") num_klasses = 10 model = SimpleTestingModel(num_klasses) optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01) manager = ExperimentManager(expdir, model, optimizer) manager.make_dirs() train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir, 40000, 2, 0) loss = NanLoss() with pytest.raises(ValueError) as e: train(manager, train_loader, test_loader, start_iter=1, disp_iter=1, save_iter=1, valid_iter=1, use_cuda=False, loss=loss) assert "Loss became NaN during iteration" in str(e.value)
def test_train(tmp_path): expdir = str(tmp_path / "testexp") tmp_data_dir = str(tmp_path / "tmpdata") num_klasses = 10 class TestModel(torch.nn.Module): def __init__(self, klasses): super(TestModel, self).__init__() self.conv = torch.nn.Conv2d(3, klasses, 1) self.avgpool = torch.nn.AvgPool2d(32) self.klasses = klasses def forward(self, x): return self.avgpool(self.conv(x)).reshape(x.shape[0], self.klasses) model = TestModel(num_klasses) optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01) manager = ExperimentManager(expdir, model, optimizer) manager.make_dirs() train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir, 40000, 2, 0) loss = CrossEntropyLossTF() train(manager, train_loader, test_loader, start_iter=39999, disp_iter=1, save_iter=1, valid_iter=1, use_cuda=False, loss=loss)
def test_train_networks(tmp_path, network, use_cuda): exptags = ["cifar10", network, "epoch5"] exp_file = str(Path(__file__).parent / "resources" / "experiments.json") data_dir = str(tmp_path / "tmpdata") results_dir = str(tmp_path / "resdir") os.makedirs(data_dir) os.makedirs(results_dir) run_experiment( experiment_tags=exptags, data_dir=data_dir, results_dir=results_dir, start_fresh=True, use_cuda=use_cuda, workers=None, experiments_file=exp_file, disp_iter=1, save_iter=5, valid_iter=5, ) experiment_dir = os.path.join(results_dir, '_'.join(exptags)) assert os.path.exists(experiment_dir) manager = ExperimentManager(experiment_dir) scalars_file = os.path.join(manager.log_dir, "scalars.json") assert os.path.exists(scalars_file) with open(scalars_file, "r") as f: results = json.load(f) # no results should hold any NaN values assert not any([val != val for t, i, val in results["train_loss"]])
def test_train(tmp_path): expdir = str(tmp_path / "testexp") tmp_data_dir = str(tmp_path / "tmpdata") num_klasses = 10 model = SimpleTestingModel(num_klasses) optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01) manager = ExperimentManager(expdir, model, optimizer) manager.make_dirs() train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir, 40000, 2, 0) loss = CrossEntropyLossTF() train(manager, train_loader, test_loader, start_iter=39999, disp_iter=1, save_iter=1, valid_iter=1, use_cuda=False, loss=loss)
def test_experiment_manager(tmp_path): exp_dir = tmp_path / "test_exp_dir" man = ExperimentManager(str(exp_dir)) assert man.model is None assert man.optimizer is None man.make_dirs() assert exp_dir.exists() assert (exp_dir / "log").exists() assert (exp_dir / "state" / "model").exists() assert (exp_dir / "state" / "optimizer").exists() assert man.all_dirs_exists() assert man.any_dir_exists() man.delete_dirs() assert not exp_dir.exists() assert not (exp_dir / "log").exists() assert not (exp_dir / "state" / "model").exists() assert not (exp_dir / "state" / "optimizer").exists() assert not man.all_dirs_exists() assert not man.any_dir_exists() man.make_dirs() man.model = torch.nn.Conv2d(2, 1, 3) w = man.model.weight.clone() man.save_model_state(0) with torch.no_grad(): man.model.weight.zero_() man.save_model_state(100) assert not man.model.weight.equal(w) assert man.get_last_model_iteration() == 100 man.load_model_state(0) assert man.model.weight.equal(w) optimizer = torch.optim.SGD(man.model.parameters(), lr=0.01, momentum=0.1) man.optimizer = optimizer man.save_train_state(100) w = man.model.weight.clone() sd = man.optimizer.state_dict().copy() man.model.train() x = torch.ones(5, 2, 5, 5) x.requires_grad = True y = torch.ones(5, 1, 3, 3) y.requires_grad = False ypred = man.model(x) loss = torch.nn.MSELoss()(ypred, y) man.optimizer.zero_grad() loss.backward() man.optimizer.step() man.save_train_state(101) assert not man.model.weight.equal(w) assert sd != man.optimizer.state_dict() w2 = man.model.weight.clone() sd2 = man.optimizer.state_dict().copy() man.load_train_state(100) assert man.model.weight.equal(w) assert sd == man.optimizer.state_dict() man.load_last_train_state() # should be 101 assert not man.model.weight.equal(w) assert sd != man.optimizer.state_dict() assert man.model.weight.equal(w2) def retrieve_mom_buffer(sd): keys = [e for e in sd['state'].keys()] if len(keys) == 0: return torch.zero(0) else: return sd['state'][keys[0]]['momentum_buffer'] assert torch.equal(retrieve_mom_buffer(sd2), retrieve_mom_buffer(man.optimizer.state_dict()))