예제 #1
0
def run_experiment(experiment_tags, data_dir, results_dir, start_fresh=False, use_cuda=False, workers=None,
                   experiments_file=None, *args, **kwargs):
    if not os.path.exists(data_dir):
        raise RuntimeError('Cannot find data_dir directory: {}'.format(data_dir))

    if not os.path.exists(results_dir):
        raise RuntimeError('Cannot find results_dir directory: {}'.format(results_dir))

    cfg = load_experiment_config(experiments_file, experiment_tags)
    logger.info(cfg)

    model, optimizer, trainer, trainer_params = experiment_config_parser(cfg, workers=workers, data_dir=data_dir)

    experiment_dir = os.path.join(results_dir, '_'.join(experiment_tags))
    manager = ExperimentManager(experiment_dir, model, optimizer)
    if start_fresh:
        logger.info('Starting fresh option enabled. Clearing all previous results...')
        manager.delete_dirs()
    manager.make_dirs()

    if use_cuda:
        manager.model = manager.model.cuda()
        import torch.backends.cudnn as cudnn
        cudnn.benchmark = True

    last_iter = manager.get_last_model_iteration()
    if last_iter > 0:
        logger.info('Continue experiment from iteration: {}'.format(last_iter))
        manager.load_train_state(last_iter)

    trainer_params.update(kwargs)

    trainer(manager, start_iter=last_iter, use_cuda=use_cuda, *args, **trainer_params)
예제 #2
0
def test_train_with_nan_loss(tmp_path):
    class NanLoss(torch.nn.Module):
        def __init__(self):
            super(NanLoss, self).__init__()

        def forward(self, Ypred, Y, W=None):
            return Ypred.mean() * float('nan')

    expdir = str(tmp_path / "testexp")
    tmp_data_dir = str(tmp_path / "tmpdata")
    num_klasses = 10

    model = SimpleTestingModel(num_klasses)
    optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)
    manager = ExperimentManager(expdir, model, optimizer)
    manager.make_dirs()

    train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir,
                                                       40000, 2, 0)
    loss = NanLoss()

    with pytest.raises(ValueError) as e:
        train(manager,
              train_loader,
              test_loader,
              start_iter=1,
              disp_iter=1,
              save_iter=1,
              valid_iter=1,
              use_cuda=False,
              loss=loss)
    assert "Loss became NaN during iteration" in str(e.value)
예제 #3
0
def test_train(tmp_path):
    expdir = str(tmp_path / "testexp")
    tmp_data_dir = str(tmp_path / "tmpdata")
    num_klasses = 10

    class TestModel(torch.nn.Module):
        def __init__(self, klasses):
            super(TestModel, self).__init__()
            self.conv = torch.nn.Conv2d(3, klasses, 1)
            self.avgpool = torch.nn.AvgPool2d(32)
            self.klasses = klasses

        def forward(self, x):
            return self.avgpool(self.conv(x)).reshape(x.shape[0], self.klasses)

    model = TestModel(num_klasses)
    optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)
    manager = ExperimentManager(expdir, model, optimizer)
    manager.make_dirs()

    train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir,
                                                       40000, 2, 0)
    loss = CrossEntropyLossTF()

    train(manager,
          train_loader,
          test_loader,
          start_iter=39999,
          disp_iter=1,
          save_iter=1,
          valid_iter=1,
          use_cuda=False,
          loss=loss)
예제 #4
0
def test_train_networks(tmp_path, network, use_cuda):
    exptags = ["cifar10", network, "epoch5"]
    exp_file = str(Path(__file__).parent / "resources" / "experiments.json")
    data_dir = str(tmp_path / "tmpdata")
    results_dir = str(tmp_path / "resdir")
    os.makedirs(data_dir)
    os.makedirs(results_dir)
    run_experiment(
        experiment_tags=exptags,
        data_dir=data_dir,
        results_dir=results_dir,
        start_fresh=True,
        use_cuda=use_cuda,
        workers=None,
        experiments_file=exp_file,
        disp_iter=1,
        save_iter=5,
        valid_iter=5,
    )
    experiment_dir = os.path.join(results_dir, '_'.join(exptags))
    assert os.path.exists(experiment_dir)
    manager = ExperimentManager(experiment_dir)
    scalars_file = os.path.join(manager.log_dir, "scalars.json")
    assert os.path.exists(scalars_file)
    with open(scalars_file, "r") as f:
        results = json.load(f)
    # no results should hold any NaN values
    assert not any([val != val for t, i, val in results["train_loss"]])
예제 #5
0
def test_train(tmp_path):
    expdir = str(tmp_path / "testexp")
    tmp_data_dir = str(tmp_path / "tmpdata")
    num_klasses = 10

    model = SimpleTestingModel(num_klasses)
    optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)
    manager = ExperimentManager(expdir, model, optimizer)
    manager.make_dirs()

    train_loader, test_loader = get_cifar_data_loaders(CIFAR10, tmp_data_dir,
                                                       40000, 2, 0)
    loss = CrossEntropyLossTF()

    train(manager,
          train_loader,
          test_loader,
          start_iter=39999,
          disp_iter=1,
          save_iter=1,
          valid_iter=1,
          use_cuda=False,
          loss=loss)
예제 #6
0
def test_experiment_manager(tmp_path):
    exp_dir = tmp_path / "test_exp_dir"
    man = ExperimentManager(str(exp_dir))
    assert man.model is None
    assert man.optimizer is None

    man.make_dirs()
    assert exp_dir.exists()
    assert (exp_dir / "log").exists()
    assert (exp_dir / "state" / "model").exists()
    assert (exp_dir / "state" / "optimizer").exists()
    assert man.all_dirs_exists()
    assert man.any_dir_exists()

    man.delete_dirs()
    assert not exp_dir.exists()
    assert not (exp_dir / "log").exists()
    assert not (exp_dir / "state" / "model").exists()
    assert not (exp_dir / "state" / "optimizer").exists()
    assert not man.all_dirs_exists()
    assert not man.any_dir_exists()

    man.make_dirs()

    man.model = torch.nn.Conv2d(2, 1, 3)
    w = man.model.weight.clone()
    man.save_model_state(0)
    with torch.no_grad():
        man.model.weight.zero_()
    man.save_model_state(100)
    assert not man.model.weight.equal(w)
    assert man.get_last_model_iteration() == 100

    man.load_model_state(0)
    assert man.model.weight.equal(w)

    optimizer = torch.optim.SGD(man.model.parameters(), lr=0.01, momentum=0.1)
    man.optimizer = optimizer

    man.save_train_state(100)

    w = man.model.weight.clone()
    sd = man.optimizer.state_dict().copy()

    man.model.train()

    x = torch.ones(5, 2, 5, 5)
    x.requires_grad = True
    y = torch.ones(5, 1, 3, 3)
    y.requires_grad = False

    ypred = man.model(x)
    loss = torch.nn.MSELoss()(ypred, y)
    man.optimizer.zero_grad()
    loss.backward()
    man.optimizer.step()

    man.save_train_state(101)
    assert not man.model.weight.equal(w)
    assert sd != man.optimizer.state_dict()
    w2 = man.model.weight.clone()
    sd2 = man.optimizer.state_dict().copy()

    man.load_train_state(100)
    assert man.model.weight.equal(w)
    assert sd == man.optimizer.state_dict()

    man.load_last_train_state() # should be 101
    assert not man.model.weight.equal(w)
    assert sd != man.optimizer.state_dict()
    assert man.model.weight.equal(w2)

    def retrieve_mom_buffer(sd):
        keys = [e for e in sd['state'].keys()]
        if len(keys) == 0:
            return torch.zero(0)
        else:
            return sd['state'][keys[0]]['momentum_buffer']

    assert torch.equal(retrieve_mom_buffer(sd2), retrieve_mom_buffer(man.optimizer.state_dict()))