Exemplo n.º 1
0
def test_resize(ray_start_2_cpus):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(batch_size, config):
        train_dataset = LinearDataset(2, 5, size=1000000)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size)
        return train_loader

    def step_with_fail(self):
        worker_stats = [w.step.remote() for w in self.workers]
        if self._num_failures < 1:
            time.sleep(1)  # Make the batch will fail correctly.
            self.workers[0].__ray_kill__()
        success = check_for_failure(worker_stats)
        return success, worker_stats

    with patch.object(PyTorchTrainer, "_train_step", step_with_fail):
        trainer1 = PyTorchTrainer(model_creator,
                                  single_loader,
                                  optimizer_creator,
                                  batch_size=100000,
                                  loss_creator=lambda config: nn.MSELoss(),
                                  num_replicas=2)

        @ray.remote
        def try_test():
            import time
            time.sleep(100)

        try_test.remote()
        trainer1.train(max_retries=1)
        assert len(trainer1.workers) == 1
Exemplo n.º 2
0
def test_fail_twice(ray_start_2_cpus):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        return LinearDataset(2, 5, size=1000000)

    def step_with_fail(self):
        worker_stats = [w.step.remote() for w in self.workers]
        if self._num_failures < 2:
            time.sleep(1)
            self.workers[0].__ray_kill__()
        success = check_for_failure(worker_stats)
        return success, worker_stats

    with patch.object(PyTorchTrainer, "_train_step", step_with_fail):
        trainer1 = PyTorchTrainer(
            model_creator,
            single_loader,
            optimizer_creator,
            batch_size=100000,
            loss_creator=lambda config: nn.MSELoss(),
            num_replicas=2)

        trainer1.train(max_retries=2)
Exemplo n.º 3
0
def test_test_mode(ray_start_2_cpus):  # noqa: F811
    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        config={TEST_MODE: True},
        num_replicas=1)
    metrics = trainer.train()
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate()
    assert val_metrics[BATCH_COUNT] == 1
Exemplo n.º 4
0
Arquivo: dcgan.py Projeto: senfore/ray
def train_example(num_replicas=1, use_gpu=False, test_mode=False):
    config = {"test_mode": test_mode}
    trainer = PyTorchTrainer(model_creator,
                             data_creator,
                             optimizer_creator,
                             lambda config: nn.BCELoss(),
                             train_function=train,
                             validation_function=False,
                             num_replicas=num_replicas,
                             config=config,
                             use_gpu=use_gpu,
                             batch_size=16 if test_mode else 512,
                             backend="nccl" if use_gpu else "gloo")
    for i in range(5):
        stats = trainer.train()
        print(stats)

    return trainer
Exemplo n.º 5
0
def train_example(num_replicas=1,
                  num_epochs=5,
                  use_gpu=False,
                  use_fp16=False,
                  test_mode=False):
    config = {TEST_MODE: test_mode}
    trainer1 = PyTorchTrainer(ResNet18,
                              cifar_creator,
                              optimizer_creator,
                              nn.CrossEntropyLoss,
                              scheduler_creator=scheduler_creator,
                              initialization_hook=initialization_hook,
                              num_replicas=num_replicas,
                              config=config,
                              use_gpu=use_gpu,
                              batch_size=16 if test_mode else 512,
                              backend="nccl" if use_gpu else "gloo",
                              scheduler_step_freq="epoch",
                              use_fp16=use_fp16)
    for i in range(num_epochs):
        # Increase `max_retries` to turn on fault tolerance.
        stats = trainer1.train(max_retries=0)
        print(stats)

    print(trainer1.validate())
    trainer1.shutdown()
    print("success!")
Exemplo n.º 6
0
def train_example(num_replicas=1, use_gpu=False):
    trainer1 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              num_replicas=num_replicas,
                              use_gpu=use_gpu,
                              backend="gloo")
    trainer1.train()
    trainer1.shutdown()
    print("success!")
Exemplo n.º 7
0
def test_multi_model_matrix(ray_start_2_cpus, num_replicas):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        if config.get("models", 1) > 1:
            assert len(model) == config["models"], config

        if config.get("optimizers", 1) > 1:
            assert len(optimizer) == config["optimizers"], config

        if config.get("schedulers", 1) > 1:
            assert len(scheduler) == config["schedulers"], config
        return {"done": 1}

    def multi_model_creator(config):
        models = []
        for i in range(config.get("models", 1)):
            models += [nn.Linear(1, 1)]
        return models[0] if len(models) == 1 else models

    def multi_optimizer_creator(models, config):
        optimizers = []
        main_model = models[0] if type(models) is list else models
        for i in range(config.get("optimizers", 1)):
            optimizers += [torch.optim.SGD(main_model.parameters(), lr=0.0001)]
        return optimizers[0] if len(optimizers) == 1 else optimizers

    def multi_scheduler_creator(optimizer, config):
        schedulers = []
        main_opt = optimizer[0] if type(optimizer) is list else optimizer
        for i in range(config.get("schedulers", 1)):
            schedulers += [
                torch.optim.lr_scheduler.StepLR(
                    main_opt, step_size=30, gamma=0.1)
            ]
        return schedulers[0] if len(schedulers) == 1 else schedulers

    for model_count in range(1, 3):
        for optimizer_count in range(1, 3):
            for scheduler_count in range(1, 3):
                trainer = PyTorchTrainer(
                    multi_model_creator,
                    data_creator,
                    multi_optimizer_creator,
                    loss_creator=nn.MSELoss,
                    scheduler_creator=multi_scheduler_creator,
                    train_function=custom_train,
                    num_replicas=num_replicas,
                    config={
                        "models": model_count,
                        "optimizers": optimizer_count,
                        "schedulers": scheduler_count
                    })
                trainer.train()
                trainer.shutdown()
Exemplo n.º 8
0
def train_example(num_replicas=1, use_gpu=False):
    trainer1 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              num_replicas=num_replicas,
                              resources_per_replica=Resources(
                                  num_cpus=1,
                                  num_gpus=int(use_gpu),
                                  resources={}))
    trainer1.train()
    trainer1.shutdown()
Exemplo n.º 9
0
def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        return {"done": 1}

    from torch.optim.lr_scheduler import ReduceLROnPlateau

    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer))
    trainer.update_scheduler(0.5)
    trainer.update_scheduler(0.5)
    assert all(
        trainer.apply_all_workers(lambda r: r.schedulers[0].last_epoch == 2))
    trainer.shutdown()
Exemplo n.º 10
0
def test_train(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer = PyTorchTrainer(model_creator,
                             data_creator,
                             optimizer_creator,
                             num_replicas=num_replicas)
    train_loss1 = trainer.train()["train_loss"]
    validation_loss1 = trainer.validate()["validation_loss"]

    train_loss2 = trainer.train()["train_loss"]
    validation_loss2 = trainer.validate()["validation_loss"]

    print(train_loss1, train_loss2)
    print(validation_loss1, validation_loss2)

    assert train_loss2 <= train_loss1
    assert validation_loss2 <= validation_loss1
Exemplo n.º 11
0
def test_train(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    for i in range(3):
        train_loss1 = trainer.train()["train_loss"]
    validation_loss1 = trainer.validate()["validation_loss"]

    for i in range(3):
        train_loss2 = trainer.train()["train_loss"]
    validation_loss2 = trainer.validate()["validation_loss"]

    print(train_loss1, train_loss2)
    print(validation_loss1, validation_loss2)

    assert train_loss2 <= train_loss1
    assert validation_loss2 <= validation_loss1
Exemplo n.º 12
0
def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        assert config[SCHEDULER_STEP] == scheduler_freq
        return {"done": 1}

    def scheduler_creator(optimizer, config):
        return torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=30, gamma=0.1)

    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        scheduler_creator=scheduler_creator)

    for i in range(3):
        trainer.train()["train_loss"]
    trainer.shutdown()
def train_example(num_replicas=1, use_gpu=False, test_mode=False):
    config = {"test_mode": test_mode}
    trainer1 = PyTorchTrainer(ResNet18,
                              cifar_creator,
                              optimizer_creator,
                              nn.CrossEntropyLoss,
                              initialization_hook=initialization_hook,
                              train_function=train,
                              validation_function=validate,
                              num_replicas=num_replicas,
                              config=config,
                              use_gpu=use_gpu,
                              batch_size=16 if test_mode else 512,
                              backend="nccl" if use_gpu else "gloo")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    trainer1.shutdown()
    print("success!")
Exemplo n.º 14
0
def test_multi_model(ray_start_2_cpus, num_replicas):  # noqa: F811
    def custom_train(config, models, dataloader, criterion, optimizers,
                     **kwargs):
        result = {}
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            result["model_{}".format(i)] = train(config, model, dataloader,
                                                 criterion, optimizer)
        return result

    def multi_model_creator(config):
        return nn.Linear(1, 1), nn.Linear(1, 1)

    def multi_optimizer_creator(models, config):
        opts = [
            torch.optim.SGD(model.parameters(), lr=0.0001) for model in models
        ]
        return opts[0], opts[1]

    trainer1 = PyTorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        train_function=custom_train,
        num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    models1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    models2 = trainer2.get_model()

    for model_1, model_2 in zip(models1, models2):

        model1_state_dict = model_1.state_dict()
        model2_state_dict = model_2.state_dict()

        assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

        for k in model1_state_dict:
            assert torch.equal(model1_state_dict[k], model2_state_dict[k])

    trainer2.shutdown()
Exemplo n.º 15
0
def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer1 = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    model1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    model2 = trainer2.get_model()

    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()

    assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

    for k in model1_state_dict:
        assert torch.equal(model1_state_dict[k], model2_state_dict[k])