Exemplo n.º 1
0
def train_linear(num_workers=1):
    trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(train_func,
                          config,
                          callbacks=[JsonLoggerCallback("./sgd_results")])
    trainer.shutdown()

    print(results)
    return results
Exemplo n.º 2
0
def test_horovod_simple(ray_start_2_cpus):
    def simple_fn():
        hvd_torch.init()
        return hvd_torch.rank()

    num_workers = 2
    trainer = Trainer("horovod", num_workers)
    trainer.start()
    result = trainer.run(simple_fn)
    trainer.shutdown()

    assert result == list(range(num_workers))
Exemplo n.º 3
0
def test_worker_kill_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        checkpoint = sgd.load_checkpoint()
        if checkpoint:
            epoch = checkpoint["epoch"]
        else:
            epoch = 0
        print("Epoch: ", epoch)
        for i in range(epoch, 2):
            sgd.report(loss=1, iter=i)
            sgd.save_checkpoint(epoch=i + 1)

    trainer = Trainer(test_config, num_workers=2)
    trainer.start()
    kill_callback = KillCallback(fail_on=0,
                                 worker_group=trainer._executor.worker_group)

    trainer.run(train, callbacks=[kill_callback])

    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint is saved.*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from beginning*
    # Run 2: epoch=0, counter=2, Successful
    # Run 3: epoch=1, counter=3, Successful
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(fail_on=1,
                                 worker_group=trainer._executor.worker_group)
    trainer.run(train, callbacks=[kill_callback])
    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint saved*
    # *Latest checkpoint updated, epoch=1
    # Run 2: epoch=1, counter=2, Successful
    # *Checkpoint saved*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from last checkpoint.*
    # Run 3: epoch=1, counter=3, Successful.
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    def train():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train)
Exemplo n.º 4
0
def test_run(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == 1 for result in results)
Exemplo n.º 5
0
def train_tensorflow_mnist(num_workers=2, use_gpu=False):
    trainer = Trainer(backend="tensorflow",
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config={
                              "lr": 1e-3,
                              "batch_size": 64,
                              "epochs": 4
                          })
    trainer.shutdown()
    print(f"Results: {results[0]}")
Exemplo n.º 6
0
def test_torch_fashion_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=True)
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer.start()
    results = trainer.run(fashion_mnist_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1] < result[0]
Exemplo n.º 7
0
def test_torch_linear(ray_start_2_cpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(linear_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]
Exemplo n.º 8
0
def test_run_config(ray_start_2_cpus):
    backend_config = TestConfig()

    def train_func(config):
        return config["fruit"]

    config = {"fruit": "banana"}

    trainer = Trainer(backend_config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func, config)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == "banana" for result in results)
Exemplo n.º 9
0
def train_linear(num_workers=2, use_gpu=False):
    datasets = get_datasets()

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        dataset=datasets,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()
    print(results)
    return results
Exemplo n.º 10
0
def test_start_shutdown(ray_start_2_cpus, num_workers):
    config = TestConfig()
    assert ray.available_resources()["CPU"] == 2
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    time.sleep(1)

    remaining = 2 - num_workers
    if remaining == 0:
        assert "CPU" not in ray.available_resources()
    else:
        assert ray.available_resources()["CPU"] == remaining

    trainer.shutdown()
    time.sleep(1)
    assert ray.available_resources()["CPU"] == 2
def train_tensorflow_linear(num_workers=2, use_gpu=False):
    dataset_pipeline = get_dataset_pipeline()
    trainer = Trainer(backend="tensorflow",
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          dataset=dataset_pipeline,
                          config={
                              "lr": 1e-3,
                              "batch_size": 32,
                              "epochs": 4
                          })
    trainer.shutdown()
    print(f"Results: {results[0]}")
    return results
Exemplo n.º 12
0
def test_horovod_torch_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers, use_gpu=True)
    trainer.start()
    results = trainer.run(horovod_torch_train_func,
                          config={
                              "num_epochs": num_epochs,
                              "lr": 1e-3
                          })
    trainer.shutdown()

    assert len(results) == num_workers
    for worker_result in results:
        assert len(worker_result) == num_epochs
        assert worker_result[num_epochs - 1] < worker_result[0]
Exemplo n.º 13
0
def test_horovod_torch_mnist_stateful(ray_start_2_cpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers)
    workers = trainer.to_worker_group(HorovodTrainClass,
                                      config={
                                          "num_epochs": num_epochs,
                                          "lr": 1e-3
                                      })
    results = []
    for epoch in range(num_epochs):
        results.append(ray.get([w.train.remote(epoch=epoch) for w in workers]))
    trainer.shutdown()

    assert len(results) == num_epochs
    for i in range(num_workers):
        assert results[num_epochs - 1][i] < results[0][i]
Exemplo n.º 14
0
def test_run_failure(ray_start_2_cpus):
    test_config = TestConfig()

    def train_invalid_signature(a, b):
        pass

    trainer = Trainer(test_config, num_workers=2)

    # Raise RuntimeError when trainer has not been started yet.
    with pytest.raises(RuntimeError):
        trainer.run(lambda: 1)

    trainer.start()

    with pytest.raises(ValueError):
        trainer.run(train_invalid_signature)

    trainer.shutdown()
Exemplo n.º 15
0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):

    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(TestConfig(),
                num_workers=2,
                use_gpu=False,
                resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(TestConfig(),
                num_workers=2,
                use_gpu=True,
                resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(TestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(TestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()
Exemplo n.º 16
0
def test_worker_kill(ray_start_2_cpus, backend):
    if backend == "test":
        test_config = TestConfig()
    elif backend == "torch":
        test_config = TorchConfig()
    elif backend == "tf":
        test_config = TensorflowConfig()
    elif backend == "horovod":
        test_config = HorovodConfig()

    trainer = Trainer(test_config, num_workers=2)

    def train_func():
        for i in range(2):
            sgd.report(loss=1, iter=i)

    trainer.start()
    kill_callback = KillCallback(fail_on=0,
                                 worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning
    # Run 3: iter=0, counter=2, Successful
    # Run 4: iter=1, counter=3, Successful
    assert kill_callback.counter == 3

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(fail_on=1,
                                 worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=2, Successful
    # Run 3: None, counter=2, Unsuccessful, starts training from beginning.
    # Run 4: iter=0, counter=3, Successful
    # Run 5: iter=1, counter=4, Successful
    assert kill_callback.counter == 4

    def train():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train)
Exemplo n.º 17
0
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested):
    num_workers = 2
    config = TestConfig()
    original = ray.available_resources().get(resource)
    resources_per_worker = {resource: num_requested}
    use_gpu = resource == "GPU"
    trainer = Trainer(config,
                      num_workers=num_workers,
                      use_gpu=use_gpu,
                      resources_per_worker=resources_per_worker)

    trainer.start()
    expected = original - num_workers * num_requested
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == expected)

    trainer.shutdown()
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == original)
Exemplo n.º 18
0
def test_tensorflow_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True)
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer.start()
    results = trainer.run(tensorflow_mnist_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers
    result = results[0]

    loss = result["loss"]
    assert len(loss) == epochs
    assert loss[-1] < loss[0]

    accuracy = result["accuracy"]
    assert len(accuracy) == epochs
    assert accuracy[-1] > accuracy[0]
Exemplo n.º 19
0
def test_multiple_datasets(ray_start_4_cpus):
    num_epochs = 2
    num_data_1 = 10
    num_data_2 = 6

    train_data = ray.data.range(num_data_1)
    val_data = ray.data.range(num_data_2)

    def get_dataset():
        data_train_all_epochs = []
        data_val_all_epochs = []
        for _ in range(2):
            data_this_epoch_train = []
            train_dataset = sgd.get_dataset_shard("train")
            for batch in train_dataset.iter_batches():
                data_this_epoch_train.extend(batch)
            data_train_all_epochs.append(data_this_epoch_train)

            data_this_epoch_val = []
            val_dataset = sgd.get_dataset_shard("val")
            for batch in val_dataset.iter_batches():
                data_this_epoch_val.extend(batch)
            data_val_all_epochs.append(data_this_epoch_val)

        return data_train_all_epochs, data_val_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset,
                          dataset={
                              "train": train_data,
                              "val": val_data
                          })
    check_dataset_output(num_data_1, num_epochs,
                         [worker_data[0] for worker_data in results])
    check_dataset_output(num_data_2, num_epochs,
                         [worker_data[1] for worker_data in results])
    trainer.shutdown()
Exemplo n.º 20
0
def test_dataset(ray_start_4_cpus):
    """Checks that Dataset is correctly sharded even with multiple epochs."""
    num_epochs = 2
    num_data = 10

    dataset = ray.data.range(num_data)

    def get_dataset():
        data_all_epochs = []
        for _ in range(2):
            data_this_epoch = []
            dataset = sgd.get_dataset_shard()
            for batch in dataset.iter_batches():
                data_this_epoch.extend(batch)
            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)
    trainer.shutdown()
Exemplo n.º 21
0
def main(num_workers, use_gpu, kwargs):
    trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers)
    trainer.start()
    loss_per_epoch = trainer.run(train_func, config=kwargs)
    trainer.shutdown()
    print(loss_per_epoch)