Пример #1
0
def test_persisted_checkpoint(ray_start_2_cpus, tmp_path):
    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)

    config = TestConfig()
    e = BackendExecutor(config)
    e.start()
    e.start_training(train, run_dir=tmp_path)
    e.finish_training()

    assert e.latest_checkpoint_id == 2
    assert e.latest_checkpoint is not None
    assert e.latest_checkpoint["epoch"] == 1
    assert e.latest_checkpoint_path is not None

    assert os.path.exists(e.latest_checkpoint_path)

    def validate():
        checkpoint = sgd.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 1

    e2 = BackendExecutor(config)
    e2.start()
    e2.start_training(validate,
                      checkpoint=e.latest_checkpoint_path,
                      run_dir=tmp_path)
    e2.finish_training()
Пример #2
0
def test_train(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    e.start_training(lambda: 1, run_dir=tmp_path)
    assert e.finish_training() == [1, 1]
Пример #3
0
def test_start(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    with pytest.raises(InactiveWorkerGroupError):
        e.run(lambda: 1)
    e.start()
    assert len(e.worker_group) == 2
Пример #4
0
    def __init__(self,
                 backend: Union[str, BackendConfig],
                 num_workers: int = 1,
                 use_gpu: bool = False,
                 resources_per_worker: Optional[Dict[str, float]] = None):
        """A class for distributed training.

        Args:
            backend (Union[str, BackendConfig]): The backend used for
                distributed communication. If configurations are needed,
                a subclass of ``BackendConfig`` can be passed in.
                Supported ``str`` values: {"torch"}.
            num_workers (int): The number of workers (Ray actors) to launch.
                Defaults to 1. Each worker will reserve 1 CPU by default.
            use_gpu (bool): If True, training will be done on GPUs (1 per
                worker). Defaults to False.
            resources_per_worker (Optional[Dict]): If specified, the resources
                defined in this Dict will be reserved for each worker.
        """
        # Setup executor.
        backend_config = self._get_backend_config(backend)

        if resources_per_worker:
            raise NotImplementedError("`resources_per_worker` argument is not "
                                      "supported yet.")

        self._executor = BackendExecutor(backend_config, num_workers, 1,
                                         int(use_gpu))
Пример #5
0
def test_start(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    with pytest.raises(InactiveWorkerGroupError):
        e.start_training(lambda: 1, run_dir=tmp_path)
    e.start()
    assert len(e.worker_group) == 2
Пример #6
0
def test_local_ranks(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train():
        return sgd.local_rank()

    e.start_training(train, run_dir=tmp_path)
    assert set(e.finish_training()) == {0, 1}
Пример #7
0
def test_execute_worker_failure(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train_fail():
        ray.actor.exit_actor()

    new_execute_func = gen_execute_special(train_fail)
    with patch.object(WorkerGroup, "execute_async", new_execute_func):
        with pytest.raises(RuntimeError):
            e.run(lambda: 1)
Пример #8
0
def test_worker_failure(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train_fail():
        ray.actor.exit_actor()

    new_execute_func = gen_execute_special(train_fail)
    with patch.object(WorkerGroup, "execute_async", new_execute_func):
        with pytest.raises(TrainingWorkerError):
            e.start_training(lambda: 1, run_dir=tmp_path)
            e.finish_training()
Пример #9
0
def test_mismatch_checkpoint_report(ray_start_2_cpus, tmp_path):
    def train():
        if (sgd.world_rank()) == 0:
            sgd.save_checkpoint(epoch=0)
        else:
            sgd.report(iter=0)

    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()
    e.start_training(train, run_dir=tmp_path)
    with pytest.raises(RuntimeError):
        e.finish_training()
Пример #10
0
def test_checkpoint(ray_start_2_cpus, tmp_path):
    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)

    config = TestConfig()
    e = BackendExecutor(config, num_workers=1)
    e.start()

    e.start_training(train, run_dir=tmp_path)
    e.finish_training()

    assert e.latest_checkpoint is not None
    assert e.latest_checkpoint["epoch"] == 1
Пример #11
0
    def __init__(
        self,
        backend: Union[str, BackendConfig],
        num_workers: int = 1,
        use_gpu: bool = False,
        resources_per_worker: Optional[Dict[str, float]] = None,
        logdir: Optional[str] = None,
        max_retries: int = 3,
    ):

        self._backend = backend
        self._num_workers = num_workers
        self._use_gpu = use_gpu
        self._resources_per_worker = resources_per_worker

        # Incremental unique run ID.
        self._run_id = 0

        self.logdir = self.create_logdir(logdir)

        # Setup executor.
        backend_config = self._get_backend_config(backend)

        num_cpus = 1
        num_gpus = int(use_gpu)

        if resources_per_worker:
            # Override CPU and GPU resources and remove from dict.
            num_cpus = resources_per_worker.pop("CPU", num_cpus)
            num_gpus = resources_per_worker.pop("GPU", num_gpus)
            if not use_gpu and num_gpus > 0:
                raise ValueError(
                    "`use_gpu` is False but `GPU` was found in "
                    "`resources_per_worker`. Either set `use_gpu` to True or "
                    "remove `GPU` from `resources_per_worker.")
            if use_gpu and num_gpus == 0:
                raise ValueError(
                    "`use_gpu` is True but `GPU` is set to 0 in "
                    "`resources_per_worker`. Either set `use_gpu` to False or "
                    "request a positive number of `GPU` in "
                    "`resources_per_worker.")

        self._executor = BackendExecutor(
            backend_config=backend_config,
            num_workers=num_workers,
            num_cpus_per_worker=num_cpus,
            num_gpus_per_worker=num_gpus,
            additional_resources_per_worker=resources_per_worker,
            max_retries=max_retries)
Пример #12
0
def test_no_exhaust(ray_start_2_cpus, tmp_path):
    """Tests if training can finish even if queue is not exhausted."""
    def train():
        for _ in range(2):
            sgd.report(loss=1)
        return 2

    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    e.start_training(train, run_dir=tmp_path)
    output = e.finish_training()

    assert output == [2, 2]
Пример #13
0
def test_checkpoint(ray_start_2_cpus):
    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)

    config = TestConfig()
    e = BackendExecutor(config, num_workers=1)
    e.start()

    e.start_training(train)
    e.finish_training()

    latest_checkpoint = e.get_latest_checkpoint()
    assert latest_checkpoint is not None
    assert latest_checkpoint["epoch"] == 1
Пример #14
0
def test_initialization_hook(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)

    def init_hook():
        import os
        os.environ["TEST"] = "1"

    e.start(initialization_hook=init_hook)

    def check():
        import os
        return os.getenv("TEST", "0")

    assert e.run(check) == ["1", "1"]
Пример #15
0
def test_torch_start_shutdown(ray_start_2_cpus, init_method):
    torch_config = TorchConfig(backend="gloo", init_method=init_method)
    e = BackendExecutor(torch_config, num_workers=2)
    e.start()

    def check_process_group():
        import torch
        return torch.distributed.is_initialized(
        ) and torch.distributed.get_world_size() == 2

    assert all(e.run(check_process_group))

    e._backend.on_shutdown(e.worker_group, e._backend_config)

    assert not any(e.run(check_process_group))
Пример #16
0
def test_initialization_hook(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)

    def init_hook():
        import os
        os.environ["TEST"] = "1"

    e.start(initialization_hook=init_hook)

    def check():
        import os
        return os.getenv("TEST", "0")

    e.start_training(check, run_dir=tmp_path)
    assert e.finish_training() == ["1", "1"]
Пример #17
0
def test_cuda_visible_devices(ray_2_node_2_gpu, worker_results, tmp_path):
    config = TestConfig()

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    num_workers, expected_results = worker_results

    e = BackendExecutor(config,
                        num_workers=num_workers,
                        num_cpus_per_worker=0,
                        num_gpus_per_worker=1)
    e.start()
    e.start_training(get_resources, tmp_path)
    results = e.finish_training()
    results.sort()
    assert results == expected_results
Пример #18
0
def test_train_failure(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    with pytest.raises(SGDBackendError):
        e.fetch_next_result()

    with pytest.raises(SGDBackendError):
        e.finish_training()

    e.start_training(lambda: 1, run_dir=tmp_path)

    with pytest.raises(SGDBackendError):
        e.start_training(lambda: 2, run_dir=tmp_path)

    assert e.finish_training() == [1, 1]
Пример #19
0
def test_tensorflow_start(ray_start_2_cpus, tmp_path):
    num_workers = 2
    tensorflow_config = TensorflowConfig()
    e = BackendExecutor(tensorflow_config, num_workers=num_workers)
    e.start()

    def get_tf_config():
        import json
        import os
        return json.loads(os.environ["TF_CONFIG"])

    e.start_training(get_tf_config, run_dir=tmp_path)
    results = e.finish_training()
    assert len(results) == num_workers

    workers = [result["cluster"]["worker"] for result in results]
    assert all(worker == workers[0] for worker in workers)

    indexes = [result["task"]["index"] for result in results]
    assert len(set(indexes)) == num_workers
Пример #20
0
def test_execute(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    assert e.run(lambda: 1) == [1, 1]