示例#1
0
文件: test_gpu.py 项目: parasj/ray
def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker):
    @patch("torch.cuda.is_available", lambda: True)
    def train_fn():
        return train.torch.get_device().index

    trainer = Trainer(
        TorchConfig(backend="gloo"),
        num_workers=int(8 / num_gpus_per_worker),
        use_gpu=True,
        resources_per_worker={"GPU": num_gpus_per_worker},
    )
    trainer.start()
    devices = trainer.run(train_fn)
    trainer.shutdown()

    count = Counter(devices)
    if num_gpus_per_worker == 0.5:
        for i in range(4):
            assert count[i] == 4
    elif num_gpus_per_worker == 1:
        for i in range(4):
            assert count[i] == 2
    elif num_gpus_per_worker == 2:
        for i in range(2):
            assert count[2 * i] == 2
    else:
        raise RuntimeError(
            "New parameter for this test has been added without checking that the "
            "correct devices have been returned.")
示例#2
0
    def __init__(
        self,
        train_loop_per_worker: Union[Callable[[], None], Callable[[Dict],
                                                                  None]],
        train_loop_config: Optional[Dict] = None,
        torch_config: Optional[TorchConfig] = None,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional[Preprocessor] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):
        if not torch_config:
            torch_config = TorchConfig()

        super(TorchTrainer, self).__init__(
            train_loop_per_worker=train_loop_per_worker,
            train_loop_config=train_loop_config,
            backend_config=torch_config,
            scaling_config=scaling_config,
            run_config=run_config,
            datasets=datasets,
            preprocessor=preprocessor,
            resume_from_checkpoint=resume_from_checkpoint,
        )
示例#3
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    trainer = Trainer(backend=TorchConfig(backend="gloo"),
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()

    print(results)
    return results
示例#4
0
def test_torch_start_shutdown(ray_start_2_cpus, init_method):
    torch_config = TorchConfig(backend="gloo", init_method=init_method)
    e = BackendExecutor(torch_config, num_workers=2)
    e.start()

    def check_process_group():
        import torch
        return torch.distributed.is_initialized(
        ) and torch.distributed.get_world_size() == 2

    e.start_training(check_process_group)
    assert all(e.finish_training())

    e._backend.on_shutdown(e.worker_group, e._backend_config)

    e.start_training(check_process_group)
    assert not any(e.finish_training())
示例#5
0
def test_worker_kill(ray_start_2_cpus, backend):
    if backend == "test":
        test_config = TestConfig()
    elif backend == "torch":
        test_config = TorchConfig()
    elif backend == "tf":
        test_config = TensorflowConfig()
    elif backend == "horovod":
        test_config = HorovodConfig()

    trainer = Trainer(test_config, num_workers=2)

    def train_func():
        for i in range(2):
            train.report(loss=1, iter=i)

    trainer.start()
    kill_callback = KillCallback(fail_on=0, trainer=trainer)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning
    # Run 3: iter=0, counter=2, Successful
    # Run 4: iter=1, counter=3, Successful
    assert kill_callback.counter == 3

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(fail_on=1, trainer=trainer)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=2, Successful
    # Run 3: None, counter=2, Unsuccessful, starts training from beginning.
    # Run 4: iter=0, counter=3, Successful
    # Run 5: iter=1, counter=4, Successful
    assert kill_callback.counter == 4

    def train_func():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train_func)
示例#6
0
文件: test_tune.py 项目: parasj/ray
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker):
    from ray import tune
    from ray.tune.tuner import Tuner, TuneConfig

    num_samples = 2

    @patch("torch.cuda.is_available", lambda: True)
    def train_func():
        train.report(device_id=train.torch.get_device().index)

    trainer = TorchTrainer(
        train_func,
        torch_config=TorchConfig(backend="gloo"),
        scaling_config=ScalingConfig(
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": num_gpus_per_worker},
        ),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "dummy": tune.choice([32, 64, 128]),
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs]

    assert len(device_ids) == num_samples
    for i in range(num_samples):
        assert device_ids[i][0] == 0
示例#7
0
parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test",
    action="store_true",
    default=False,
    help="Finish quickly for training.",
)
args = parser.parse_args()

ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

trainer = Trainer(
    num_workers=num_training_workers,
    use_gpu=not args.smoke_test,
    backend=TorchConfig(backend="gloo"),
)
TorchTrainable = trainer.to_tune_trainable(train_func=train_func)

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="loss",
    mode="min",
    perturbation_interval=1,
    hyperparam_mutations={
        # distribution for resampling
        "lr": lambda: np.random.uniform(0.001, 1),
        # allow perturbations within this set of categorical values
        "momentum": [0.8, 0.9, 0.99],
    },
)
示例#8
0
    action="store_true",
    default=False,
    help="Finish quickly for training.",
)
args = parser.parse_args()

ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(
        num_workers=num_training_workers,
        use_gpu=not args.smoke_test,
    ),
    torch_config=TorchConfig(backend="gloo"),
)

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    perturbation_interval=1,
    hyperparam_mutations={
        "train_loop_config": {
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        }
    },
)