Пример #1
0
    def setup(self, config: Dict):
        trainable = wrap_function(self.__class__._function)
        # We use a filelock here to ensure that the file-writing
        # process is safe across different trainables.
        if self._ssh_identity_file:
            with FileLock(self._ssh_identity_file + ".lock"):
                settings = RayExecutor.create_settings(self._timeout_s,
                                                       self._ssh_identity_file,
                                                       self._ssh_str)
        else:
            settings = RayExecutor.create_settings(self._timeout_s,
                                                   self._ssh_identity_file,
                                                   self._ssh_str)

        self.executor = RayExecutor(settings,
                                    cpus_per_slot=self._num_cpus_per_slot,
                                    use_gpu=self._use_gpu,
                                    num_hosts=self._num_hosts,
                                    num_slots=self._num_slots)

        # We can't put `self` in the lambda closure, so we
        # resolve the variable ahead of time.
        logdir_ = str(self.logdir)

        # Starts the workers as specified by the resources above.
        self.executor.start(executable_cls=trainable,
                            executable_kwargs={
                                "config":
                                config,
                                "logger_creator":
                                lambda cfg: logger_creator(cfg, logdir_)
                            })
Пример #2
0
Файл: ray.py Проект: cxz/ludwig
 def __init__(self, horovod_kwargs, trainer_kwargs):
     # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
     setting = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(
         setting, **{**get_horovod_kwargs(), **horovod_kwargs})
     self.executor.start(executable_cls=RayRemoteTrainer,
                         executable_kwargs=trainer_kwargs)
 def setup(self, model: LightningModule):
     """Creates the RayExecutor object."""
     self._model = model
     settings = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(settings,
                                 num_hosts=self.num_hosts,
                                 num_slots=self.num_slots,
                                 use_gpu=self.use_gpu)
     self.executor.start(executable_cls=get_executable_cls())
Пример #4
0
 def setup(self, model):
     self.trainer.use_horovod = True
     settings = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(settings,
                                 num_hosts=self.num_hosts,
                                 num_slots=self.num_slots,
                                 use_gpu=self.use_gpu)
     self.trainer.model = model
     self.executor.start(executable_cls=get_executable_cls())
Пример #5
0
 def __init__(self, horovod_kwargs, predictor_kwargs):
     # TODO ray: investigate using Dask for prediction instead of Horovod
     setting = RayExecutor.create_settings(timeout_s=30)
     self.executor = RayExecutor(
         setting, **{
             **get_horovod_kwargs(),
             **horovod_kwargs
         })
     self.executor.start(executable_cls=RemotePredictor,
                         executable_kwargs=predictor_kwargs)
Пример #6
0
def main(num_workers,
         use_gpu,
         timeout_s=30,
         placement_group_timeout_s=100,
         kwargs=None):
    kwargs = kwargs or {}
    if use_gpu:
        kwargs["use_cuda"] = True
    settings = RayExecutor.create_settings(
        timeout_s=timeout_s,
        placement_group_timeout_s=placement_group_timeout_s)
    executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
    executor.start()
    executor.run(train_fn, kwargs=kwargs)
Пример #7
0
    def __init__(self, horovod_kwargs, executable_kwargs):
        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
        if RayExecutor is None:
            logger.error(
                "RayLegacyTrainer failed to initialize: RayExecutor is None. Make sure horovod[ray] is installed."
            )
            return
        setting = RayExecutor.create_settings(timeout_s=30)

        self.executor = RayExecutor(
            setting, **{
                **get_horovod_kwargs(),
                **horovod_kwargs
            })
        self.executor.start(executable_cls=HorovodRemoteTrainer,
                            executable_kwargs=executable_kwargs)
Пример #8
0
    def start_executor(self):
        # Ray executor settings
        setting = RayExecutor.create_settings(timeout_s=100)
        num_hosts = 1  # number of machine to use
        num_slots = self.num_slots  # number of workers to use on each machine
        cpus_per_slot = 1  # number of cores to allocate to each worker
        gpus_per_slot = 1  # number of GPUs to allocate to each worker
        use_gpu = gpus_per_slot > 0

        # Start num_hosts * num_slots actors on the cluster
        # https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api
        executor = RayExecutor(
            setting,
            num_hosts=num_hosts,
            num_slots=num_slots,
            cpus_per_slot=cpus_per_slot,
            gpus_per_slot=gpus_per_slot,
            use_gpu=use_gpu
        )

        # Launch the Ray actors on each machine
        # This will launch `num_slots` actors on each machine
        executor.start()
        return executor
Пример #9
0
        for batch_idx, data in enumerate(train_loader):
            feature = data[:-1]
            target = data[-1]
            optimizer.zero_grad()
            output = model(*feature)
            loss = F.smooth_l1_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} \tLoss: {:.6f}'.format(
                    epoch, loss.item()))

    for epoch in range(1, args.epochs + 1):
        train(epoch)


if __name__ == '__main__':
    # connect to ray cluster
    import ray
    # ray.init(address='auto')
    ray.init()
    torch_ds, num_features = process_data()
    # Start horovod workers on Ray
    from horovod.ray import RayExecutor
    settings = RayExecutor.create_settings(500)
    executor = RayExecutor(settings, num_hosts=1, num_slots=1, cpus_per_slot=1)
    executor.start()
    executor.run(train_fn, args=[torch_ds, num_features])
    raydp.stop_spark()
    ray.shutdown()
Пример #10
0
        # TensorBoard or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            './checkpoint-{epoch}.h5'))

    # Horovod: write logs on worker 0.
    verbose = 1 if hvd.rank() == 0 else 0

    # Train the model.
    # Horovod: adjust number of steps based on number of GPUs.
    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(),
                    callbacks=callbacks, epochs=num_epochs, verbose=verbose)


ray.init()
settings = RayExecutor.create_settings(timeout_s=30)
executor = RayExecutor(settings, num_hosts=1, num_slots=2, use_gpu=False)
executor.start()
executor.run(train, kwargs=dict(num_epochs=1))
executor.shutdown()
Пример #11
0
import socket
import ray

import horovod.tensorflow.keras as hvd
# import horovod.tensorflow as hvd
from horovod.ray import RayExecutor

# Start the Ray cluster or attach to an existing Ray cluster
ray.init(address="auto")

# Ray executor settings
setting = RayExecutor.create_settings(timeout_s=100)
num_hosts = 1  # number of machine to use
num_slots = 6  # number of workers to use on each machine
cpus_per_slot = 1  # number of cores to allocate to each worker
gpus_per_slot = 1  # number of GPUs to allocate to each worker

# Start num_hosts * num_slots actors on the cluster
# https://horovod.readthedocs.io/en/stable/api.html#horovod-ray-api
executor = RayExecutor(setting,
                       num_hosts=num_hosts,
                       num_slots=num_slots,
                       cpus_per_slot=cpus_per_slot,
                       gpus_per_slot=gpus_per_slot,
                       use_gpu=True)

# Launch the Ray actors on each machine
# This will launch `num_slots` actors on each machine
print("Start executor...", end="", flush=True)
executor.start()
print("OK", flush=True)
Пример #12
0
def main(num_workers, use_gpu, **kwargs):
    settings = RayExecutor.create_settings(timeout_s=30)
    executor = RayExecutor(settings, use_gpu=use_gpu, num_workers=num_workers)
    executor.run(train_fn, kwargs=kwargs)