Python RayExecutor.run示例

编程语言: Python

命名空间/包名称: horovod.ray

类/类型: RayExecutor

方法/功能: run

hotexamples.com的示例: 2

Python RayExecutor.run - 已找到2个示例。这些是从开源项目中提取的最受好评的horovod.ray.RayExecutor.run现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

RayExecutor(15)

create_settings(12)

start(9)

shutdown(7)

execute(4)

execute_single(4)

run(2)

run_remote(1)

示例#1

显示文件

文件： pipelined_training.py 项目： novahe/ray

    start = time.time()

    splits = create_dataset(files,
                            num_workers=args.num_workers,
                            epochs=args.epochs,
                            num_windows=args.num_windows)

    if args.debug:
        tasks = [
            consume.options(num_gpus=1).remote(split,
                                               rank=idx,
                                               batch_size=args.batch_size)
            for idx, split in enumerate(splits)
        ]
        ray.get(tasks)
    else:
        print("Create Ray executor")
        settings = RayExecutor.create_settings(timeout_s=30)
        executor = RayExecutor(settings,
                               num_workers=args.num_workers,
                               use_gpu=True)
        executor.start()
        executor.run(train_main, args=[args, splits])
        executor.shutdown()

    delta = time.time() - start
    print(f"success! total time {delta}")
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps({"ingest_time": delta, "success": 1}))

示例#2

显示文件

文件： horovod_ray_accelerator.py 项目： tuyulers5/jav44

class HorovodRayAccelerator(HorovodAccelerator):
    """Pytorch Lightning Accelerator for Horovod training on a Ray cluster.

    This accelerator is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this accelerator, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray.util.lightning_accelerators import HorovodRayAccelerator

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4,
                use_gpu=True).

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, accelerator=accelerator).
            trainer.fit(ptl_model).

    """
    def __init__(self,
                 *args,
                 num_hosts=1,
                 num_slots=1,
                 use_gpu=False,
                 **kwargs):
        super().__init__(*args, trainer=None, **kwargs)
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu

    def setup(self, model):
        self.trainer.use_horovod = True
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(settings,
                                    num_hosts=self.num_hosts,
                                    num_slots=self.num_slots,
                                    use_gpu=self.use_gpu)
        self.trainer.model = model
        self.executor.start(executable_cls=get_executable_cls())

    def train(self):
        trainer = self.trainer
        trainer_ref = ray.put(self.trainer)
        self.trainer = None
        results = self.executor.run(self.train_remote, args=[trainer_ref])
        results, state_dict, best_path = results[0]

        self.trainer = trainer
        self.trainer.model.load_state_dict(state_dict)
        if self.trainer.checkpoint_callback:
            self.trainer.checkpoint_callback.best_model_path = best_path

        return results

    def train_remote(self, trainer_ref):
        self.trainer = ray.get(trainer_ref)
        hvd.init()
        if self.trainer.on_gpu:
            # Horovod assigns one local GPU per process.
            self.trainer.root_gpu = hvd.local_rank()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayAccelerator, self).setup(self.trainer.model)
        results = super(HorovodRayAccelerator, self).train()
        if hvd.rank() != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.trainer.checkpoint_callback is not None:
            best_model_path = self.trainer.checkpoint_callback.best_model_path

        model = self.trainer.model
        return results, model.state_dict(), best_model_path

    def teardown(self):
        self.executor.shutdown()