Пример #1
0
    start = time.time()

    splits = create_dataset(files,
                            num_workers=args.num_workers,
                            epochs=args.epochs,
                            num_windows=args.num_windows)

    if args.debug:
        tasks = [
            consume.options(num_gpus=1).remote(split,
                                               rank=idx,
                                               batch_size=args.batch_size)
            for idx, split in enumerate(splits)
        ]
        ray.get(tasks)
    else:
        print("Create Ray executor")
        settings = RayExecutor.create_settings(timeout_s=30)
        executor = RayExecutor(settings,
                               num_workers=args.num_workers,
                               use_gpu=True)
        executor.start()
        executor.run(train_main, args=[args, splits])
        executor.shutdown()

    delta = time.time() - start
    print(f"success! total time {delta}")
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps({"ingest_time": delta, "success": 1}))
Пример #2
0
class HorovodRayAccelerator(HorovodAccelerator):
    """Pytorch Lightning Accelerator for Horovod training on a Ray cluster.

    This accelerator is used to manage distributed training on a Ray cluster
    via the Horovod training framework. Internally, the specified number of
    Ray actors are launched in the cluster and are configured as part of the
    Horovod ring. The Pytorch Lightning trainer is instantiated on the
    driver and sent to each of these training workers where training is
    executed. The distributed training protocol is handled by Horovod.

    Each training worker is configured to reserve 1 CPU and if 1 GPU if
    ``use_gpu`` is set to ``True``.

    If using this accelerator, you should run your code like a normal Python
    script: ``python train.py``, and not with ``horovodrun``.

    Args:
        num_hosts (int): The number of nodes/machines to execute the job on.
        num_slots (int): Number of workers to be placed on each machine.
        use_gpu (bool): Whether to use GPU for allocation. For GPU to be
            used, you must also set the ``gpus`` arg in your Pytorch Lightning
            Trainer to a value > 0.

    Example:

        .. code_block:: python

            import pytorch_lightning as ptl
            from ray.util.lightning_accelerators import HorovodRayAccelerator

            ptl_model = MNISTClassifier(...)
            # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
            accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4,
                use_gpu=True).

            # If using GPUs, set the ``gpus`` arg to a value > 0.
            # The actual number of GPUs is determined by ``num_slots``.
            trainer = pl.Trainer(..., gpus=1, accelerator=accelerator).
            trainer.fit(ptl_model).

    """
    def __init__(self,
                 *args,
                 num_hosts=1,
                 num_slots=1,
                 use_gpu=False,
                 **kwargs):
        super().__init__(*args, trainer=None, **kwargs)
        self.nickname = "horovod_ray"
        self.num_hosts = num_hosts
        self.num_slots = num_slots
        self.use_gpu = use_gpu

    def setup(self, model):
        self.trainer.use_horovod = True
        settings = RayExecutor.create_settings(timeout_s=30)
        self.executor = RayExecutor(settings,
                                    num_hosts=self.num_hosts,
                                    num_slots=self.num_slots,
                                    use_gpu=self.use_gpu)
        self.trainer.model = model
        self.executor.start(executable_cls=get_executable_cls())

    def train(self):
        trainer = self.trainer
        trainer_ref = ray.put(self.trainer)
        self.trainer = None
        results = self.executor.run(self.train_remote, args=[trainer_ref])
        results, state_dict, best_path = results[0]

        self.trainer = trainer
        self.trainer.model.load_state_dict(state_dict)
        if self.trainer.checkpoint_callback:
            self.trainer.checkpoint_callback.best_model_path = best_path

        return results

    def train_remote(self, trainer_ref):
        self.trainer = ray.get(trainer_ref)
        hvd.init()
        if self.trainer.on_gpu:
            # Horovod assigns one local GPU per process.
            self.trainer.root_gpu = hvd.local_rank()

        # TODO: Make changes in PTL to clean this up.
        super(HorovodRayAccelerator, self).setup(self.trainer.model)
        results = super(HorovodRayAccelerator, self).train()
        if hvd.rank() != 0:
            # Only want results from the first worker.
            return None

        best_model_path = None
        if self.trainer.checkpoint_callback is not None:
            best_model_path = self.trainer.checkpoint_callback.best_model_path

        model = self.trainer.model
        return results, model.state_dict(), best_model_path

    def teardown(self):
        self.executor.shutdown()