예제 #1
0
def log_sync_template(options=""):
    """Template enabling syncs between driver and worker when possible.
    Requires ray cluster to be started with the autoscaler. Also requires
    rsync to be installed.

    Args:
        options (str): Additional rsync options.

    Returns:
        Sync template with source and target parameters. None if rsync
        unavailable.
    """
    if not distutils.spawn.find_executable("rsync"):
        if log_once("tune:rsync"):
            logger.error("Log sync requires rsync to be installed.")
        return None
    global _log_sync_warned
    ssh_key = get_ssh_key()
    if ssh_key is None:
        if not _log_sync_warned:
            logger.debug("Log sync requires cluster to be setup with "
                         "`ray up`.")
            _log_sync_warned = True
        return None

    rsh = "ssh -i {ssh_key} -o ConnectTimeout=120s -o StrictHostKeyChecking=no"
    rsh = rsh.format(ssh_key=quote(ssh_key))
    template = "rsync {options} -savz -e {rsh} {{source}} {{target}}"
    return template.format(options=options, rsh=quote(rsh))
예제 #2
0
    def sync_to_worker_if_possible(self):
        """Syncs the local logdir on driver to worker if possible.

        Requires ray cluster to be started with the autoscaler. Also requires
        rsync to be installed.
        """
        if self.worker_ip == self.local_ip:
            return
        ssh_key = get_ssh_key()
        ssh_user = get_ssh_user()
        global _log_sync_warned
        if ssh_key is None or ssh_user is None:
            if not _log_sync_warned:
                logger.error("Log sync requires cluster to be setup with "
                             "`ray up`.")
                _log_sync_warned = True
            return
        if not distutils.spawn.find_executable("rsync"):
            logger.error("Log sync requires rsync to be installed.")
            return
        source = '{}/'.format(self.local_dir)
        target = '{}@{}:{}/'.format(ssh_user, self.worker_ip, self.local_dir)
        final_cmd = (("""rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """
                      """-o StrictHostKeyChecking=no" {} {}""").format(
                          quote(ssh_key), quote(source), quote(target)))
        logger.info("Syncing results to %s", str(self.worker_ip))
        sync_process = subprocess.Popen(
            final_cmd, shell=True, stdout=self.logfile)
        sync_process.wait()
예제 #3
0
파일: log_sync.py 프로젝트: zhouhh2017/ray
    def sync_to_worker_if_possible(self):
        """Syncs the local logdir on driver to worker if possible.

        Requires ray cluster to be started with the autoscaler. Also requires
        rsync to be installed.
        """
        if self.worker_ip == self.local_ip:
            return
        ssh_key = get_ssh_key()
        ssh_user = get_ssh_user()
        global _log_sync_warned
        if ssh_key is None or ssh_user is None:
            if not _log_sync_warned:
                logger.error("Log sync requires cluster to be setup with "
                             "`ray up`.")
                _log_sync_warned = True
            return
        if not distutils.spawn.find_executable("rsync"):
            logger.error("Log sync requires rsync to be installed.")
            return
        source = "{}/".format(self.local_dir)
        target = "{}@{}:{}/".format(ssh_user, self.worker_ip, self.local_dir)
        final_cmd = (("""rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """
                      """-o StrictHostKeyChecking=no" {} {}""").format(
                          quote(ssh_key), quote(source), quote(target)))
        logger.info("Syncing results to %s", str(self.worker_ip))
        sync_process = subprocess.Popen(final_cmd,
                                        shell=True,
                                        stdout=self.logfile)
        sync_process.wait()
예제 #4
0
    def sync_now(self, force=False):
        self.last_sync_time = time.time()
        if not self.worker_ip:
            logger.debug("Worker ip unknown, skipping log sync for {}".format(
                self.local_dir))
            return

        if self.worker_ip == self.local_ip:
            worker_to_local_sync_cmd = None  # don't need to rsync
        else:
            ssh_key = get_ssh_key()
            ssh_user = get_ssh_user()
            if ssh_key is None or ssh_user is None:
                logger.error("Log sync requires cluster to be setup with "
                             "`ray create_or_update`.")
                return
            if not distutils.spawn.find_executable("rsync"):
                logger.error("Log sync requires rsync to be installed.")
                return
            source = '{}@{}:{}/'.format(ssh_user, self.worker_ip,
                                        self.local_dir)
            target = '{}/'.format(self.local_dir)
            worker_to_local_sync_cmd = ((
                """rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """
                """-o StrictHostKeyChecking=no" {} {}""").format(
                    quote(ssh_key), quote(source), quote(target)))

        if self.remote_dir:
            if self.sync_func:
                local_to_remote_sync_cmd = None
                try:
                    self.sync_func(self.local_dir, self.remote_dir)
                except Exception:
                    logger.exception("Sync function failed.")
            else:
                local_to_remote_sync_cmd = self.get_remote_sync_cmd()
        else:
            local_to_remote_sync_cmd = None

        if self.sync_process:
            self.sync_process.poll()
            if self.sync_process.returncode is None:
                if force:
                    self.sync_process.kill()
                else:
                    logger.warning("Last sync is still in progress, skipping.")
                    return

        if worker_to_local_sync_cmd or local_to_remote_sync_cmd:
            final_cmd = ""
            if worker_to_local_sync_cmd:
                final_cmd += worker_to_local_sync_cmd
            if local_to_remote_sync_cmd:
                if final_cmd:
                    final_cmd += " && "
                final_cmd += local_to_remote_sync_cmd
            logger.debug("Running log sync: {}".format(final_cmd))
            self.sync_process = subprocess.Popen(final_cmd, shell=True)
예제 #5
0
파일: log_sync.py 프로젝트: jamescasbon/ray
    def sync_now(self, force=False):
        self.last_sync_time = time.time()
        if not self.worker_ip:
            logger.debug("Worker ip unknown, skipping log sync for {}".format(
                self.local_dir))
            return

        if self.worker_ip == self.local_ip:
            worker_to_local_sync_cmd = None  # don't need to rsync
        else:
            ssh_key = get_ssh_key()
            ssh_user = get_ssh_user()
            if ssh_key is None or ssh_user is None:
                logger.error("Log sync requires cluster to be setup with "
                             "`ray create_or_update`.")
                return
            if not distutils.spawn.find_executable("rsync"):
                logger.error("Log sync requires rsync to be installed.")
                return
            source = '{}@{}:{}/'.format(ssh_user, self.worker_ip,
                                        self.local_dir)
            target = '{}/'.format(self.local_dir)
            worker_to_local_sync_cmd = ((
                """rsync -savz -e "ssh -i {} -o ConnectTimeout=120s """
                """-o StrictHostKeyChecking=no" {} {}""").format(
                    quote(ssh_key), quote(source), quote(target)))

        if self.remote_dir:
            if self.sync_func:
                local_to_remote_sync_cmd = None
                try:
                    self.sync_func(self.local_dir, self.remote_dir)
                except Exception:
                    logger.exception("Sync function failed.")
            else:
                local_to_remote_sync_cmd = self.get_remote_sync_cmd()
        else:
            local_to_remote_sync_cmd = None

        if self.sync_process:
            self.sync_process.poll()
            if self.sync_process.returncode is None:
                if force:
                    self.sync_process.kill()
                else:
                    logger.warning("Last sync is still in progress, skipping.")
                    return

        if worker_to_local_sync_cmd or local_to_remote_sync_cmd:
            final_cmd = ""
            if worker_to_local_sync_cmd:
                final_cmd += worker_to_local_sync_cmd
            if local_to_remote_sync_cmd:
                if final_cmd:
                    final_cmd += " && "
                final_cmd += local_to_remote_sync_cmd
            logger.debug("Running log sync: {}".format(final_cmd))
            self.sync_process = subprocess.Popen(final_cmd, shell=True)
예제 #6
0
    def sync_now(self, force=False):
        self.last_sync_time = time.time()
        if not self.worker_ip:
            print("Worker ip unknown, skipping log sync for {}".format(
                self.local_dir))
            return

        if self.worker_ip == self.local_ip:
            worker_to_local_sync_cmd = None  # don't need to rsync
        else:
            ssh_key = get_ssh_key()
            ssh_user = get_ssh_user()
            if ssh_key is None or ssh_user is None:
                print("Error: log sync requires cluster to be setup with "
                      "`ray create_or_update`.")
                return
            if not distutils.spawn.find_executable("rsync"):
                print("Error: log sync requires rsync to be installed.")
                return
            worker_to_local_sync_cmd = ((
                """rsync -avz -e "ssh -i {} -o ConnectTimeout=120s """
                """-o StrictHostKeyChecking=no" '{}@{}:{}/' '{}/'""").format(
                    quote(ssh_key), ssh_user, self.worker_ip,
                    quote(self.local_dir), quote(self.local_dir)))

        if self.remote_dir:
            if self.remote_dir.startswith(S3_PREFIX):
                local_to_remote_sync_cmd = ("aws s3 sync {} {}".format(
                    quote(self.local_dir), quote(self.remote_dir)))
            elif self.remote_dir.startswith(GCS_PREFIX):
                local_to_remote_sync_cmd = ("gsutil rsync -r {} {}".format(
                    quote(self.local_dir), quote(self.remote_dir)))
        else:
            local_to_remote_sync_cmd = None

        if self.sync_process:
            self.sync_process.poll()
            if self.sync_process.returncode is None:
                if force:
                    self.sync_process.kill()
                else:
                    print("Warning: last sync is still in progress, skipping")
                    return

        if worker_to_local_sync_cmd or local_to_remote_sync_cmd:
            final_cmd = ""
            if worker_to_local_sync_cmd:
                final_cmd += worker_to_local_sync_cmd
            if local_to_remote_sync_cmd:
                if final_cmd:
                    final_cmd += " && "
                final_cmd += local_to_remote_sync_cmd
            print("Running log sync: {}".format(final_cmd))
            self.sync_process = subprocess.Popen(final_cmd, shell=True)
예제 #7
0
파일: log_sync.py 프로젝트: x-malet/ray
def log_sync_template():
    """Syncs the local_dir between driver and worker if possible.

    Requires ray cluster to be started with the autoscaler. Also requires
    rsync to be installed.

    """
    if not distutils.spawn.find_executable("rsync"):
        logger.error("Log sync requires rsync to be installed.")
        return
    global _log_sync_warned
    ssh_key = get_ssh_key()
    if ssh_key is None:
        if not _log_sync_warned:
            logger.error("Log sync requires cluster to be setup with "
                         "`ray up`.")
            _log_sync_warned = True
        return

    return ("""rsync -savz -e "ssh -i {ssh_key} -o ConnectTimeout=120s """
            """-o StrictHostKeyChecking=no" {{source}} {{target}}""").format(
                ssh_key=quote(ssh_key))
예제 #8
0
파일: horovod.py 프로젝트: zzmcdc/ray
def DistributedTrainableCreator(
        func: Callable,
        use_gpu: bool = False,
        num_hosts: int = 1,
        num_slots: int = 1,
        num_cpus_per_slot: int = 1,
        timeout_s: int = 30,
        replicate_pem: bool = False) -> Type[_HorovodTrainable]:
    """Converts Horovod functions to be executable by Tune.

    Requires horovod > 0.19 to work.

    This function wraps and sets the resources for a given Horovod
    function to be used with Tune. It generates a Horovod Trainable (trial)
    which can itself be a distributed training job. One basic assumption of
    this implementation is that all sub-workers
    of a trial will be placed evenly across different machines.

    It is recommended that if `num_hosts` per trial > 1, you set
    num_slots == the size (or number of GPUs) of a single host.
    If num_hosts == 1, then you can set num_slots to be <=
    the size (number of GPUs) of a single host.

    This above assumption can be relaxed - please file a feature request
    on Github to inform the maintainers.

    Another assumption is that this API requires gloo as the underlying
    communication primitive. You will need to install Horovod with
    `HOROVOD_WITH_GLOO` enabled.

    *Fault Tolerance:* The trial workers themselves are not fault tolerant.
    When a host of a trial fails, all workers of a trial are expected to
    die, and the trial is expected to restart. This currently does not
    support function checkpointing.

    Args:
        func (Callable[[dict], None]): A training function that takes in
            a config dict for hyperparameters and should initialize
            horovod via horovod.init.
        use_gpu (bool); Whether to allocate a GPU per worker.
        num_cpus_per_slot (int): Number of CPUs to request
            from Ray per worker.
        num_hosts (int): Number of hosts that each trial is expected
            to use.
        num_slots (int): Number of slots (workers) to start on each host.
        timeout_s (int): Seconds for Horovod rendezvous to timeout.
        replicate_pem (bool): THIS MAY BE INSECURE. If true, this will
            replicate the underlying Ray cluster ssh key across all hosts.
            This may be useful if using the Ray Autoscaler.


    Returns:
        Trainable class that can be passed into `tune.run`.

    Example:

    .. code-block:: python

        def train(config):
            horovod.init()
            horovod.allreduce()

        from ray.tune.integration.horovod import DistributedTrainableCreator
        trainable_cls = DistributedTrainableCreator(
            train, num_hosts=1, num_slots=2, use_gpu=True)

        tune.run(trainable_cls)

    .. versionadded:: 1.0.0
    """
    ssh_identity_file = None
    sshkeystr = None

    if replicate_pem:
        from ray.tune.cluster_info import get_ssh_key
        ssh_identity_file = get_ssh_key()
        if os.path.exists(ssh_identity_file):
            # For now, we assume that you're on a Ray cluster.
            with open(ssh_identity_file) as f:
                sshkeystr = f.read()

    class WrappedHorovodTrainable(_HorovodTrainable):
        _function = func
        _num_hosts = num_hosts
        _num_slots = num_slots
        _num_cpus_per_slot = num_cpus_per_slot
        _use_gpu = use_gpu
        _ssh_identity_file = ssh_identity_file
        _ssh_str = sshkeystr
        _timeout_s = timeout_s

        @classmethod
        def default_resource_request(cls, config: Dict):
            extra_gpu = int(num_hosts * num_slots) * int(use_gpu)
            extra_cpu = int(num_hosts * num_slots * num_cpus_per_slot)

            return Resources(
                cpu=0,
                gpu=0,
                extra_cpu=extra_cpu,
                extra_gpu=extra_gpu,
            )

    return WrappedHorovodTrainable
예제 #9
0
파일: horovod.py 프로젝트: krfricke/ray
def DistributedTrainableCreator(
    func: Callable[[Dict], None],
    use_gpu: bool = False,
    num_hosts: Optional[int] = None,
    num_workers: int = 1,
    num_cpus_per_worker: int = 1,
    timeout_s: int = 30,
    replicate_pem: bool = False,
) -> Type[_HorovodTrainable]:
    """Converts Horovod functions to be executable by Tune.

    Requires horovod > 0.19 to work.

    This function wraps and sets the resources for a given Horovod
    function to be used with Tune. It generates a Horovod Trainable (trial)
    which can itself be a distributed training job. One basic assumption of
    this implementation is that all sub-workers
    of a trial will be placed evenly across different machines.

    It is recommended that if `num_hosts` per trial > 1, you set
    num_workers == the size (or number of GPUs) of a single host.
    If num_hosts == 1, then you can set num_workers to be <=
    the size (number of GPUs) of a single host.

    This above assumption can be relaxed - please file a feature request
    on Github to inform the maintainers.

    Another assumption is that this API requires gloo as the underlying
    communication primitive. You will need to install Horovod with
    `HOROVOD_WITH_GLOO` enabled.

    *Fault Tolerance:* The trial workers themselves are not fault tolerant.
    When a host of a trial fails, all workers of a trial are expected to
    die, and the trial is expected to restart. This currently does not
    support function checkpointing.

    Args:
        func: A training function that takes in
            a config dict for hyperparameters and should initialize
            horovod via horovod.init.
        use_gpu: Whether to allocate a GPU per worker.
        num_cpus_per_worker: Number of CPUs to request
            from Ray per worker.
        num_hosts: Number of hosts that each trial is expected
            to use.
        num_workers: Number of workers to start on each host.
        timeout_s: Seconds for Horovod rendezvous to timeout.
        replicate_pem: THIS MAY BE INSECURE. If true, this will
            replicate the underlying Ray cluster ssh key across all hosts.
            This may be useful if using the Ray Autoscaler.

    Returns:
        Trainable class that can be passed into `tune.run`.

    Example:

    .. code-block:: python

        def train(config):
            horovod.init()
            horovod.allreduce()

        from ray.tune.integration.horovod import DistributedTrainableCreator
        trainable_cls = DistributedTrainableCreator(
            train, num_hosts=1, num_workers=2, use_gpu=True)

        tune.run(trainable_cls)

    .. versionadded:: 1.0.0
    """
    warnings.warn(
        "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray "
        "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR ("
        "https://docs.ray.io/en/latest/ray-air/getting-started.html) will "
        "provide greater functionality than `DistributedTrainableCreator`, "
        "and with a more flexible and easy-to-use API.",
        PendingDeprecationWarning,
        stacklevel=2,
    )

    ssh_identity_file = None
    sshkeystr = None

    if replicate_pem:
        from ray.tune.cluster_info import get_ssh_key

        ssh_identity_file = get_ssh_key()
        if os.path.exists(ssh_identity_file):
            # For now, we assume that you're on a Ray cluster.
            with open(ssh_identity_file) as f:
                sshkeystr = f.read()

    class WrappedHorovodTrainable(_HorovodTrainable):
        _function = func
        _num_hosts = num_hosts
        _num_workers = num_workers
        _num_cpus_per_worker = num_cpus_per_worker
        _use_gpu = use_gpu
        _ssh_identity_file = ssh_identity_file
        _ssh_str = sshkeystr
        _timeout_s = timeout_s

        @classmethod
        def default_resource_request(cls, config: Dict):
            return PlacementGroupFactory(
                [{}]
                + [{"CPU": cls._num_cpus_per_worker, "GPU": int(use_gpu)}]
                * (num_workers)
            )

    return WrappedHorovodTrainable