def schedule_fold_model_fit(self, model_base, fold_ctx, kwargs):
        args = [model_base, fold_ctx, kwargs]
        args_refs = [ray.put(arg) for arg in args]
        print('...model_fit')

        pg = placement_group([{"CPU": 2}], strategy="STRICT_SPREAD")
        ray.get(pg.ready())
        print(placement_group_table(pg))
        results_ref = model_fit_task_ray.options(placement_group=pg).remote(
            *args_refs)
        self.jobs.append((results_ref, time_start_fold, on_fit_end_fn))
示例#2
0
 def placement_group_factory():
     head_bundle = {"CPU": 1}
     child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor}
     child_bundle_extra = {} if resources_per_actor is None else \
         resources_per_actor
     child_bundles = [{
         **child_bundle,
         **child_bundle_extra
     } for _ in range(num_actors)]
     bundles = [head_bundle] + child_bundles
     return placement_group(bundles, strategy="PACK")
示例#3
0
    def get_remote_worker_options(
        num_workers: int,
        num_cpus_per_worker: int,
        num_gpus_per_worker: int,
        num_workers_per_host: Optional[int],
        timeout_s: Optional[int],
    ) -> (Dict[str, Any], placement_group):
        """Returns the option for remote workers.

        Args:
            num_workers: Number of training workers to include in
                world.
            num_cpus_per_worker: Number of CPU resources to reserve
                per training worker.
            num_gpus_per_worker: Number of GPU resources to reserve
                per training worker.
            num_workers_per_host: Optional[int]: Number of workers to
                colocate per host.
            timeout_s: Seconds before the torch process group
                times out. Useful when machines are unreliable. Defaults
                to 60 seconds. This value is also reused for triggering
                placement timeouts if forcing colocation.


        Returns:
            type: option that contains CPU/GPU count of
                the remote worker and the placement group information.
            pg: return a reference to the placement group
        """
        pg = None
        options = dict(num_cpus=num_cpus_per_worker,
                       num_gpus=num_gpus_per_worker)
        if num_workers_per_host:
            num_hosts = int(num_workers / num_workers_per_host)
            cpus_per_node = num_cpus_per_worker * num_workers_per_host
            gpus_per_node = num_gpus_per_worker * num_workers_per_host
            bundle = {"CPU": cpus_per_node, "GPU": gpus_per_node}

            all_bundles = [bundle] * num_hosts
            pg = placement_group(all_bundles, strategy="STRICT_SPREAD")
            logger.debug("Waiting for placement_group to start.")
            ray.get(pg.ready(), timeout=timeout_s)
            logger.debug("Placement_group started.")
            options["placement_group"] = pg

        return options, pg
示例#4
0
def test_mpi_with_pg(ray_cluster):
    pg = placement_group(bundles=[{"CPU": 2}], strategy="STRICT_SPREAD")
    with create_mpi_job(job_name="test",
                        world_size=2,
                        num_cpus_per_process=1,
                        num_processes_per_node=2,
                        timeout=5,
                        mpi_type="mpich",
                        placement_group=pg,
                        placement_group_bundle_indexes=[0]) as job:

        def func(context: WorkerContext):
            return context.job_id

        results = job.run(func)
        assert len(results) == 2
        assert results[0] == results[1] == "test"

    remove_placement_group(pg)
示例#5
0
        def placement_group_factory():
            head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
            child_bundle = {"custom": 1}

            return placement_group([head_bundle, child_bundle, child_bundle])