Пример #1
0
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # Create a head node
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 1,
        },
    )
    ray.init(address=cluster.address)
    for i in range(2):
        cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.worker.global_worker.current_node_id

        worker_node_ids = {
            ray.get(get_node_id.options(resources={
                f"foo:{i}": 1
            }).remote())
            for i in range(2)
        }
        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY)
        def task1():
            internal_kv._internal_kv_put("test_task1", "task1")
            while internal_kv._internal_kv_exists("test_task1"):
                time.sleep(0.1)
            return ray.worker.global_worker.current_node_id

        @ray.remote
        def task2():
            internal_kv._internal_kv_put("test_task2", "task2")
            return ray.worker.global_worker.current_node_id

        locations = []
        locations.append(task1.remote())
        while not internal_kv._internal_kv_exists("test_task1"):
            time.sleep(0.1)
        # Wait for updating driver raylet's resource view.
        time.sleep(5)
        locations.append(
            task2.options(
                scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote())
        while not internal_kv._internal_kv_exists("test_task2"):
            time.sleep(0.1)
        internal_kv._internal_kv_del("test_task1")
        internal_kv._internal_kv_del("test_task2")
        assert set(ray.get(locations)) == worker_node_ids
Пример #2
0
    def test_create_upload_once(self, tmp_path, random_dir, ray_start_regular):
        uri = get_uri_for_directory(random_dir)
        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert uploaded
        assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)

        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert not uploaded
        assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)

        # Delete the URI from the internal_kv. This should trigger re-upload.
        _internal_kv_del(uri, namespace=KV_NAMESPACE_PACKAGE)
        assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)
        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert uploaded
Пример #3
0
    def test_create_upload_once(self, empty_dir, random_dir,
                                ray_start_regular):
        uri = get_uri_for_directory(random_dir)
        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert uploaded
        assert _internal_kv_exists(uri)

        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert not uploaded
        assert _internal_kv_exists(uri)

        # Delete the URI from the internal_kv. This should trigger re-upload.
        _internal_kv_del(uri)
        assert not _internal_kv_exists(uri)
        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert uploaded
Пример #4
0
    def test_upload_succeeds(self, ray_start_regular):
        """Check function behavior when upload succeeds."""

        uri = "gcs://test.zip"
        bytes = b"test"

        assert len(bytes) < GCS_STORAGE_MAX_SIZE
        assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)
        assert _store_package_in_gcs(uri, bytes) == len(bytes)
        assert bytes == _internal_kv_get(uri, namespace=KV_NAMESPACE_PACKAGE)
Пример #5
0
 def wait(self, keys: list):
     while (True):
         all_exist = True
         for key in keys:
             result = internal_kv._internal_kv_exists(key)
             if not result:
                 all_exist = False
                 break
         if all_exist:
             return True
         time.sleep(1)
Пример #6
0
 def wait(self, keys: list):
     while True:
         all_exist = True
         for key in keys:
             key = self.__concat_key_with_prefixes(key)
             result = internal_kv._internal_kv_exists(key)
             if not result:
                 all_exist = False
                 break
         if all_exist:
             return True
         time.sleep(1)
Пример #7
0
def package_exists(pkg_uri: str) -> bool:
    """Check whether the package with given uri exists or not.

    Args:
        pkg_uri (str): The uri of the package

    Return:
        True for package existing and False for not.
    """
    (protocol, pkg_name) = _parse_uri(pkg_uri)
    if protocol in (Protocol.GCS, Protocol.PIN_GCS):
        return internal_kv._internal_kv_exists(pkg_uri)
    else:
        raise NotImplementedError(f"Protocol {protocol} is not supported")
Пример #8
0
def package_exists(pkg_uri: str) -> bool:
    """Check whether the package with given URI exists or not.

    Args:
        pkg_uri: The uri of the package

    Return:
        True for package existing and False for not.
    """
    protocol, pkg_name = parse_uri(pkg_uri)
    if protocol == Protocol.GCS:
        return _internal_kv_exists(pkg_uri)
    else:
        raise NotImplementedError(f"Protocol {protocol} is not supported")
Пример #9
0
def test_task_failure_when_driver_local_raylet_dies(ray_start_cluster):
    cluster = ray_start_cluster
    head = cluster.add_node(num_cpus=4, resources={"foo": 1})
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote(resources={"foo": 1})
    def func():
        internal_kv._internal_kv_put("test_func", "func")
        while True:
            time.sleep(1)

    func.remote()
    while not internal_kv._internal_kv_exists("test_func"):
        time.sleep(0.1)

    # The lease request should wait inside raylet
    # since there is no available resources.
    ret = func.remote()
    # Waiting for the lease request to reach raylet.
    time.sleep(1)
    head.kill_raylet()
    with pytest.raises(LocalRayletDiedError):
        ray.get(ret)
Пример #10
0
 def task1():
     internal_kv._internal_kv_put("test_task1", "task1")
     while internal_kv._internal_kv_exists("test_task1"):
         time.sleep(0.1)
     return ray.worker.global_worker.current_node_id
Пример #11
0
 def task1():
     internal_kv._internal_kv_put("test_task1", "task1")
     while internal_kv._internal_kv_exists("test_task1"):
         time.sleep(0.1)
     return ray.get_runtime_context().node_id.hex()
Пример #12
0
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # Create a head node
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 1,
        },
    )
    ray.init(address=cluster.address)
    for i in range(2):
        cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.get_runtime_context().node_id.hex()

        worker_node_ids = {
            ray.get(get_node_id.options(resources={
                f"foo:{i}": 1
            }).remote())
            for i in range(2)
        }
        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        @ray.remote(scheduling_strategy="SPREAD")
        def task1():
            internal_kv._internal_kv_put("test_task1", "task1")
            while internal_kv._internal_kv_exists("test_task1"):
                time.sleep(0.1)
            return ray.get_runtime_context().node_id.hex()

        @ray.remote
        def task2():
            internal_kv._internal_kv_put("test_task2", "task2")
            return ray.get_runtime_context().node_id.hex()

        locations = []
        locations.append(task1.remote())
        while not internal_kv._internal_kv_exists("test_task1"):
            time.sleep(0.1)
        # Wait for updating driver raylet's resource view.
        time.sleep(5)
        locations.append(task2.options(scheduling_strategy="SPREAD").remote())
        while not internal_kv._internal_kv_exists("test_task2"):
            time.sleep(0.1)
        internal_kv._internal_kv_del("test_task1")
        internal_kv._internal_kv_del("test_task2")
        assert set(ray.get(locations)) == worker_node_ids

        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        # Make sure actors can be spreaded as well.
        @ray.remote(num_cpus=1)
        class Actor:
            def ping(self):
                return ray.get_runtime_context().node_id.hex()

        actors = []
        locations = []
        for i in range(8):
            actors.append(Actor.options(scheduling_strategy="SPREAD").remote())
            locations.append(ray.get(actors[-1].ping.remote()))
        locations.sort()
        expected_locations = list(worker_node_ids) * 4
        expected_locations.sort()
        assert locations == expected_locations
Пример #13
0
def test_node_affinity_scheduling_strategy(ray_start_cluster,
                                           connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=8, resources={"head": 1})
    ray.init(address=cluster.address)
    cluster.add_node(num_cpus=8, resources={"worker": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.get_runtime_context().node_id

        head_node_id = ray.get(
            get_node_id.options(num_cpus=0, resources={
                "head": 1
            }).remote())
        worker_node_id = ray.get(
            get_node_id.options(num_cpus=0, resources={
                "worker": 1
            }).remote())

        assert worker_node_id == ray.get(
            get_node_id.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    worker_node_id, soft=False)).remote())
        assert head_node_id == ray.get(
            get_node_id.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    head_node_id, soft=False)).remote())

        # Doesn't fail when the node doesn't exist since soft is true.
        ray.get(
            get_node_id.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    ray.NodeID.from_random().hex(), soft=True)).remote())

        # Doesn't fail when the node is infeasible since soft is true.
        assert worker_node_id == ray.get(
            get_node_id.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    head_node_id, soft=True),
                resources={
                    "worker": 1
                },
            ).remote())

        # Fail when the node doesn't exist.
        with pytest.raises(ray.exceptions.TaskUnschedulableError):
            ray.get(
                get_node_id.options(
                    scheduling_strategy=NodeAffinitySchedulingStrategy(
                        ray.NodeID.from_random().hex(), soft=False)).remote())

        # Fail when the node is infeasible.
        with pytest.raises(ray.exceptions.TaskUnschedulableError):
            ray.get(
                get_node_id.options(
                    scheduling_strategy=NodeAffinitySchedulingStrategy(
                        head_node_id, soft=False),
                    resources={
                        "not_exist": 1
                    },
                ).remote())

        crashed_worker_node = cluster.add_node(num_cpus=8,
                                               resources={"crashed_worker": 1})
        cluster.wait_for_nodes()
        crashed_worker_node_id = ray.get(
            get_node_id.options(num_cpus=0, resources={
                "crashed_worker": 1
            }).remote())

        @ray.remote(
            max_retries=-1,
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                crashed_worker_node_id, soft=True),
        )
        def crashed_get_node_id():
            if ray.get_runtime_context().node_id == crashed_worker_node_id:
                internal_kv._internal_kv_put("crashed_get_node_id",
                                             "crashed_worker_node_id")
                while True:
                    time.sleep(1)
            else:
                return ray.get_runtime_context().node_id

        r = crashed_get_node_id.remote()
        while not internal_kv._internal_kv_exists("crashed_get_node_id"):
            time.sleep(0.1)
        cluster.remove_node(crashed_worker_node, allow_graceful=False)
        assert ray.get(r) in {head_node_id, worker_node_id}

        @ray.remote(num_cpus=1)
        class Actor:
            def get_node_id(self):
                return ray.get_runtime_context().node_id

        actor = Actor.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                worker_node_id, soft=False)).remote()
        assert worker_node_id == ray.get(actor.get_node_id.remote())

        actor = Actor.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                head_node_id, soft=False)).remote()
        assert head_node_id == ray.get(actor.get_node_id.remote())

        # Wait until the target node becomes available.
        worker_actor = Actor.options(resources={"worker": 1}).remote()
        assert worker_node_id == ray.get(worker_actor.get_node_id.remote())
        actor = Actor.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(worker_node_id,
                                                               soft=True),
            resources={
                "worker": 1
            },
        ).remote()
        del worker_actor
        assert worker_node_id == ray.get(actor.get_node_id.remote())

        # Doesn't fail when the node doesn't exist since soft is true.
        actor = Actor.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(
                ray.NodeID.from_random().hex(), soft=True)).remote()
        assert ray.get(actor.get_node_id.remote())

        # Doesn't fail when the node is infeasible since soft is true.
        actor = Actor.options(
            scheduling_strategy=NodeAffinitySchedulingStrategy(head_node_id,
                                                               soft=True),
            resources={
                "worker": 1
            },
        ).remote()
        assert worker_node_id == ray.get(actor.get_node_id.remote())

        # Fail when the node doesn't exist.
        with pytest.raises(ray.exceptions.ActorUnschedulableError):
            actor = Actor.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    ray.NodeID.from_random().hex(), soft=False)).remote()
            ray.get(actor.get_node_id.remote())

        # Fail when the node is infeasible.
        with pytest.raises(ray.exceptions.ActorUnschedulableError):
            actor = Actor.options(
                scheduling_strategy=NodeAffinitySchedulingStrategy(
                    worker_node_id, soft=False),
                resources={
                    "not_exist": 1
                },
            ).remote()
            ray.get(actor.get_node_id.remote())