Пример #1
0
def test_placement_group_status_no_bundle_demand(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    @ray.remote
    def f():
        pass

    pg = ray.util.placement_group([{"CPU": 1}])
    ray.get(pg.ready())
    ray.util.remove_placement_group(pg)
    wait_for_condition(lambda: is_placement_group_removed(pg))
    # Create a ready task after the placement group is removed.
    # This shouldn't be reported to the resource demand.
    r = pg.ready()  # noqa

    # Wait until the usage is updated, which is
    # when the demand is also updated.
    def is_usage_updated():
        demand_output = get_ray_status_output(cluster.address)
        return demand_output["usage"] != ""

    wait_for_condition(is_usage_updated)
    # The output shouldn't include the pg.ready task demand.
    demand_output = get_ray_status_output(cluster.address)
    assert demand_output["demand"] == "(no resource demands)"
Пример #2
0
def test_placement_group_synchronous_registration(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # One node which only has one CPU.
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Create a placement group that has two bundles and `STRICT_PACK`
        # strategy so its registration will successful but scheduling failed.
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_PACK",
            bundles=[
                {
                    "CPU": 1,
                },
                {"CPU": 1},
            ],
        )
        # Make sure we can properly remove it immediately
        # as its registration is synchronous.
        ray.util.remove_placement_group(placement_group)

        wait_for_condition(lambda: is_placement_group_removed(placement_group))
Пример #3
0
def test_remove_placement_group(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    @ray.remote
    def warmup():
        pass

    # warm up the cluster.
    ray.get([warmup.remote() for _ in range(4)])

    with connect_to_client_or_not(connect_to_client):
        # First try to remove a placement group that doesn't
        # exist. This should not do anything.
        random_group_id = PlacementGroupID.from_random()
        random_placement_group = PlacementGroup(random_group_id)
        for _ in range(3):
            ray.util.remove_placement_group(random_placement_group)

        # Creating a placement group as soon as it is
        # created should work.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        ray.util.remove_placement_group(placement_group)

        wait_for_condition(lambda: is_placement_group_removed(placement_group))

        # # Now let's create a placement group.
        placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
        assert placement_group.wait(10)

        # Create an actor that occupies resources.
        @ray.remote(num_cpus=2)
        class A:
            def f(self):
                return 3

        # Currently, there's no way to prevent
        # tasks to be retried for removed placement group.
        # Set max_retrie=0 for testing.
        # TODO(sang): Handle this edge case.
        @ray.remote(num_cpus=2, max_retries=0)
        def long_running_task():
            print(os.getpid())
            import time

            time.sleep(50)

        # Schedule a long running task and actor.
        task_ref = long_running_task.options(
            placement_group=placement_group).remote()
        a = A.options(placement_group=placement_group).remote()
        assert ray.get(a.f.remote()) == 3

        ray.util.remove_placement_group(placement_group)
        # Subsequent remove request shouldn't do anything.
        for _ in range(3):
            ray.util.remove_placement_group(placement_group)

        # Make sure placement group resources are
        # released and we can schedule this task.
        @ray.remote(num_cpus=4)
        def f():
            return 3

        assert ray.get(f.remote()) == 3
        # Since the placement group is removed,
        # the actor should've been killed.
        # That means this request should fail.
        with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
            ray.get(a.f.remote(), timeout=3.0)
        with pytest.raises(ray.exceptions.WorkerCrashedError):
            ray.get(task_ref)