示例#1
0
def test_check_bundle_index(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="SPREAD",
                                                   bundles=[{
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }])

        with pytest.raises(ValueError, match="bundle index 3 is invalid"):
            Actor.options(placement_group=placement_group,
                          placement_group_bundle_index=3).remote()

        with pytest.raises(ValueError, match="bundle index -2 is invalid"):
            Actor.options(placement_group=placement_group,
                          placement_group_bundle_index=-2).remote()

        with pytest.raises(ValueError, match="bundle index must be -1"):
            Actor.options(placement_group_bundle_index=0).remote()

        placement_group_assert_no_leak([placement_group])
def test_placement_group_hang(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=1)
    def f():
        return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Warm workers up, so that this triggers the hang rice.
        ray.get(f.remote())

        g1 = ray.util.placement_group([{"CPU": 2}])
        # This will start out infeasible. The placement group will then be
        # created and it transitions to feasible.
        o1 = f.options(placement_group=g1).remote()

        resources = ray.get(o1)
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources

        placement_group_assert_no_leak([g1])
def test_placement_group_task_resource_ids(ray_start_cluster,
                                           connect_to_client):
    @ray.remote(num_cpus=1)
    def f():
        return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 2}])
        o1 = f.options(placement_group=g1).remote()
        resources = ray.get(o1)
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources
        assert "CPU_group_0_" not in list(resources.keys())[0], resources

        # Now retry with a bundle index constraint.
        o1 = f.options(placement_group=g1,
                       placement_group_bundle_index=0).remote()
        resources = ray.get(o1)
        assert len(resources) == 2, resources
        keys = list(resources.keys())
        assert "CPU_group_" in keys[0], resources
        assert "CPU_group_" in keys[1], resources
        assert ("CPU_group_0_" in keys[0]
                or "CPU_group_0_" in keys[1]), resources

        placement_group_assert_no_leak([g1])
示例#4
0
def test_placement_group_strict_spread(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 3
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_SPREAD",
            bundles=[{
                "CPU": 2
            }, {
                "CPU": 2
            }, {
                "CPU": 2
            }])
        ray.get(placement_group.ready())
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1).remote()
        actor_3 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())
        ray.get(actor_3.value.remote())

        # Get all actors.
        actor_infos = ray.state.actors()

        # Make sure all actors in counter_list are located in separate nodes.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())
        actor_info_3 = actor_infos.get(actor_3._actor_id.hex())

        assert actor_info_1 and actor_info_2 and actor_info_3

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        node_of_actor_3 = actor_info_3["Address"]["NodeID"]
        assert node_of_actor_1 != node_of_actor_2
        assert node_of_actor_1 != node_of_actor_3
        assert node_of_actor_2 != node_of_actor_3

        placement_group_assert_no_leak([placement_group])
示例#5
0
def test_placement_group_pack(ray_start_cluster, connect_to_client,
                              gcs_actor_scheduling_enabled):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for i in range(num_nodes):
        cluster.add_node(
            num_cpus=4,
            _system_config={
                "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled
            } if i == 0 else {},
        )
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="PACK",
            bundles=[
                {
                    "CPU": 2,
                    "GPU": 0
                },  # Test 0 resource spec doesn't break tests.
                {
                    "CPU": 2
                },
            ],
        )
        ray.get(placement_group.ready())
        actor_1 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=1).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())

        # Get all actors.
        actor_infos = ray._private.state.actors()

        # Make sure all actors in counter_list are collocated in one node.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

        assert actor_info_1 and actor_info_2

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        assert node_of_actor_1 == node_of_actor_2
        placement_group_assert_no_leak([placement_group])
示例#6
0
def test_placement_group_strict_pack(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_PACK",
            bundles=[
                {
                    "memory": 50 * 1024 *
                    1024,  # Test memory resource spec doesn't break tests.
                    "CPU": 2
                },
                {
                    "CPU": 2
                }
            ])
        ray.get(placement_group.ready())
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1).remote()

        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())

        # Get all actors.
        actor_infos = ray.state.actors()

        # Make sure all actors in counter_list are collocated in one node.
        actor_info_1 = actor_infos.get(actor_1._actor_id.hex())
        actor_info_2 = actor_infos.get(actor_2._actor_id.hex())

        assert actor_info_1 and actor_info_2

        node_of_actor_1 = actor_info_1["Address"]["NodeID"]
        node_of_actor_2 = actor_info_2["Address"]["NodeID"]
        assert node_of_actor_1 == node_of_actor_2

        placement_group_assert_no_leak([placement_group])
示例#7
0
def test_placement_group_spread(ray_start_cluster, connect_to_client,
                                gcs_actor_scheduling_enabled):
    @ray.remote
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for i in range(num_nodes):
        cluster.add_node(
            num_cpus=4,
            _system_config={
                "gcs_actor_scheduling_enabled": gcs_actor_scheduling_enabled
            } if i == 0 else {},
        )
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(
            name="name",
            strategy="STRICT_SPREAD",
            bundles=[{
                "CPU": 2
            }, {
                "CPU": 2
            }],
        )
        ray.get(placement_group.ready())
        actors = [
            Actor.options(
                placement_group=placement_group,
                placement_group_bundle_index=i,
                num_cpus=2,
            ).remote() for i in range(num_nodes)
        ]

        [ray.get(actor.value.remote()) for actor in actors]

        # Get all actors.
        actor_infos = ray._private.state.actors()

        # Make sure all actors in counter_list are located in separate nodes.
        actor_info_objs = [
            actor_infos.get(actor._actor_id.hex()) for actor in actors
        ]
        assert are_pairwise_unique(
            [info_obj["Address"]["NodeID"] for info_obj in actor_info_objs])

        placement_group_assert_no_leak([placement_group])
def test_cuda_visible_devices(ray_start_cluster, connect_to_client):
    @ray.remote(num_gpus=1)
    def f():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_gpus=1)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
        o1 = f.options(placement_group=g1).remote()

        devices = ray.get(o1)
        assert devices == "0", devices
        placement_group_assert_no_leak([g1])
def test_remove_pending_placement_group(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        # Create a placement group that cannot be scheduled now.
        placement_group = ray.util.placement_group([{"GPU": 2}, {"CPU": 2}])
        ray.util.remove_placement_group(placement_group)

        # TODO(sang): Add state check here.
        @ray.remote(num_cpus=4)
        def f():
            return 3

        # Make sure this task is still schedulable.
        assert ray.get(f.remote()) == 3

        placement_group_assert_no_leak([placement_group])
def test_placement_group_actor_resource_ids(ray_start_cluster,
                                            connect_to_client):
    @ray.remote(num_cpus=1)
    class F:
        def f(self):
            return ray.worker.get_resource_ids()

    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        g1 = ray.util.placement_group([{"CPU": 2}])
        a1 = F.options(placement_group=g1).remote()
        resources = ray.get(a1.f.remote())
        assert len(resources) == 1, resources
        assert "CPU_group_" in list(resources.keys())[0], resources
        placement_group_assert_no_leak([g1])
def test_placement_ready(ray_start_regular, connect_to_client):
    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def v(self):
            return 10

    # bundle is placement group reserved resources and can't be used in bundles
    with pytest.raises(Exception):
        ray.util.placement_group(bundles=[{"bundle": 1}])
    # This test is to test the case that even there all resource in the
    # bundle got allocated, we are still able to return from ready[I
    # since ready use 0 CPU
    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group(bundles=[{"CPU": 1}])
        ray.get(pg.ready())
        a = Actor.options(num_cpus=1, placement_group=pg).remote()
        ray.get(a.v.remote())
        ray.get(pg.ready())

        placement_group_assert_no_leak([pg])
示例#12
0
def test_placement_group_reschedule_when_node_dead(ray_start_cluster,
                                                   connect_to_client):
    @ray.remote(num_cpus=1)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, namespace="default_test_namespace")

    # Make sure both head and worker node are alive.
    nodes = ray.nodes()
    assert len(nodes) == 3
    assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="SPREAD",
                                                   bundles=[{
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }])
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_3 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())
        ray.get(actor_3.value.remote())

        cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
        cluster.wait_for_nodes()

        actor_4 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_5 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_6 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_4.value.remote())
        ray.get(actor_5.value.remote())
        ray.get(actor_6.value.remote())
        placement_group_assert_no_leak([placement_group])
        ray.shutdown()
示例#13
0
def test_placement_group_stats(ray_start_cluster):
    cluster = ray_start_cluster
    num_nodes = 1
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4, num_gpus=1)
    ray.init(address=cluster.address)

    # Test createable pgs.
    pg = ray.util.placement_group(bundles=[{"CPU": 4, "GPU": 1}])
    ray.get(pg.ready())
    stats = ray.util.placement_group_table(pg)["stats"]
    assert stats["scheduling_attempt"] == 1
    assert stats["scheduling_state"] == "FINISHED"
    assert stats["end_to_end_creation_latency_ms"] != 0

    # Create a pending pg.
    pg2 = ray.util.placement_group(bundles=[{"CPU": 4, "GPU": 1}])

    def assert_scheduling_state():
        stats = ray.util.placement_group_table(pg2)["stats"]
        if stats["scheduling_attempt"] != 1:
            return False
        if stats["scheduling_state"] != "NO_RESOURCES":
            return False
        if stats["end_to_end_creation_latency_ms"] != 0:
            return False
        return True

    wait_for_condition(assert_scheduling_state)

    # Remove the first pg, and the second
    # pg should be schedulable now.
    ray.util.remove_placement_group(pg)

    def assert_scheduling_state():
        stats = ray.util.placement_group_table(pg2)["stats"]
        if stats["scheduling_state"] != "FINISHED":
            return False
        if stats["end_to_end_creation_latency_ms"] == 0:
            return False
        return True

    wait_for_condition(assert_scheduling_state)

    # Infeasible pg.
    pg3 = ray.util.placement_group(bundles=[{"CPU": 4, "a": 1}])
    # TODO This is supposed to be infeasible, but it is printed
    # as NO_RESOURCES. Fix the issue.
    # def assert_scheduling_state():
    #     stats = ray.util.placement_group_table(pg3)["stats"]
    #     print(stats)
    #     if stats["scheduling_state"] != "INFEASIBLE":
    #         return False
    #     return True
    # wait_for_condition(assert_scheduling_state)

    ray.util.remove_placement_group(pg3)

    def assert_scheduling_state():
        stats = ray.util.placement_group_table(pg3)["stats"]
        if stats["scheduling_state"] != "REMOVED":
            return False
        return True

    wait_for_condition(assert_scheduling_state)

    placement_group_assert_no_leak([pg2])
示例#14
0
def test_placement_group_table(ray_start_cluster, connect_to_client):
    @ray.remote(num_cpus=2)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    num_nodes = 2
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)
    pgs_created = []

    with connect_to_client_or_not(connect_to_client):
        # Originally placement group creation should be pending because
        # there are no resources.
        name = "name"
        strategy = "PACK"
        bundles = [{"CPU": 2, "GPU": 1}, {"CPU": 2}]
        placement_group = ray.util.placement_group(name=name,
                                                   strategy=strategy,
                                                   bundles=bundles)
        pgs_created.append(placement_group)
        result = ray.util.placement_group_table(placement_group)
        assert result["name"] == name
        assert result["strategy"] == strategy
        for i in range(len(bundles)):
            assert bundles[i] == result["bundles"][i]
        assert result["state"] == "PENDING"

        # Now the placement group should be scheduled.
        cluster.add_node(num_cpus=5, num_gpus=1)
        cluster.wait_for_nodes()

        actor_1 = Actor.options(placement_group=placement_group,
                                placement_group_bundle_index=0).remote()
        ray.get(actor_1.value.remote())

        result = ray.util.placement_group_table(placement_group)
        assert result["state"] == "CREATED"

        # Add tow more placement group for placement group table test.
        second_strategy = "SPREAD"
        pgs_created.append(
            ray.util.placement_group(name="second_placement_group",
                                     strategy=second_strategy,
                                     bundles=bundles))
        pgs_created.append(
            ray.util.placement_group(name="third_placement_group",
                                     strategy=second_strategy,
                                     bundles=bundles))

        placement_group_table = ray.util.placement_group_table()
        assert len(placement_group_table) == 3

        true_name_set = {
            "name", "second_placement_group", "third_placement_group"
        }
        get_name_set = set()

        for _, placement_group_data in placement_group_table.items():
            get_name_set.add(placement_group_data["name"])

        assert true_name_set == get_name_set

        placement_group_assert_no_leak(pgs_created)
def test_placement_group_invalid_resource_request(shutdown_only):
    """
    Make sure exceptions are raised if
    requested resources don't fit any bundles.
    """
    ray.init(resources={"a": 1})
    pg = ray.util.placement_group(bundles=[{"a": 1}])

    #
    # Test an actor with 0 cpu.
    #
    @ray.remote
    class A:
        def ready(self):
            pass

    # The actor cannot be scheduled with the default because
    # it requires 1 cpu for the placement, but the pg doesn't have it.
    with pytest.raises(ValueError):
        a = A.options(placement_group=pg).remote()
    # Shouldn't work with 1 CPU because pg doesn't contain CPUs.
    with pytest.raises(ValueError):
        a = A.options(num_cpus=1, placement_group=pg).remote()
    # 0 CPU should work.
    a = A.options(num_cpus=0, placement_group=pg).remote()
    ray.get(a.ready.remote())
    del a

    #
    # Test an actor with non-0 resources.
    #
    @ray.remote(resources={"a": 1})
    class B:
        def ready(self):
            pass

    # When resources are given to the placement group,
    # it automatically adds 1 CPU to resources, so it should fail.
    with pytest.raises(ValueError):
        b = B.options(placement_group=pg).remote()
    # If 0 cpu is given, it should work.
    b = B.options(num_cpus=0, placement_group=pg).remote()
    ray.get(b.ready.remote())
    del b
    # If resources are requested too much, it shouldn't work.
    with pytest.raises(ValueError):
        # The actor cannot be scheduled with no resource specified.
        # Note that the default actor has 0 cpu.
        B.options(num_cpus=0, resources={"a": 2}, placement_group=pg).remote()

    #
    # Test a function with 1 CPU.
    #
    @ray.remote
    def f():
        pass

    # 1 CPU shouldn't work because the pg doesn't have CPU bundles.
    with pytest.raises(ValueError):
        f.options(placement_group=pg).remote()
    # 0 CPU should work.
    ray.get(f.options(placement_group=pg, num_cpus=0).remote())

    #
    # Test a function with 0 CPU.
    #
    @ray.remote(num_cpus=0)
    def g():
        pass

    # 0 CPU should work.
    ray.get(g.options(placement_group=pg).remote())

    placement_group_assert_no_leak([pg])