# Make sure placement groups are cleaned when detached actors are killed. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_create_placement_group_after_gcs_server_restart( ray_start_cluster_head): cluster = ray_start_cluster_head cluster.add_node(num_cpus=2) cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Create placement group 1 successfully. placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}]) ray.get(placement_group1.ready(), timeout=10) table = ray.util.placement_group_table(placement_group1) assert table["state"] == "CREATED"
assert node2.remaining_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2]) def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2]) @pytest.mark.parametrize("ray_start_cluster_head", [ generate_system_config_map(num_heartbeats_timeout=20, object_timeout_milliseconds=12345) ], indirect=True) def test_system_config(ray_start_cluster_head): """Checks that the internal configuration setting works. We set the cluster to timeout nodes after 2 seconds of no timeouts. We then remove a node, wait for 1 second to check that the cluster is out of sync, then wait another 2 seconds (giving 1 second of leeway) to check that the client has timed out. We also check to see if the config is set. """ cluster = ray_start_cluster_head worker = cluster.add_node() cluster.wait_for_nodes() @ray.remote
def increase(self): self.value += 1 return self.value remote_actor = Actor.remote() assert ray.get(RetryableTask.remote(remote_actor)) == 3 # NOTE(hchen): we set object_timeout_milliseconds to 1s for # this test. Because if this value is too small, suprious task reconstruction # may happen and cause the test fauilure. If the value is too large, this test # could be very slow. We can remove this once we support dynamic timeout. @pytest.mark.parametrize("ray_start_cluster_head", [ generate_system_config_map(object_timeout_milliseconds=1000, num_heartbeats_timeout=10) ], indirect=True) def test_multiple_actor_restart(ray_start_cluster_head): cluster = ray_start_cluster_head # This test can be made more stressful by increasing the numbers below. # The total number of actors created will be # num_actors_at_a_time * num_nodes. num_nodes = 5 num_actors_at_a_time = 3 num_function_calls_at_a_time = 10 worker_nodes = [cluster.add_node(num_cpus=3) for _ in range(num_nodes)] @ray.remote(max_restarts=-1, max_task_retries=-1) class SlowCounter:
@ray.remote class Increase: def method(self, x): return x + 2 @ray.remote def increase(x): return x + 1 @pytest.mark.parametrize( "ray_start_regular", [generate_system_config_map(num_heartbeats_timeout=20)], indirect=True) def test_gcs_server_restart(ray_start_regular): actor1 = Increase.remote() result = ray.get(actor1.method.remote(1)) assert result == 3 ray.worker._global_node.kill_gcs_server() ray.worker._global_node.start_gcs_server() result = ray.get(actor1.method.remote(7)) assert result == 9 actor2 = Increase.remote() result = ray.get(actor2.method.remote(2)) assert result == 4