示例#1
0
        # run evaluations and get the results
        evaluations = [
            Evaluation.remote(update_id) for _ in range(ASYNC_EVALUATIONS)
            ]    
        results = ray.get(
            [evaluation.run.remote() for evaluation in evaluations]
            )      
        # tally the results 
        apprentice_wins = sum(results)
        print(f'the apprentice won {apprentice_wins} games...')
        update = (apprentice_wins/NUM_EVAL_PLAYS) > WIN_RATIO   
        # update if neccesary
        if update:
            # increment and get the the current update id
            update_id = ray.get(update_signal.set_update_id.remote())
            # block until new alpha parameters are saved, indexed by current 
            # update_id
            ray.get(
                evaluations[0].update_alpha_parameters.remote(update_id)
                )
            # send update signal to self-play actors
            update_signal.send_update.remote()      
        # free up resources by killing the evaluation actors
        for evaluation in evaluations:
            ray.kill(evaluation) 
        # manual garbage collection
        gc.collect()
            
        
        
示例#2
0
def _kill_http_proxy():
    [http_proxy
     ] = ray.get(serve.api._get_master_actor().get_http_proxy.remote())
    ray.kill(http_proxy)
示例#3
0
 def run(self):
     while True:
         ray.kill(random.choice(self._get_all_serve_actors()),
                  no_restart=False)
         time.sleep(self.kill_period_s)
示例#4
0
def test_atomic_creation(ray_start_cluster):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 2

    [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    @ray.remote(num_cpus=3)
    def bothering_task():
        import time
        time.sleep(1)
        return True

    # Schedule tasks to fail initial placement group creation.
    tasks = [bothering_task.remote() for _ in range(2)]
    # Create an actor that will fail bundle scheduling.
    # It is important to use pack strategy to make test less flaky.
    pg = ray.util.placement_group(name="name",
                                  strategy="SPREAD",
                                  bundles=[{
                                      "CPU": bundle_cpu_size
                                  } for _ in range(num_nodes * bundle_per_node)
                                           ])

    # Create a placement group actor.
    # This shouldn't be scheduled because atomic
    # placement group creation should've failed.
    pg_actor = NormalActor.options(
        placement_group=pg,
        placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote()

    # Wait on the placement group now. It should be unready
    # because normal actor takes resources that are required
    # for one of bundle creation.
    ready, unready = ray.wait([pg.ready()], timeout=0)
    assert len(ready) == 0
    assert len(unready) == 1
    # Wait until all tasks are done.
    assert all(ray.get(tasks))

    # Wait on the placement group creation. Since resources are now available,
    # it should be ready soon.
    ready, unready = ray.wait([pg.ready()])
    assert len(ready) == 1
    assert len(unready) == 0

    # Confirm that the placement group actor is created. It will
    # raise an exception if actor was scheduled before placement
    # group was created thus it checks atomicity.
    ray.get(pg_actor.ping.remote(), timeout=3.0)
    ray.kill(pg_actor)

    # Make sure atomic creation failure didn't impact resources.
    @ray.remote(num_cpus=bundle_cpu_size)
    def resource_check():
        return True

    # This should hang because every resources
    # are claimed by placement group.
    check_without_pg = [
        resource_check.remote() for _ in range(bundle_per_node * num_nodes)
    ]

    # This all should scheduled on each bundle.
    check_with_pg = [
        resource_check.options(placement_group=pg,
                               placement_group_bundle_index=i).remote()
        for i in range(bundle_per_node * num_nodes)
    ]

    # Make sure these are hanging.
    ready, unready = ray.wait(check_without_pg, timeout=0)
    assert len(ready) == 0
    assert len(unready) == bundle_per_node * num_nodes

    # Make sure these are all scheduled.
    assert all(ray.get(check_with_pg))

    ray.util.remove_placement_group(pg)

    def pg_removed():
        return ray.util.placement_group_table(pg)["state"] == "REMOVED"

    wait_for_condition(pg_removed)

    # Make sure check without pgs are all
    # scheduled properly because resources are cleaned up.
    assert all(ray.get(check_without_pg))
示例#5
0
def _kill_http_proxies(client):
    http_proxies = ray.get(client._controller.get_http_proxies.remote())
    for http_proxy in http_proxies.values():
        ray.kill(http_proxy, no_restart=False)
示例#6
0
def test_object_unpin(ray_start_cluster):
    nodes = []
    cluster = ray_start_cluster
    head_node = cluster.add_node(num_cpus=0,
                                 object_store_memory=100 * 1024 * 1024,
                                 _system_config={
                                     "num_heartbeats_timeout": 10,
                                     "subscriber_timeout_ms": 100
                                 })
    ray.init(address=cluster.address)

    # Add worker nodes.
    for i in range(2):
        nodes.append(
            cluster.add_node(num_cpus=1,
                             resources={f"node_{i}": 1},
                             object_store_memory=100 * 1024 * 1024))
    cluster.wait_for_nodes()

    one_mb_array = np.ones(1 * 1024 * 1024, dtype=np.uint8)
    ten_mb_array = np.ones(10 * 1024 * 1024, dtype=np.uint8)

    @ray.remote
    class ObjectsHolder:
        def __init__(self):
            self.ten_mb_objs = []
            self.one_mb_objs = []

        def put_10_mb(self):
            self.ten_mb_objs.append(ray.put(ten_mb_array))

        def put_1_mb(self):
            self.one_mb_objs.append(ray.put(one_mb_array))

        def pop_10_mb(self):
            if len(self.ten_mb_objs) == 0:
                return False
            self.ten_mb_objs.pop()
            return True

        def pop_1_mb(self):
            if len(self.one_mb_objs) == 0:
                return False
            self.one_mb_objs.pop()
            return True

    # Head node contains 11MB of data.
    one_mb_arrays = []
    ten_mb_arrays = []

    one_mb_arrays.append(ray.put(one_mb_array))
    ten_mb_arrays.append(ray.put(ten_mb_array))

    def check_memory(mb):
        return ((f"Plasma memory usage {mb} "
                 "MiB" in memory_summary(address=head_node.address,
                                         stats_only=True)))

    def wait_until_node_dead(node):
        for n in ray.nodes():
            if (n["ObjectStoreSocketName"] ==
                    node.address_info["object_store_address"]):
                return not n["Alive"]
        return False

    wait_for_condition(lambda: check_memory(11))

    # Pop one mb array and see if it works.
    one_mb_arrays.pop()
    wait_for_condition(lambda: check_memory(10))

    # Pop 10 MB.
    ten_mb_arrays.pop()
    wait_for_condition(lambda: check_memory(0))

    # Put 11 MB for each actor.
    # actor 1: 1MB + 10MB
    # actor 2: 1MB + 10MB
    actor_on_node_1 = ObjectsHolder.options(resources={"node_0": 1}).remote()
    actor_on_node_2 = ObjectsHolder.options(resources={"node_1": 1}).remote()
    ray.get(actor_on_node_1.put_1_mb.remote())
    ray.get(actor_on_node_1.put_10_mb.remote())
    ray.get(actor_on_node_2.put_1_mb.remote())
    ray.get(actor_on_node_2.put_10_mb.remote())
    wait_for_condition(lambda: check_memory(22))

    # actor 1: 10MB
    # actor 2: 1MB
    ray.get(actor_on_node_1.pop_1_mb.remote())
    ray.get(actor_on_node_2.pop_10_mb.remote())
    wait_for_condition(lambda: check_memory(11))

    # The second node is dead, and actor 2 is dead.
    cluster.remove_node(nodes[1], allow_graceful=False)
    wait_for_condition(lambda: wait_until_node_dead(nodes[1]))
    wait_for_condition(lambda: check_memory(10))

    # The first actor is dead, so object should be GC'ed.
    ray.kill(actor_on_node_1)
    wait_for_condition(lambda: check_memory(0))
示例#7
0
async def router(serve_instance):
    q = ray.remote(Router).remote(serve_instance._controller)
    yield q
    ray.kill(q)
示例#8
0
def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard):
    timeout = 5
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)

    p = client.pubsub(ignore_subscribe_messages=True)
    p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN)

    @ray.remote
    class DummyActor:
        def __init__(self):
            pass

    # Create a dummy actor.
    a = DummyActor.remote()

    def handle_pub_messages(client, msgs, timeout, expect_num):
        start_time = time.time()
        while time.time() - start_time < timeout and len(msgs) < expect_num:
            msg = client.get_message()
            if msg is None:
                time.sleep(0.01)
                continue
            pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"])
            actor_data = ray.gcs_utils.ActorTableData.FromString(
                pubsub_msg.data)
            msgs.append(actor_data)

    msgs = []
    handle_pub_messages(p, msgs, timeout, 2)

    # Assert we received published actor messages with state
    # DEPENDENCIES_UNREADY and ALIVE.
    assert len(msgs) == 2

    # Kill actor.
    ray.kill(a)
    handle_pub_messages(p, msgs, timeout, 3)

    # Assert we received published actor messages with state DEAD.
    assert len(msgs) == 3

    def actor_table_data_to_dict(message):
        return dashboard_utils.message_to_dict(
            message, {
                "actorId", "parentId", "jobId", "workerId", "rayletId",
                "actorCreationDummyObjectId", "callerId", "taskId",
                "parentTaskId", "sourceActorId", "placementGroupId"
            },
            including_default_value_fields=False)

    non_state_keys = ("actorId", "jobId", "taskSpec")
    for msg in msgs:
        actor_data_dict = actor_table_data_to_dict(msg)
        # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We
        # need check its original value.
        if msg.state == 0:
            assert len(actor_data_dict) > 5
            for k in non_state_keys:
                assert k in actor_data_dict
        # For status that is not DEPENDENCIES_UNREADY, only states fields will
        # be published.
        elif actor_data_dict["state"] in ("ALIVE", "DEAD"):
            assert actor_data_dict.keys() == {
                "state", "address", "timestamp", "pid", "creationTaskException"
            }
        else:
            raise Exception("Unknown state: {}".format(
                actor_data_dict["state"]))
示例#9
0
                    help="start server without blocking")

args = parser.parse_args()

os.makedirs(args.save_path, exist_ok=True)
assert os.path.isdir(args.seesaw_root)

ray.init("auto", namespace="seesaw", log_to_driver=True)

seesaw_root = os.path.abspath(os.path.expanduser(args.seesaw_root))
save_path = os.path.abspath(os.path.expanduser(args.save_path))

actor_name = "session_manager"
try:
    oldh = ray.get_actor(actor_name)
    print(
        "found old session_manager actor, destroying it (old sessions will be lost)"
    )
    ray.kill(oldh)
except:
    pass

session_manager = SessionManagerActor.options(name=actor_name).remote(
    root_dir=seesaw_root,
    save_path=save_path,
    num_cpus_per_session=args.num_cpus)

ray.get(session_manager.ready.remote())

uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info")
示例#10
0
 def reset(self):
     for worker in self.remote_workers:
         logger.debug(f"Killing worker {worker}.")
         ray.kill(worker)
     self.remote_workers = []
def test_task_level_gc(ray_start_cluster, option):
    """Tests that task-level working_dir is GC'd when the worker exits."""

    cluster = ray_start_cluster

    soft_limit_zero = False
    worker_register_timeout = False
    system_config = cluster.list_all_nodes()[0]._ray_params._system_config
    if "num_workers_soft_limit" in system_config and \
            system_config["num_workers_soft_limit"] == 0:
        soft_limit_zero = True
    if "worker_register_timeout_seconds" in system_config and \
            system_config["worker_register_timeout_seconds"] != 0:
        worker_register_timeout = True

    @ray.remote
    def f():
        import test_module
        test_module.one()

    @ray.remote(num_cpus=1)
    class A:
        def check(self):
            import test_module
            test_module.one()

    if option == "working_dir":
        runtime_env = {"working_dir": S3_PACKAGE_URI}
    else:
        runtime_env = {"py_modules": [S3_PACKAGE_URI]}

    # Note: We should set a bigger timeout if downloads the s3 package slowly.
    get_timeout = 10

    # Start a task with runtime env
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(
                f.options(runtime_env=runtime_env).remote(),
                timeout=get_timeout)
    else:
        ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a actor with runtime env
    actor = A.options(runtime_env=runtime_env).remote()
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(actor.check.remote(), timeout=get_timeout)
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        ray.get(actor.check.remote())
        assert not check_local_files_gced(cluster)

    # Kill actor
    ray.kill(actor)
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a task with runtime env
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(
                f.options(runtime_env=runtime_env).remote(),
                timeout=get_timeout)
    else:
        ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)
示例#12
0
 def end_session(self, session_id):
     ## session should die after reference
     sess = self.sessions[session_id]
     del self.sessions[session_id]
     print(f"ending session {session_id}")
     ray.kill(sess)
示例#13
0
 def shutdown(self) -> None:
     for proxy in self.get_http_proxy_handles().values():
         ray.kill(proxy, no_restart=True)
示例#14
0
文件: ray.py 项目: cxz/ludwig
 def shutdown(self):
     for handle in self.actor_handles:
         ray.kill(handle)
     self.actor_handles.clear()
示例#15
0
def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
    # Limit our object store to 75 MiB of memory.
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()
    cluster = ray_start_cluster
    # Head node.
    cluster.add_node(num_cpus=1,
                     object_store_memory=75 * 1024 * 1024,
                     _system_config={
                         "max_io_workers":
                         2,
                         "min_spilling_size":
                         20 * 1024 * 1024,
                         "automatic_object_spilling_enabled":
                         True,
                         "object_store_full_delay_ms":
                         100,
                         "object_spilling_config":
                         json.dumps({
                             "type": "filesystem",
                             "params": {
                                 "directory_path": str(temp_folder)
                             }
                         }),
                     })
    # Add 2 worker nodes.
    for _ in range(2):
        cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
    ray.init(address=cluster.address)

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def ping(self):
            return

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(200):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=0)
                assert np.array_equal(sample, arr)

    actors = [Actor.remote() for _ in range(3)]
    ray.get([actor.create_objects.remote() for actor in actors])

    def wait_until_actor_dead(actor):
        try:
            ray.get(actor.ping.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    def is_dir_empty():
        num_files = 0
        for path in temp_folder.iterdir():
            num_files += 1
        return num_files == 0

    # Kill actors to remove all references.
    for actor in actors:
        ray.kill(actor)
        wait_for_condition(lambda: wait_until_actor_dead(actor))
    # The multi node deletion should work.
    wait_for_condition(is_dir_empty)
示例#16
0
def _kill_router():
    [router] = ray.get(serve.api._get_master_actor().get_router.remote())
    ray.kill(router, no_restart=False)
示例#17
0
def test_multiple_routers(ray_cluster):
    cluster = ray_cluster
    head_node = cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.start(http_options=dict(port=8005, location="EveryNode"))

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME,
                                  serve.api._global_client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
示例#18
0
 def handle_result(self, results):
     print(results)
     if self.counter == self.fail_on:
         ray.kill(self.worker_group.workers[0].actor)
         time.sleep(3)
     self.counter += 1
示例#19
0
    def logging_loop(self):
        """
        Keep track of the training performance.
        """
        # Launch the test worker to get performance metrics
        test_worker = self_play.SelfPlay.options(
            num_gpus=self.config.selfplay_num_gpus
            if "cuda" in self.config.selfplay_device
            else 0
        ).remote(
            copy.deepcopy(self.muzero_weights),
            self.Game,
            self.config,
            self.config.seed + self.config.num_workers,
        )
        test_worker.continuous_self_play.remote(self.shared_storage_worker, None, True)

        # Write everything in TensorBoard
        writer = SummaryWriter(self.config.results_path)

        print(
            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
        )

        # Save hyperparameters to TensorBoard
        hp_table = [
            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
        ]
        writer.add_text(
            "Hyperparameters",
            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
        )
        # Save model representation
        writer.add_text(
            "Model summary", self.summary,
        )
        # Loop for updating the training performance
        counter = 0
        info = ray.get(self.shared_storage_worker.get_info.remote())
        try:
            while info["training_step"] < self.config.training_steps:
                info = ray.get(self.shared_storage_worker.get_info.remote())
                writer.add_scalar(
                    "1.Total reward/1.Total reward", info["total_reward"], counter,
                )
                writer.add_scalar(
                    "1.Total reward/2.Mean value", info["mean_value"], counter,
                )
                writer.add_scalar(
                    "1.Total reward/3.Episode length", info["episode_length"], counter,
                )
                writer.add_scalar(
                    "1.Total reward/4.MuZero reward", info["muzero_reward"], counter,
                )
                writer.add_scalar(
                    "1.Total reward/5.Opponent reward",
                    info["opponent_reward"],
                    counter,
                )
                writer.add_scalar(
                    "2.Workers/1.Self played games", info["num_played_games"], counter,
                )
                writer.add_scalar(
                    "2.Workers/2.Training steps", info["training_step"], counter
                )
                writer.add_scalar(
                    "2.Workers/3.Self played steps", info["num_played_steps"], counter
                )
                writer.add_scalar(
                    "2.Workers/4.Reanalysed games",
                    info["num_reanalysed_games"],
                    counter,
                )
                writer.add_scalar(
                    "2.Workers/5.Training steps per self played step ratio",
                    info["training_step"] / max(1, info["num_played_steps"]),
                    counter,
                )
                writer.add_scalar("2.Workers/6.Learning rate", info["lr"], counter)
                writer.add_scalar(
                    "3.Loss/1.Total weighted loss", info["total_loss"], counter
                )
                writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter)
                writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter)
                writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter)
                print(
                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
                    end="\r",
                )
                counter += 1
                time.sleep(0.5)
        except KeyboardInterrupt:
            ray.kill(test_worker)
            ray.kill(self.training_worker)
            for worker in self.self_play_workers:
                ray.kill(worker)

        self.muzero_weights = ray.get(self.shared_storage_worker.get_weights.remote())
        self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())

        if self.config.save_weights:
            # Persist replay buffer to disk
            print("\n\nPersisting replay buffer games to disk...")
            pickle.dump(
                self.replay_buffer,
                open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"),
            )
示例#20
0
def test_recover_start_from_replica_actor_names(serve_instance):
    """Test controller is able to recover starting -> running replicas from
    actor names.
    """
    # Test failed to deploy with total of 2 replicas,
    # but first constructor call fails.
    @serve.deployment(name="recover_start_from_replica_actor_names",
                      num_replicas=2)
    class TransientConstructorFailureDeployment:
        def __init__(self):
            return True

        def __call__(self, *args):
            return "hii"

    TransientConstructorFailureDeployment.deploy()
    for _ in range(10):
        response = request_with_retries(
            "/recover_start_from_replica_actor_names/", timeout=30)
        assert response.text == "hii"
    # Assert 2 replicas are running in deployment deployment after partially
    # successful deploy() call with transient error
    deployment_dict = ray.get(
        serve_instance._controller._all_running_replicas.remote())
    assert len(deployment_dict["recover_start_from_replica_actor_names"]) == 2

    replica_version_hash = None
    for replica in deployment_dict["recover_start_from_replica_actor_names"]:
        ref = replica.actor_handle.get_metadata.remote()
        _, version = ray.get(ref)
        if replica_version_hash is None:
            replica_version_hash = hash(version)
        assert replica_version_hash == hash(version), (
            "Replica version hash should be the same for "
            "same code version and user config.")

    # Sample: [
    # 'TransientConstructorFailureDeployment#xlituP',
    # 'SERVE_CONTROLLER_ACTOR',
    # 'TransientConstructorFailureDeployment#NosHNA',
    # 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-node:192.168.86.165-0']
    all_actor_names = ray.util.list_named_actors()
    all_replica_names = [
        actor_name for actor_name in all_actor_names
        if (SERVE_CONTROLLER_NAME not in actor_name
            and SERVE_PROXY_NAME not in actor_name)
    ]
    assert (len(all_replica_names) == 2
            ), "Should have two running replicas fetched from ray API."

    # Kill controller and wait for endpoint to be available again
    ray.kill(serve.context._global_client._controller, no_restart=False)
    for _ in range(10):
        response = request_with_retries(
            "/recover_start_from_replica_actor_names/", timeout=30)
        assert response.text == "hii"

    # Ensure recovered replica names are the same
    recovered_all_actor_names = ray.util.list_named_actors()
    recovered_all_replica_names = [
        actor_name for actor_name in recovered_all_actor_names
        if (SERVE_CONTROLLER_NAME not in actor_name
            and SERVE_PROXY_NAME not in actor_name)
    ]
    assert (recovered_all_replica_names == all_replica_names
            ), "Running replica actor names after recovery must match"

    # Ensure recovered replica version has are the same
    for replica_name in recovered_all_replica_names:
        actor_handle = ray.get_actor(replica_name)
        ref = actor_handle.get_metadata.remote()
        _, version = ray.get(ref)
        assert replica_version_hash == hash(
            version
        ), "Replica version hash should be the same after recover from actor names"
示例#21
0
def test_automatic_cleanup_detached_actors(ray_start_cluster):
    # Make sure the placement groups created by a
    # detached actors are cleaned properly.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 2
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)

    info = ray.init(address=cluster.address)
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

# TODO(sang): Placement groups created by tasks launched by detached actor
# is not cleaned with the current protocol.
# @ray.remote(num_cpus=0)
# def f():
#     create_pg()

@ray.remote(num_cpus=0, max_restarts=1)
class A:
    def create_pg(self):
        create_pg()
    def create_child_pg(self):
        self.a = A.options(name="B").remote()
        ray.get(self.a.create_pg.remote())
    def kill_child_actor(self):
        ray.kill(self.a)
        try:
            ray.get(self.a.create_pg.remote())
        except Exception:
            pass

a = A.options(lifetime="detached", name="A").remote()
ray.get(a.create_pg.remote())
# TODO(sang): Currently, child tasks are cleaned when a detached actor
# is dead. We cannot test this scenario until it is fixed.
# ray.get(a.create_child_pg.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    assert assert_num_cpus(num_nodes)
    # Make sure when a child actor spawned by a detached actor
    # is killed, the placement group is removed.
    a = ray.get_actor("A")
    # TODO(sang): child of detached actors
    # seem to be killed when jobs are done. We should fix this before
    # testing this scenario.
    # ray.get(a.kill_child_actor.remote())
    # assert assert_num_cpus(num_nodes)

    # Make sure placement groups are cleaned when detached actors are killed.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
    # The detached actor a should've been restarted.
    # Recreate a placement group.
    ray.get(a.create_pg.remote())
    wait_for_condition(lambda: assert_num_cpus(num_nodes))
    # Kill it again and make sure the placement group
    # that is created is deleted again.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
示例#22
0
def test_recover_rolling_update_from_replica_actor_names(serve_instance):
    """Test controller is able to recover starting -> updating -> running
    replicas from actor names, with right replica versions during rolling
    update.
    """
    client = serve_instance

    name = "test"

    @ray.remote(num_cpus=0)
    def call(block=False):
        handle = serve.get_deployment(name).get_handle()
        ret = ray.get(handle.handler.remote(block))

        return ret.split("|")[0], ret.split("|")[1]

    signal_name = f"signal#{get_random_letters()}"
    signal = SignalActor.options(name=signal_name).remote()

    @serve.deployment(name=name, version="1", num_replicas=2)
    class V1:
        async def handler(self, block: bool):
            if block:
                signal = ray.get_actor(signal_name)
                await signal.wait.remote()

            return f"1|{os.getpid()}"

        async def __call__(self, request):
            return await self.handler(request.query_params["block"] == "True")

    class V2:
        async def handler(self, *args):
            return f"2|{os.getpid()}"

        async def __call__(self, request):
            return await self.handler()

    def make_nonblocking_calls(expected, expect_blocking=False, num_returns=1):
        # Returns dict[val, set(pid)].
        blocking = []
        responses = defaultdict(set)
        start = time.time()
        timeout_value = 60 if sys.platform == "win32" else 30
        while time.time() - start < timeout_value:
            refs = [call.remote(block=False) for _ in range(10)]
            ready, not_ready = ray.wait(refs,
                                        timeout=5,
                                        num_returns=num_returns)
            for ref in ready:
                val, pid = ray.get(ref)
                responses[val].add(pid)
            for ref in not_ready:
                blocking.extend(not_ready)

            if all(
                    len(responses[val]) >= num
                    for val, num in expected.items()) and (
                        expect_blocking is False or len(blocking) > 0):
                break
        else:
            assert False, f"Timed out, responses: {responses}."

        return responses, blocking

    V1.deploy()
    responses1, _ = make_nonblocking_calls({"1": 2}, num_returns=2)
    pids1 = responses1["1"]

    # ref2 will block a single replica until the signal is sent. Check that
    # some requests are now blocking.
    ref2 = call.remote(block=True)
    responses2, blocking2 = make_nonblocking_calls({"1": 1},
                                                   expect_blocking=True)
    assert list(responses2["1"])[0] in pids1

    ray.kill(serve.context._global_client._controller, no_restart=False)

    # Redeploy new version. Since there is one replica blocking, only one new
    # replica should be started up.
    V2 = V1.options(func_or_class=V2, version="2")
    V2.deploy(_blocking=False)
    with pytest.raises(TimeoutError):
        client._wait_for_deployment_healthy(V2.name, timeout_s=0.1)
    responses3, blocking3 = make_nonblocking_calls({"1": 1},
                                                   expect_blocking=True)

    ray.kill(serve.context._global_client._controller, no_restart=False)

    # Signal the original call to exit.
    ray.get(signal.send.remote())
    val, pid = ray.get(ref2)
    assert val == "1"
    assert pid in responses1["1"]

    # Now the goal and requests to the new version should complete.
    # We should have two running replicas of the new version.
    client._wait_for_deployment_healthy(V2.name)
    make_nonblocking_calls({"2": 2}, num_returns=2)
示例#23
0
def test_capture_child_actors(ray_start_cluster):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    pg = ray.util.placement_group([{
        "CPU": 2
    }, {
        "CPU": 2
    }],
                                  strategy="STRICT_PACK")
    ray.get(pg.ready())

    # If get_current_placement_group is used when the current worker/driver
    # doesn't belong to any of placement group, it should return None.
    assert get_current_placement_group() is None

    # Test actors first.
    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor(self):
            # Make sure we can capture the current placement group.
            assert get_current_placement_group() is not None
            # Actors should be implicitly captured.
            actor = NestedActor.remote()
            ray.get(actor.ready.remote())
            self.actors.append(actor)

        def schedule_nested_actor_outside_pg(self):
            # Don't use placement group.
            actor = NestedActor.options(placement_group=None).remote()
            ray.get(actor.ready.remote())
            self.actors.append(actor)

    a = Actor.options(placement_group=pg).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor.remote())
    # Make sure all the actors are scheduled on the same node.
    # (why? The placement group has STRICT_PACK strategy).
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    # Since all node id should be identical, set should be equal to 1.
    assert len(node_id_set) == 1

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # Now create an actor, but do not capture the current tasks
    a = Actor.options(placement_group=pg,
                      placement_group_capture_child_tasks=False).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor.remote())
    # Make sure all the actors are not scheduled on the same node.
    # It is because the child tasks are not scheduled on the same
    # placement group.
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    assert len(node_id_set) == 2

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # Lastly, make sure when None is specified, actors are not scheduled
    # on the same placement group.
    a = Actor.options(placement_group=pg).remote()
    ray.get(a.ready.remote())
    # 1 top level actor + 3 children.
    for _ in range(total_num_actors - 1):
        ray.get(a.schedule_nested_actor_outside_pg.remote())
    # Make sure all the actors are not scheduled on the same node.
    # It is because the child tasks are not scheduled on the same
    # placement group.
    node_id_set = set()
    for actor_info in ray.actors().values():
        node_id = actor_info["Address"]["NodeID"]
        node_id_set.add(node_id)

    assert len(node_id_set) == 2
示例#24
0
    def run(self,
            worker_fn: Callable,
            callbacks: Optional[List[Callable]] = None) -> List[Any]:
        """Executes the provided function on all workers.

        Args:
            worker_fn: Target elastic function that can be executed.
            callbacks: List of callables. Each callback must either
                be a callable function or a class that implements __call__.
                Every callback will be invoked on every value logged
                by the rank 0 worker.

        Returns:
            List of return values from every completed worker.
        """
        return_values = []
        from ray.util.queue import Queue
        import inspect
        args = inspect.getfullargspec(Queue).args
        if "actor_options" not in args:
            # Ray 1.1 and less
            _queue = Queue()
        else:
            _queue = Queue(actor_options={
                "num_cpus": 0,
                "resources": {
                    ray.state.current_node_id(): 0.001
                }
            })
        self.driver.start(
            self.settings.num_proc,
            self._create_spawn_worker_fn(return_values, worker_fn, _queue))

        def _process_calls(queue, callbacks, event):
            if not callbacks:
                return
            while queue.actor:
                if not queue.empty():
                    result = queue.get_nowait()
                    for c in callbacks:
                        c(result)
                    # avoid slamming the CI
                elif event.is_set():
                    break
                time.sleep(0.1)

        try:
            event = threading.Event()
            _callback_thread = threading.Thread(target=_process_calls,
                                                args=(_queue, callbacks,
                                                      event),
                                                daemon=True)
            _callback_thread.start()
            res = self.driver.get_results()
            event.set()
            if _callback_thread:
                _callback_thread.join(timeout=60)
        finally:
            if hasattr(_queue, "shutdown"):
                _queue.shutdown()
            else:
                done_ref = _queue.actor.__ray_terminate__.remote()
                done, not_done = ray.wait([done_ref], timeout=5)
                if not_done:
                    ray.kill(_queue.actor)
        self.driver.stop()

        if res.error_message is not None:
            raise RuntimeError(res.error_message)

        for name, value in sorted(res.worker_results.items(),
                                  key=lambda item: item[1][1]):
            exit_code, timestamp = value
            if exit_code != 0:
                raise RuntimeError(
                    'Horovod detected that one or more processes '
                    'exited with non-zero '
                    'status, thus causing the job to be terminated. '
                    'The first process '
                    'to do so was:\nProcess name: {name}\nExit code: {code}\n'.
                    format(name=name, code=exit_code))

        return_values = [
            value for k, value in sorted(return_values, key=lambda kv: kv[0])
        ]
        return return_values
示例#25
0
def test_worker_replica_failure(serve_instance):
    serve.http_proxy.MAX_ACTOR_DEAD_RETRIES = 0
    serve.init()
    serve.create_endpoint("replica_failure",
                          "/replica_failure",
                          methods=["GET"])

    class Worker:
        # Assumes that two replicas are started. Will hang forever in the
        # constructor for any workers that are restarted.
        def __init__(self, path):
            self.should_hang = False
            if not os.path.exists(path):
                with open(path, "w") as f:
                    f.write("1")
            else:
                with open(path, "r") as f:
                    num = int(f.read())

                with open(path, "w") as f:
                    if num == 2:
                        self.should_hang = True
                    else:
                        f.write(str(num + 1))

            if self.should_hang:
                while True:
                    pass

        def __call__(self):
            pass

    temp_path = tempfile.gettempdir() + "/" + serve.utils.get_random_letters()
    serve.create_backend(Worker, "replica_failure", temp_path)
    backend_config = serve.get_backend_config("replica_failure")
    backend_config.num_replicas = 2
    serve.set_backend_config("replica_failure", backend_config)
    serve.link("replica_failure", "replica_failure")

    # Wait until both replicas have been started.
    responses = set()
    while len(responses) == 1:
        responses.add(
            request_with_retries("/replica_failure", timeout=0.1).text)
        time.sleep(0.1)

    # Kill one of the replicas.
    handles = _get_worker_handles("replica_failure")
    assert len(handles) == 2
    ray.kill(handles[0])

    # Check that the other replica still serves requests.
    for _ in range(10):
        while True:
            try:
                # The timeout needs to be small here because the request to
                # the restarting worker will hang.
                request_with_retries("/replica_failure", timeout=0.1)
                break
            except TimeoutError:
                time.sleep(0.1)
示例#26
0
def _kill_routers():
    routers = ray.get(serve.api._get_controller().get_routers.remote())
    for router in routers.values():
        ray.kill(router, no_restart=False)
示例#27
0
def _kill_router():
    [router] = ray.get(serve.api._get_master_actor().get_router.remote())
    ray.kill(router)
示例#28
0
def test_detached_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    for _ in range(2):
        cluster.add_node(num_cpus=3)
    cluster.wait_for_nodes()
    info = ray.init(address=cluster.address)

    # Make sure detached placement group will alive when job dead.
    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

pg = ray.util.placement_group(
        [{{"CPU": 1}} for _ in range(2)],
        strategy="STRICT_SPREAD", lifetime="detached")
ray.get(pg.ready())

@ray.remote(num_cpus=1)
class Actor:
    def ready(self):
        return True

for bundle_index in range(2):
    actor = Actor.options(lifetime="detached", placement_group=pg,
                placement_group_bundle_index=bundle_index).remote()
    ray.get(actor.ready.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    def assert_alive_num_pg(expected_num_pg):
        alive_num_pg = 0
        for _, placement_group_info in ray.util.placement_group_table().items(
        ):
            if placement_group_info["state"] == "CREATED":
                alive_num_pg += 1
        return alive_num_pg == expected_num_pg

    def assert_alive_num_actor(expected_num_actor):
        alive_num_actor = 0
        for actor_info in ray.actors().values():
            if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE:
                alive_num_actor += 1
        return alive_num_actor == expected_num_actor

    wait_for_condition(is_job_done)

    assert assert_alive_num_pg(1)
    assert assert_alive_num_actor(2)

    # Make sure detached placement group will alive when its creator which
    # is detached actor dead.
    # Test actors first.
    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor_with_detached_pg(self):
            # Create placement group which is detached.
            pg = ray.util.placement_group([{
                "CPU": 1
            } for _ in range(2)],
                                          strategy="STRICT_SPREAD",
                                          lifetime="detached",
                                          name="detached_pg")
            ray.get(pg.ready())
            # Schedule nested actor with the placement group.
            for bundle_index in range(2):
                actor = NestedActor.options(
                    placement_group=pg,
                    placement_group_bundle_index=bundle_index,
                    lifetime="detached").remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

    a = Actor.options(lifetime="detached").remote()
    ray.get(a.ready.remote())
    # 1 parent actor and 2 children actor.
    ray.get(a.schedule_nested_actor_with_detached_pg.remote())

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    try:
        ray.get(a.ready.remote())
    except ray.exceptions.RayActorError:
        pass

    # We should have 2 alive pgs and 4 alive actors.
    assert assert_alive_num_pg(2)
    assert assert_alive_num_actor(4)
示例#29
0
def test_threaded_actor_creation_and_kill(ray_start_cluster):
    """Test the scenario where the threaded actors are created and killed."""
    cluster = ray_start_cluster
    NUM_CPUS_PER_NODE = 3
    NUM_NODES = 2
    for _ in range(NUM_NODES):
        cluster.add_node(num_cpus=NUM_CPUS_PER_NODE)
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=0)
    class ThreadedActor:
        def __init__(self):
            self.received = []
            self.lock = threading.Lock()

        def add(self, seqno):
            time.sleep(1)
            with self.lock:
                self.received.append(seqno)

        def get_all(self):
            with self.lock:
                return self.received

        def ready(self):
            pass

        def terminate(self):
            ray.actor.exit_actor()

    # - Create threaded actors
    # - Submit many tasks.
    # - Ungracefully kill them in the middle.
    for _ in range(10):
        actors = [
            ThreadedActor.options(max_concurrency=10).remote()
            for _ in range(NUM_NODES * NUM_CPUS_PER_NODE)
        ]
        ray.get([actor.ready.remote() for actor in actors])

        for _ in range(10):
            for actor in actors:
                actor.add.remote(1)
        time.sleep(0.5)
        for actor in actors:
            ray.kill(actor)
    ensure_cpu_returned(NUM_NODES * NUM_CPUS_PER_NODE)

    # - Create threaded actors
    # - Submit many tasks.
    # - Gracefully kill them in the middle.
    for _ in range(10):
        actors = [
            ThreadedActor.options(max_concurrency=10).remote()
            for _ in range(NUM_NODES * NUM_CPUS_PER_NODE)
        ]
        ray.get([actor.ready.remote() for actor in actors])
        for _ in range(10):
            for actor in actors:
                actor.add.remote(1)

        time.sleep(0.5)
        for actor in actors:
            actor.terminate.remote()
    ensure_cpu_returned(NUM_NODES * NUM_CPUS_PER_NODE)
示例#30
0
def _kill_routers(client):
    routers = ray.get(client._controller.get_routers.remote())
    for router in routers.values():
        ray.kill(router, no_restart=False)