Exemplo n.º 1
0
def test_run_runtime_env(ray_start_stop):
    """Test `serve run` with runtime_env passed in."""

    # With import path
    p = subprocess.Popen([
        "serve",
        "run",
        "--address=auto",
        "ray.serve.tests.test_cli.metal_detector_node",
        "--runtime-env-json",
        ('{"env_vars": {"buried_item": "lucky coin"} }'),
    ])
    wait_for_condition(lambda: ping_endpoint("MetalDetector") == "lucky coin",
                       timeout=10)
    p.send_signal(signal.SIGINT)
    p.wait()

    # With config
    p = subprocess.Popen([
        "serve",
        "run",
        "--address=auto",
        os.path.join(
            os.path.dirname(__file__),
            "test_config_files",
            "missing_runtime_env.yaml",
        ),
        "--runtime-env-json",
        ('{"py_modules": ["https://github.com/ray-project/test_deploy_group'
         '/archive/67971777e225600720f91f618cdfe71fc47f60ee.zip"],'
         '"working_dir": "http://nonexistentlink-q490123950ni34t"}'),
        "--working-dir",
        ("https://github.com/ray-project/test_dag/archive/"
         "76a741f6de31df78411b1f302071cde46f098418.zip"),
    ])
    wait_for_condition(lambda: ping_endpoint("") == "wonderful world",
                       timeout=15)
    p.send_signal(signal.SIGINT)
    p.wait()
Exemplo n.º 2
0
def test_schedule_placement_groups_at_the_same_time(connect_to_client):
    ray.init(num_cpus=4)

    with connect_to_client_or_not(connect_to_client):
        pgs = [placement_group([{"CPU": 2}]) for _ in range(6)]

        wait_pgs = {pg.ready(): pg for pg in pgs}

        def is_all_placement_group_removed():
            ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5)
            if ready:
                ready_pg = wait_pgs[ready[0]]
                remove_placement_group(ready_pg)
                del wait_pgs[ready[0]]

            if len(wait_pgs) == 0:
                return True
            return False

        wait_for_condition(is_all_placement_group_removed)

    ray.shutdown()
Exemplo n.º 3
0
def test_task_summary(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    cluster.add_node(num_cpus=2)

    @ray.remote
    def run_long_time_task():
        time.sleep(30)
        return True

    @ray.remote
    def task_wait_for_dep(dep):
        print(dep)

    a = task_wait_for_dep.remote(run_long_time_task.remote())  # noqa
    b = task_wait_for_dep.remote(run_long_time_task.remote())  # noqa

    def verify():
        # task_name -> states
        task_summary = summarize_tasks()
        task_summary = task_summary["cluster"]["summary"]
        assert "task_wait_for_dep" in task_summary
        assert "run_long_time_task" in task_summary
        assert (task_summary["task_wait_for_dep"]["state_counts"]
                ["WAITING_FOR_DEPENDENCIES"] == 2)
        assert task_summary["run_long_time_task"]["state_counts"][
            "RUNNING"] == 2
        assert task_summary["task_wait_for_dep"]["type"] == "NORMAL_TASK"
        return True

    wait_for_condition(verify)
    """
    Test CLI
    """
    runner = CliRunner()
    result = runner.invoke(summary_state_cli_group, ["tasks"])
    assert "task_wait_for_dep" in result.output
    assert result.exit_code == 0
def test_e2e_basic_scale_up_down(serve_instance):
    """Send 100 requests and check that we autoscale up, and then back down."""

    signal = SignalActor.remote()

    @serve.deployment(
        _autoscaling_config={
            "metrics_interval_s": 0.1,
            "min_replicas": 1,
            "max_replicas": 2,
            "look_back_period_s": 0.2,
            "downscale_delay_s": 0,
            "upscale_delay_s": 0
        },
        # We will send over a lot of queries. This will make sure replicas are
        # killed quickly during cleanup.
        _graceful_shutdown_timeout_s=1,
        max_concurrent_queries=1000,
        version="v1")
    class A:
        def __call__(self):
            ray.get(signal.wait.remote())

    A.deploy()

    controller = serve_instance._controller
    start_time = get_deployment_start_time(controller, A)

    handle = A.get_handle()
    [handle.remote() for _ in range(100)]

    wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 2)
    signal.send.remote()

    # As the queue is drained, we should scale back down.
    wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 1)

    # Make sure start time did not change for the deployment
    assert get_deployment_start_time(controller, A) == start_time
Exemplo n.º 5
0
def test_job_gc(call_ray_start):
    address = call_ray_start

    ray.init(address=address)
    driver = """
import ray

ray.init(address="{}")

@ray.remote
class Actor:
    def __init__(self):
        pass

_ = Actor.remote()
""".format(address)

    p = run_string_as_driver_nonblocking(driver)
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.state.actors()
    assert len(actor_table) == 1

    job_table = ray.state.jobs()
    assert len(job_table) == 2  # dash

    # Kill the driver process.
    p.kill()
    p.wait()

    def actor_finish():
        actor_table = ray.state.actors()
        if (len(actor_table) == 0):
            return True
        else:
            return False

    wait_for_condition(actor_finish)
Exemplo n.º 6
0
def test_get_node_info_after_raylet_died(ray_start_cluster_head):
    cluster = ray_start_cluster_head

    def get_node_info():
        return ray._private.services.get_node_to_connect_for_driver(
            cluster.redis_address,
            cluster.gcs_address,
            cluster.head_node.node_ip_address,
            redis_password=cluster.redis_password,
        )

    assert get_node_info(
    ).raylet_socket_name == cluster.head_node.raylet_socket_name

    cluster.head_node.kill_raylet()
    wait_for_condition(
        lambda: not cluster.global_state.node_table()[0]["Alive"], timeout=30)
    with pytest.raises(RuntimeError):
        get_node_info()

    node2 = cluster.add_node()
    assert get_node_info().raylet_socket_name == node2.raylet_socket_name
Exemplo n.º 7
0
def test_plugin_hang(ray_start_regular):
    env_key = MyPluginForHang.env_key

    @ray.remote(num_cpus=0.1)
    def f():
        return os.environ[env_key]

    refs = [
        f.options(
            # Avoid hitting the cache of runtime_env
            runtime_env={
                "plugins": {
                    MY_PLUGIN_FOR_HANG_CLASS_PATH: {
                        "name": "f1"
                    }
                }
            }).remote(),
        f.options(runtime_env={
            "plugins": {
                MY_PLUGIN_FOR_HANG_CLASS_PATH: {
                    "name": "f2"
                }
            }
        }).remote(),
    ]

    def condition():
        for ref in refs:
            try:
                res = ray.get(ref, timeout=1)
                print("result:", res)
                assert int(res) == 2
                return True
            except Exception as error:
                print(f"Got error: {error}")
                pass
        return False

    wait_for_condition(condition, timeout=60)
def _test_task_and_actor(capsys):
    @ray.remote
    def f():
        pass

    with pytest.raises(RuntimeEnvSetupError):
        ray.get(f.options(runtime_env={"pip": ["requests"]}).remote())

    def stderr_checker():
        captured = capsys.readouterr()
        return "ray[default]" in captured.err

    wait_for_condition(stderr_checker)

    @ray.remote
    class A:
        def task(self):
            pass

    A.options(runtime_env={"pip": ["requests"]}).remote()

    wait_for_condition(stderr_checker)
Exemplo n.º 9
0
    def test_subprocess_exception(self, job_manager):
        """
        Run a python script with exception, ensure:
        1) Job status is marked as failed
        2) Job manager can surface exception message back to logs api
        3) Job no hanging job supervisor actor
        4) Empty logs
        """
        run_cmd = f"python {_driver_script_path('script_with_exception.py')}"
        job_id = job_manager.submit_job(entrypoint=run_cmd)

        def cleaned_up():
            status = job_manager.get_job_status(job_id)
            if status.status != JobStatus.FAILED:
                return False
            if ("Exception: Script failed with exception !"
                    not in status.message):
                return False

            return job_manager._get_actor_for_job(job_id) is None

        wait_for_condition(cleaned_up)
Exemplo n.º 10
0
    def test_pass_metadata(self, job_manager):
        def dict_to_binary(d):
            return str(dict(sorted(d.items()))).encode("utf-8")

        print_metadata_cmd = (
            "python -c\""
            "import ray;"
            "ray.init();"
            "job_config=ray.worker.global_worker.core_worker.get_job_config();"
            "print(dict(sorted(job_config.metadata.items())))"
            "\"")

        # Check that we default to only the job ID.
        job_id = job_manager.submit_job(print_metadata_cmd)

        wait_for_condition(check_job_succeeded,
                           job_manager=job_manager,
                           job_id=job_id)
        assert job_manager.get_job_stdout(job_id) == dict_to_binary(
            {JOB_ID_METADATA_KEY: job_id})

        # Check that we can pass custom metadata.
        job_id = job_manager.submit_job(print_metadata_cmd,
                                        metadata={
                                            "key1": "val1",
                                            "key2": "val2"
                                        })

        wait_for_condition(check_job_succeeded,
                           job_manager=job_manager,
                           job_id=job_id)
        assert job_manager.get_job_stdout(job_id) == dict_to_binary({
            JOB_ID_METADATA_KEY:
            job_id,
            "key1":
            "val1",
            "key2":
            "val2"
        })
Exemplo n.º 11
0
def test_updating_status_message(lower_slow_startup_threshold_and_reset):
    """Check if status message says if a serve deployment has taken a long time"""

    client = lower_slow_startup_threshold_and_reset

    @serve.deployment(
        num_replicas=5,
        ray_actor_options={"num_cpus": 1},
    )
    def f(*args):
        pass

    f.deploy(_blocking=False)

    def updating_message():
        deployment_status = client.get_serve_status().deployment_statuses[0]
        message_substring = "more than 1s to be scheduled."
        return (deployment_status.status
                == "UPDATING") and (message_substring
                                    in deployment_status.message)

    wait_for_condition(updating_message, timeout=20)
Exemplo n.º 12
0
def test_user_logs(serve_instance):
    logger = logging.getLogger("ray.serve")
    msg = "user log message"
    name = "user_fn"

    @serve.deployment(name=name)
    def fn(*args):
        logger.info("user log message")
        return serve.get_replica_context().replica_tag

    fn.deploy()
    handle = fn.get_handle()

    f = io.StringIO()
    with redirect_stderr(f):

        def check_log(replica_tag: str):
            s = f.getvalue()
            return all([name in s, replica_tag in s, msg in s])

        replica_tag = ray.get(handle.remote())
        wait_for_condition(check_log, replica_tag=replica_tag)
Exemplo n.º 13
0
def test_heartbeat_ip(shutdown_only):
    cluster = ray.init(num_cpus=1,
                       _system_config={
                           "report_worker_backlog": True,
                       })
    global_state_accessor = GlobalStateAccessor(
        cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    global_state_accessor.connect()

    self_ip = ray.util.get_node_ip_address()

    def self_ip_is_set():
        message = global_state_accessor.get_all_resource_usage()
        if message is None:
            return False

        resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message)
        resources_data = resource_usage.batch[0]
        return resources_data.node_manager_address == self_ip

    wait_for_condition(self_ip_is_set, timeout=2)
    global_state_accessor.disconnect()
Exemplo n.º 14
0
def test_no_user_defined_method(serve_instance, use_class):
    """Check the default behavior when an actor crashes."""

    if use_class:

        @serve.deployment
        class A:
            def __call__(self, *args):
                return ray.get_runtime_context().current_actor

    else:

        @serve.deployment
        def A(*args):
            return ray.get_runtime_context().current_actor

    h = serve.run(A.bind())
    actor = ray.get(h.remote())
    ray.kill(actor)

    # This would time out if we wait for multiple health check failures.
    wait_for_condition(check_new_actor_started, handle=h, original_actors=actor)
Exemplo n.º 15
0
def test_actor_scheduling_not_block_with_placement_group(ray_start_cluster):
    """Tests the scheduling of lots of actors will not be blocked
       when using placement groups.

       For more detailed information please refer to:
       https://github.com/ray-project/ray/issues/15801.
    """

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=1)
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class A:
        def ready(self):
            pass

    actor_num = 1000
    pgs = [ray.util.placement_group([{"CPU": 1}]) for _ in range(actor_num)]
    actors = [A.options(placement_group=pg).remote() for pg in pgs]
    refs = [actor.ready.remote() for actor in actors]

    expected_created_num = 1

    def is_actor_created_number_correct():
        ready, not_ready = ray.wait(refs, num_returns=len(refs), timeout=1)
        return len(ready) == expected_created_num

    def is_pg_created_number_correct():
        created_pgs = [
            pg for _, pg in ray.util.placement_group_table().items()
            if pg["state"] == "CREATED"
        ]
        return len(created_pgs) == expected_created_num

    wait_for_condition(is_pg_created_number_correct, timeout=3)
    wait_for_condition(
        is_actor_created_number_correct, timeout=30, retry_interval_ms=0)

    # NOTE: we don't need to test all the actors create successfully.
    for _ in range(20):
        expected_created_num += 1
        cluster.add_node(num_cpus=1)

        wait_for_condition(is_pg_created_number_correct, timeout=10)
        # Make sure the node add event will cause a waiting actor
        # to create successfully in time.
        wait_for_condition(
            is_actor_created_number_correct, timeout=30, retry_interval_ms=0)
Exemplo n.º 16
0
    def test_stop_job_in_pending(self, job_manager):
        """
        Kick off a job that is in PENDING state, stop the job and ensure

        1) Job can correctly be stop immediately with correct JobStatus
        2) No dangling subprocess left.
        """
        start_signal_actor = SignalActor.remote()

        with tempfile.TemporaryDirectory() as tmp_dir:
            pid_file, _, job_id = _run_hanging_command(
                job_manager, tmp_dir, start_signal_actor=start_signal_actor)
            assert not os.path.exists(pid_file), (
                "driver subprocess should NOT be running while job is "
                "still PENDING.")

            assert job_manager.stop_job(job_id) is True
            # Send run signal to unblock run function
            ray.get(start_signal_actor.send.remote())
            wait_for_condition(check_job_stopped,
                               job_manager=job_manager,
                               job_id=job_id)
def test_scaledown_shared_objects(shutdown_only):
    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 100 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 5,
            },
        },
        idle_timeout_minutes=0.05,
    )

    try:
        cluster.start(
            _system_config={"scheduler_report_pinned_bytes_only": True})
        ray.init("auto")

        actors = [Actor.remote() for _ in range(5)]
        ray.get([a.f.remote() for a in actors])
        print("All five nodes launched")

        # Verify scale-up.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 5)

        data = actors[0].create.remote(1024 * 1024 * 5)
        ray.get([a.recv.remote(data) for a in actors])
        print("Data broadcast successfully, deleting actors.")
        del actors

        # Verify scale-down.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 1,
                           timeout=30)
    finally:
        cluster.shutdown()
Exemplo n.º 18
0
    def test_kill_job_actor_in_pending(self, job_manager):
        """
        Kick off a job that is in PENDING state, kill the job actor and ensure

        1) Job can correctly be stop immediately with correct JobStatus
        2) No dangling subprocess left.
        """
        start_signal_actor = SignalActor.remote()

        with tempfile.TemporaryDirectory() as tmp_dir:
            pid_file, _, job_id = _run_hanging_command(
                job_manager, tmp_dir, start_signal_actor=start_signal_actor)

            assert not os.path.exists(pid_file), (
                "driver subprocess should NOT be running while job is "
                "still PENDING.")

            actor = job_manager._get_actor_for_job(job_id)
            ray.kill(actor, no_restart=True)
            wait_for_condition(check_job_failed,
                               job_manager=job_manager,
                               job_id=job_id)
    def test_job_level_gc(self, runtime_env_disable_URI_cache, start_cluster,
                          field, spec_format, tmp_path):
        """Tests that job-level conda env is GC'd when the job exits."""
        # We must use a single-node cluster.  If we simulate a multi-node
        # cluster then the conda installs will proceed simultaneously, one on
        # each node, but since they're actually running on the same machine we
        # get errors.
        cluster, address = start_cluster

        ray.init(address,
                 runtime_env=generate_runtime_env_dict(field, spec_format,
                                                       tmp_path))

        @ray.remote
        def f():
            import pip_install_test  # noqa: F401

            return True

        # Ensure that the runtime env has been installed.
        assert ray.get(f.remote())
        # Sleep some seconds before checking that we didn't GC. Otherwise this
        # check may spuriously pass.
        time.sleep(2)
        assert not check_local_files_gced(cluster)

        ray.shutdown()

        wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30)

        # Check that we can reconnect with the same env.  (In other words, ensure
        # the conda env was fully deleted and not left in some kind of corrupted
        # state that prevents reinstalling the same conda env.)

        ray.init(address,
                 runtime_env=generate_runtime_env_dict(field, spec_format,
                                                       tmp_path))

        assert ray.get(f.remote())
Exemplo n.º 20
0
    def test_stop_job_subprocess_cleanup_upon_stop(self, job_manager):
        """
        Ensure driver scripts' subprocess is cleaned up properly when we
        stopped a running job.

        SIGTERM first, SIGKILL after 3 seconds.
        """
        with tempfile.TemporaryDirectory() as tmp_dir:
            pid_file, _, job_id = _run_hanging_command(job_manager, tmp_dir)
            with open(pid_file, "r") as file:
                pid = int(file.read())
                assert psutil.pid_exists(pid), (
                    "driver subprocess should be running")

            assert job_manager.stop_job(job_id) is True
            wait_for_condition(check_job_stopped,
                               job_manager=job_manager,
                               job_id=job_id)

            # Ensure driver subprocess gets cleaned up after job reached
            # termination state
            wait_for_condition(check_subprocess_cleaned, pid=pid)
Exemplo n.º 21
0
def test_delete_actor(ray_start_regular):
    with ray_start_client_server_pair() as pair:
        ray, server = pair

        @ray.remote
        class Accumulator:
            def __init__(self):
                self.acc = 0

            def inc(self):
                self.acc += 1

        actor = Accumulator.remote()
        actor.inc.remote()
        actor2 = Accumulator.remote()
        actor2.inc.remote()

        assert server_actor_ref_count(server, 2)()

        del actor

        wait_for_condition(server_actor_ref_count(server, 1), timeout=5)
Exemplo n.º 22
0
    async def test_failed_job(self, job_manager):
        """Test tailing logs for a job that unexpectedly exits."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            pid_file, _, job_id = _run_hanging_command(job_manager, tmp_dir)

            await self._tail_and_assert_logs(job_id,
                                             job_manager,
                                             expected_log="Waiting...",
                                             num_iteration=5)

            # Kill the job unexpectedly.
            with open(pid_file, "r") as f:
                os.kill(int(f.read()), signal.SIGKILL)

            async for lines in job_manager.tail_job_logs(job_id):
                assert all(s == "Waiting..."
                           for s in lines.strip().split("\n"))
                print(lines, end="")

            wait_for_condition(check_job_failed,
                               job_manager=job_manager,
                               job_id=job_id)
Exemplo n.º 23
0
def test_node_physical_stats(enable_test_module, shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=1)
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote() for _ in range(6)]
    actor_pids = ray.get([actor.getpid.remote() for actor in actors])
    actor_pids = set(actor_pids)

    webui_url = addresses["webui_url"]
    assert wait_until_server_available(webui_url) is True
    webui_url = format_web_url(webui_url)

    def _check_workers():
        try:
            resp = requests.get(webui_url + "/test/dump?key=node_physical_stats")
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True
            node_physical_stats = result["data"]["nodePhysicalStats"]
            assert len(node_physical_stats) == 1
            current_stats = node_physical_stats[addresses["node_id"]]
            # Check Actor workers
            current_actor_pids = set()
            for worker in current_stats["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    current_actor_pids.add(worker["pid"])
            assert current_actor_pids == actor_pids
            # Check raylet cmdline
            assert "raylet" in current_stats["cmdline"][0]
            return True
        except Exception as ex:
            logger.info(ex)
            return False

    wait_for_condition(_check_workers, timeout=10)
Exemplo n.º 24
0
def test_pass_returned_object_ref(one_worker_100MiB, use_ray_put, failure):
    @ray.remote
    def return_an_id():
        return [
            put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8), use_ray_put)
        ]

    # TODO(edoakes): this fails with an ActorError with max_retries=1.
    @ray.remote(max_retries=0)
    def pending(ref, signal):
        ray.get(signal.wait.remote())
        ray.get(ref[0])
        if failure:
            os._exit(0)

    signal = SignalActor.remote()
    outer_oid = return_an_id.remote()
    inner_oid_binary = ray.get(outer_oid)[0].binary()
    pending_oid = pending.remote([outer_oid], signal)

    # Remove the local reference to the returned ID.
    del outer_oid

    # Check that the inner ID is pinned by the remote task ID and finishing
    # the task unpins the object.
    ray.get(signal.send.remote())
    try:
        # Should succeed because inner_oid is pinned if no failure.
        ray.get(pending_oid)
        assert not failure
    except ray.exceptions.WorkerCrashedError:
        assert failure

    def ref_not_exists():
        worker = ray.worker.global_worker
        inner_oid = ray.ObjectRef(inner_oid_binary)
        return not worker.core_worker.object_exists(inner_oid)

    wait_for_condition(ref_not_exists)
Exemplo n.º 25
0
def test_stop_long_running_job(job_sdk_client):
    """
    Submit a job that runs for a while and stop it in the middle.
    """
    client = job_sdk_client

    with tempfile.TemporaryDirectory() as tmp_dir:
        path = Path(tmp_dir)
        driver_script = """
print('Hello !')
import time
time.sleep(300) # This should never finish
raise RuntimeError('Intentionally failed.')
        """
        test_script_file = path / "test_script.py"
        with open(test_script_file, "w+") as file:
            file.write(driver_script)

        job_id = client.submit_job(entrypoint="python test_script.py",
                                   runtime_env={"working_dir": tmp_dir})
        assert client.stop_job(job_id) is True
        wait_for_condition(_check_job_stopped, client=client, job_id=job_id)
Exemplo n.º 26
0
def test_usage_file_error_message(monkeypatch, ray_start_cluster, reset_lib_usage):
    """
    Make sure the usage report file is generated with a proper
    error message when the report is failed.
    """
    with monkeypatch.context() as m:
        m.setenv("RAY_USAGE_STATS_ENABLED", "1")
        m.setenv("RAY_USAGE_STATS_REPORT_URL", "http://127.0.0.1:8000")
        m.setenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", "1")
        cluster = ray_start_cluster
        cluster.add_node(num_cpus=0)
        ray.init(address=cluster.address)

        global_node = ray._private.worker._global_node
        temp_dir = pathlib.Path(global_node.get_session_dir_path())
        try:
            wait_for_condition(lambda: file_exists(temp_dir), timeout=30)
        except Exception:
            print_dashboard_log()
            raise

        error_message = read_file(temp_dir, "error")
        failure_old = read_file(temp_dir, "usage_stats")["total_failed"]
        report_success = read_file(temp_dir, "success")
        # Test if the timestampe has been updated.
        assert (
            "HTTPConnectionPool(host='127.0.0.1', port=8000): "
            "Max retries exceeded with url:"
        ) in error_message
        assert not report_success
        try:
            wait_for_condition(
                lambda: failure_old < read_file(temp_dir, "usage_stats")["total_failed"]
            )
        except Exception:
            print_dashboard_log()
            read_file(temp_dir, "usage_stats")["total_failed"]
            raise
        assert read_file(temp_dir, "usage_stats")["total_success"] == 0
Exemplo n.º 27
0
def test_replica_spread(ray_cluster):
    cluster = ray_cluster

    cluster.add_node(num_cpus=2)

    # NOTE(edoakes): we need to start serve before adding the worker node to
    # guarantee that the controller is placed on the head node (we should be
    # able to tolerate being placed on workers, but there's currently a bug).
    # We should add an explicit test for that in the future when it's fixed.
    cluster.connect(namespace=SERVE_NAMESPACE)
    serve.start(detached=True)

    worker_node = cluster.add_node(num_cpus=2)

    @serve.deployment(num_replicas=2)
    def get_node_id():
        return os.getpid(), ray.get_runtime_context().node_id.hex()

    h = serve.run(get_node_id.bind())

    def get_num_nodes():
        pids = set()
        node_ids = set()
        while len(pids) < 2:
            pid, node = ray.get(h.remote())
            pids.add(pid)
            node_ids.add(node)

        return len(node_ids)

    # Check that the two replicas are spread across the two nodes.
    wait_for_condition(lambda: get_num_nodes() == 2)

    # Kill the worker node. The second replica should get rescheduled on
    # the head node.
    cluster.remove_node(worker_node)

    # Check that the replica on the dead node can be rescheduled.
    wait_for_condition(lambda: get_num_nodes() == 1)
Exemplo n.º 28
0
def test_delete_objects_delete_while_creating(object_spilling_config,
                                              shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = object_spilling_config

    address = ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
        },
    )
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []

    for _ in range(80):
        ref = None
        while ref is None:
            ref = ray.put(arr)
            replay_buffer.append(ref)
        # Remove the replay buffer with 60% probability.
        if random.randint(0, 9) < 6:
            replay_buffer.pop()

    # Do random sampling.
    for _ in range(200):
        ref = random.choice(replay_buffer)
        sample = ray.get(ref, timeout=0)
        assert np.array_equal(sample, arr)

    # After all, make sure all objects are killed without race condition.
    del replay_buffer
    del ref
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["address"])
Exemplo n.º 29
0
def test_cached_object(ray_start_cluster):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
        "object_timeout_milliseconds": 200,
    }
    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(
        num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
    cluster.add_node(
        num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
    cluster.wait_for_nodes()

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node1": 1}).remote()
    ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(
        num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
    wait_for_condition(
        lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    ray.get(dependent_task.remote(obj))
Exemplo n.º 30
0
def test_idempotence_after_controller_death(ray_start_stop, use_command: bool):
    """Check that CLI is idempotent even if controller dies."""

    config_file_name = os.path.join(os.path.dirname(__file__),
                                    "test_config_files", "basic_graph.yaml")
    success_message_fragment = b"Sent deploy request successfully!"
    deploy_response = subprocess.check_output(
        ["serve", "deploy", config_file_name])
    assert success_message_fragment in deploy_response

    ray.init(address="auto", namespace=SERVE_NAMESPACE)
    serve.start(detached=True)
    wait_for_condition(
        lambda: len(ray.util.list_named_actors(all_namespaces=True)) == 4,
        timeout=15)

    # Kill controller
    if use_command:
        subprocess.check_output(["serve", "shutdown", "-y"])
    else:
        serve.shutdown()

    status_response = subprocess.check_output(["serve", "status"])
    status_info = yaml.safe_load(status_response)

    assert len(status_info["deployment_statuses"]) == 0

    deploy_response = subprocess.check_output(
        ["serve", "deploy", config_file_name])
    assert success_message_fragment in deploy_response

    # Restore testing controller
    serve.start(detached=True)
    wait_for_condition(
        lambda: len(ray.util.list_named_actors(all_namespaces=True)) == 4,
        timeout=15)
    serve.shutdown()
    ray.shutdown()