Exemplo n.º 1
0
def test_dedupe_cluster_failure(tmp_path):
    ray.shutdown()
    """
    ======== driver 1 ===========
    1. Checkpoing the input args
        * Uploads
    2. Begin to run step
        * Crash

    ====== driver 2 ============
    1. Recover inputs
        * Creates a new object ref
    2. Finish running step
    3. Checkpoint step output
        * Should not trigger upload
    """
    lock_file = tmp_path / "lock"
    workflow_dir = tmp_path / "workflow"

    driver_script = f"""
import time
import ray
from ray import workflow
from filelock import FileLock

@ray.remote
def foo(objrefs):
    with FileLock("{str(lock_file)}"):
        return objrefs

if __name__ == "__main__":
    ray.init(storage="{str(workflow_dir)}")
    workflow.init()
    arg = ray.put("hello world")

    workflow.create(foo.bind([arg, arg])).run()
    assert False
    """

    lock = FileLock(lock_file)
    lock.acquire()

    run_string_as_driver_nonblocking(driver_script)

    time.sleep(10)

    subprocess.check_call(["ray", "stop", "--force"])

    lock.release()
    ray.init(storage=str(workflow_dir))
    workflow.init()
    resumed = workflow.resume_all()
    assert len(resumed) == 1
    objref = resumed.pop()[1]
    ray.get(objref)

    # The object ref will be different before and after recovery, so it will
    # get uploaded twice.
    assert get_num_uploads() == 1
    ray.shutdown()
Exemplo n.º 2
0
def test_spill_logs():
    script = """
import ray
import numpy as np

ray.init(object_store_memory=200e6)

x = []

for _ in range(10):
    x.append(ray.put(np.ones(100 * 1024 * 1024, dtype=np.uint8)))
"""

    proc = run_string_as_driver_nonblocking(
        script, env={"RAY_verbose_spill_logs": "1"})
    out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    print(out_str)
    assert "Spilled " in out_str

    proc = run_string_as_driver_nonblocking(
        script, env={"RAY_verbose_spill_logs": "0"})
    out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    print(out_str)
    assert "Spilled " not in out_str
Exemplo n.º 3
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(address="{address}")


tune.run(
    "PG",
    name="experiment",
    config=dict(env="CartPole-v1", framework="tf"),
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(
        address=cluster.address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(
                resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath,
            }
        },
        resume=True)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    ray.shutdown()
    cluster.shutdown()
Exemplo n.º 4
0
def test_actor_stdout():
    script = """
import ray

ray.init(num_cpus=2)

@ray.remote
class Actor1:
    def f(self):
        print("hi")

@ray.remote
class Actor2:
    def f(self):
        print("bye")
    def __repr__(self):
        return "ActorX"

a = Actor1.remote()
ray.get(a.f.remote())
b = Actor2.remote()
ray.get(b.f.remote())
    """

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")

    assert "hi" in out_str, out_str
    assert "(Actor1 pid=" in out_str, out_str
    assert "bye" in out_str, out_str
    assert "(Actor2 pid=" not in out_str, out_str
    assert "(ActorX pid=" in out_str, out_str
Exemplo n.º 5
0
def test_recovery_cluster_failure(tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(f"""
import time
import ray
from ray import workflow

@ray.remote
def foo(x):
    print("Executing", x)
    time.sleep(1)
    if x < 20:
        return workflow.continuation(foo.bind(x + 1))
    else:
        return 20

if __name__ == "__main__":
    ray.init(storage="{tmp_path}")
    workflow.init()
    assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20
""")
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    ray.init(storage=str(tmp_path))
    workflow.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
Exemplo n.º 6
0
def test_runtime_env_hook(skip_hook):
    ray_init_snippet = "ray.init(_skip_env_hook=True)" if skip_hook else ""

    script = f"""
import ray
import os

{ray_init_snippet}

@ray.remote
def f():
    return os.environ.get("HOOK_KEY")

print(ray.get(f.remote()))
"""

    proc = run_string_as_driver_nonblocking(
        script, env={"RAY_RUNTIME_ENV_HOOK": "ray.tests.test_output._hook"})
    out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    print(out_str)
    if skip_hook:
        assert "HOOK_VALUE" not in out_str
    else:
        assert "HOOK_VALUE" in out_str
Exemplo n.º 7
0
def test_multi_stdout():
    script = """
import ray
import sys

ray.init(num_cpus=1)

@ray.remote
def foo():
    print()

@ray.remote
def bar():
    print()

@ray.remote
def baz():
    print()

ray.get(foo.remote())
ray.get(bar.remote())
ray.get(baz.remote())
    """

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")

    assert "(foo pid=" in out_str, out_str
    assert "(bar pid=" in out_str, out_str
    assert "(baz pid=" in out_str, out_str
Exemplo n.º 8
0
def test_chained_workflow_logs(workflow_start_regular):
    script = """
import ray
from ray import workflow

ray.init(address='auto')

@workflow.step(name="f1")
def f1():
    return 10

@workflow.step(name="f2")
def f2(x):
    return x+1

f2.step(f1.step()).run("wid1")
    """
    proc = run_string_as_driver_nonblocking(script)
    logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    # on driver
    assert 'Workflow job created. [id="wid1"' in logs
    # # in WorkflowManagementActor's run_or_resume.remote()
    # assert "run_or_resume: wid1" in logs
    # assert "Workflow job [id=wid1] started." in logs
    # in _workflow_step_executor_remote
    assert "Step status [RUNNING]\t[wid1@f1" in logs
    assert "Step status [SUCCESSFUL]\t[wid1@f1" in logs
    assert "Step status [RUNNING]\t[wid1@f2" in logs
    assert "Step status [SUCCESSFUL]\t[wid1@f2" in logs
Exemplo n.º 9
0
def test_driver_dead(shutdown_only):
    """Make sure all ray workers are shutdown when driver is killed."""
    driver = """
import ray
ray.init(_system_config={"ping_gcs_rpc_server_max_retries": 1})
@ray.remote
def f():
    import time
    time.sleep(10)

num_cpus = int(ray.available_resources()["CPU"])
tasks = [f.remote() for _ in range(num_cpus)]
"""

    p = run_string_as_driver_nonblocking(driver)
    # Make sure the driver is running.
    time.sleep(1)
    assert p.poll() is None
    wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0)

    # Kill the driver process.
    p.kill()
    p.wait()
    time.sleep(0.1)

    wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
Exemplo n.º 10
0
def test_virtual_actor_logs(workflow_start_regular):
    script = """
import ray
from ray import workflow

ray.init(address='auto')

@workflow.virtual_actor
class Counter:
    def __init__(self, x: int):
        self.x = x

    def add(self, y):
        self.x += y
        return self.x

couter = Counter.get_or_create("vid", 10)
couter.add.options(name="add").run(1)
    """
    proc = run_string_as_driver_nonblocking(script)
    logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    print(logs)
    # on driver
    assert 'Workflow job created. [id="vid"' in logs
    # # in WorkflowManagementActor's run_or_resume.remote()
    # assert "run_or_resume: vid" in logs
    # assert "Workflow job [id=vid] started." in logs
    # in _workflow_step_executor_remote
    assert "Step status [RUNNING]\t[vid@add" in logs
    assert "Step status [SUCCESSFUL]\t[vid@add" in logs
Exemplo n.º 11
0
def test_dynamic_workflow_logs(workflow_start_regular):
    script = """
import ray
from ray import workflow

ray.init(address='auto')

@workflow.step(name="f3")
def f3(x):
    return x+1

@workflow.step(name="f4")
def f4(x):
    return f3.step(x*2)

f4.step(10).run("wid2")
    """
    proc = run_string_as_driver_nonblocking(script)
    logs = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    # on driver
    assert 'Workflow job created. [id="wid2"' in logs
    # # in WorkflowManagementActor's run_or_resume.remote()
    # assert "run_or_resume: wid2" in logs
    # assert "Workflow job [id=wid2] started." in logs
    # in _workflow_step_executor_remote
    assert "Step status [RUNNING]\t[wid2@f3" in logs
    assert "Step status [SUCCESSFUL]\t[wid2@f3" in logs
    assert "Step status [RUNNING]\t[wid2@f4" in logs
    assert "Step status [SUCCESSFUL]\t[wid2@f4" in logs
Exemplo n.º 12
0
def test_gcs_server_crash_cluster(ray_start_cluster):
    # Test the GCS server failures will crash the driver.
    cluster = ray_start_cluster
    GCS_RECONNECTION_TIMEOUT = 5
    node = cluster.add_node(
        num_cpus=0,
        _system_config={
            "gcs_rpc_server_reconnect_timeout_s": GCS_RECONNECTION_TIMEOUT
        },
    )

    script = """
import ray
import time

ray.init(address="auto")
time.sleep(60)
    """

    # Get gcs server pid to send a signal.
    all_processes = node.all_processes
    gcs_server_process = all_processes["gcs_server"][0].process
    gcs_server_pid = gcs_server_process.pid

    proc = run_string_as_driver_nonblocking(script)
    # Wait long enough to start the driver.
    time.sleep(5)
    start = time.time()
    print(gcs_server_pid)
    os.kill(gcs_server_pid, signal.SIGKILL)
    wait_for_condition(lambda: proc.poll() is None, timeout=10)
    # Make sure the driver was exited within the timeout instead of hanging.
    # * 2 for avoiding flakiness.
    assert time.time() - start < GCS_RECONNECTION_TIMEOUT * 2
Exemplo n.º 13
0
def test_job_timestamps(ray_start_regular):
    driver_template = """
import ray
from time import sleep

ray.init(address="{}")

print("My job id: ", str(ray.get_runtime_context().job_id))

{}
ray.shutdown()
    """

    non_hanging = driver_template.format(ray_start_regular["address"],
                                         "sleep(1)")
    hanging_driver = driver_template.format(ray_start_regular["address"],
                                            "sleep(60)")

    out = run_string_as_driver(non_hanging)
    p = run_string_as_driver_nonblocking(hanging_driver)
    # The nonblocking process needs time to connect.
    time.sleep(1)

    jobs = list(ray.state.jobs())
    jobs.sort(key=lambda x: x["JobID"])

    driver = jobs[0]
    finished = jobs[1]
    running = jobs[2]

    # The initial driver timestamp/start time go down a different code path.
    assert driver["Timestamp"] == driver["StartTime"]
    assert finished["Timestamp"] == finished["EndTime"]
    assert running["Timestamp"] == running["StartTime"]

    assert finished["EndTime"] > finished["StartTime"] > 0, out
    lapsed = finished["EndTime"] - finished["StartTime"]
    assert 0 < lapsed < 5000, f"Job should've taken ~1s, {finished}"

    assert running["StartTime"] > 0
    assert running["EndTime"] == 0

    p.kill()
    # Give the second job time to clean itself up.
    time.sleep(1)

    jobs = list(ray.state.jobs())
    jobs.sort(key=lambda x: x["JobID"])

    # jobs[0] is the test case driver.
    finished = jobs[1]
    prev_running = jobs[2]

    assert finished["EndTime"] > finished["StartTime"] > 0, f"{finished}"
    assert finished["EndTime"] == finished["Timestamp"]
    lapsed = finished["EndTime"] - finished["StartTime"]
    assert 0 < lapsed < 5000, f"Job should've taken ~1s {finished}"

    assert prev_running["EndTime"] > prev_running["StartTime"] > 0
Exemplo n.º 14
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    proc = run_string_as_driver_nonblocking(driver_script.format(100))
    time.sleep(10)
    proc.kill()
    time.sleep(1)
    workflow.init()
    output = workflow.get_output("driver_terminated")
    assert ray.get(output) == 20
Exemplo n.º 15
0
def test_workflow_lifetime_2(workflow_start_cluster):
    # Case 2: driver terminated
    address, storage_uri = workflow_start_cluster
    with patch.dict(os.environ, {"RAY_ADDRESS": address}):
        ray.init()
        proc = run_string_as_driver_nonblocking(driver_script.format(100))
        time.sleep(10)
        proc.kill()
        time.sleep(1)
        assert workflow.get_output("driver_terminated") == 20
Exemplo n.º 16
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        proc = run_string_as_driver_nonblocking(driver_script.format(100))
        time.sleep(10)
        proc.kill()
        time.sleep(1)
        workflow.init()
        output = workflow.get_output("driver_terminated")
        assert ray.get(output) == 20
Exemplo n.º 17
0
def test_recovery_cluster_failure(reset_workflow, tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(
        driver_script.format(tmp_path=str(tmp_path)))
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    workflow.init(str(tmp_path))
    assert ray.get(workflow.resume("cluster_failure")) == 20
    workflow.storage.set_global_storage(None)
    ray.shutdown()
Exemplo n.º 18
0
def test_drivers_named_actors(call_ray_start):
    # This test will create some drivers that submit some tasks to the same
    # named actor.
    address = call_ray_start

    ray.init(address=address, namespace="test")

    # Define a driver that creates a named actor then sleeps for a while.
    driver_script1 = """
import ray
import time
ray.init(address="{}", namespace="test")
@ray.remote
class Counter:
    def __init__(self):
        self.count = 0
    def increment(self):
        self.count += 1
        return self.count
counter = Counter.options(name="Counter").remote()
time.sleep(100)
""".format(
        address
    )

    # Define a driver that submits to the named actor and exits.
    driver_script2 = """
import ray
import time
ray.init(address="{}", namespace="test")
while True:
    try:
        counter = ray.get_actor("Counter")
        break
    except ValueError:
        time.sleep(1)
assert ray.get(counter.increment.remote()) == {}
print("success")
""".format(
        address, "{}"
    )

    process_handle = run_string_as_driver_nonblocking(driver_script1)

    for i in range(3):
        driver_script = driver_script2.format(i + 1)
        out = run_string_as_driver(driver_script)
        assert "success" in out

    process_handle.kill()
Exemplo n.º 19
0
def test_core_worker_error_message():
    script = """
import ray
import sys

ray.init(local_mode=True)

# In local mode this generates an ERROR level log.
ray._private.utils.push_error_to_driver(
    ray.worker.global_worker, "type", "Hello there")
    """

    proc = run_string_as_driver_nonblocking(script)
    err_str = proc.stderr.read().decode("ascii")

    assert "Hello there" in err_str, err_str
Exemplo n.º 20
0
def test_actor_stdout(file):
    if file == "stdout":
        file_handle = "sys.stdout"
    else:  # sys.stderr
        file_handle = "sys.stderr"

    script = f"""
import ray
import sys

ray.init(num_cpus=2)

@ray.remote
class Actor1:
    def f(self):
        print("hi", file={file_handle})

@ray.remote
class Actor2:
    def __init__(self):
        print("init", file={file_handle})
        self.name = "ActorX"
    def f(self):
        print("bye", file={file_handle})
    def __repr__(self):
        return self.name

a = Actor1.remote()
ray.get(a.f.remote())
b = Actor2.remote()
ray.get(b.f.remote())
    """

    proc = run_string_as_driver_nonblocking(script)
    if file == "stdout":
        out_str = proc.stdout.read().decode("ascii")
    else:
        out_str = proc.stderr.read().decode("ascii")
    print(out_str)

    assert "hi" in out_str, out_str
    assert "(Actor1 pid=" in out_str, out_str
    assert "bye" in out_str, out_str
    assert re.search("Actor2 pid=.*init", out_str), out_str
    assert not re.search("ActorX pid=.*init", out_str), out_str
    assert re.search("ActorX pid=.*bye", out_str), out_str
    assert not re.search("Actor2 pid=.*bye", out_str), out_str
Exemplo n.º 21
0
def test_runtime_env_hook():
    script = """
import ray
import os

@ray.remote
def f():
    return os.environ.get("HOOK_KEY")

print(ray.get(f.remote()))
"""

    proc = run_string_as_driver_nonblocking(
        script, env={"RAY_RUNTIME_ENV_HOOK": "ray.tests.test_output._hook"})
    out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    print(out_str)
    assert "HOOK_VALUE" in out_str
Exemplo n.º 22
0
def test_no_verbose_output():
    script = """
import ray

@ray.remote
class Actor:
    def ping(self):
        return "ok"


@ray.remote
def getter(name):
    actor = Actor.options(
        name="foo", lifetime="detached", namespace="n", get_if_exists=True).remote()
    ray.get(actor.ping.remote())


def do_run(name):
    name = "actor_" + str(name)
    tasks = [getter.remote(name) for i in range(4)]
    ray.get(tasks)
    try:
        ray.kill(ray.get_actor(name, namespace="n"))  # Cleanup
    except:
        pass


for i in range(100):
    do_run(i)

print("DONE")
"""

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii") + proc.stderr.read().decode(
        "ascii")
    # Check there's no excessively verbose raylet error messages due to
    # actor creation races.
    out = []
    for line in out_str.split("\n"):
        if "Ray dashboard" not in line and "The object store" not in line:
            out.append(line)
    valid = "".join(out)
    assert valid.strip() == "DONE", out_str
Exemplo n.º 23
0
def test_fail_importing_actor(ray_start_regular, error_pubsub):
    script = """
import os
import sys
import tempfile
import ray

ray.init()
temporary_python_file = '''
def temporary_helper_function():
   return 1
'''

f = tempfile.NamedTemporaryFile("w+", suffix=".py", prefix="_", delete=True)
f_name = f.name
f.close()
f = open(f_name, "w+")
f.write(temporary_python_file)
f.flush()
directory = os.path.dirname(f_name)
# Get the module name and strip ".py" from the end.
module_name = os.path.basename(f_name)[:-3]
sys.path.append(directory)
module = __import__(module_name)

# Define an actor that closes over this temporary module. This should
# fail when it is unpickled.
@ray.remote
class Foo:
    def __init__(self):
        self.x = module.temporary_python_file()

a = Foo.remote()
import time
time.sleep(3)  # Wait for actor start.
"""
    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")
    err_str = proc.stderr.read().decode("ascii")
    print(out_str)
    print(err_str)
    assert "ModuleNotFoundError: No module named" in err_str
    assert "RuntimeError: The actor with name Foo failed to import" in err_str
Exemplo n.º 24
0
def test_recovery_cluster_failure_resume_all(tmp_path, shutdown_only):
    ray.shutdown()

    tmp_path = tmp_path
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    workflow_dir = tmp_path / "workflow"
    lock_file = tmp_path / "lock_file"
    lock = FileLock(lock_file)
    lock.acquire()

    proc = run_string_as_driver_nonblocking(
        f"""
import time
import ray
from ray import workflow
from filelock import FileLock

@ray.remote
def foo(x):
    with FileLock("{str(lock_file)}"):
        return 20

if __name__ == "__main__":
    ray.init(storage="{str(workflow_dir)}")
    workflow.init()
    assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20
"""
    )
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    lock.release()
    ray.init(storage=str(workflow_dir))
    workflow.init()
    resumed = workflow.resume_all()
    assert len(resumed) == 1
    (wid, obj_ref) = resumed[0]
    assert wid == "cluster_failure"
    assert ray.get(obj_ref) == 20
Exemplo n.º 25
0
def test_autoscaler_no_spam():
    script = """
import ray
import time

ray.init(num_cpus=1)

@ray.remote(num_cpus=1)
def f():
    time.sleep(1)

ray.get([f.remote() for _ in range(5)])
    """

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")
    err_str = proc.stderr.read().decode("ascii")

    print(out_str, err_str)
    assert "Tip:" not in out_str
    assert "Tip:" not in err_str
Exemplo n.º 26
0
def test_worker_stdout():
    script = """
import ray
import sys

ray.init(num_cpus=2)

@ray.remote
def foo(out_str, err_str):
    print(out_str)
    print(err_str, file=sys.stderr)

ray.get(foo.remote("abc", "def"))
    """

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")
    err_str = proc.stderr.read().decode("ascii")

    assert out_str.endswith("abc\n")
    assert err_str.split("\n")[-2].endswith("def")
Exemplo n.º 27
0
def test_head_node_down(short_gcs_publish_timeout, ray_start_cluster):
    """Make sure all ray workers when head node is dead."""
    cluster = ray_start_cluster
    # head node.
    head = cluster.add_node(
        num_cpus=2, _system_config={"gcs_rpc_server_reconnect_timeout_s": 1}
    )

    # worker nodes.
    num_worker_nodes = 2
    for _ in range(num_worker_nodes):
        cluster.add_node(num_cpus=2)
    cluster.wait_for_nodes()

    # Start a driver.
    driver = """
import ray
ray.init(address="{}")
@ray.remote
def f():
    import time
    time.sleep(10)

num_cpus = int(ray.available_resources()["CPU"])
tasks = [f.remote() for _ in range(num_cpus)]
import time
time.sleep(100)
""".format(
        cluster.address
    )

    p = run_string_as_driver_nonblocking(driver)
    # Make sure the driver is running.
    time.sleep(1)
    wait_for_condition(lambda: p.poll() is None)
    wait_for_condition(lambda: len(get_all_ray_worker_processes()) > 0)

    cluster.remove_node(head)

    wait_for_condition(lambda: len(get_all_ray_worker_processes()) == 0)
Exemplo n.º 28
0
def test_autoscaler_infeasible():
    script = """
import ray
import time

ray.init(num_cpus=1)

@ray.remote(num_gpus=1)
def foo():
    pass

x = foo.remote()
time.sleep(15)
    """

    proc = run_string_as_driver_nonblocking(script)
    out_str = proc.stdout.read().decode("ascii")
    err_str = proc.stderr.read().decode("ascii")

    print(out_str, err_str)
    assert "Tip:" in out_str
    assert "Error: No available node types can fulfill" in out_str
Exemplo n.º 29
0
def test_env_installation_nonblocking(shutdown_only):
    """Test fix for https://github.com/ray-project/ray/issues/16226."""
    env1 = {"pip": ["pip-install-test==0.5"]}

    ray.init(runtime_env=env1)

    @ray.remote
    def f():
        return "hello"

    # Warm up a worker because it takes time to start.
    ray.get(f.remote())

    def assert_tasks_finish_quickly(total_sleep_s=0.1):
        """Call f every 0.01 seconds for total time total_sleep_s."""
        gap_s = 0.01
        for i in range(int(total_sleep_s / gap_s)):
            start = time.time()
            ray.get(f.remote())
            # Env installation takes around 10 to 60 seconds.  If we fail the
            # below assert, we can be pretty sure an env installation blocked
            # the task.
            assert time.time() - start < 1.0
            time.sleep(gap_s)

    assert_tasks_finish_quickly()

    env2 = {"pip": ["pip-install-test==0.5", "requests"]}
    f.options(runtime_env=env2).remote()
    # Check that installing env2 above does not block tasks using env1.
    assert_tasks_finish_quickly()

    proc = run_string_as_driver_nonblocking(
        install_env_script.format(env=env1))
    # Check that installing env1 in a new worker in the script above does not
    # block other tasks that use env1.
    assert_tasks_finish_quickly(total_sleep_s=5)
    proc.kill()
    proc.wait()
Exemplo n.º 30
0
def test_multi_stdout_err(file):
    if file == "stdout":
        file_handle = "sys.stdout"
    else:  # sys.stderr
        file_handle = "sys.stderr"

    script = f"""
import ray
import sys

ray.init(num_cpus=1)

@ray.remote
def foo():
    print(file={file_handle})

@ray.remote
def bar():
    print(file={file_handle})

@ray.remote
def baz():
    print(file={file_handle})

ray.get(foo.remote())
ray.get(bar.remote())
ray.get(baz.remote())
    """

    proc = run_string_as_driver_nonblocking(script)
    if file == "stdout":
        out_str = proc.stdout.read().decode("ascii")
    else:
        out_str = proc.stderr.read().decode("ascii")

    out_str = "".join(out_str.splitlines())
    assert "(foo pid=" in out_str, out_str
    assert "(bar pid=" in out_str, out_str
    assert "(baz pid=" in out_str, out_str