Python get_error_message示例，ray._private.test_utils.get_error_message Python示例

示例#1

0

显示文件

文件： test_failure_2.py 项目： mvindiola1/ray

def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.

    @ray.remote(num_gpus=1)
    def f():
        pass

    @ray.remote(resources={"Custom": 1})
    class Foo:
        pass

    # This task is infeasible.
    f.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # This actor placement task is infeasible.
    Foo.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # Placement group cannot be made, but no warnings should occur.
    total_cpus = ray.cluster_resources()["CPU"]

    # Occupy one cpu by an actor
    @ray.remote(num_cpus=1)
    class A:
        pass

    a = A.remote()
    print(a)

    @ray.remote(num_cpus=total_cpus)
    def g():
        pass

    pg = placement_group([{"CPU": total_cpus}], strategy="STRICT_PACK")
    g.options(placement_group=pg).remote()

    errors = get_error_message(p,
                               1,
                               ray_constants.INFEASIBLE_TASK_ERROR,
                               timeout=5)
    assert len(errors) == 0, errors

示例#2

0

显示文件

文件： test_placement_group_2.py 项目： rlan/ray

def test_ready_warning_suppressed(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Create an infeasible pg.
    pg = ray.util.placement_group([{"CPU": 2}] * 2, strategy="STRICT_PACK")
    with pytest.raises(ray.exceptions.GetTimeoutError):
        ray.get(pg.ready(), timeout=0.5)

    errors = get_error_message(
        p, 1, ray.ray_constants.INFEASIBLE_TASK_ERROR, timeout=0.1)
    assert len(errors) == 0

示例#3

0

显示文件

文件： test_runtime_env.py 项目： mvindiola1/ray

def test_runtime_env_no_spurious_resource_deadlock_msg(
        runtime_env_local_dev_env_var, ray_start_regular, error_pubsub):
    p = error_pubsub

    @ray.remote(runtime_env={"pip": ["tensorflow", "torch"]})
    def f():
        pass

    # Check no warning printed.
    ray.get(f.remote())
    errors = get_error_message(p, 5, ray.ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 0

示例#4

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_push_error_to_driver_through_redis(ray_start_regular, error_pubsub):
    address_info = ray_start_regular
    address = address_info["redis_address"]
    redis_client = ray._private.services.create_redis_client(
        address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    error_message = "Test error message"
    ray._private.utils.push_error_to_driver_through_redis(
        redis_client, ray_constants.DASHBOARD_AGENT_DIED_ERROR, error_message)
    errors = get_error_message(error_pubsub, 1,
                               ray_constants.DASHBOARD_AGENT_DIED_ERROR)
    assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR
    assert errors[0].error_message == error_message

示例#5

0

显示文件

文件： test_failure.py 项目： vishalbelsare/ray

def test_publish_error_to_driver(ray_start_regular, error_pubsub):
    address_info = ray_start_regular
    gcs_publisher = GcsPublisher(address=address_info["gcs_address"])

    error_message = "Test error message"
    ray._private.utils.publish_error_to_driver(
        ray_constants.DASHBOARD_AGENT_DIED_ERROR,
        error_message,
        gcs_publisher=gcs_publisher,
    )
    errors = get_error_message(error_pubsub, 1,
                               ray_constants.DASHBOARD_AGENT_DIED_ERROR)
    assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR
    assert errors[0].error_message == error_message

示例#6

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_failed_function_to_run(ray_start_2_cpus, error_pubsub):
    p = error_pubsub

    def f(worker):
        if ray.worker.global_worker.mode == ray.WORKER_MODE:
            raise Exception("Function to run failed.")

    ray.worker.global_worker.run_function_on_all_workers(f)
    # Check that the error message is in the task info.
    errors = get_error_message(p, 2, ray_constants.FUNCTION_TO_RUN_PUSH_ERROR)
    assert len(errors) == 2
    assert errors[0].type == ray_constants.FUNCTION_TO_RUN_PUSH_ERROR
    assert "Function to run failed." in errors[0].error_message
    assert "Function to run failed." in errors[1].error_message

示例#7

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_version_mismatch(error_pubsub, shutdown_only):
    ray_version = ray.__version__
    ray.__version__ = "fake ray version"

    ray.init(num_cpus=1)
    p = error_pubsub

    errors = get_error_message(p, 1, ray_constants.VERSION_MISMATCH_PUSH_ERROR)
    assert False, errors
    assert len(errors) == 1
    assert errors[0].type == ray_constants.VERSION_MISMATCH_PUSH_ERROR

    # Reset the version.
    ray.__version__ = ray_version

示例#8

0

显示文件

文件： test_stress_failure.py 项目： stjordanis/ray

def wait_for_errors(p, error_check):
    # Wait for errors from all the nondeterministic tasks.
    errors = []
    time_left = 100
    while time_left > 0:
        errors.extend(get_error_message(p, 1))
        if error_check(errors):
            break
        time_left -= 1
        time.sleep(1)

    # Make sure that enough errors came through.
    assert error_check(errors)
    return errors

示例#9

0

显示文件

文件： test_namespace.py 项目： vishalbelsare/ray

def test_detached_warning(shutdown_only):
    ray.init()

    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    error_pubsub = init_error_pubsub()
    actor = DetachedActor.options(  # noqa: F841
        name="Pinger", lifetime="detached").remote()
    errors = get_error_message(error_pubsub, 1, None)
    error = errors.pop()
    assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR

示例#10

0

显示文件

文件： test_failure_2.py 项目： vishalbelsare/ray

def test_warning_for_too_many_nested_tasks(shutdown_only):
    # Check that if we run a workload which requires too many workers to be
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
    p = init_error_pubsub()

    remote_wait = Semaphore.remote(value=0)
    nested_wait = Semaphore.remote(value=0)

    ray.get([
        remote_wait.locked.remote(),
        nested_wait.locked.remote(),
    ])

    @ray.remote(num_cpus=0.25)
    def f():
        time.sleep(1000)
        return 1

    @ray.remote(num_cpus=0.25)
    def h(nested_waits):
        nested_wait.release.remote()
        ray.get(nested_waits)
        ray.get(f.remote())

    @ray.remote(num_cpus=0.25)
    def g(remote_waits, nested_waits):
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        remote_wait.release.remote()
        # wait until every lock is released.
        ray.get(remote_waits)
        ray.get(h.remote(nested_waits))

    num_root_tasks = num_cpus * 4
    # Lock remote task until everything is scheduled.
    remote_waits = []
    nested_waits = []
    for _ in range(num_root_tasks):
        remote_waits.append(remote_wait.acquire.remote())
        nested_waits.append(nested_wait.acquire.remote())

    [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)]

    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
    p.close()

示例#11

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_warning_actor_waiting_on_actor(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Actor:
        pass

    a = Actor.remote()  # noqa
    b = Actor.remote()  # noqa

    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR

示例#12

0

显示文件

def test_warning_for_dead_autoscaler(ray_start_regular, error_pubsub):
    # Terminate the autoscaler process.
    from ray.worker import _global_node
    autoscaler_process = _global_node.all_processes[
        ray_constants.PROCESS_TYPE_MONITOR][0].process
    autoscaler_process.terminate()

    # Confirm that we receive an autoscaler failure error.
    errors = get_error_message(
        error_pubsub, 1, ray_constants.MONITOR_DIED_ERROR, timeout=5)
    assert len(errors) == 1

    # Confirm that the autoscaler failure error is stored.
    error = _internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    assert error is not None

示例#13

0

显示文件

文件： test_dashboard.py 项目： rlan/ray

def test_dashboard_agent_restart(ray_start_cluster_head, error_pubsub):
    """Test that when the agent fails to start many times in a row
    if the error message is suppressed correctly without spamming
    the driver.
    """
    # Choose a duplicated port for the agent so that it will crash.
    p = error_pubsub
    errors = get_error_message(
        p, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR, timeout=10)
    for e in errors:
        assert ("There are 2 possible problems "
                "if you see this error." in e.error_message)
    # Make sure the agent process is not started anymore.
    cluster = ray_start_cluster_head
    wait_for_condition(lambda: search_agents(cluster) is None)

示例#14

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_worker_dying(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Define a remote function that will kill the worker that runs it.

    @ray.remote(max_retries=0)
    def f():
        eval("exit()")

    with pytest.raises(ray.exceptions.WorkerCrashedError):
        ray.get(f.remote())

    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
    assert "died or was killed while executing" in errors[0].error_message

示例#15

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_worker_raising_exception(ray_start_regular, error_pubsub):
    p = error_pubsub

    @ray.remote(max_calls=2)
    def f():
        # This is the only reasonable variable we can set here that makes the
        # execute_task function fail after the task got executed.
        worker = ray.worker.global_worker
        worker.function_actor_manager.increase_task_counter = None

    # Running this task should cause the worker to raise an exception after
    # the task has successfully completed.
    f.remote()
    errors = get_error_message(p, 1, ray_constants.WORKER_CRASH_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_CRASH_PUSH_ERROR

示例#16

0

显示文件

def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):
    # Check that we cannot place an actor on a 0 CPU machine and that we get an
    # infeasibility warning (even though the actor creation task itself
    # requires no CPUs).

    ray.init(num_cpus=0)
    p = init_error_pubsub()

    @ray.remote
    class Foo:
        pass

    # The actor creation should be infeasible.
    Foo.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR
    p.close()

示例#17

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_put_error1(ray_start_object_store_memory, error_pubsub):
    p = error_pubsub
    num_objects = 3
    object_size = 4 * 10**5

    # Define a task with a single dependency, a numpy array, that returns
    # another array.
    @ray.remote
    def single_dependency(i, arg):
        arg = np.copy(arg)
        arg[0] = i
        return arg

    @ray.remote
    def put_arg_task():
        # Launch num_objects instances of the remote task, each dependent
        # on the one before it. The result of the first task should get
        # evicted.
        args = []
        arg = single_dependency.remote(0, np.zeros(object_size,
                                                   dtype=np.uint8))
        for i in range(num_objects):
            arg = single_dependency.remote(i, arg)
            args.append(arg)

        # Get the last value to force all tasks to finish.
        value = ray.get(args[-1])
        assert value[0] == i

        # Get the first value (which should have been evicted) to force
        # reconstruction. Currently, since we're not able to reconstruct
        # `ray.put` objects that were evicted and whose originating tasks
        # are still running, this for-loop should hang and push an error to
        # the driver.
        ray.get(args[0])

    put_arg_task.remote()

    # Make sure we receive the correct error message.
    errors = get_error_message(p, 1,
                               ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR

示例#18

0

显示文件

def test_warning_many_actor_tasks_queued(shutdown_only):
    ray.init(num_cpus=1)
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Foo:
        def f(self):
            import time

            time.sleep(1)

    a = Foo.remote()
    [a.f.remote() for _ in range(50000)]
    errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING)
    msgs = [e.error_message for e in errors]
    assert "Warning: More than 5000 tasks are pending submission to actor" in msgs[0]
    assert "Warning: More than 10000 tasks are pending submission to actor" in msgs[1]
    assert "Warning: More than 20000 tasks are pending submission to actor" in msgs[2]
    assert "Warning: More than 40000 tasks are pending submission to actor" in msgs[3]

示例#19

0

显示文件

文件： test_failure.py 项目： mvindiola1/ray

def test_publish_error_to_driver(ray_start_regular, error_pubsub):
    address_info = ray_start_regular
    address = address_info["redis_address"]
    redis_client = ray._private.services.create_redis_client(
        address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    gcs_publisher = None
    if gcs_pubsub_enabled():
        gcs_publisher = GcsPublisher(
            address=gcs_utils.get_gcs_address_from_redis(redis_client))
    error_message = "Test error message"
    ray._private.utils.publish_error_to_driver(
        ray_constants.DASHBOARD_AGENT_DIED_ERROR,
        error_message,
        redis_client=redis_client,
        gcs_publisher=gcs_publisher)
    errors = get_error_message(error_pubsub, 1,
                               ray_constants.DASHBOARD_AGENT_DIED_ERROR)
    assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR
    assert errors[0].error_message == error_message

示例#20

0

显示文件

def test_raylet_and_agent_share_fate(shutdown_only):
    """Test raylet and agent share fate."""

    ray.init(include_dashboard=True)
    p = init_error_pubsub()

    node = ray._private.worker._global_node
    all_processes = node.all_processes
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)

    wait_for_condition(lambda: search_agent(raylet_proc.children()))
    agent_proc = search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    check_agent_register(raylet_proc, agent_pid)

    # The agent should be dead if raylet exits.
    raylet_proc.terminate()
    raylet_proc.wait()
    agent_proc.wait(5)

    # No error should be reported for graceful termination.
    errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR)
    assert len(errors) == 0, errors

    ray.shutdown()

    ray.init(include_dashboard=True)
    all_processes = ray._private.worker._global_node.all_processes
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)
    wait_for_condition(lambda: search_agent(raylet_proc.children()))
    agent_proc = search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    check_agent_register(raylet_proc, agent_pid)

    # The raylet should be dead if agent exits.
    agent_proc.kill()
    agent_proc.wait()
    raylet_proc.wait(5)

示例#21

0

显示文件

def test_no_warning_many_actor_tasks_queued_when_sequential(shutdown_only, sync: bool):
    ray.init(num_cpus=1)
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class SyncFoo:
        def f(self):
            return 1

    @ray.remote(num_cpus=1)
    class AsyncFoo:
        async def f(self):
            return 1

    Foo = SyncFoo if sync else AsyncFoo
    a = Foo.remote()
    for _ in range(10000):
        assert ray.get(a.f.remote()) == 1
    errors = get_error_message(p, 1, ray_constants.EXCESS_QUEUEING_WARNING, timeout=1)
    assert len(errors) == 0

示例#22

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_failed_actor_method(ray_start_regular, error_pubsub):
    p = error_pubsub
    error_message2 = "actor method failed"

    @ray.remote
    class FailedActor:
        def __init__(self):
            pass

        def fail_method(self):
            raise Exception(error_message2)

    a = FailedActor.remote()

    # Make sure that we get errors from a failed method.
    a.fail_method.remote()
    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.TASK_PUSH_ERROR
    assert error_message2 in errors[0].error_message

示例#23

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_actor_worker_dying(ray_start_regular, error_pubsub):
    p = error_pubsub

    @ray.remote
    class Actor:
        def kill(self):
            eval("exit()")

    @ray.remote
    def consume(x):
        pass

    a = Actor.remote()
    [obj], _ = ray.wait([a.kill.remote()], timeout=5)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(obj)
    with pytest.raises(ray.exceptions.RayTaskError):
        ray.get(consume.remote(obj))
    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR

示例#24

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_warning_all_tasks_blocked(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Foo:
        def f(self):
            return 0

    @ray.remote
    def f():
        # Creating both actors is not possible.
        actors = [Foo.remote() for _ in range(3)]
        for a in actors:
            ray.get(a.f.remote())

    # Run in a task to check we handle the blocked task case correctly
    f.remote()
    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR

示例#25

0

显示文件

def test_actor_scope_or_intentionally_killed_message(ray_start_regular, error_pubsub):
    p = error_pubsub

    @ray.remote
    class Actor:
        def __init__(self):
            # This log is added to debug a flaky test issue.
            print(os.getpid())

        def ping(self):
            pass

    a = Actor.remote()
    # Without this waiting, there seems to be race condition happening
    # in the CI. This is not a fundamental fix for that, but it at least
    # makes the test less flaky.
    ray.get(a.ping.remote())
    a = Actor.remote()
    a.__ray_terminate__.remote()
    time.sleep(1)
    errors = get_error_message(p, 1)
    assert len(errors) == 0, "Should not have propogated an error - {}".format(errors)

示例#26

0

显示文件

文件： test_failure_2.py 项目： vishalbelsare/ray

def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub):
    cluster = ray_start_cluster_2_nodes
    cluster.wait_for_nodes()
    p = error_pubsub

    node_ids = {item["NodeID"] for item in ray.nodes()}

    # Try to make sure that the monitor has received at least one heartbeat
    # from the node.
    time.sleep(0.5)

    # Kill both raylets.
    cluster.list_all_nodes()[1].kill_raylet()
    cluster.list_all_nodes()[0].kill_raylet()

    # Check that we get warning messages for both raylets.
    errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40)

    # Extract the client IDs from the error messages. This will need to be
    # changed if the error message changes.
    warning_node_ids = {error.error_message.split(" ")[5] for error in errors}

    assert node_ids == warning_node_ids

示例#27

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_warning_task_waiting_on_actor(shutdown_only):
    ray.init(num_cpus=1,
             _system_config={"debug_dump_period_milliseconds": 500})
    p = init_error_pubsub()

    @ray.remote(num_cpus=1)
    class Actor:
        def hello(self):
            pass

    a = Actor.remote()  # noqa
    ray.get(a.hello.remote())

    @ray.remote(num_cpus=1)
    def f():
        print("f running")
        time.sleep(999)

    ids = [f.remote()]  # noqa

    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR

示例#28

0

显示文件

文件： test_failure.py 项目： rlan/ray

def test_actor_worker_dying_future_tasks(ray_start_regular, error_pubsub):
    p = error_pubsub

    @ray.remote(max_restarts=0)
    class Actor:
        def getpid(self):
            return os.getpid()

        def sleep(self):
            time.sleep(1)

    a = Actor.remote()
    pid = ray.get(a.getpid.remote())
    tasks1 = [a.sleep.remote() for _ in range(10)]
    os.kill(pid, 9)
    time.sleep(0.1)
    tasks2 = [a.sleep.remote() for _ in range(10)]
    for obj in tasks1 + tasks2:
        with pytest.raises(Exception):
            ray.get(obj)

    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR

示例#29

0

显示文件

文件： test_multi_node.py 项目： wuisawesome/ray

def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)

    # If a GRPC call exceeds timeout, the calls is cancelled at client side but
    # server may still reply to it, leading to missed message. Using a sequence
    # number to ensure no message is dropped can be the long term solution,
    # but its complexity and the fact the Ray subscribers do not use deadline
    # in production makes it less preferred.
    # Therefore, a simpler workaround is used instead: a different subscriber
    # is used for each get_error_message() call.
    subscribers = [init_error_pubsub() for _ in range(3)]

    # There shouldn't be any errors yet.
    errors = get_error_message(subscribers[0], 1, timeout=2)
    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    errors = get_error_message(subscribers[1], 1)

    # Make sure we got the error.
    assert len(errors) == 1
    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time
from ray._private.test_utils import init_error_pubsub, get_error_message

ray.init(address="{}")
subscribers = [init_error_pubsub() for _ in range(2)]
time.sleep(1)
errors = get_error_message(subscribers[0], 1, timeout=2)
assert len(errors) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

errors = get_error_message(subscribers[1], 1)
assert len(errors) == 1

assert "{}" in errors[0].error_message

print("success")
""".format(
        address, error_string2, error_string2
    )

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    errors = get_error_message(subscribers[2], 1)
    assert len(errors) == 1

示例#30

0

显示文件

def test_threaded_actor_integration_test_stress(ray_start_cluster_head,
                                                log_pubsub, error_pubsub):
    """This is a sanity test that checks threaded actors are
        working with the nightly stress test.
    """
    cluster = ray_start_cluster_head
    p = log_pubsub
    e = error_pubsub

    # Prepare the config
    num_remote_nodes = 4
    num_parents = 6
    num_children = 6
    death_probability = 0.95
    max_concurrency = 10

    for _ in range(num_remote_nodes):
        cluster.add_node(num_cpus=2)

    @ray.remote
    class Child(object):
        def __init__(self, death_probability):
            self.death_probability = death_probability

        def ping(self):
            # Exit process with some probability.
            exit_chance = np.random.rand()
            if exit_chance > self.death_probability:
                sys.exit(-1)

    @ray.remote
    class Parent(object):
        def __init__(self, num_children, death_probability=0.95):
            self.death_probability = death_probability
            self.children = [
                Child.options(
                    max_concurrency=max_concurrency).remote(death_probability)
                for _ in range(num_children)
            ]

        def ping(self, num_pings):
            children_outputs = []
            for _ in range(num_pings):
                children_outputs += [
                    child.ping.remote() for child in self.children
                ]
            try:
                ray.get(children_outputs)
            except Exception:
                # Replace the children if one of them died.
                self.__init__(len(self.children), self.death_probability)

        def kill(self):
            # Clean up children.
            ray.get(
                [child.__ray_terminate__.remote() for child in self.children])

    parents = [
        Parent.options(max_concurrency=max_concurrency).remote(
            num_children, death_probability) for _ in range(num_parents)
    ]

    start = time.time()
    loop_times = []
    for _ in range(10):
        loop_start = time.time()
        ray.get([parent.ping.remote(10) for parent in parents])

        # Kill a parent actor with some probability.
        exit_chance = np.random.rand()
        if exit_chance > death_probability:
            parent_index = np.random.randint(len(parents))
            parents[parent_index].kill.remote()
            parents[parent_index] = Parent.options(
                max_concurrency=max_concurrency).remote(
                    num_children, death_probability)
        loop_times.append(time.time() - loop_start)
    result = {}
    print("Finished in: {}s".format(time.time() - start))
    print("Average iteration time: {}s".format(
        sum(loop_times) / len(loop_times)))
    print("Max iteration time: {}s".format(max(loop_times)))
    print("Min iteration time: {}s".format(min(loop_times)))
    result["total_time"] = time.time() - start
    result["avg_iteration_time"] = sum(loop_times) / len(loop_times)
    result["max_iteration_time"] = max(loop_times)
    result["min_iteration_time"] = min(loop_times)
    result["success"] = 1
    print(result)
    ensure_cpu_returned(10)
    del parents

    # Make sure parents are still scheduleable.
    parents = [
        Parent.options(max_concurrency=max_concurrency).remote(
            num_children, death_probability) for _ in range(num_parents)
    ]
    ray.get([parent.ping.remote(10) for parent in parents])
    """
    Make sure there are not SIGSEGV, SIGBART, or other odd check failures.
    """
    # Get all logs for 20 seconds.
    logs = test_utils.get_log_message(p, timeout=20)
    for log in logs:
        assert "SIG" not in log, "There's the segfault or SIGBART reported."
        assert "Check failed" not in log, (
            "There's the check failure reported.")

    # Get error messages for 10 seconds.
    errors = test_utils.get_error_message(e, timeout=10)
    for error in errors:
        print(error)
        assert "You can ignore this message if" not in error.error_message, (
            "Resource deadlock warning shouldn't be printed, but it did.")