示例#1
0
class TrialRunnerTest(unittest.TestCase):
    def setUp(self):
        self.cluster = Cluster(
            initialize_head=True,
            connect=True,
            head_node_args={
                "num_cpus": 4,
                "num_gpus": 1,
            })
        self.cluster.add_node(num_cpus=2, num_gpus=1)
        self.cluster.wait_for_nodes()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()

    def testAvailableResources(self):
        assert len(nodes()) == 2
        assert default_device(refresh=True) == "GPU"

    def testOthersTakingResources(self):
        # Let someone occupy the head node
        pg = placement_group([{"CPU": 4, "GPU": 1}])
        ray.get(pg.ready())
        # We are left with the second node
        assert len(nodes()) == 1
        assert default_device(refresh=True) == "GPU"

        pg = placement_group([{"GPU": 1}])
        ray.get(pg.ready())
        # Default device should be CPU
        assert default_device(refresh=True) == "CPU"
        assert len(nodes()) == 1
示例#2
0
def test_namespace_client():
    cluster = Cluster()
    cluster.add_node(num_cpus=4, ray_client_server_port=8080)
    cluster.wait_for_nodes(1)

    template = """
import ray
ray.util.connect("{address}", namespace="{namespace}")

@ray.remote
class DetachedActor:
    def ping(self):
        return "pong from other job"

actor = DetachedActor.options(name="Pinger", lifetime="detached").remote()
ray.get(actor.ping.remote())
print("Done!!!")
    """

    print(
        run_string_as_driver(
            template.format(address="localhost:8080", namespace="test")))

    ray.util.connect("localhost:8080", namespace="test")

    pinger = ray.get_actor("Pinger")
    assert ray.get(pinger.ping.remote()) == "pong from other job"

    ray.util.disconnect()
    cluster.shutdown()
    # This piece of cleanup doesn't seem to happen automatically.
    ray._private.client_mode_hook._explicitly_disable_client_mode()
示例#3
0
def build_cluster(num_nodes, num_cpus, object_store_memory):
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpus,
                         object_store_memory=object_store_memory)
    cluster.wait_for_nodes()
    return cluster
示例#4
0
def test_pull_request_retry(shutdown_only):
    cluster = Cluster()
    cluster.add_node(num_cpus=0, num_gpus=1, object_store_memory=100 * 2**20)
    cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 2**20)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def put():
        return np.zeros(64 * 2**20, dtype=np.int8)

    @ray.remote(num_cpus=0, num_gpus=1)
    def driver():
        local_ref = ray.put(np.zeros(64 * 2**20, dtype=np.int8))

        remote_ref = put.remote()

        ready, _ = ray.wait([remote_ref], timeout=30)
        assert len(ready) == 1

        del local_ref

        # This should always complete within 10 seconds.
        ready, _ = ray.wait([remote_ref], timeout=20)
        assert len(ready) > 0

    # Pretend the GPU node is the driver. We do this to force the placement of
    # the driver and `put` task on different nodes.
    ray.get(driver.remote())
示例#5
0
def test_task_args(shutdown_only):
    cluster = Cluster()
    cluster.add_node(
        num_cpus=1,
        object_store_memory=80 * 1024 * 1024,
        _system_config={
            "local_fs_capacity_threshold": 0,
        },
        resources={"out_of_memory": 1},
    )
    cluster.add_node(
        num_cpus=1,
        object_store_memory=200 * 1024 * 1024,
        resources={"sufficient_memory": 1},
    )
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def foo():
        return np.random.rand(20 * 1024 * 1024)  # 160 MB data

    @ray.remote
    def bar(obj):
        print(obj)

    ref = foo.options(resources={"sufficient_memory": 1}).remote()
    try:
        ray.get(bar.options(resources={"out_of_memory": 1}).remote(ref))
    except ray.exceptions.RayTaskError as e:
        assert isinstance(e.cause, ray.exceptions.OutOfDiskError)
示例#6
0
def test_pull_bundles_admission_control(shutdown_only):
    cluster = Cluster()
    object_size = int(6e6)
    num_objects = 10
    num_tasks = 10
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0,
                     object_store_memory=2 * num_tasks * num_objects *
                     object_size)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node can only fit 1 task at a time.
    cluster.add_node(num_cpus=1,
                     object_store_memory=1.5 * num_objects * object_size)
    cluster.wait_for_nodes()

    @ray.remote
    def foo(*args):
        return

    args = []
    for _ in range(num_tasks):
        task_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        args.append(task_args)

    tasks = [foo.remote(*task_args) for task_args in args]
    ray.get(tasks)
示例#7
0
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return x

    obj = ray.put(np.zeros(10**7, dtype=np.uint8))
    result = dependent_task.options(resources={"node1": 1}).remote(obj)
    ray.get(result)
    del obj

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)

    for _ in range(20):
        ray.put(np.zeros(10**7, dtype=np.uint8))

    if reconstruction_enabled:
        ray.get(result)
    else:
        with pytest.raises(ray.exceptions.UnreconstructableError):
            ray.get(result)
示例#8
0
def test_actor(shutdown_only):
    cluster = Cluster()
    cluster.add_node(
        num_cpus=1,
        object_store_memory=80 * 1024 * 1024,
        _system_config={
            "local_fs_capacity_threshold": 0,
        },
        resources={"out_of_memory": 1},
    )
    cluster.add_node(
        num_cpus=1,
        object_store_memory=200 * 1024 * 1024,
        resources={"sufficient_memory": 1},
    )
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def foo():
        return np.random.rand(20 * 1024 * 1024)  # 160 MB data

    @ray.remote
    class Actor:
        def __init__(self, obj):
            self._obj = obj

        def foo(self):
            print(self._obj)

        def args_ood(self, obj):
            print(obj)

        def return_ood(self):
            return np.random.rand(20 * 1024 * 1024)

    ref = foo.options(resources={"sufficient_memory": 1}).remote()
    with pytest.raises(ray.exceptions.RayActorError):
        a = Actor.options(resources={"out_of_memory": 0.001}).remote(ref)
        ray.get(a.foo.remote())

    a = Actor.options(resources={"out_of_memory": 1}).remote(1)
    ray.get(a.foo.remote())
    try:
        ray.get(a.args_ood.remote(ref))
    except ray.exceptions.RayTaskError as e:
        assert isinstance(e.cause, ray.exceptions.OutOfDiskError)

    ray.get(a.foo.remote())
    try:
        ray.get(a.return_ood.remote())
    except ray.exceptions.RayTaskError as e:
        assert isinstance(e.cause, ray.exceptions.OutOfDiskError)
示例#9
0
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0,
                     _internal_config=config,
                     object_store_memory=10**8)
    node_to_kill = cluster.add_node(num_cpus=1,
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return x

    obj = large_object.remote()
    for _ in range(20):
        obj = chain.remote(obj)
    ray.get(dependent_task.remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     object_store_memory=10**8,
                     _internal_config=config)

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()
示例#10
0
def test_namespace():
    """
    Most of the "checks" in this test case rely on the fact that
    `run_string_as_driver` will throw an exception if the driver string exits
    with a non-zero exit code (e.g. when the driver scripts throws an
    exception). Since all of these drivers start named, detached actors, the
    most likely failure case would be a collision of named actors if they're
    put in the same namespace.

    This test checks that:
    * When two drivers don't specify a namespace, they are placed in different
      anonymous namespaces.
    * When two drivers specify a namespace, they collide.
    * The namespace name (as provided by the runtime context) is correct.
    """
    cluster = Cluster()
    cluster.add_node(num_cpus=4, ray_client_server_port=50055)
    cluster.wait_for_nodes(1)

    template = """
import ray
ray.client("localhost:50055").namespace({namespace}).connect()

@ray.remote
class Foo:
    def ping(self):
        return "pong"

a = Foo.options(lifetime="detached", name="abc").remote()
ray.get(a.ping.remote())
print(ray.get_runtime_context().namespace)
    """

    anon_driver = template.format(namespace="None")
    run_string_as_driver(anon_driver)
    # This second run will fail if the actors don't run in separate anonymous
    # namespaces.
    run_string_as_driver(anon_driver)

    run_in_namespace = template.format(namespace="'namespace'")
    script_namespace = run_string_as_driver(run_in_namespace)
    # The second run fails because the actors are run in the same namespace.
    with pytest.raises(subprocess.CalledProcessError):
        run_string_as_driver(run_in_namespace)

    assert script_namespace.strip() == "namespace"
    subprocess.check_output("ray stop --force", shell=True)
示例#11
0
def test_pull_bundles_admission_control_dynamic(shutdown_only):
    # This test is the same as test_pull_bundles_admission_control, except that
    # the object store's capacity starts off higher and is later consumed
    # dynamically by concurrent workers.
    cluster = Cluster()
    object_size = int(6e6)
    num_objects = 20
    num_tasks = 20
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0,
                     object_store_memory=2 * num_tasks * num_objects *
                     object_size)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node can fit 2 tasks at a time.
    cluster.add_node(num_cpus=1,
                     object_store_memory=2.5 * num_objects * object_size)
    cluster.wait_for_nodes()

    @ray.remote
    def foo(i, *args):
        print("foo", i)
        return

    @ray.remote
    def allocate(i):
        print("allocate", i)
        return np.zeros(object_size, dtype=np.uint8)

    args = []
    for _ in range(num_tasks):
        task_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        args.append(task_args)

    allocated = [allocate.remote(i) for i in range(num_objects)]
    ray.get(allocated)

    tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)]
    ray.get(tasks)
    del allocated
示例#12
0
def test_system_config_when_connecting(ray_start_cluster):
    config = {"object_timeout_milliseconds": 200}
    cluster = Cluster()
    cluster.add_node(_system_config=config, object_store_memory=100 * 1024 * 1024)
    cluster.wait_for_nodes()

    # Specifying _system_config when connecting to a cluster is disallowed.
    with pytest.raises(ValueError):
        ray.init(address=cluster.address, _system_config=config)

    # Check that the config was picked up (object pinning is disabled).
    ray.init(address=cluster.address)
    obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))

    for _ in range(5):
        put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
    del put_ref

    ray.get(obj_ref)
示例#13
0
def test_cached_object(ray_start_cluster):
    config = json.dumps({
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node1": 1}).remote()
    ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    ray.get(dependent_task.remote(obj))
示例#14
0
def run_multi_nodes():
    c = Cluster()
    c.add_node(num_cpus=4,
               object_store_memory=object_store_size,
               _system_config=system_config)
    ray.init(address=c.address)
    for _ in range(num_nodes - 1):  # subtract a head node.
        c.add_node(num_cpus=4, object_store_memory=object_store_size)
    c.wait_for_nodes()

    # Run shuffle.
    print(
        f"\n\nTest streaming shuffle with {num_nodes} nodes.\n"
        f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}"
        "GB")
    run_shuffle()
    time.sleep(5)
    display_spilling_info(c.address)
    ray.shutdown()
    c.shutdown()
    time.sleep(5)
示例#15
0
def test_pull_bundles_pinning(shutdown_only):
    cluster = Cluster()
    object_size = int(50e6)
    num_objects = 10
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0, object_store_memory=1000e6)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node cannot even fit a single task.
    cluster.add_node(num_cpus=1, object_store_memory=200e6)
    cluster.wait_for_nodes()

    @ray.remote(num_cpus=1)
    def foo(*args):
        return

    task_args = [
        ray.put(np.zeros(object_size, dtype=np.uint8))
        for _ in range(num_objects)
    ]
    ray.get(foo.remote(*task_args))
示例#16
0
def test_ray_get_task_args_deadlock(shutdown_only):
    cluster = Cluster()
    object_size = int(6e6)
    num_objects = 10
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0,
                     object_store_memory=4 * num_objects * object_size)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node can only fit 1 task at a time.
    cluster.add_node(num_cpus=1,
                     object_store_memory=1.5 * num_objects * object_size)
    cluster.wait_for_nodes()

    @ray.remote
    def foo(*args):
        return

    @ray.remote
    def test_deadlock(get_args, task_args):
        foo.remote(*task_args)
        ray.get(get_args)

    for i in range(5):
        start = time.time()
        get_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        task_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        ray.get(test_deadlock.remote(get_args, task_args))
        print(f"round {i} finished in {time.time() - start}")
示例#17
0
# This number should be divisible by 3.
resource_quantity = 999
num_nodes = 5
custom_resources = {"pg_custom": resource_quantity}
# Create pg that uses 1 resource of cpu & custom resource.
num_pg = resource_quantity

# TODO(sang): Cluster setup. Remove when running in real clusters.
cluster = Cluster()
nodes = []
for _ in range(num_nodes):
    nodes.append(
        cluster.add_node(num_cpus=3,
                         num_gpus=resource_quantity,
                         resources=custom_resources))
cluster.wait_for_nodes()

ray.init(address=cluster.address)
while not ray.is_initialized():
    time.sleep(0.1)

# Scenario 1: Create bunch of placement groups and measure how long it takes.
total_creating_time = 0
total_removing_time = 0
repeat = 1
total_trial = repeat * num_pg
bundles = [{"GPU": 1, "pg_custom": 1}] * num_nodes

# Create and remove placement groups.
for _ in range(repeat):
    pgs = []
示例#18
0
class RayExecutorQueueTest(unittest.TestCase):
    def setUp(self):
        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": 1,
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        self.trial_executor = RayTrialExecutor(queue_trials=True,
                                               refresh_period=0)
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testQueueTrial(self):
        """Tests that reset handles NotImplemented properly."""
        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        cpu_only = create_trial(1, 0)
        self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only))
        self.trial_executor.start_trial(cpu_only)

        gpu_only = create_trial(0, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only))

    def testHeadBlocking(self):
        # Once resource requests are deprecated, remove this test
        os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        gpu_trial = create_trial(1, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial))
        self.trial_executor.start_trial(gpu_trial)

        # TODO(rliaw): This behavior is probably undesirable, but right now
        #  trials with different resource requirements is not often used.
        cpu_only_trial = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))

        self.cluster.add_node(num_cpus=1, num_gpus=1)
        self.cluster.wait_for_nodes()

        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))
        self.trial_executor.start_trial(cpu_only_trial)

        cpu_only_trial2 = create_trial(1, 0)
        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial2))
        self.trial_executor.start_trial(cpu_only_trial2)

        cpu_only_trial3 = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial3))
示例#19
0
               2,
               "object_spilling_config":
               json.dumps(
                   {
                       "type": "filesystem",
                       "params": {
                           "directory_path": "/tmp/spill"
                       }
                   },
                   separators=(",", ":"))
           })
# Add fake 4 nodes cluster.
for _ in range(num_nodes - 1):  # subtract a head node.
    c.add_node(num_cpus=4, object_store_memory=object_store_size)

c.wait_for_nodes()
ray.init(address=c.address)


@ray.remote
class Counter:
    def __init__(self):
        self.num_map = 0
        self.num_reduce = 0

    def inc(self):
        self.num_map += 1
        print("Num map tasks finished", self.num_map)

    def inc2(self):
        self.num_reduce += 1
示例#20
0
def test_locality_aware_scheduling_for_dead_nodes(shutdown_only):
    """Test that locality-ware scheduling can handle dead nodes."""
    # Create a cluster with 4 nodes.
    config = {
        "num_heartbeats_timeout": 5,
        "raylet_heartbeat_period_milliseconds": 50,
    }
    cluster = Cluster()
    cluster.add_node(num_cpus=4, resources={"node1": 1}, _system_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    node2 = cluster.add_node(num_cpus=4, resources={"node2": 1})
    node3 = cluster.add_node(num_cpus=4, resources={"node3": 1})
    node4 = cluster.add_node(num_cpus=4, resources={"node4": 1})
    cluster.wait_for_nodes()

    # Create 2 objects on node 2.
    @ray.remote(resources={"node2": 0.1})
    def create_object():
        return np.zeros(10 * 1024 * 1024, dtype=np.uint8)

    obj1 = create_object.remote()
    obj2 = create_object.remote()

    # Push these 2 objects to other nodes.
    # node2 will have obj1 and obj2.
    # node3 will have obj1.
    # node4 will have obj2.
    @ray.remote
    class MyActor:
        def __init__(self, obj_refs):
            # Note, we need to keep obj_refs to prevent the objects from
            # being garbage collected.
            self.obj_refs = obj_refs
            self.obj = ray.get(obj_refs)

        def ready(self):
            return True

    actors = [
        MyActor.options(resources={
            "node2": 0.1
        }).remote([obj1, obj2]),
        MyActor.options(resources={
            "node3": 0.1
        }).remote([obj1]),
        MyActor.options(resources={
            "node4": 0.1
        }).remote([obj2]),
    ]

    assert all(ray.get(actor.ready.remote()) is True for actor in actors)

    # This function requires obj1 and obj2.
    @ray.remote
    def func(obj1, obj2):
        return ray.worker.global_worker.node.unique_id

    # This function should be scheduled to node2. As node2 has both objects.
    assert ray.get(func.remote(obj1, obj2)) == node2.unique_id

    # Kill node2, and re-schedule the function.
    # It should be scheduled to either node3 or node4.
    node2.kill_raylet()
    # Waits for the driver to receive the NodeRemoved notification.
    time.sleep(1)
    target_node = ray.get(func.remote(obj1, obj2))
    assert target_node == node3.unique_id or target_node == node4.unique_id
示例#21
0
def test_fate_sharing(ray_start_cluster):
    config = json.dumps({
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the parent actor.
    node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
    # Node to place the child actor.
    cluster.add_node(num_cpus=1, resources={"child": 1})
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def sleep():
        time.sleep(1000)

    @ray.remote(resources={"child": 1})
    def probe():
        return

    @ray.remote
    class Actor(object):
        def __init__(self):
            return

        def start_child(self, use_actors):
            if use_actors:
                child = Actor.options(resources={"child": 1}).remote()
                ray.get(child.sleep.remote())
            else:
                ray.get(sleep.options(resources={"child": 1}).remote())

        def sleep(self):
            time.sleep(1000)

        def get_pid(self):
            return os.getpid()

    # Returns whether the "child" resource is available.
    def child_resource_available():
        p = probe.remote()
        ready, _ = ray.wait([p], timeout=1)
        return len(ready) > 0

    # Test fate sharing if the parent process dies.
    def test_process_failure(use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        pid = ray.get(a.get_pid.remote())
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        assert wait_for_condition(
            lambda: not child_resource_available(), timeout_ms=10000)
        # Kill the parent process.
        os.kill(pid, 9)
        assert wait_for_condition(child_resource_available, timeout_ms=10000)

    # Test fate sharing if the parent node dies.
    def test_node_failure(node_to_kill, use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        assert wait_for_condition(
            lambda: not child_resource_available(), timeout_ms=10000)
        # Kill the parent process.
        cluster.remove_node(node_to_kill, allow_graceful=False)
        node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
        assert wait_for_condition(child_resource_available, timeout_ms=10000)
        return node_to_kill

    test_process_failure(use_actors=True)
    test_process_failure(use_actors=False)
    node_to_kill = test_node_failure(node_to_kill, use_actors=True)
    node_to_kill = test_node_failure(node_to_kill, use_actors=False)
示例#22
0
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
    }
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the parent actor.
    node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
    # Node to place the child actor.
    cluster.add_node(num_cpus=1, resources={"child": 1})
    cluster.wait_for_nodes()

    @ray.remote
    def sleep():
        time.sleep(1000)

    @ray.remote(resources={"child": 1})
    def probe():
        return

    # TODO(swang): This test does not pass if max_restarts > 0 for the
    # raylet codepath. Add this parameter once the GCS actor service is enabled
    # by default.
    @ray.remote
    class Actor(object):
        def __init__(self):
            return

        def start_child(self, use_actors):
            if use_actors:
                child = Actor.options(resources={"child": 1}).remote()
                ray.get(child.sleep.remote())
            else:
                ray.get(sleep.options(resources={"child": 1}).remote())

        def sleep(self):
            time.sleep(1000)

        def get_pid(self):
            return os.getpid()

    # Returns whether the "child" resource is available.
    def child_resource_available():
        p = probe.remote()
        ready, _ = ray.wait([p], timeout=1)
        return len(ready) > 0

    # Test fate sharing if the parent process dies.
    def test_process_failure(use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        pid = ray.get(a.get_pid.remote())
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        wait_for_condition(lambda: not child_resource_available())
        # Kill the parent process.
        os.kill(pid, 9)
        wait_for_condition(child_resource_available)

    # Test fate sharing if the parent node dies.
    def test_node_failure(node_to_kill, use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        wait_for_condition(lambda: not child_resource_available())
        # Kill the parent process.
        cluster.remove_node(node_to_kill, allow_graceful=False)
        node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
        wait_for_condition(child_resource_available)
        return node_to_kill

    if node_failure:
        test_node_failure(node_to_kill, use_actors)
    else:
        test_process_failure(use_actors)
示例#23
0
def test_reconstruction_cached_dependency(ray_start_cluster,
                                          reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()

    @ray.remote(max_retries=0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node2": 1}).remote()
    obj = chain.options(resources={"node1": 1}).remote(obj)
    ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()