예제 #1
0
def test_basic_reconstruction_actor_constructor(ray_start_cluster,
                                                reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(num_cpus=0,
                     _system_config=config,
                     enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8)
    cluster.wait_for_nodes()

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    # Both the constructor and a method depend on the large object.
    @ray.remote(max_restarts=-1)
    class Actor:
        def __init__(self, x):
            pass

        def dependent_task(self, x):
            return

        def pid(self):
            return os.getpid()

    obj = large_object.options(resources={"node1": 1}).remote()
    a = Actor.options(resources={"node1": 1}).remote(obj)
    ray.get(a.dependent_task.remote(obj))
    pid = ray.get(a.pid.remote())

    # Workaround to kill the actor process too since there is a bug where the
    # actor's plasma client hangs after the plasma store has exited.
    os.kill(pid, SIGKILL)

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8)

    wait_for_pid_to_exit(pid)

    # Wait for the actor to restart.
    def probe():
        try:
            ray.get(a.dependent_task.remote(obj))
            return True
        except ray.exceptions.RayActorError:
            return False
        except (ray.exceptions.RayTaskError, ray.exceptions.ObjectLostError):
            return True

    wait_for_condition(probe)

    if reconstruction_enabled:
        ray.get(a.dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            x = a.dependent_task.remote(obj)
            print(x)
            ray.get(x)
            with pytest.raises(ray.exceptions.ObjectLostError):
                raise e.as_instanceof_cause()
예제 #2
0
def test_atomic_creation(ray_start_cluster):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 2

    [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    @ray.remote(num_cpus=3)
    def bothering_task():
        import time
        time.sleep(1)
        return True

    # Schedule tasks to fail initial placement group creation.
    tasks = [bothering_task.remote() for _ in range(2)]
    # Create an actor that will fail bundle scheduling.
    # It is important to use pack strategy to make test less flaky.
    pg = ray.util.placement_group(name="name",
                                  strategy="SPREAD",
                                  bundles=[{
                                      "CPU": bundle_cpu_size
                                  } for _ in range(num_nodes * bundle_per_node)
                                           ])

    # Create a placement group actor.
    # This shouldn't be scheduled because atomic
    # placement group creation should've failed.
    pg_actor = NormalActor.options(
        placement_group=pg,
        placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote()

    # Wait on the placement group now. It should be unready
    # because normal actor takes resources that are required
    # for one of bundle creation.
    ready, unready = ray.wait([pg.ready()], timeout=0)
    assert len(ready) == 0
    assert len(unready) == 1
    # Wait until all tasks are done.
    assert all(ray.get(tasks))

    # Wait on the placement group creation. Since resources are now available,
    # it should be ready soon.
    ready, unready = ray.wait([pg.ready()])
    assert len(ready) == 1
    assert len(unready) == 0

    # Confirm that the placement group actor is created. It will
    # raise an exception if actor was scheduled before placement
    # group was created thus it checks atomicity.
    ray.get(pg_actor.ping.remote(), timeout=3.0)
    ray.kill(pg_actor)

    # Make sure atomic creation failure didn't impact resources.
    @ray.remote(num_cpus=bundle_cpu_size)
    def resource_check():
        return True

    # This should hang because every resources
    # are claimed by placement group.
    check_without_pg = [
        resource_check.remote() for _ in range(bundle_per_node * num_nodes)
    ]

    # This all should scheduled on each bundle.
    check_with_pg = [
        resource_check.options(placement_group=pg,
                               placement_group_bundle_index=i).remote()
        for i in range(bundle_per_node * num_nodes)
    ]

    # Make sure these are hanging.
    ready, unready = ray.wait(check_without_pg, timeout=0)
    assert len(ready) == 0
    assert len(unready) == bundle_per_node * num_nodes

    # Make sure these are all scheduled.
    assert all(ray.get(check_with_pg))

    ray.util.remove_placement_group(pg)

    def pg_removed():
        return ray.util.placement_group_table(pg)["state"] == "REMOVED"

    wait_for_condition(pg_removed)

    # Make sure check without pgs are all
    # scheduled properly because resources are cleaned up.
    assert all(ray.get(check_without_pg))
예제 #3
0
파일: test_api.py 프로젝트: xqk/ray
def test_shadow_traffic(serve_instance):
    client = serve_instance

    @ray.remote
    class RequestCounter:
        def __init__(self):
            self.requests = defaultdict(int)

        def record(self, backend):
            self.requests[backend] += 1

        def get(self, backend):
            return self.requests[backend]

    counter = RequestCounter.remote()

    def f(_):
        ray.get(counter.record.remote("backend1"))
        return "hello"

    def f_shadow_1(_):
        ray.get(counter.record.remote("backend2"))
        return "oops"

    def f_shadow_2(_):
        ray.get(counter.record.remote("backend3"))
        return "oops"

    def f_shadow_3(_):
        ray.get(counter.record.remote("backend4"))
        return "oops"

    client.create_backend("backend1", f)
    client.create_backend("backend2", f_shadow_1)
    client.create_backend("backend3", f_shadow_2)
    client.create_backend("backend4", f_shadow_3)

    client.create_endpoint("endpoint", backend="backend1", route="/api")
    client.shadow_traffic("endpoint", "backend2", 1.0)
    client.shadow_traffic("endpoint", "backend3", 0.5)
    client.shadow_traffic("endpoint", "backend4", 0.1)

    start = time.time()
    num_requests = 100
    for _ in range(num_requests):
        assert requests.get("http://127.0.0.1:8000/api").text == "hello"
    print("Finished 100 requests in {}s.".format(time.time() - start))

    def requests_to_backend(backend):
        return ray.get(counter.get.remote(backend))

    def check_requests():
        return all([
            requests_to_backend("backend1") == num_requests,
            requests_to_backend("backend2") == requests_to_backend("backend1"),
            requests_to_backend("backend3") < requests_to_backend("backend2"),
            requests_to_backend("backend4") < requests_to_backend("backend3"),
            requests_to_backend("backend4") > 0,
        ])

    wait_for_condition(check_requests)
예제 #4
0
def test_basic(ray_start_with_dashboard):
    """Dashboard test that starts a Ray cluster with a dashboard server running,
    then hits the dashboard API and asserts that it receives sensible data."""
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    node_id = address_info["node_id"]
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)

    all_processes = ray.worker._global_node.all_processes
    assert ray_constants.PROCESS_TYPE_DASHBOARD in all_processes
    assert ray_constants.PROCESS_TYPE_REPORTER not in all_processes
    dashboard_proc_info = all_processes[
        ray_constants.PROCESS_TYPE_DASHBOARD][0]
    dashboard_proc = psutil.Process(dashboard_proc_info.process.pid)
    assert dashboard_proc.status() in [
        psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING
    ]
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)

    def _search_agent(processes):
        for p in processes:
            try:
                for c in p.cmdline():
                    if "new_dashboard/agent.py" in c:
                        return p
            except Exception:
                pass

    # Test for bad imports, the agent should be restarted.
    logger.info("Test for bad imports.")
    agent_proc = _search_agent(raylet_proc.children())
    prepare_test_files()
    agent_pids = set()
    try:
        assert agent_proc is not None
        agent_proc.kill()
        agent_proc.wait()
        # The agent will be restarted for imports failure.
        for x in range(50):
            agent_proc = _search_agent(raylet_proc.children())
            if agent_proc:
                agent_pids.add(agent_proc.pid)
            # The agent should be restarted,
            # so we can break if the len(agent_pid) > 1
            if len(agent_pids) > 1:
                break
            time.sleep(0.1)
    finally:
        cleanup_test_files()
    assert len(agent_pids) > 1, agent_pids

    agent_proc = _search_agent(raylet_proc.children())
    if agent_proc:
        agent_proc.kill()
        agent_proc.wait()

    logger.info("Test agent register is OK.")
    wait_for_condition(lambda: _search_agent(raylet_proc.children()))
    assert dashboard_proc.status() in [
        psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING
    ]
    agent_proc = _search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    # Check if agent register is OK.
    for x in range(5):
        logger.info("Check agent is alive.")
        agent_proc = _search_agent(raylet_proc.children())
        assert agent_proc.pid == agent_pid
        time.sleep(1)

    # Check redis keys are set.
    logger.info("Check redis keys are set.")
    dashboard_address = client.get(dashboard_consts.REDIS_KEY_DASHBOARD)
    assert dashboard_address is not None
    dashboard_rpc_address = client.get(
        dashboard_consts.REDIS_KEY_DASHBOARD_RPC)
    assert dashboard_rpc_address is not None
    key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}"
    agent_ports = client.get(key)
    assert agent_ports is not None
예제 #5
0
def test_automatic_cleanup_job(ray_start_cluster):
    # Make sure the placement groups created by a
    # job, actor, and task are cleaned when the job is done.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 4
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)

    info = ray.init(address=cluster.address)
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

@ray.remote(num_cpus=0)
def f():
    create_pg()

@ray.remote(num_cpus=0)
class A:
    def create_pg(self):
        create_pg()

ray.get(f.remote())
a = A.remote()
ray.get(a.create_pg.remote())
# Create 2 pgs to make sure multiple placement groups that belong
# to a single job will be properly cleaned.
create_pg()
create_pg()

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    available_cpus = ray.available_resources()["CPU"]
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
예제 #6
0
def test_reconstruction_cached_dependency(ray_start_cluster,
                                          reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node2": 1}).remote()
    obj = chain.options(resources={"node1": 1}).remote(obj)
    ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()
예제 #7
0
def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()
    ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
            "object_store_full_initial_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
                    "directory_path": str(temp_folder)
                }
            }),
            "min_spilling_size": 0,
        })

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def get_pid(self):
            return os.getpid()

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(200):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=0)
                assert np.array_equal(sample, arr)

    a = Actor.remote()
    actor_pid = ray.get(a.get_pid.remote())
    ray.get(a.create_objects.remote())
    os.kill(actor_pid, 9)

    def wait_until_actor_dead():
        try:
            ray.get(a.get_pid.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    wait_for_condition(wait_until_actor_dead)

    def is_dir_empty():
        num_files = 0
        for path in temp_folder.iterdir():
            num_files += 1
        return num_files == 0

    # After all, make sure all objects are deleted upon worker failures.
    wait_for_condition(is_dir_empty, timeout=1000)
예제 #8
0
def test_metrics_export_end_to_end(_setup_cluster_for_test):
    TEST_TIMEOUT_S = 20

    prom_addresses, autoscaler_export_addr = _setup_cluster_for_test

    def test_cases():
        components_dict, metric_names, metric_samples = fetch_prometheus(
            prom_addresses)

        # Raylet should be on every node
        assert all("raylet" in components
                   for components in components_dict.values())

        # GCS server should be on one node
        assert any("gcs_server" in components
                   for components in components_dict.values())

        # Core worker should be on at least on node
        assert any("core_worker" in components
                   for components in components_dict.values())

        # Make sure our user defined metrics exist
        for metric_name in [
                "test_counter", "test_histogram", "test_driver_counter"
        ]:
            assert any(metric_name in full_name for full_name in metric_names)

        # Make sure metrics are recorded.
        for metric in _METRICS:
            assert metric in metric_names, \
                f"metric {metric} not in {metric_names}"

        # Make sure the numeric values are correct
        test_counter_sample = [
            m for m in metric_samples if "test_counter" in m.name
        ][0]
        assert test_counter_sample.value == 4.0

        test_driver_counter_sample = [
            m for m in metric_samples if "test_driver_counter" in m.name
        ][0]
        assert test_driver_counter_sample.value == 1.0

        test_histogram_samples = [
            m for m in metric_samples if "test_histogram" in m.name
        ]
        buckets = {
            m.labels["le"]: m.value
            for m in test_histogram_samples if "_bucket" in m.name
        }
        # We recorded value 1.5 for the histogram. In Prometheus data model
        # the histogram is cumulative. So we expect the count to appear in
        # <1.1 and <+Inf buckets.
        assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0}
        hist_count = [m for m in test_histogram_samples
                      if "_count" in m.name][0].value
        hist_sum = [m for m in test_histogram_samples
                    if "_sum" in m.name][0].value
        assert hist_count == 1
        assert hist_sum == 1.5

        # Autoscaler metrics
        _, autoscaler_metric_names, _ = fetch_prometheus(
            [autoscaler_export_addr])
        for metric in _AUTOSCALER_METRICS:
            # Metric name should appear with some suffix (_count, _total,
            # etc...) in the list of all names
            assert any(name.startswith(metric) for name in
                       autoscaler_metric_names), \
                    f"{metric} not in {autoscaler_metric_names}"

    def wrap_test_case_for_retry():
        try:
            test_cases()
            return True
        except AssertionError:
            return False

    try:
        wait_for_condition(
            wrap_test_case_for_retry,
            timeout=TEST_TIMEOUT_S,
            retry_interval_ms=1000,  # Yield resource for other processes
        )
    except RuntimeError:
        print(
            f"The components are {pformat(fetch_prometheus(prom_addresses))}")
        test_cases()  # Should fail assert
예제 #9
0
def test_two_custom_resources(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(
        num_cpus=3, resources={
            "CustomResource1": 1,
            "CustomResource2": 2
        })
    custom_resource_node = cluster.add_node(
        num_cpus=3, resources={
            "CustomResource1": 3,
            "CustomResource2": 4
        })
    ray.init(address=cluster.address)

    @ray.remote
    def foo():
        # Sleep a while to emulate a slow operation. This is needed to make
        # sure tasks are scheduled to different nodes.
        time.sleep(0.1)
        return ray.worker.global_worker.node.unique_id

    # Make sure each node has at least one idle worker.
    wait_for_condition(
        lambda: len(set(ray.get([foo.remote() for _ in range(6)]))) == 2)

    @ray.remote(resources={"CustomResource1": 1})
    def f():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource2": 1})
    def g():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource1": 1, "CustomResource2": 3})
    def h():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource1": 4})
    def j():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource3": 1})
    def k():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    # The f and g tasks should be scheduled on both raylets.
    assert len(set(ray.get([f.remote() for _ in range(500)]))) == 2
    assert len(set(ray.get([g.remote() for _ in range(500)]))) == 2

    # The h tasks should be scheduled only on the second raylet.
    raylet_ids = set(ray.get([h.remote() for _ in range(50)]))
    assert len(raylet_ids) == 1
    assert list(raylet_ids)[0] == custom_resource_node.unique_id

    # Make sure that tasks with unsatisfied custom resource requirements do
    # not get scheduled.
    ready_ids, remaining_ids = ray.wait([j.remote(), k.remote()], timeout=0.5)
    assert ready_ids == []
예제 #10
0
def test_detached_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    for _ in range(2):
        cluster.add_node(num_cpus=3)
    cluster.wait_for_nodes()
    info = ray.init(address=cluster.address)

    # Make sure detached placement group will alive when job dead.
    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

pg = ray.util.placement_group(
        [{{"CPU": 1}} for _ in range(2)],
        strategy="STRICT_SPREAD", lifetime="detached")
ray.get(pg.ready())

@ray.remote(num_cpus=1)
class Actor:
    def ready(self):
        return True

for bundle_index in range(2):
    actor = Actor.options(lifetime="detached", placement_group=pg,
                placement_group_bundle_index=bundle_index).remote()
    ray.get(actor.ready.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    def assert_alive_num_pg(expected_num_pg):
        alive_num_pg = 0
        for _, placement_group_info in ray.util.placement_group_table().items(
        ):
            if placement_group_info["state"] == "CREATED":
                alive_num_pg += 1
        return alive_num_pg == expected_num_pg

    def assert_alive_num_actor(expected_num_actor):
        alive_num_actor = 0
        for actor_info in ray.actors().values():
            if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE:
                alive_num_actor += 1
        return alive_num_actor == expected_num_actor

    wait_for_condition(is_job_done)

    assert assert_alive_num_pg(1)
    assert assert_alive_num_actor(2)

    # Make sure detached placement group will alive when its creator which
    # is detached actor dead.
    # Test actors first.
    @ray.remote(num_cpus=1)
    class NestedActor:
        def ready(self):
            return True

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.actors = []

        def ready(self):
            return True

        def schedule_nested_actor_with_detached_pg(self):
            # Create placement group which is detached.
            pg = ray.util.placement_group(
                [{
                    "CPU": 1
                } for _ in range(2)],
                strategy="STRICT_SPREAD",
                lifetime="detached",
                name="detached_pg")
            ray.get(pg.ready())
            # Schedule nested actor with the placement group.
            for bundle_index in range(2):
                actor = NestedActor.options(
                    placement_group=pg,
                    placement_group_bundle_index=bundle_index,
                    lifetime="detached").remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

    a = Actor.options(lifetime="detached").remote()
    ray.get(a.ready.remote())
    # 1 parent actor and 2 children actor.
    ray.get(a.schedule_nested_actor_with_detached_pg.remote())

    # Kill an actor and wait until it is killed.
    ray.kill(a)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(a.ready.remote())

    # We should have 2 alive pgs and 4 alive actors.
    assert assert_alive_num_pg(2)
    assert assert_alive_num_actor(4)
예제 #11
0
def test_named_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    for _ in range(2):
        cluster.add_node(num_cpus=3)
    cluster.wait_for_nodes()
    info = ray.init(address=cluster.address)
    global_placement_group_name = "named_placement_group"

    # Create a detached placement group with name.
    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

pg = ray.util.placement_group(
        [{{"CPU": 1}} for _ in range(2)],
        strategy="STRICT_SPREAD",
        name="{global_placement_group_name}",
        lifetime="detached")
ray.get(pg.ready())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    wait_for_condition(is_job_done)

    @ray.remote(num_cpus=1)
    class Actor:
        def ping(self):
            return "pong"

    # Get the named placement group and schedule a actor.
    placement_group = ray.util.get_placement_group(global_placement_group_name)
    assert placement_group is not None
    assert placement_group.wait(5)
    actor = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=0).remote()

    ray.get(actor.ping.remote())

    # Create another placement group and make sure its creation will failed.
    same_name_pg = ray.util.placement_group(
        [{
            "CPU": 1
        } for _ in range(2)],
        strategy="STRICT_SPREAD",
        name=global_placement_group_name)
    assert not same_name_pg.wait(10)

    # Remove a named placement group and make sure the second creation
    # will successful.
    ray.util.remove_placement_group(placement_group)
    same_name_pg = ray.util.placement_group(
        [{
            "CPU": 1
        } for _ in range(2)],
        strategy="STRICT_SPREAD",
        name=global_placement_group_name)
    assert same_name_pg.wait(10)

    # Get a named placement group with a name that doesn't exist
    # and make sure it will raise ValueError correctly.
    error_count = 0
    try:
        ray.util.get_placement_group("inexistent_pg")
    except ValueError:
        error_count = error_count + 1
    assert error_count == 1
예제 #12
0
def test_multiple_directories(tmp_path, shutdown_only):
    num_dirs = 3
    temp_dirs = []
    for i in range(num_dirs):
        temp_folder = tmp_path / f"spill_{i}"
        temp_folder.mkdir()
        temp_dirs.append(temp_folder)

    # Limit our object store to 75 MiB of memory.
    min_spilling_size = 0
    object_spilling_config = json.dumps({
        "type": "filesystem",
        "params": {
            "directory_path": [str(directory) for directory in temp_dirs]
        }
    })
    address = ray.init(object_store_memory=75 * 1024 * 1024,
                       _system_config={
                           "max_io_workers": 5,
                           "object_store_full_delay_ms": 100,
                           "object_spilling_config": object_spilling_config,
                           "min_spilling_size": min_spilling_size,
                       })

    arr = np.ones(74 * 1024 * 1024, dtype=np.uint8)  # 74MB.
    object_refs = []
    # Now the storage is full.
    object_refs.append(ray.put(arr))

    num_object_spilled = 20
    for _ in range(num_object_spilled):
        object_refs.append(ray.put(arr))

    num_files = defaultdict(int)
    for temp_dir in temp_dirs:
        temp_folder = temp_dir / ray.ray_constants.DEFAULT_OBJECT_PREFIX
        for path in temp_folder.iterdir():
            num_files[str(temp_folder)] += 1

    for ref in object_refs:
        assert np.array_equal(ray.get(ref), arr)

    print("Check distribution...")
    min_count = 5
    is_distributed = [n_files >= min_count for n_files in num_files.values()]
    assert all(is_distributed)

    print("Check deletion...")
    # Empty object refs.
    object_refs = []
    # Add a new object so that the last entry is evicted.
    ref = ray.put(arr)
    for temp_dir in temp_dirs:
        temp_folder = temp_dir
        wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["redis_address"])

    # Now kill ray and see all directories are deleted.
    print("Check directories are deleted...")
    ray.shutdown()
    for temp_dir in temp_dirs:
        wait_for_condition(lambda: is_dir_empty(temp_dir, append_path=""))
예제 #13
0
def test_delete_objects_multi_node(multi_node_object_spilling_config,
                                   ray_start_cluster):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = multi_node_object_spilling_config

    cluster = ray_start_cluster
    # Head node.
    cluster.add_node(num_cpus=1,
                     object_store_memory=75 * 1024 * 1024,
                     _system_config={
                         "max_io_workers": 2,
                         "min_spilling_size": 20 * 1024 * 1024,
                         "automatic_object_spilling_enabled": True,
                         "object_store_full_delay_ms": 100,
                         "object_spilling_config": object_spilling_config,
                     })
    ray.init(address=cluster.address)
    # Add 2 worker nodes.
    for _ in range(2):
        cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
    cluster.wait_for_nodes()

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def ping(self):
            return

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(50):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=10)
                assert np.array_equal(sample, arr)

    actors = [Actor.remote() for _ in range(3)]
    ray.get([actor.create_objects.remote() for actor in actors])

    def wait_until_actor_dead(actor):
        try:
            ray.get(actor.ping.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    # Kill actors to remove all references.
    for actor in actors:
        ray.kill(actor)
        wait_for_condition(lambda: wait_until_actor_dead(actor))
    # The multi node deletion should work.
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(cluster.address)
예제 #14
0
    assert not leaked_processes()

    run_experiments({
        "demo": {
            "run": "PG",
            "env": "subproc",
            "num_samples": 1,
            "config": {
                "num_workers": 1,
                "env_config": {
                    "tmp_file1": tmp1,
                    "tmp_file2": tmp2,
                    "tmp_file3": tmp3,
                    "tmp_file4": tmp4,
                },
                "framework": "tf",
            },
            "stop": {
                "training_iteration": 1
            },
        },
    })
    # Check whether processes are still running or Env has not cleaned up
    # the given tmp files.
    wait_for_condition(lambda: not leaked_processes(), timeout=30)
    wait_for_condition(lambda: not os.path.exists(tmp1), timeout=30)
    wait_for_condition(lambda: not os.path.exists(tmp2), timeout=30)
    wait_for_condition(lambda: not os.path.exists(tmp3), timeout=30)
    wait_for_condition(lambda: not os.path.exists(tmp4), timeout=30)
    print("OK")
예제 #15
0
def test_local_clusters():
    """
    This tests the various behaviors of connecting to local clusters:

    * Using `ray.client("local").connect() ` should always create a new
      cluster.
    * Using `ray.cleint().connectIO` should create a new cluster if it doesn't
      connect to an existing one.
    * Using `ray.client().connect()` should only connect to a cluster if it
      was created with `ray start --head`, not from a python program.

    It does tests if two calls are in the same cluster by trying to create an
    actor with the same name in the same namespace, which will error and cause
    the script have a non-zero exit, which throws an exception.
    """
    driver_template = """
import ray
info = ray.client({address}).namespace("").connect()

@ray.remote
class Foo:
    def ping(self):
        return "pong"

a = Foo.options(name="abc", lifetime="detached").remote()
ray.get(a.ping.remote())

import time
while True:
    time.sleep(30)

"""
    blocking_local_script = driver_template.format(address="'local'",
                                                   blocking=True)
    blocking_noaddr_script = driver_template.format(address="", blocking=True)

    # This should start a cluster.
    p1 = run_string_as_driver_nonblocking(blocking_local_script)
    # ray.client("local").connect() should start a second cluster.
    p2 = run_string_as_driver_nonblocking(blocking_local_script)
    # ray.client().connect() shouldn't connect to a cluster started by
    # ray.client("local").connect() so it should create a third one.
    p3 = run_string_as_driver_nonblocking(blocking_noaddr_script)
    # ray.client().connect() shouldn't connect to a cluster started by
    # ray.client().connect() so it should create a fourth one.
    p4 = run_string_as_driver_nonblocking(blocking_noaddr_script)

    wait_for_condition(
        lambda: len(ray._private.services.find_redis_address()) == 4,
        retry_interval_ms=1000)

    p1.kill()
    p2.kill()
    p3.kill()
    p4.kill()
    # Prevent flakiness since fatesharing takes some time.
    subprocess.check_output("ray stop --force", shell=True)

    # Since there's a cluster started with `ray start --head`
    # we should connect to it instead.
    subprocess.check_output("ray start --head", shell=True)
    # The assertion in the driver should cause the script to fail if we start
    # a new cluster instead of connecting.
    run_string_as_driver("""
import ray
ray.client().connect()
assert len(ray._private.services.find_redis_address()) == 1
    """)
    # ray.client("local").connect() should always create a new cluster even if
    # there's one running.
    p1 = run_string_as_driver_nonblocking(blocking_local_script)
    wait_for_condition(
        lambda: len(ray._private.services.find_redis_address()) == 2,
        retry_interval_ms=1000)
    p1.kill()
    subprocess.check_output("ray stop --force", shell=True)
예제 #16
0
def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
    # This test makes sure resource gets deleted correctly when a task has
    # already acquired the resource

    cluster = ray_start_cluster

    res_name = "test_res"
    res_capacity = 5
    num_nodes = 5
    TIMEOUT_DURATION = 1

    for i in range(num_nodes):
        cluster.add_node()

    ray.init(address=cluster.address)

    node_ids = [node["NodeID"] for node in ray.nodes()]
    target_node_id = node_ids[1]

    @ray.remote
    def set_res(resource_name, resource_capacity, res_node_id):
        ray.experimental.set_resource(resource_name,
                                      resource_capacity,
                                      node_id=res_node_id)

    @ray.remote
    def delete_res(resource_name, res_node_id):
        ray.experimental.set_resource(resource_name, 0, node_id=res_node_id)

    # Create the resource on node 1
    ray.get(set_res.remote(res_name, res_capacity, target_node_id))

    def check_resources():
        return ray.cluster_resources().get(res_name, None) == res_capacity

    wait_for_condition(check_resources)

    # Task to hold the resource till the driver signals to finish
    @ray.remote
    def wait_func(running_signal, finish_signal):
        # Signal that the task is running.
        ray.get(running_signal.send.remote())
        # Wait until signaled by driver.
        ray.get(finish_signal.wait.remote())

    @ray.remote
    def test_func():
        return 1

    @ray.remote(num_cpus=0)
    class Signal:
        def __init__(self):
            self.ready_event = asyncio.Event()

        def send(self):
            self.ready_event.set()

        async def wait(self):
            await self.ready_event.wait()

    running_signal = Signal.remote()
    finish_signal = Signal.remote()

    # Launch the task with resource requirement of 4, thus the new available
    # capacity becomes 1
    task = wait_func._remote(args=[running_signal, finish_signal],
                             resources={res_name: 4})
    # Wait until wait_func is launched before updating resource
    ray.get(running_signal.wait.remote())

    # Delete the resource
    ray.get(delete_res.remote(res_name, target_node_id))

    # Signal task to complete
    ray.get(finish_signal.send.remote())
    ray.get(task)

    # Check if scheduler state is consistent by launching a task requiring
    # the deleted resource  This should not execute
    task_2 = test_func._remote(args=[],
                               resources={res_name:
                                          1})  # This should be infeasible
    successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
    assert unsuccessful  # The task did not complete because it's infeasible
    assert res_name not in ray.available_resources()
예제 #17
0
def test_global_gc_when_full(shutdown_only):
    cluster = ray.cluster_utils.Cluster()
    for _ in range(2):
        cluster.add_node(num_cpus=1,
                         num_gpus=0,
                         object_store_memory=100 * 1024 * 1024)
    ray.init(address=cluster.address)

    class LargeObjectWithCyclicRef:
        def __init__(self):
            self.loop = self
            self.large_object = ray.put(
                np.zeros(40 * 1024 * 1024, dtype=np.uint8))

    @ray.remote(num_cpus=1)
    class GarbageHolder:
        def __init__(self):
            gc.disable()
            x = LargeObjectWithCyclicRef()
            self.garbage = weakref.ref(x)

        def has_garbage(self):
            return self.garbage() is not None

        def return_large_array(self):
            return np.zeros(80 * 1024 * 1024, dtype=np.uint8)

    try:
        gc.disable()

        # Local driver.
        local_ref = weakref.ref(LargeObjectWithCyclicRef())

        # Remote workers.
        actors = [GarbageHolder.remote() for _ in range(2)]
        assert local_ref() is not None
        assert all(ray.get([a.has_garbage.remote() for a in actors]))

        # GC should be triggered for all workers, including the local driver,
        # when the driver tries to ray.put a value that doesn't fit in the
        # object store. This should cause the captured ObjectRefs' numpy arrays
        # to be evicted.
        ray.put(np.zeros(80 * 1024 * 1024, dtype=np.uint8))

        def check_refs_gced():
            return (local_ref() is None and
                    not any(ray.get([a.has_garbage.remote() for a in actors])))

        wait_for_condition(check_refs_gced)

        # Local driver.
        local_ref = weakref.ref(LargeObjectWithCyclicRef())

        # Remote workers.
        actors = [GarbageHolder.remote() for _ in range(2)]
        assert all(ray.get([a.has_garbage.remote() for a in actors]))

        # GC should be triggered for all workers, including the local driver,
        # when a remote task tries to put a return value that doesn't fit in
        # the object store. This should cause the captured ObjectRefs' numpy
        # arrays to be evicted.
        ray.get(actors[0].return_large_array.remote())

        def check_refs_gced():
            return (local_ref() is None and
                    not any(ray.get([a.has_garbage.remote() for a in actors])))

        wait_for_condition(check_refs_gced)
    finally:
        gc.enable()
예제 #18
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.init(http_port=8005)

    def actor_name(index):
        return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index)

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(actor_name(0))
            ray.get_actor(actor_name(1))
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(actor_name(0)), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    def get_third_actor():
        try:
            ray.get_actor(actor_name(2))
            return True
        except ValueError:
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(actor_name(2))
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
예제 #19
0
def test_multiple_routers(ray_cluster):
    cluster = ray_cluster
    head_node = cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.start(http_options=dict(port=8005, location="EveryNode"))

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME,
                                  serve.api._global_client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
예제 #20
0
def test_metrics_export_end_to_end(_setup_cluster_for_test):
    TEST_TIMEOUT_S = 20

    prom_addresses = _setup_cluster_for_test

    def test_cases():
        components_dict, metric_names, metric_samples = fetch_prometheus(
            prom_addresses)

        # Raylet should be on every node
        assert all("raylet" in components
                   for components in components_dict.values())

        # GCS server should be on one node
        assert any("gcs_server" in components
                   for components in components_dict.values())

        # Core worker should be on at least on node
        assert any("core_worker" in components
                   for components in components_dict.values())

        # Make sure our user defined metrics exist
        for metric_name in ["test_counter", "test_histogram"]:
            assert any(metric_name in full_name for full_name in metric_names)

        # Make sure GCS server metrics are recorded.
        assert "ray_outbound_heartbeat_size_kb_sum" in metric_names

        # Make sure the numeric value is correct
        test_counter_sample = [
            m for m in metric_samples if "test_counter" in m.name
        ][0]
        assert test_counter_sample.value == 1.0

        # Make sure the numeric value is correct
        test_histogram_samples = [
            m for m in metric_samples if "test_histogram" in m.name
        ]
        buckets = {
            m.labels["le"]: m.value
            for m in test_histogram_samples if "_bucket" in m.name
        }
        # We recorded value 1.5 for the histogram. In Prometheus data model
        # the histogram is cumulative. So we expect the count to appear in
        # <1.1 and <+Inf buckets.
        assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0}
        hist_count = [m for m in test_histogram_samples
                      if "_count" in m.name][0].value
        hist_sum = [m for m in test_histogram_samples
                    if "_sum" in m.name][0].value
        assert hist_count == 1
        assert hist_sum == 1.5

    def wrap_test_case_for_retry():
        try:
            test_cases()
            return True
        except AssertionError:
            return False

    try:
        wait_for_condition(
            wrap_test_case_for_retry,
            timeout=TEST_TIMEOUT_S,
            retry_interval_ms=1000,  # Yield resource for other processes
        )
    except RuntimeError:
        print(
            f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}")
        test_cases()  # Should fail assert
예제 #21
0
def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
    # Limit our object store to 75 MiB of memory.
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()
    cluster = ray_start_cluster
    # Head node.
    cluster.add_node(
        num_cpus=1,
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 2,
            "automatic_object_spilling_enabled": True,
            "object_store_full_max_retries": 4,
            "object_store_full_initial_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
                "params": {
                    "directory_path": str(temp_folder)
                }
            }),
        })
    # Add 2 worker nodes.
    for _ in range(2):
        cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
    ray.init(address=cluster.address)

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def ping(self):
            return

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(200):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=0)
                assert np.array_equal(sample, arr)

    actors = [Actor.remote() for _ in range(3)]
    ray.get([actor.create_objects.remote() for actor in actors])

    def wait_until_actor_dead(actor):
        try:
            ray.get(actor.ping.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    def is_dir_empty():
        num_files = 0
        for path in temp_folder.iterdir():
            num_files += 1
        return num_files == 0

    # Kill actors to remove all references.
    for actor in actors:
        ray.kill(actor)
        wait_for_condition(lambda: wait_until_actor_dead(actor))
    # The multi node deletion should work.
    wait_for_condition(is_dir_empty)
예제 #22
0
def test_metrics_export_end_to_end(_setup_cluster_for_test):
    TEST_TIMEOUT_S = 20

    prom_addresses = _setup_cluster_for_test

    # Make sure we can ping Prometheus endpoints.
    def fetch_prometheus(prom_addresses):
        components_dict = {}
        metric_names = set()
        metric_samples = []
        for address in prom_addresses:
            if address not in components_dict:
                components_dict[address] = set()
            try:
                response = requests.get(f"http://{address}/metrics")
            except requests.exceptions.ConnectionError:
                continue

            for line in response.text.split("\n"):
                for family in text_string_to_metric_families(line):
                    for sample in family.samples:
                        metric_names.add(sample.name)
                        metric_samples.append(sample)
                        if "Component" in sample.labels:
                            components_dict[address].add(
                                sample.labels["Component"])
        return components_dict, metric_names, metric_samples

    def test_cases():
        components_dict, metric_names, metric_samples = fetch_prometheus(
            prom_addresses)

        # Raylet should be on every node
        assert all("raylet" in components
                   for components in components_dict.values())

        # GCS server should be on one node
        assert any("gcs_server" in components
                   for components in components_dict.values())

        # Core worker should be on at least on node
        assert any("core_worker" in components
                   for components in components_dict.values())

        # Make sure our user defined metrics exist
        for metric_name in ["test_counter", "test_histogram"]:
            assert any(metric_name in full_name for full_name in metric_names)

        # Make sure the numeric value is correct
        test_counter_sample = [
            m for m in metric_samples if "test_counter" in m.name
        ][0]
        assert test_counter_sample.value == 1.0

        # Make sure the numeric value is correct
        test_histogram_samples = [
            m for m in metric_samples if "test_histogram" in m.name
        ]
        buckets = {
            m.labels["le"]: m.value
            for m in test_histogram_samples if "_bucket" in m.name
        }
        # We recorded value 1.5 for the histogram. In Prometheus data model
        # the histogram is cumulative. So we expect the count to appear in
        # <1.1 and <+Inf buckets.
        assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0}
        hist_count = [m for m in test_histogram_samples
                      if "_count" in m.name][0].value
        hist_sum = [m for m in test_histogram_samples
                    if "_sum" in m.name][0].value
        assert hist_count == 1
        assert hist_sum == 1.5

    def wrap_test_case_for_retry():
        try:
            test_cases()
            return True
        except AssertionError:
            return False

    try:
        wait_for_condition(
            wrap_test_case_for_retry,
            timeout=TEST_TIMEOUT_S,
            retry_interval_ms=1000,  # Yield resource for other processes
        )
    except RuntimeError:
        print(
            f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}")
        test_cases()  # Should fail assert
예제 #23
0
def test_automatic_cleanup_detached_actors(ray_start_cluster):
    # Make sure the placement groups created by a
    # detached actors are cleaned properly.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 2
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)

    info = ray.init(address=cluster.address)
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

# TODO(sang): Placement groups created by tasks launched by detached actor
# is not cleaned with the current protocol.
# @ray.remote(num_cpus=0)
# def f():
#     create_pg()

@ray.remote(num_cpus=0, max_restarts=1)
class A:
    def create_pg(self):
        create_pg()
    def create_child_pg(self):
        self.a = A.options(name="B").remote()
        ray.get(self.a.create_pg.remote())
    def kill_child_actor(self):
        ray.kill(self.a)
        try:
            ray.get(self.a.create_pg.remote())
        except Exception:
            pass

a = A.options(lifetime="detached", name="A").remote()
ray.get(a.create_pg.remote())
# TODO(sang): Currently, child tasks are cleaned when a detached actor
# is dead. We cannot test this scenario until it is fixed.
# ray.get(a.create_child_pg.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.jobs()
        for job in jobs:
            if "StopTime" in job:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    assert assert_num_cpus(num_nodes)
    # Make sure when a child actor spawned by a detached actor
    # is killed, the placement group is removed.
    a = ray.get_actor("A")
    # TODO(sang): child of detached actors
    # seem to be killed when jobs are done. We should fix this before
    # testing this scenario.
    # ray.get(a.kill_child_actor.remote())
    # assert assert_num_cpus(num_nodes)

    # Make sure placement groups are cleaned when detached actors are killed.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
    # The detached actor a should've been restarted.
    # Recreate a placement group.
    ray.get(a.create_pg.remote())
    wait_for_condition(lambda: assert_num_cpus(num_nodes))
    # Kill it again and make sure the placement group
    # that is created is deleted again.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
예제 #24
0
def test_actor_restart(ray_init_with_task_retry_delay):
    """Test actor restart when actor process is killed."""
    @ray.remote(max_restarts=1)
    class RestartableActor:
        """An actor that will be restarted at most once."""
        def __init__(self):
            self.value = 0

        def increase(self, exit=False):
            if exit:
                os._exit(-1)
            self.value += 1
            return self.value

        def get_pid(self):
            return os.getpid()

    actor = RestartableActor.remote()
    # Submit some tasks and kill on a task midway through.
    results = [actor.increase.remote(exit=(i == 100)) for i in range(200)]
    # Make sure that all tasks were executed in order before the actor's death.
    i = 1
    while results:
        res = results[0]
        try:
            r = ray.get(res)
            if r != i:
                # Actor restarted at this task without any failed tasks in
                # between.
                break
            results.pop(0)
            i += 1
        except ray.exceptions.RayActorError:
            break
    # Skip any tasks that errored.
    while results:
        try:
            ray.get(results[0])
        except ray.exceptions.RayActorError:
            results.pop(0)
    # Check all tasks that executed after the restart.
    if results:
        # The actor executed some tasks after the restart.
        i = 1
        while results:
            r = ray.get(results.pop(0))
            assert r == i
            i += 1

        # Check that we can still call the actor.
        result = actor.increase.remote()
        assert ray.get(result) == r + 1
    else:
        # Wait for the actor to restart.
        def ping():
            try:
                ray.get(actor.increase.remote())
                return True
            except ray.exceptions.RayActorError:
                return False

        wait_for_condition(ping)

    # The actor has restarted. Kill actor process one more time.
    actor.increase.remote(exit=True)
    # The actor has exceeded max restarts. All tasks should fail.
    for _ in range(100):
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(actor.increase.remote())

    # Create another actor.
    actor = RestartableActor.remote()
    # Intentionlly exit the actor
    actor.__ray_terminate__.remote()
    # Check that the actor won't be restarted.
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(actor.increase.remote())
예제 #25
0
def test_remove_placement_group(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    ray.init(address=cluster.address)
    # First try to remove a placement group that doesn't
    # exist. This should not do anything.
    random_group_id = PlacementGroupID.from_random()
    random_placement_group = PlacementGroup(random_group_id, [{"CPU": 1}])
    for _ in range(3):
        ray.util.remove_placement_group(random_placement_group)

    # Creating a placement group as soon as it is
    # created should work.
    placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])
    ray.util.remove_placement_group(placement_group)

    def is_placement_group_removed():
        table = ray.util.placement_group_table(placement_group)
        if "state" not in table:
            return False
        return table["state"] == "REMOVED"

    wait_for_condition(is_placement_group_removed)

    # # Now let's create a placement group.
    placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}])

    # Create an actor that occupies resources.
    @ray.remote(num_cpus=2)
    class A:
        def f(self):
            return 3

    # Currently, there's no way to prevent
    # tasks to be retried for removed placement group.
    # Set max_retrie=0 for testing.
    # TODO(sang): Handle this edge case.
    @ray.remote(num_cpus=2, max_retries=0)
    def long_running_task():
        print(os.getpid())
        import time
        time.sleep(50)

    # Schedule a long running task and actor.
    task_ref = long_running_task.options(
        placement_group=placement_group).remote()
    a = A.options(placement_group=placement_group).remote()
    assert ray.get(a.f.remote()) == 3

    ray.util.remove_placement_group(placement_group)
    # Subsequent remove request shouldn't do anything.
    for _ in range(3):
        ray.util.remove_placement_group(placement_group)

    # Make sure placement group resources are
    # released and we can schedule this task.
    @ray.remote(num_cpus=4)
    def f():
        return 3

    assert ray.get(f.remote()) == 3
    # Since the placement group is removed,
    # the actor should've been killed.
    # That means this request should fail.
    with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
        ray.get(a.f.remote(), timeout=3.0)
    with pytest.raises(ray.exceptions.WorkerCrashedError):
        ray.get(task_ref)
예제 #26
0
def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
    """Test actor owner node dies before local dependencies are resolved.
    This test verifies the scenario where owner node
    has failed before actor dependencies are resolved.
    Reference: https://github.com/ray-project/ray/pull/8045
    """
    @ray.remote
    class Actor:
        def __init__(self, dependency):
            print("actor: {}".format(os.getpid()))
            self.dependency = dependency

        def f(self):
            return self.dependency

    # Make sure it is scheduled in the second node.
    @ray.remote(resources={"node": 1}, num_cpus=1)
    class Owner:
        def get_pid(self):
            return os.getpid()

        def create_actor(self, caller_handle):
            s = SignalActor.remote()
            # Create an actor which depends on an object that can never be
            # resolved.
            actor_handle = Actor.remote(s.wait.remote())

            pid = os.getpid()
            signal_handle = SignalActor.remote()
            caller_handle.call.remote(pid, signal_handle, actor_handle)
            # Wait until the `Caller` start executing the remote `call` method.
            ray.get(signal_handle.wait.remote())

    @ray.remote
    class Caller:
        def call(self, owner_pid, signal_handle, actor_handle):
            # Notify the `Owner` that the `Caller` is executing the remote
            # `call` method.
            ray.get(signal_handle.send.remote())
            # Wait for the `Owner` to exit.
            wait_for_pid_to_exit(owner_pid)
            oid = actor_handle.f.remote()
            # It will hang without location resolution protocol.
            ray.get(oid)

        def hang(self):
            return True

    cluster = ray_start_cluster
    node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})

    owner = Owner.remote()
    owner_pid = ray.get(owner.get_pid.remote())

    caller = Caller.remote()
    owner.create_actor.remote(caller)
    cluster.remove_node(node_to_be_broken)
    # Wait for the `Owner` to exit.
    wait_for_pid_to_exit(owner_pid)

    # It will hang here if location is not properly resolved.
    assert (wait_for_condition(lambda: ray.get(caller.hang.remote())))
예제 #27
0
def test_submit_job_validation(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    def _ensure_available_nodes():
        resp = requests.post(f"{webui_url}/jobs")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is False
        return "no nodes available" not in result["msg"]

    wait_for_condition(_ensure_available_nodes, timeout=5)

    # Invalid value.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": "Unsupported",
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert "language" in msg and "value is not a valid" in msg, resp.text

    # Missing required field.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["missing", "driver_entry"]), resp.text

    # Incorrect value type.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": ["http://xxx/yyy.zip"]
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["working_dir", "str"]), resp.text

    # Invalid key.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                             "invalid_key": 1,
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["not permitted", "invalid_key"]), resp.text
예제 #28
0
def test_initial_workers(shutdown_only):
    # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
    ray.init(num_cpus=1, include_dashboard=True)
    wait_for_condition(lambda: len(get_workers()) == 1)
예제 #29
0
 def decorated_func(quantity):
     wait_for_condition(
         lambda: ray.available_resources()[resource_name] < quantity)
     return True
예제 #30
0
def test_memory_dashboard(shutdown_only):
    """Test Memory table.

    These tests verify examples in this document.
    https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory
    """
    addresses = ray.init(num_cpus=2)
    webui_url = addresses["webui_url"].replace("127.0.0.1", "http://127.0.0.1")
    assert (wait_until_server_available(addresses["webui_url"]) is True)

    def get_memory_table():
        memory_table = requests.get(webui_url + "/api/memory_table").json()
        return memory_table["result"]

    def memory_table_ready():
        """Wait until the new fresh memory table is ready."""
        global prev_memory_table
        memory_table = get_memory_table()
        is_ready = memory_table["group"] != prev_memory_table
        prev_memory_table = memory_table["group"]
        return is_ready

    def stop_memory_table():
        requests.get(webui_url + "/api/stop_memory_table").json()

    def test_local_reference():
        @ray.remote
        def f(arg):
            return arg

        # a and b are local references.
        a = ray.put(None)  # Noqa F841
        b = f.remote(None)  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 2
        for table in group.values():
            for entry in table["entries"]:
                assert (
                    entry["reference_type"] == ReferenceType.LOCAL_REFERENCE)
        stop_memory_table()
        return True

    def test_object_pinned_in_memory():

        a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
        b = ray.get(a)  # Noqa F841
        del a

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 1
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 0
        for table in group.values():
            for entry in table["entries"]:
                assert (
                    entry["reference_type"] == ReferenceType.PINNED_IN_MEMORY)
        stop_memory_table()
        return True

    def test_pending_task_references():
        @ray.remote
        def f(arg):
            time.sleep(1)

        a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
        b = f.remote(a)

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 1
        assert summary["total_used_by_pending_task"] == 1
        assert summary["total_local_ref_count"] == 1
        # Make sure the function f is done before going to the next test.
        # Otherwise, the memory table will be corrupted because the
        # task f won't be done when the next test is running.
        ray.get(b)
        stop_memory_table()
        return True

    def test_serialized_object_ref_reference():
        @ray.remote
        def f(arg):
            time.sleep(1)

        a = ray.put(None)
        b = f.remote([a])  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 1
        assert summary["total_local_ref_count"] == 2
        # Make sure the function f is done before going to the next test.
        # Otherwise, the memory table will be corrupted because the
        # task f won't be done when the next test is running.
        ray.get(b)
        stop_memory_table()
        return True

    def test_captured_object_ref_reference():
        a = ray.put(None)
        b = ray.put([a])  # Noqa F841
        del a

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 1
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 1
        stop_memory_table()
        return True

    def test_actor_handle_reference():
        @ray.remote
        class Actor:
            pass

        a = Actor.remote()  # Noqa F841
        b = Actor.remote()  # Noqa F841
        c = Actor.remote()  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 0
        assert summary["total_actor_handles"] == 3
        for table in group.values():
            for entry in table["entries"]:
                assert (entry["reference_type"] == ReferenceType.ACTOR_HANDLE)
        stop_memory_table()
        return True

    # These tests should be retried because it takes at least one second
    # to get the fresh new memory table. It is because memory table is updated
    # Whenever raylet and node info is renewed which takes 1 second.
    wait_for_condition(test_local_reference,
                       timeout=30000,
                       retry_interval_ms=1000)

    wait_for_condition(test_object_pinned_in_memory,
                       timeout=30000,
                       retry_interval_ms=1000)

    wait_for_condition(test_pending_task_references,
                       timeout=30000,
                       retry_interval_ms=1000)

    wait_for_condition(test_serialized_object_ref_reference,
                       timeout=30000,
                       retry_interval_ms=1000)

    wait_for_condition(test_captured_object_ref_reference,
                       timeout=30000,
                       retry_interval_ms=1000)

    wait_for_condition(test_actor_handle_reference,
                       timeout=30000,
                       retry_interval_ms=1000)