Пример #1
0
def start_connected_emptyhead_cluster():
    """Starts head with no resources."""

    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args={
                          "num_cpus":
                          0,
                          "_internal_config":
                          json.dumps({"num_heartbeats_timeout": 10})
                      })
    # Pytest doesn't play nicely with imports
    _register_all()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Пример #2
0
def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(initialize_head=True,
                      head_node_args={
                          "num_cpus": 10,
                          "redis_max_memory": 10**8
                      })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(address=cluster.address)

    yield num_nodes, num_workers_per_scheduler, cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Пример #3
0
def test_raylet_socket_name(shutdown_only):
    sock1 = unix_socket_create_path("i_am_a_temp_socket_1")
    ray.init(raylet_socket_name=sock1)
    unix_socket_verify(sock1)
    ray.shutdown()
    try:
        unix_socket_delete(sock1)
    except OSError:
        pass  # It could have been removed by Ray.
    cluster = Cluster(True)
    sock2 = unix_socket_create_path("i_am_a_temp_socket_2")
    cluster.add_node(raylet_socket_name=sock2)
    unix_socket_verify(sock2)
    cluster.shutdown()
    try:
        unix_socket_delete(sock2)
    except OSError:
        pass  # It could have been removed by Ray.
Пример #4
0
def start_connected_emptyhead_cluster():
    """Starts head with no resources."""
    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args={
                          "num_cpus": 0,
                          "_system_config": {
                              "num_heartbeats_timeout": 10
                          }
                      })
    # Pytest doesn't play nicely with imports
    _register_all()
    register_trainable("__fake_remote", MockRemoteTrainer)
    register_trainable("__fake_durable", MockDurableTrainer)
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Пример #5
0
def test_ray_status_multinode():
    from ray.cluster_utils import Cluster
    cluster = Cluster()
    for _ in range(4):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    def output_ready():
        result = runner.invoke(scripts.status)
        result.stdout
        return not result.exception and "memory" in result.output

    wait_for_condition(output_ready)

    result = runner.invoke(scripts.status, [])
    _check_output_via_pattern("test_ray_status_multinode.txt", result)
    ray.shutdown()
    cluster.shutdown()
Пример #6
0
def test_temp_plasma_store_socket():
    ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket"), "Specified socket path not found."
    ray.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket")
    except OSError:
        pass  # It could have been removed by Ray.
    cluster = Cluster(True)
    cluster.add_node(plasma_store_socket_name="/tmp/i_am_a_temp_socket_2")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket_2"), "Specified socket path not found."
    cluster.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket_2")
    except OSError:
        pass  # It could have been removed by Ray.
Пример #7
0
def ray_cluster():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 3
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init(address=cluster.address)
    mo.setup_cluster(address_to_resources=TEST_ADDRESS_TO_RESOURCES)

    yield

    RayActorDriver.stop_cluster()
    ray.shutdown()
    cluster.shutdown()
Пример #8
0
def test_ray_status_multinode():
    cluster = Cluster()
    for _ in range(4):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    def output_ready():
        result = runner.invoke(scripts.status)
        result.stdout
        if not result.exception and "memory" in result.output:
            return True
        raise RuntimeError(f"result.exception={result.exception} "
                           f"result.output={result.output}")

    wait_for_condition(output_ready)

    result = runner.invoke(scripts.status, [])
    _check_output_via_pattern("test_ray_status_multinode.txt", result)
    ray.shutdown()
    cluster.shutdown()
Пример #9
0
def run_multi_nodes():
    c = Cluster()
    c.add_node(num_cpus=4,
               object_store_memory=object_store_size,
               _system_config=system_config)
    ray.init(address=c.address)
    for _ in range(num_nodes - 1):  # subtract a head node.
        c.add_node(num_cpus=4, object_store_memory=object_store_size)
    c.wait_for_nodes()

    # Run shuffle.
    print(
        f"\n\nTest streaming shuffle with {num_nodes} nodes.\n"
        f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}"
        "GB")
    run_shuffle()
    time.sleep(5)
    display_spilling_info(c.address)
    ray.shutdown()
    c.shutdown()
    time.sleep(5)
Пример #10
0
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config):
    object_spilling_config, temp_folder = object_spilling_config
    cluster = Cluster()
    cluster.add_node(
        num_cpus=0,
        object_store_memory=75 * 1024 * 1024,
        _system_config={"object_spilling_config": object_spilling_config},
    )
    ray.init(address=cluster.address)
    node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # This task will run on node 2 because node 1 has no CPU resource
    @ray.remote(num_cpus=1)
    def run_workload():
        ids = []
        for _ in range(2):
            arr = np.random.rand(5 * 1024 * 1024)  # 40 MB
            ids.append(ray.put(arr))
        return ids

    ids = ray.get(run_workload.remote())
    assert not is_dir_empty(temp_folder)

    # Kill node 2
    cluster.remove_node(node2)

    # Verify that the spill folder is not empty
    assert not is_dir_empty(temp_folder)

    # Start a new node
    cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # Verify that the spill folder is now cleaned up
    assert is_dir_empty(temp_folder)

    # We hold the object refs to prevent them from being deleted
    del ids
    ray.shutdown()
    cluster.shutdown()
Пример #11
0
def main():
    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={"object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16},
    )
    cluster.add_node(
        object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16
    )

    object_ref_list = []
    for i in range(0, 10):
        object_ref = ray.put(np.random.rand(1024 * 128, 1024))
        object_ref_list.append(object_ref)

    @ray.remote(num_gpus=1)
    def f(object_ref_list):
        diffs = []
        for object_ref in object_ref_list:
            before = time.time()
            ray.get(object_ref)
            after = time.time()
            diffs.append(after - before)
            time.sleep(1)
        return np.mean(diffs), np.std(diffs)

    time_diff, time_diff_std = ray.get(f.remote(object_ref_list))

    print(
        "latency to get an 1G object over network",
        round(time_diff, 2),
        "+-",
        round(time_diff_std, 2),
    )

    ray.shutdown()
    cluster.shutdown()
Пример #12
0
def test_cluster_handle_affinity():
    cluster = Cluster()
    # HACK: using two different ip address so the placement constraint for
    # resource check later will work.
    head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=4)
    cluster.add_node(node_ip_address="0.0.0.0", num_cpus=4)

    ray.init(head_node.address)

    # Make sure we have two nodes.
    node_ids = [n["NodeID"] for n in ray.nodes()]
    assert len(node_ids) == 2

    # Start the backend.
    client = serve.start(http_port=randint(10000, 30000), detached=True)
    client.create_backend("hi:v0", lambda _: "hi")
    client.create_endpoint("hi", backend="hi:v0")

    # Try to retrieve the handle from both head and worker node, check the
    # router's node id.
    @ray.remote
    def check_handle_router_id():
        client = serve.connect()
        handle = client.get_handle("hi")
        return get_node_id_for_actor(handle.router_handle)

    router_node_ids = ray.get([
        check_handle_router_id.options(resources={
            node_id: 0.01
        }).remote() for node_id in ray.state.node_ids()
    ])

    assert set(router_node_ids) == set(node_ids)

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
Пример #13
0
def ray_large_cluster():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 3
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init(address=cluster.address)
    register_ray_serializers()
    try:
        yield
    finally:
        unregister_ray_serializers()
        Router.set_instance(None)
        RayServer.clear()
        ray.shutdown()
        cluster.shutdown()
        if 'COV_CORE_SOURCE' in os.environ:
            # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed
            subprocess.check_call(["ray", "stop", "--force"])
Пример #14
0
def _ray_start_cluster(**kwargs):
    init_kwargs = get_default_fixture_ray_kwargs()
    num_nodes = 0
    do_init = False
    # num_nodes & do_init are not arguments for ray.init, so delete them.
    if "num_nodes" in kwargs:
        num_nodes = kwargs["num_nodes"]
        del kwargs["num_nodes"]
    if "do_init" in kwargs:
        do_init = kwargs["do_init"]
        del kwargs["do_init"]
    elif num_nodes > 0:
        do_init = True
    init_kwargs.update(kwargs)
    cluster = Cluster()
    remote_nodes = []
    for _ in range(num_nodes):
        remote_nodes.append(cluster.add_node(**init_kwargs))
    if do_init:
        ray.init(address=cluster.address)
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Пример #15
0
class TrialRunnerPlacementGroupTest(unittest.TestCase):
    def setUp(self):
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "10000"
        self.head_cpus = 8
        self.head_gpus = 4
        self.head_custom = 16

        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "include_dashboard": False,
                                   "num_cpus": self.head_cpus,
                                   "num_gpus": self.head_gpus,
                                   "resources": {
                                       "custom": self.head_custom
                                   },
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def _assertCleanup(self, trial_executor):
        # Assert proper cleanup
        pg_manager = trial_executor._pg_manager
        self.assertFalse(pg_manager._in_use_trials)
        self.assertFalse(pg_manager._in_use_pgs)
        self.assertFalse(pg_manager._staging_futures)
        for pgf in pg_manager._staging:
            self.assertFalse(pg_manager._staging[pgf])
        for pgf in pg_manager._ready:
            self.assertFalse(pg_manager._ready[pgf])
        self.assertTrue(pg_manager._latest_staging_start_time)

        num_non_removed_pgs = len([
            p for pid, p in placement_group_table().items()
            if p["state"] != "REMOVED"
        ])
        self.assertEqual(num_non_removed_pgs, 0)

    def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10):
        """In this test we try to start 10 trials but only have resources
        for 2. Placement groups should still be created and PENDING.

        Eventually they should be scheduled sequentially (i.e. in pairs
        of two)."""
        def train(config):
            time.sleep(1)
            now = time.time()
            tune.report(end=now - config["start_time"])

        head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
        child_bundle = {"custom": 1}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle])

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        this = self

        class _TestCallback(Callback):
            def on_step_end(self, iteration, trials, **info):
                num_finished = len([
                    t for t in trials
                    if t.status == Trial.TERMINATED or t.status == Trial.ERROR
                ])

                num_staging = sum(
                    len(s)
                    for s in trial_executor._pg_manager._staging.values())
                num_ready = sum(
                    len(s) for s in trial_executor._pg_manager._ready.values())
                num_in_use = len(trial_executor._pg_manager._in_use_pgs)
                num_cached = len(trial_executor._pg_manager._cached_pgs)

                total_num_tracked = num_staging + num_ready + \
                    num_in_use + num_cached

                num_non_removed_pgs = len([
                    p for pid, p in placement_group_table().items()
                    if p["state"] != "REMOVED"
                ])
                num_removal_scheduled_pgs = len(
                    trial_executor._pg_manager._pgs_for_removal)

                # All trials should be scheduled
                this.assertEqual(scheduled,
                                 min(scheduled, len(trials)),
                                 msg=f"Num trials iter {iteration}")
                # The number of PGs should decrease when trials finish
                this.assertEqual(max(scheduled, len(trials)) - num_finished,
                                 total_num_tracked,
                                 msg=f"Num tracked iter {iteration}")
                # The number of actual placement groups should match this
                this.assertEqual(max(scheduled, len(trials)) - num_finished,
                                 num_non_removed_pgs -
                                 num_removal_scheduled_pgs,
                                 msg=f"Num actual iter {iteration}")

        start = time.time()
        out = tune.run(train,
                       config={"start_time": start},
                       resources_per_trial=placement_group_factory,
                       num_samples=10,
                       trial_executor=trial_executor,
                       callbacks=[_TestCallback()],
                       reuse_actors=reuse_actors,
                       verbose=2)

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)

    def testPlacementGroupRequestsWithActorReuse(self):
        """Assert that reuse actors doesn't leak placement groups"""
        self.testPlacementGroupRequests(reuse_actors=True)

    @patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6)
    @patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6)
    def testPlacementGroupLimitedRequests(self):
        """Assert that maximum number of placement groups is enforced."""
        self.testPlacementGroupRequests(scheduled=6)

    @patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6)
    @patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6)
    def testPlacementGroupLimitedRequestsWithActorReuse(self):
        self.testPlacementGroupRequests(reuse_actors=True, scheduled=6)

    def testPlacementGroupDistributedTraining(self, reuse_actors=False):
        """Run distributed training using placement groups.

        Each trial requests 4 CPUs and starts 4 remote training workers.
        """

        head_bundle = {"CPU": 1, "GPU": 0, "custom": 0}
        child_bundle = {"CPU": 1}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle, child_bundle])

        @ray.remote
        class TrainingActor:
            def train(self, val):
                time.sleep(1)
                return val

        def train(config):
            base = config["base"]
            actors = [TrainingActor.remote() for _ in range(4)]
            futures = [
                actor.train.remote(base + 2 * i)
                for i, actor in enumerate(actors)
            ]
            results = ray.get(futures)

            end = time.time() - config["start_time"]
            tune.report(avg=np.mean(results), end=end)

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        start = time.time()
        out = tune.run(train,
                       config={
                           "start_time": start,
                           "base": tune.grid_search(list(range(0, 100, 10)))
                       },
                       resources_per_trial=placement_group_factory,
                       num_samples=1,
                       trial_executor=trial_executor,
                       reuse_actors=reuse_actors,
                       verbose=2)

        avgs = sorted(t.last_result["avg"] for t in out.trials)
        self.assertSequenceEqual(avgs, list(range(3, 103, 10)))

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)

    def testPlacementGroupDistributedTrainingWithActorReuse(self):
        self.testPlacementGroupDistributedTraining(reuse_actors=True)
Пример #16
0
def ray_cluster():
    cluster = Cluster()
    yield Cluster()
    serve.shutdown()
    ray.shutdown()
    cluster.shutdown()
Пример #17
0
class RayExecutorQueueTest(unittest.TestCase):
    def setUp(self):
        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": 1,
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        self.trial_executor = RayTrialExecutor(queue_trials=True,
                                               refresh_period=0)
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testQueueTrial(self):
        """Tests that reset handles NotImplemented properly."""
        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        cpu_only = create_trial(1, 0)
        self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only))
        self.trial_executor.start_trial(cpu_only)

        gpu_only = create_trial(0, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only))

    def testHeadBlocking(self):
        # Once resource requests are deprecated, remove this test
        os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        gpu_trial = create_trial(1, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial))
        self.trial_executor.start_trial(gpu_trial)

        # TODO(rliaw): This behavior is probably undesirable, but right now
        #  trials with different resource requirements is not often used.
        cpu_only_trial = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))

        self.cluster.add_node(num_cpus=1, num_gpus=1)
        self.cluster.wait_for_nodes()

        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))
        self.trial_executor.start_trial(cpu_only_trial)

        cpu_only_trial2 = create_trial(1, 0)
        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial2))
        self.trial_executor.start_trial(cpu_only_trial2)

        cpu_only_trial3 = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial3))
Пример #18
0
class RayExecutorPlacementGroupTest(unittest.TestCase):
    def setUp(self):
        self.head_cpus = 8
        self.head_gpus = 4
        self.head_custom = 16

        self.cluster = Cluster(
            initialize_head=True,
            connect=True,
            head_node_args={
                "num_cpus": self.head_cpus,
                "num_gpus": self.head_gpus,
                "resources": {"custom": self.head_custom},
                "_system_config": {"num_heartbeats_timeout": 10},
            },
        )
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testResourcesAvailableWithPlacementGroup(self):
        def train(config):
            tune.report(metric=0, resources=ray.available_resources())

        head_bundle = {"CPU": 1, "GPU": 0, "custom": 4}
        child_bundle = {"CPU": 2, "GPU": 1, "custom": 3}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle]
        )

        out = tune.run(train, resources_per_trial=placement_group_factory)

        available = {
            key: val
            for key, val in out.trials[0].last_result["resources"].items()
            if key in ["CPU", "GPU", "custom"]
        }

        if not available:
            self.skipTest(
                "Warning: Ray reported no available resources, "
                "but this is an error on the Ray core side. "
                "Skipping this test for now."
            )

        self.assertDictEqual(
            available,
            {
                "CPU": self.head_cpus - 5.0,
                "GPU": self.head_gpus - 2.0,
                "custom": self.head_custom - 10.0,
            },
        )

    def testPlacementGroupFactoryEquality(self):
        """
        Test that two different placement group factory objects are considered
        equal and evaluate to the same hash.
        """
        from collections import Counter

        pgf_1 = PlacementGroupFactory(
            [{"CPU": 2, "GPU": 4, "custom": 7}, {"GPU": 2, "custom": 1, "CPU": 3}],
            "PACK",
            "no_name",
            None,
        )

        pgf_2 = PlacementGroupFactory(
            [
                {
                    "custom": 7,
                    "GPU": 4,
                    "CPU": 2,
                },
                {"custom": 1, "GPU": 2, "CPU": 3},
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        pgf_3 = PlacementGroupFactory(
            [
                {"custom": 7, "GPU": 4, "CPU": 2.0, "custom2": 0},
                {"custom": 1.0, "GPU": 2, "CPU": 3, "custom2": 0},
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        self.assertEqual(pgf_1, pgf_2)
        self.assertEqual(pgf_2, pgf_3)

        # Hash testing
        counter = Counter()
        counter[pgf_1] += 1
        counter[pgf_2] += 1
        counter[pgf_3] += 1

        self.assertEqual(counter[pgf_1], 3)
        self.assertEqual(counter[pgf_2], 3)
        self.assertEqual(counter[pgf_3], 3)

    def testHasResourcesForTrialWithCaching(self):
        pgm = _PlacementGroupManager()
        pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}])
        pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}])

        executor = RayTrialExecutor(reuse_actors=True)
        executor._pg_manager = pgm
        executor.set_max_pending_trials(1)

        def train(config):
            yield 1
            yield 2
            yield 3
            yield 4

        register_trainable("resettable", train)

        trial1 = Trial("resettable", placement_group_factory=pgf1)
        trial2 = Trial("resettable", placement_group_factory=pgf1)
        trial3 = Trial("resettable", placement_group_factory=pgf2)

        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert executor.has_resources_for_trial(trial3)

        executor._stage_and_update_status([trial1, trial2, trial3])

        while not pgm.has_ready(trial1):
            time.sleep(1)
            executor._stage_and_update_status([trial1, trial2, trial3])

        # Fill staging
        executor._stage_and_update_status([trial1, trial2, trial3])

        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert not executor.has_resources_for_trial(trial3)

        executor._start_trial(trial1)
        executor._stage_and_update_status([trial1, trial2, trial3])
        executor.pause_trial(trial1)  # Caches the PG and removes a PG from staging

        assert len(pgm._staging_futures) == 0

        # This will re-schedule a placement group
        pgm.reconcile_placement_groups([trial1, trial2])

        assert len(pgm._staging_futures) == 1

        assert not pgm.can_stage()

        # We should still have resources for this trial as it has a cached PG
        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert not executor.has_resources_for_trial(trial3)
Пример #19
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    client = serve.start(http_port=8005)  # noqa: F841

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME, client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
Пример #20
0
class TrialRunnerPlacementGroupTest(unittest.TestCase):
    def setUp(self):
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "10000"
        self.head_cpus = 8
        self.head_gpus = 4
        self.head_custom = 16

        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": self.head_cpus,
                                   "num_gpus": self.head_gpus,
                                   "resources": {
                                       "custom": self.head_custom
                                   },
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testPlacementGroupRequests(self, scheduled=10):
        """In this test we try to start 10 trials but only have resources
        for 2. Placement groups should still be created and PENDING.

        Eventually they should be scheduled sequentially (i.e. in pairs
        of two)."""
        def train(config):
            time.sleep(1)
            now = time.time()
            tune.report(end=now - config["start_time"])

        def placement_group_factory():
            head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
            child_bundle = {"custom": 1}

            return placement_group([head_bundle, child_bundle, child_bundle])

        trial_executor = RayTrialExecutor()

        this = self

        class _TestCallback(Callback):
            def on_step_end(self, iteration, trials, **info):
                if iteration == 1:
                    this.assertEqual(scheduled, len(trials))
                    this.assertEqual(
                        scheduled,
                        sum(
                            len(s) for s in
                            trial_executor._pg_manager._staging.values()) +
                        sum(
                            len(s) for s in
                            trial_executor._pg_manager._ready.values()) +
                        len(trial_executor._pg_manager._in_use_pgs))

        start = time.time()
        out = tune.run(train,
                       config={"start_time": start},
                       resources_per_trial=placement_group_factory,
                       num_samples=10,
                       trial_executor=trial_executor,
                       callbacks=[_TestCallback()])

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 5)

        # Some trials should have run in parallel
        self.assertLess(max_diff, 10)

    @patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 6)
    @patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 6)
    def testPlacementGroupLimitedRequests(self):
        """Assert that maximum number of placement groups is enforced."""
        self.testPlacementGroupRequests(scheduled=6)

    def testPlacementGroupDistributedTraining(self):
        """Run distributed training using placement groups.

        Each trial requests 4 CPUs and starts 4 remote training workers.
        """
        def placement_group_factory():
            head_bundle = {"CPU": 1, "GPU": 0, "custom": 0}
            child_bundle = {"CPU": 1}

            return placement_group(
                [head_bundle, child_bundle, child_bundle, child_bundle])

        @ray.remote
        class TrainingActor:
            def train(self, val):
                time.sleep(1)
                return val

        def train(config):
            base = config["base"]
            actors = [TrainingActor.remote() for _ in range(4)]
            futures = [
                actor.train.remote(base + 2 * i)
                for i, actor in enumerate(actors)
            ]
            results = ray.get(futures)

            end = time.time() - config["start_time"]
            tune.report(avg=np.mean(results), end=end)

        trial_executor = RayTrialExecutor()

        start = time.time()
        out = tune.run(train,
                       config={
                           "start_time": start,
                           "base": tune.grid_search(list(range(0, 100, 10)))
                       },
                       resources_per_trial=placement_group_factory,
                       num_samples=1,
                       trial_executor=trial_executor)

        avgs = sorted(t.last_result["avg"] for t in out.trials)
        self.assertSequenceEqual(avgs, list(range(3, 103, 10)))

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 5)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        # Assert proper cleanup
        pg_manager = trial_executor._pg_manager
        self.assertFalse(pg_manager._in_use_trials)
        self.assertFalse(pg_manager._in_use_pgs)
        self.assertFalse(pg_manager._staging_futures)
        for pgf in pg_manager._staging:
            self.assertFalse(pg_manager._staging[pgf])
        for pgf in pg_manager._ready:
            self.assertFalse(pg_manager._ready[pgf])
        self.assertTrue(pg_manager._latest_staging_start_time)
Пример #21
0
def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])
Пример #22
0
class RayExecutorPlacementGroupTest(unittest.TestCase):
    def setUp(self):
        self.head_cpus = 8
        self.head_gpus = 4
        self.head_custom = 16

        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": self.head_cpus,
                                   "num_gpus": self.head_gpus,
                                   "resources": {
                                       "custom": self.head_custom
                                   },
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testResourcesAvailableNoPlacementGroup(self):
        def train(config):
            tune.report(metric=0, resources=ray.available_resources())

        out = tune.run(train,
                       resources_per_trial={
                           "cpu": 1,
                           "gpu": 1,
                           "custom_resources": {
                               "custom": 3
                           },
                           "extra_cpu": 3,
                           "extra_gpu": 1,
                           "extra_custom_resources": {
                               "custom": 4
                           },
                       })

        # Only `cpu`, `gpu`, and `custom_resources` will be "really" reserved,
        # the extra_* will just be internally reserved by Tune.
        self.assertDictEqual(
            {
                key: val
                for key, val in out.trials[0].last_result["resources"].items()
                if key in ["CPU", "GPU", "custom"]
            }, {
                "CPU": self.head_cpus - 1.0,
                "GPU": self.head_gpus - 1.0,
                "custom": self.head_custom - 3.0
            })

    def testResourcesAvailableWithPlacementGroup(self):
        def train(config):
            tune.report(metric=0, resources=ray.available_resources())

        def placement_group_factory():
            head_bundle = {"CPU": 1, "GPU": 0, "custom": 4}
            child_bundle = {"CPU": 2, "GPU": 1, "custom": 3}

            return placement_group([head_bundle, child_bundle, child_bundle])

        out = tune.run(train, resources_per_trial=placement_group_factory)

        self.assertDictEqual(
            {
                key: val
                for key, val in out.trials[0].last_result["resources"].items()
                if key in ["CPU", "GPU", "custom"]
            }, {
                "CPU": self.head_cpus - 5.0,
                "GPU": self.head_gpus - 2.0,
                "custom": self.head_custom - 10.0
            })
Пример #23
0
class RayExecutorPlacementGroupTest(unittest.TestCase):
    def setUp(self):
        self.head_cpus = 8
        self.head_gpus = 4
        self.head_custom = 16

        self.cluster = Cluster(
            initialize_head=True,
            connect=True,
            head_node_args={
                "num_cpus": self.head_cpus,
                "num_gpus": self.head_gpus,
                "resources": {
                    "custom": self.head_custom
                },
                "_system_config": {
                    "num_heartbeats_timeout": 10
                },
            },
        )
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testResourcesAvailableWithPlacementGroup(self):
        def train(config):
            tune.report(metric=0, resources=ray.available_resources())

        head_bundle = {"CPU": 1, "GPU": 0, "custom": 4}
        child_bundle = {"CPU": 2, "GPU": 1, "custom": 3}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle])

        out = tune.run(train, resources_per_trial=placement_group_factory)

        available = {
            key: val
            for key, val in out.trials[0].last_result["resources"].items()
            if key in ["CPU", "GPU", "custom"]
        }

        if not available:
            self.skipTest("Warning: Ray reported no available resources, "
                          "but this is an error on the Ray core side. "
                          "Skipping this test for now.")

        self.assertDictEqual(
            available,
            {
                "CPU": self.head_cpus - 5.0,
                "GPU": self.head_gpus - 2.0,
                "custom": self.head_custom - 10.0,
            },
        )

    def testPlacementGroupFactoryEquality(self):
        """
        Test that two different placement group factory objects are considered
        equal and evaluate to the same hash.
        """
        from collections import Counter

        pgf_1 = PlacementGroupFactory(
            [{
                "CPU": 2,
                "GPU": 4,
                "custom": 7
            }, {
                "GPU": 2,
                "custom": 1,
                "CPU": 3
            }],
            "PACK",
            "no_name",
            None,
        )

        pgf_2 = PlacementGroupFactory(
            [
                {
                    "custom": 7,
                    "GPU": 4,
                    "CPU": 2,
                },
                {
                    "custom": 1,
                    "GPU": 2,
                    "CPU": 3
                },
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        pgf_3 = PlacementGroupFactory(
            [
                {
                    "custom": 7,
                    "GPU": 4,
                    "CPU": 2.0,
                    "custom2": 0
                },
                {
                    "custom": 1.0,
                    "GPU": 2,
                    "CPU": 3,
                    "custom2": 0
                },
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        self.assertEqual(pgf_1, pgf_2)
        self.assertEqual(pgf_2, pgf_3)

        # Hash testing
        counter = Counter()
        counter[pgf_1] += 1
        counter[pgf_2] += 1
        counter[pgf_3] += 1

        self.assertEqual(counter[pgf_1], 3)
        self.assertEqual(counter[pgf_2], 3)
        self.assertEqual(counter[pgf_3], 3)
Пример #24
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.init(http_port=8005)

    def actor_name(index):
        return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index)

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(actor_name(0))
            ray.get_actor(actor_name(1))
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(actor_name(0)), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    def get_third_actor():
        try:
            ray.get_actor(actor_name(2))
            return True
        except ValueError:
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(actor_name(2))
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()