def testPlacementGroupLoad(self):
     lm = LoadMetrics()
     pending_placement_groups = [
         PlacementGroupTableData(
             state=PlacementGroupTableData.RESCHEDULING,
             strategy=PlacementStrategy.PACK,
             bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
         PlacementGroupTableData(
             state=PlacementGroupTableData.RESCHEDULING,
             strategy=PlacementStrategy.SPREAD,
             bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
     ]
     lm.update("1.1.1.1", {}, {}, {},
               pending_placement_groups=pending_placement_groups)
     assert lm.get_pending_placement_groups() == pending_placement_groups
Пример #2
0
    def test_strategies(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)
        # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 4}] * 2
        pending_placement_groups = [
            # Requires a new node (only uses 2 GPUs on it though).
            PlacementGroupTableData(state=PlacementGroupTableData.PENDING,
                                    strategy=PlacementStrategy.STRICT_SPREAD,
                                    bundles=[
                                        Bundle(unit_resources={"GPU": 2}),
                                        Bundle(unit_resources={"GPU": 2}),
                                        Bundle(unit_resources={"GPU": 2})
                                    ]),
            # Requires a new node (uses the whole node).
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 4)),
            # Fits across the machines that strict spread.
            PlacementGroupTableData(
                # runs on.
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
            # Fits across the machines that strict spread.
            PlacementGroupTableData(
                # runs on.
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.SPREAD,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
        ]
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {"p2.8xlarge": 2}
Пример #3
0
    def test_packing(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1)
        # At this point our cluster has 1 p2.8xlarge instances (8 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 1}] * 2
        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
        ]
        # The 2 resource demand gpus should still be packed onto the same node
        # as the 6 GPU placement group.
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {}
Пример #4
0
    def run_autoscaler(self):
        waiting_bundles = []
        infeasible_bundles = []
        placement_groups = []
        for work in self.work_queue:
            if isinstance(work, Task):
                shape = work.resources
                if self._infeasible(shape):
                    infeasible_bundles.append(shape)
                else:
                    waiting_bundles.append(shape)
            if isinstance(work, PlacementGroup):
                placement_groups.append(
                    PlacementGroupTableData(
                        state=PlacementGroupTableData.PENDING,
                        strategy=work.strategy,
                        bundles=[
                            Bundle(unit_resources=bundle)
                            for bundle in work.bundles
                        ],
                    ))

        for ip, node in self.ip_to_nodes.items():
            if not node.in_cluster:
                continue
            self.load_metrics.update(
                ip=ip,
                static_resources=node.total_resources,
                dynamic_resources=node.available_resources,
                resource_load={},
                waiting_bundles=waiting_bundles,
                infeasible_bundles=infeasible_bundles,
                pending_placement_groups=placement_groups,
            )

        self.autoscaler.update()
        self._launch_nodes()
        self._update_cluster_state()
Пример #5
0
    def test_many_strict_spreads(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)
        # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 1}] * 6
        pending_placement_groups = [
            # Requires a new node (only uses 2 GPUs on it though).
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_SPREAD,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
        ]
        # Each placement group will take up 2 GPUs per node, but the distinct
        # placement groups should still reuse the same nodes.
        pending_placement_groups = pending_placement_groups * 3
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {"p2.8xlarge": 1}
Пример #6
0
    def testPlacementGroup(self):
        # Note this is mostly an integration test. See
        # testPlacementGroupScaling for more comprehensive tests.
        config = copy.deepcopy(MULTI_WORKER_CLUSTER)
        config["min_workers"] = 0
        config["max_workers"] = 999
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(config_path,
                                        lm,
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        self.provider.create_node({}, {
            TAG_RAY_NODE_KIND: "head",
            TAG_RAY_USER_NODE_TYPE: "m4.4xlarge"
        }, 1)
        head_ip = self.provider.non_terminated_node_ips({})[0]
        assert len(self.provider.non_terminated_nodes({})) == 1
        autoscaler.update()
        self.waitForNodes(1)

        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.STRICT_SPREAD,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 5)),
        ]
        # Since placement groups are implemented with custom resources, this is
        # an example of the accompanying resource demands. Note the resource
        # demand autoscaler will be unable to fulfill these demands, but we
        # should still handle the other infeasible/waiting bundles.
        placement_group_resource_demands = [{
            "GPU_group_0_6c2506ac733bc37496295b02c4fad446":
            0.0101,
            "GPU_group_6c2506ac733bc37496295b02c4fad446":
            0.0101
        }]
        lm.update(head_ip, {"CPU": 16},
                  True, {"CPU": 16},
                  False, {},
                  infeasible_bundles=placement_group_resource_demands,
                  waiting_bundles=[{
                      "GPU": 8
                  }],
                  pending_placement_groups=pending_placement_groups)
        autoscaler.update()
        self.waitForNodes(5)

        for i in range(1, 5):
            assert self.provider.mock_nodes[i].node_type == "p2.8xlarge"

        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 4)),
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.SPREAD,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
        ]