def testPlacementGroupLoad(self): lm = LoadMetrics() pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] lm.update("1.1.1.1", {}, {}, {}, pending_placement_groups=pending_placement_groups) assert lm.get_pending_placement_groups() == pending_placement_groups
def test_strategies(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 4}] * 2 pending_placement_groups = [ # Requires a new node (only uses 2 GPUs on it though). PlacementGroupTableData(state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[ Bundle(unit_resources={"GPU": 2}), Bundle(unit_resources={"GPU": 2}), Bundle(unit_resources={"GPU": 2}) ]), # Requires a new node (uses the whole node). PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), # Fits across the machines that strict spread. PlacementGroupTableData( # runs on. state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), # Fits across the machines that strict spread. PlacementGroupTableData( # runs on. state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {"p2.8xlarge": 2}
def test_packing(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1) # At this point our cluster has 1 p2.8xlarge instances (8 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 1}] * 2 pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_PACK, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), ] # The 2 resource demand gpus should still be packed onto the same node # as the 6 GPU placement group. to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {}
def run_autoscaler(self): waiting_bundles = [] infeasible_bundles = [] placement_groups = [] for work in self.work_queue: if isinstance(work, Task): shape = work.resources if self._infeasible(shape): infeasible_bundles.append(shape) else: waiting_bundles.append(shape) if isinstance(work, PlacementGroup): placement_groups.append( PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=work.strategy, bundles=[ Bundle(unit_resources=bundle) for bundle in work.bundles ], )) for ip, node in self.ip_to_nodes.items(): if not node.in_cluster: continue self.load_metrics.update( ip=ip, static_resources=node.total_resources, dynamic_resources=node.available_resources, resource_load={}, waiting_bundles=waiting_bundles, infeasible_bundles=infeasible_bundles, pending_placement_groups=placement_groups, ) self.autoscaler.update() self._launch_nodes() self._update_cluster_state()
def test_many_strict_spreads(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 1}] * 6 pending_placement_groups = [ # Requires a new node (only uses 2 GPUs on it though). PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), ] # Each placement group will take up 2 GPUs per node, but the distinct # placement groups should still reuse the same nodes. pending_placement_groups = pending_placement_groups * 3 to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {"p2.8xlarge": 1}
def testPlacementGroup(self): # Note this is mostly an integration test. See # testPlacementGroupScaling for more comprehensive tests. config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 0 config["max_workers"] = 999 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "m4.4xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] assert len(self.provider.non_terminated_nodes({})) == 1 autoscaler.update() self.waitForNodes(1) pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 5)), ] # Since placement groups are implemented with custom resources, this is # an example of the accompanying resource demands. Note the resource # demand autoscaler will be unable to fulfill these demands, but we # should still handle the other infeasible/waiting bundles. placement_group_resource_demands = [{ "GPU_group_0_6c2506ac733bc37496295b02c4fad446": 0.0101, "GPU_group_6c2506ac733bc37496295b02c4fad446": 0.0101 }] lm.update(head_ip, {"CPU": 16}, True, {"CPU": 16}, False, {}, infeasible_bundles=placement_group_resource_demands, waiting_bundles=[{ "GPU": 8 }], pending_placement_groups=pending_placement_groups) autoscaler.update() self.waitForNodes(5) for i in range(1, 5): assert self.provider.mock_nodes[i].node_type == "p2.8xlarge" pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ]