def test_get_nodes_to_launch_with_min_workers_and_bin_packing(): provider = MockProvider() new_types = copy.deepcopy(TYPES_A) new_types["p2.8xlarge"]["min_workers"] = 2 scheduler = ResourceDemandScheduler(provider, new_types, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1) nodes = provider.non_terminated_nodes({}) ips = provider.non_terminated_node_ips({}) # 1 free p2.8xls utilizations = {ip: {"GPU": 8} for ip in ips} # 1 more on the way pending_nodes = {"p2.8xlarge": 1} # requires 2 p2.8xls (only 2 are in cluster/pending) and 1 p2.xlarge demands = [{"GPU": 8}] * (len(utilizations) + 1) + [{"GPU": 1}] to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands, utilizations, []) assert to_launch == {"p2.xlarge": 1} # 3 min_workers of p2.8xlarge covers the 2 p2.8xlarge + 1 p2.xlarge demand. # 2 p2.8xlarge are running/pending. So we need 1 more p2.8xlarge only to # meet the min_workers constraint and the demand. new_types["p2.8xlarge"]["min_workers"] = 3 scheduler = ResourceDemandScheduler(provider, new_types, 10) to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands, utilizations, []) # Make sure it does not return [("p2.8xlarge", 1), ("p2.xlarge", 1)] assert to_launch == {"p2.8xlarge": 1}
def test_get_nodes_to_launch_max_launch_concurrency(): provider = MockProvider() new_types = copy.deepcopy(TYPES_A) new_types["p2.8xlarge"]["min_workers"] = 4 new_types["p2.8xlarge"]["max_workers"] = 40 scheduler = ResourceDemandScheduler(provider, new_types, 30) to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, []) # Respects min_workers despite concurrency limitation. assert to_launch == {"p2.8xlarge": 4} provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 1) nodes = provider.non_terminated_nodes({}) # Trying to force here that the node shows in nodes but not connected yet # and hence does not show up in LoadMetrics (or utilizations). ips = provider.non_terminated_node_ips( {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) utilizations = {ip: {"GPU": 8} for ip in ips} launching_nodes = {"p2.8xlarge": 1} # requires 41 p2.8xls (currently 1 pending, 1 launching, 0 running} demands = [{"GPU": 8}] * (len(utilizations) + 40) to_launch = scheduler.get_nodes_to_launch(nodes, launching_nodes, demands, utilizations, []) # Enforces max launch to 5 when < 5 running. 2 are pending/launching. assert to_launch == {"p2.8xlarge": 3} provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE }, 8) nodes = provider.non_terminated_nodes({}) ips = provider.non_terminated_node_ips( {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) utilizations = {ip: {"GPU": 8} for ip in ips} launching_nodes = {"p2.8xlarge": 1} # Requires additional 17 p2.8xls (now 1 pending, 1 launching, 8 running} demands = [{"GPU": 8}] * (len(utilizations) + 15) to_launch = scheduler.get_nodes_to_launch(nodes, launching_nodes, demands, utilizations, []) # We are allowed to launch up to 8 more since 8 are running. # We already have 2 pending/launching, so only 6 remain. assert to_launch == {"p2.8xlarge": 6}
def test_get_nodes_to_launch_limits(): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 3) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) nodes = provider.non_terminated_nodes({}) ips = provider.non_terminated_node_ips({}) utilizations = {ip: {"GPU": 8} for ip in ips} to_launch = scheduler.get_nodes_to_launch(nodes, {"p2.8xlarge": 1}, [{ "GPU": 8 }] * 2, utilizations) assert to_launch == {}
def test_get_nodes_to_launch_with_min_workers(): provider = MockProvider() new_types = copy.deepcopy(TYPES_A) new_types["p2.8xlarge"]["min_workers"] = 2 scheduler = ResourceDemandScheduler(provider, new_types, 3) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1) nodes = provider.non_terminated_nodes({}) ips = provider.non_terminated_node_ips({}) utilizations = {ip: {"GPU": 8} for ip in ips} to_launch = scheduler.get_nodes_to_launch(nodes, {}, [{ "GPU": 8 }], utilizations, []) assert to_launch == {"p2.8xlarge": 1}
def test_calculate_node_resources(): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) nodes = provider.non_terminated_nodes({}) ips = provider.non_terminated_node_ips({}) # 2 free p2.8xls utilizations = {ip: {"GPU": 8} for ip in ips} # 1 more on the way pending_nodes = {"p2.8xlarge": 1} # requires 4 p2.8xls (only 3 are in cluster/pending) demands = [{"GPU": 8}] * (len(utilizations) + 2) to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands, utilizations, []) assert to_launch == {"p2.8xlarge": 1}
def test_packing(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1) # At this point our cluster has 1 p2.8xlarge instances (8 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 1}] * 2 pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_PACK, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), ] # The 2 resource demand gpus should still be packed onto the same node # as the 6 GPU placement group. to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {}
def test_strategies(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 4}] * 2 pending_placement_groups = [ # Requires a new node (only uses 2 GPUs on it though). PlacementGroupTableData(state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[ Bundle(unit_resources={"GPU": 2}), Bundle(unit_resources={"GPU": 2}), Bundle(unit_resources={"GPU": 2}) ]), # Requires a new node (uses the whole node). PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), # Fits across the machines that strict spread. PlacementGroupTableData( # runs on. state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), # Fits across the machines that strict spread. PlacementGroupTableData( # runs on. state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {"p2.8xlarge": 2}
def test_many_strict_spreads(self): provider = MockProvider() scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is # fully idle. nodes = provider.non_terminated_nodes({}) resource_demands = [{"GPU": 1}] * 6 pending_placement_groups = [ # Requires a new node (only uses 2 GPUs on it though). PlacementGroupTableData( state=PlacementGroupTableData.PENDING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), ] # Each placement group will take up 2 GPUs per node, but the distinct # placement groups should still reuse the same nodes. pending_placement_groups = pending_placement_groups * 3 to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands, {}, pending_placement_groups) assert to_launch == {"p2.8xlarge": 1}
class AutoscalingTest(unittest.TestCase): def setUp(self): NODE_PROVIDERS["mock"] = \ lambda config: self.create_provider self.provider = None self.tmpdir = tempfile.mkdtemp() def tearDown(self): self.provider = None del NODE_PROVIDERS["mock"] shutil.rmtree(self.tmpdir) ray.shutdown() def waitForNodes(self, expected, comparison=None, tag_filters={}): MAX_ITER = 50 for i in range(MAX_ITER): n = len(self.provider.non_terminated_nodes(tag_filters)) if comparison is None: comparison = self.assertEqual try: comparison(n, expected) return except Exception: if i == MAX_ITER - 1: raise time.sleep(.1) def create_provider(self, config, cluster_name): assert self.provider return self.provider def write_config(self, config): path = self.tmpdir + "/simple.yaml" with open(path, "w") as f: f.write(yaml.dump(config)) return path def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "m4.large") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "m4.large") def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2) def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].node_type == "m4.16xlarge" assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "CPU: 2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "CPU: 32") runner.assert_has_call("172.0.0.1", "GPU: 8")
class AutoscalingTest(unittest.TestCase): def setUp(self): _NODE_PROVIDERS["mock"] = \ lambda config: self.create_provider self.provider = None self.tmpdir = tempfile.mkdtemp() def tearDown(self): self.provider = None del _NODE_PROVIDERS["mock"] _clear_provider_cache() shutil.rmtree(self.tmpdir) ray.shutdown() def waitForNodes(self, expected, comparison=None, tag_filters={}): MAX_ITER = 50 for i in range(MAX_ITER): n = len(self.provider.non_terminated_nodes(tag_filters)) if comparison is None: comparison = self.assertEqual try: comparison(n, expected) return except Exception: if i == MAX_ITER - 1: raise time.sleep(.1) def create_provider(self, config, cluster_name): assert self.provider return self.provider def write_config(self, config): path = self.tmpdir + "/simple.yaml" with open(path, "w") as f: f.write(yaml.dump(config)) return path def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "empty_node") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "empty_node") def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2) def testPlacementGroup(self): # Note this is mostly an integration test. See # testPlacementGroupScaling for more comprehensive tests. config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 0 config["max_workers"] = 999 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "m4.4xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] assert len(self.provider.non_terminated_nodes({})) == 1 autoscaler.update() self.waitForNodes(1) pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 5)), ] # Since placement groups are implemented with custom resources, this is # an example of the accompanying resource demands. Note the resource # demand autoscaler will be unable to fulfill these demands, but we # should still handle the other infeasible/waiting bundles. placement_group_resource_demands = [{ "GPU_group_0_6c2506ac733bc37496295b02c4fad446": 0.0101, "GPU_group_6c2506ac733bc37496295b02c4fad446": 0.0101 }] lm.update(head_ip, {"CPU": 16}, True, {"CPU": 16}, False, {}, infeasible_bundles=placement_group_resource_demands, waiting_bundles=[{ "GPU": 8 }], pending_placement_groups=pending_placement_groups) autoscaler.update() self.waitForNodes(5) for i in range(1, 5): assert self.provider.mock_nodes[i].node_type == "p2.8xlarge" pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] def testScaleUpMinWorkers(self): config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 2 config["max_workers"] = 50 config["idle_timeout_minutes"] = 1 # Since config["min_workers"] > 1, the remaining worker is started # with the default worker node type. config["available_node_types"]["p2.8xlarge"]["min_workers"] = 1 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) assert len(self.provider.mock_nodes) == 2 assert { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } == {"p2.8xlarge", "m4.large"} self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.16xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) assert len(self.provider.non_terminated_nodes({})) == 6 # Make sure that after idle_timeout_minutes we don't kill idle # min workers. for node_id in self.provider.non_terminated_nodes({}): lm.last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60 autoscaler.update() self.waitForNodes(2) cnt = 0 for id in self.provider.mock_nodes: if self.provider.mock_nodes[id].state == "running" or \ self.provider.mock_nodes[id].state == "pending": assert self.provider.mock_nodes[id].node_type in { "p2.8xlarge", "m4.large" } cnt += 1 assert cnt == 2 def testScaleUpIgnoreUsed(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["target_utilization_fraction"] = 1.0 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "p2.xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] self.provider.finish_starting_nodes() runner = MockProcessRunner() lm = LoadMetrics(local_ip=head_ip) autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) autoscaler.update() self.waitForNodes(1) lm.update(head_ip, {"CPU": 4, "GPU": 1}, True, {}, True, {}) self.waitForNodes(1) lm.update(head_ip, { "CPU": 4, "GPU": 1 }, True, {"GPU": 0}, True, {}, waiting_bundles=[{ "GPU": 1 }]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.xlarge" def testRequestBundlesAccountsForHeadNode(self): config = MULTI_WORKER_CLUSTER.copy() config["head_node_type"] = "p2.8xlarge" config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: "head" }, 1) runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 1 # These requests fit on the head node. autoscaler.update() self.waitForNodes(1) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert len(self.provider.mock_nodes) == 1 autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(1) # This request requires an additional worker node. autoscaler.request_resources([{"GPU": 8}] * 2) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].node_type == "m4.16xlarge" assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "\"CPU\":2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "\"CPU\":32") runner.assert_has_call("172.0.0.1", "\"GPU\":8") def testScaleUpLoadMetrics(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.update() lm.update("1.2.3.4", {}, True, {}, True, {}, waiting_bundles=[{ "GPU": 1 }], infeasible_bundles=[{ "CPU": 16 }]) autoscaler.update() self.waitForNodes(2) nodes = { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } assert nodes == {"p2.xlarge", "m4.4xlarge"} def testCommandPassing(self): t = "custom" config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"][ "worker_setup_commands"] = ["new_worker_setup_command"] config["available_node_types"]["p2.xlarge"][ "initialization_commands"] = ["new_worker_initialization_cmd"] config["available_node_types"]["p2.xlarge"]["resources"][t] = 1 # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "new_worker_setup_command") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "setup_cmd") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "worker_setup_cmd") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "new_worker_initialization_cmd") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "init_cmd") def testDockerWorkers(self): config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"]["docker"] = { "worker_image": "p2.8x_image:latest", "worker_run_options": ["p2.8x-run-options"] } config["available_node_types"]["p2.xlarge"]["docker"] = { "worker_image": "p2x_image:nightly" } config["docker"]["worker_run_options"] = ["standard-run-options"] config["docker"]["image"] = "default-image:nightly" config["docker"]["worker_image"] = "default-image:nightly" # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() # Fill up m4, p2.8, p2 and request 2 more CPUs autoscaler.request_resources([{ "CPU": 2 }, { "CPU": 16 }, { "CPU": 32 }, { "CPU": 2 }]) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x_image:latest") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "default-image:nightly") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "standard-run-options") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "p2x_image:nightly") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "default-image:nightly") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2.8x-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2x_image:nightly") def testUpdateConfig(self): config = MULTI_WORKER_CLUSTER.copy() config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) config["min_workers"] = 0 config["available_node_types"]["m4.large"]["node_config"][ "field_changed"] = 1 config_path = self.write_config(config) autoscaler.update() self.waitForNodes(0) def testEmptyDocker(self): config = MULTI_WORKER_CLUSTER.copy() del config["docker"] config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
def test_get_concurrent_resource_demand_to_launch(): node_types = copy.deepcopy(TYPES_A) node_types["p2.8xlarge"]["min_workers"] = 1 node_types["p2.8xlarge"]["max_workers"] = 10 node_types["m4.large"]["min_workers"] = 2 node_types["m4.large"]["max_workers"] = 100 provider = MockProvider() scheduler = ResourceDemandScheduler(provider, node_types, 200) # Sanity check. assert len(provider.non_terminated_nodes({})) == 0 # Sanity check. updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch({}, [], {}) assert updated_to_launch == {} provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 1) provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.large", TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 2) # All nodes so far are pending/launching here. to_launch = {"p2.8xlarge": 4, "m4.large": 40} non_terminated_nodes = provider.non_terminated_nodes({}) pending_launches_nodes = {"p2.8xlarge": 1, "m4.large": 1} updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note: we have 2 pending/launching gpus, 3 pending/launching cpus, # 0 running gpu, and 0 running cpus. assert updated_to_launch == {"p2.8xlarge": 3, "m4.large": 2} # This starts the min workers only, so we have no more pending workers. # The workers here are either running or in pending_launches_nodes, # which is "launching". for node_id in non_terminated_nodes: provider.set_node_tags(node_id, {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note that here we have 1 launching gpu, 1 launching cpu, # 1 running gpu, and 2 running cpus. assert updated_to_launch == {"p2.8xlarge": 4, "m4.large": 4} # Launch the nodes. Note, after create_node the node is pending. provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 5) provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.large", TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 5) # Continue scaling. non_terminated_nodes = provider.non_terminated_nodes({}) to_launch = {"m4.large": 36} # No more gpus are necessary pending_launches_nodes = {} # No pending launches updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note: we have 5 pending cpus. So we are not allowed to start any. # Still only 2 running cpus. assert updated_to_launch == {} for node_id in non_terminated_nodes: provider.set_node_tags(node_id, {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note: that here we have 7 running cpus and nothing pending/launching. assert updated_to_launch == {"m4.large": 7} # Launch the nodes. Note, after create_node the node is pending. provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.large", TAG_RAY_NODE_KIND: NODE_KIND_WORKER, TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED }, 7) # Continue scaling. non_terminated_nodes = provider.non_terminated_nodes({}) to_launch = {"m4.large": 29} pending_launches_nodes = {"m4.large": 1} updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note: we have 8 pending/launching cpus and only 7 running. # So we should not launch anything (8 < 7). assert updated_to_launch == {} for node_id in non_terminated_nodes: provider.set_node_tags(node_id, {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) updated_to_launch = \ scheduler._get_concurrent_resource_demand_to_launch( to_launch, non_terminated_nodes, pending_launches_nodes) # Note: that here we have 14 running cpus and 1 launching. assert updated_to_launch == {"m4.large": 13}
class AutoscalingTest(unittest.TestCase): def setUp(self): NODE_PROVIDERS["mock"] = \ lambda config: self.create_provider self.provider = None self.tmpdir = tempfile.mkdtemp() def tearDown(self): self.provider = None del NODE_PROVIDERS["mock"] shutil.rmtree(self.tmpdir) ray.shutdown() def waitForNodes(self, expected, comparison=None, tag_filters={}): MAX_ITER = 50 for i in range(MAX_ITER): n = len(self.provider.non_terminated_nodes(tag_filters)) if comparison is None: comparison = self.assertEqual try: comparison(n, expected) return except Exception: if i == MAX_ITER - 1: raise time.sleep(.1) def create_provider(self, config, cluster_name): assert self.provider return self.provider def write_config(self, config): path = self.tmpdir + "/simple.yaml" with open(path, "w") as f: f.write(yaml.dump(config)) return path def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "empty_node") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "empty_node") def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2) def testScaleUpIgnoreUsed(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["target_utilization_fraction"] = 1.0 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "p2.xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] self.provider.finish_starting_nodes() runner = MockProcessRunner() lm = LoadMetrics(local_ip=head_ip) autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) autoscaler.update() self.waitForNodes(1) lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {}) self.waitForNodes(1) lm.update(head_ip, { "CPU": 4, "GPU": 1 }, {"GPU": 1}, {}, waiting_bundles=[{ "GPU": 1 }]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.xlarge" def testRequestBundlesAccountsForHeadNode(self): config = MULTI_WORKER_CLUSTER.copy() config["head_node_type"] = "p2.8xlarge" config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: "head" }, 1) runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 1 # These requests fit on the head node. autoscaler.update() self.waitForNodes(1) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert len(self.provider.mock_nodes) == 1 autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(1) # This request requires an additional worker node. autoscaler.request_resources([{"GPU": 8}] * 2) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].node_type == "m4.16xlarge" assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "\"CPU\":2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "\"CPU\":32") runner.assert_has_call("172.0.0.1", "\"GPU\":8") def testScaleUpLoadMetrics(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.update() lm.update("1.2.3.4", {}, {}, {}, waiting_bundles=[{ "GPU": 1 }], infeasible_bundles=[{ "CPU": 16 }]) autoscaler.update() self.waitForNodes(2) nodes = { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } assert nodes == {"p2.xlarge", "m4.4xlarge"} def testCommandPassing(self): t = "custom" config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"][ "worker_setup_commands"] = ["new_worker_setup_command"] config["available_node_types"]["p2.xlarge"][ "initialization_commands"] = ["new_worker_initialization_cmd"] config["available_node_types"]["p2.xlarge"]["resources"][t] = 1 # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "new_worker_setup_command") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "setup_cmd") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "worker_setup_cmd") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "new_worker_initialization_cmd") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "init_cmd") def testDockerWorkers(self): config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"]["docker"] = { "worker_image": "p2.8x_image:latest", "worker_run_options": ["p2.8x-run-options"] } config["available_node_types"]["p2.xlarge"]["docker"] = { "worker_image": "p2x_image:nightly" } config["docker"]["worker_run_options"] = ["standard-run-options"] config["docker"]["image"] = "default-image:nightly" config["docker"]["worker_image"] = "default-image:nightly" # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() # Fill up m4, p2.8, p2 and request 2 more CPUs autoscaler.request_resources([{ "CPU": 2 }, { "CPU": 16 }, { "CPU": 32 }, { "CPU": 2 }]) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x_image:latest") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "default-image:nightly") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "standard-run-options") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "p2x_image:nightly") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "default-image:nightly") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2.8x-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2x_image:nightly") def testUpdateConfig(self): config = MULTI_WORKER_CLUSTER.copy() config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) config["min_workers"] = 0 config["available_node_types"]["m4.large"]["node_config"][ "field_changed"] = 1 config_path = self.write_config(config) autoscaler.update() self.waitForNodes(0) def testEmptyDocker(self): config = MULTI_WORKER_CLUSTER.copy() del config["docker"] config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
class AutoscalingTest(unittest.TestCase): def setUp(self): NODE_PROVIDERS["mock"] = \ lambda: (None, self.create_provider) self.provider = None self.tmpdir = tempfile.mkdtemp() def tearDown(self): del NODE_PROVIDERS["mock"] shutil.rmtree(self.tmpdir) ray.shutdown() def waitForNodes(self, expected, comparison=None, tag_filters={}): MAX_ITER = 50 for i in range(MAX_ITER): n = len(self.provider.non_terminated_nodes(tag_filters)) if comparison is None: comparison = self.assertEqual try: comparison(n, expected) return except Exception: if i == MAX_ITER - 1: raise time.sleep(.1) def create_provider(self, config, cluster_name): assert self.provider return self.provider def write_config(self, config): path = self.tmpdir + "/simple.yaml" with open(path, "w") as f: f.write(yaml.dump(config)) return path def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2) def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].instance_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].instance_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].instance_type == "m4.16xlarge" assert self.provider.mock_nodes[3].instance_type == "m4.16xlarge"