def test_kubernetes_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) args = { "log_prefix": "prefix", "namespace": "namespace", "node_id": 0, "auth_config": auth_config, "process_runner": process_runner, } cmd_runner = KubernetesCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", environment_variables=env_vars) expected = [ "kubectl", "-n", "namespace", "exec", "-it", "0", "--", "bash", "--login", "-c", "-i", """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'""" # noqa: E501 ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def test_rsync_without_exclude_and_filter(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, } cmd_runner = SSHCommandRunner(**args) local_mount = "/home/ubuntu/base/mount/" remote_mount = "/root/protected_mount/" process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_mount, remote_mount, options={ "docker_mount_if_possible": True, }) process_runner.assert_not_has_call("1.2.3.4", pattern="--exclude test") process_runner.assert_not_has_call("1.2.3.4", pattern="--filter dir-merge,- .ignore")
def test_docker_shm_override(run_option_type): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" docker_config = { "container_name": "container", "image": "rayproject/ray:latest", run_option_type: ["--shm-size=80g"] } args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) process_runner.respond_to_call("json .Config.Env", 2 * ["[]"]) cmd_runner.run_init(as_head=True, file_mounts={}, sync_run_yet=True) # Ensure the user-provided SHM size is used. process_runner.assert_has_call("1.2.3.4", pattern="--shm-size=80g") # Ensure that SHM auto detection is bypassed process_runner.assert_not_has_call("1.2.3.4", pattern="/proc/meminfo")
def testGetOrCreateMultiNodeType(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config_path = self.write_config(config) config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "empty_node") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "empty_node")
def testRequestBundlesAccountsForHeadNode(self): config = MULTI_WORKER_CLUSTER.copy() config["head_node_type"] = "p2.8xlarge" config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: "head" }, 1) runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 1 # These requests fit on the head node. autoscaler.update() self.waitForNodes(1) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert len(self.provider.mock_nodes) == 1 autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(1) # This request requires an additional worker node. autoscaler.request_resources([{"GPU": 8}] * 2) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].node_type == "m4.16xlarge" assert self.provider.mock_nodes[3].node_type == "m4.16xlarge"
def test_kubernetes_command_runner(): fail_cmd = "fail command" process_runner = MockProcessRunner([fail_cmd]) provider = MockProvider() provider.create_node({}, {}, 1) args = { "log_prefix": "prefix", "namespace": "namespace", "node_id": 0, "auth_config": auth_config, "process_runner": process_runner, } cmd_runner = KubernetesCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", environment_variables=env_vars) expected = [ "kubectl", "-n", "namespace", "exec", "-it", "0", "--", "bash", "--login", "-c", "-i", """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'""" # noqa: E501 ] assert process_runner.calls[0] == " ".join(expected)
def testScaleUpIgnoreUsed(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["target_utilization_fraction"] = 1.0 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "p2.xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] self.provider.finish_starting_nodes() runner = MockProcessRunner() lm = LoadMetrics(local_ip=head_ip) autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) autoscaler.update() self.waitForNodes(1) lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {}) self.waitForNodes(1) lm.update(head_ip, { "CPU": 4, "GPU": 1 }, {"GPU": 1}, {}, waiting_bundles=[{ "GPU": 1 }]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.xlarge"
def _setup_autoscaler(self): self.runner = MockProcessRunner() self.config = yaml.safe_load(open(self.config_path).read()) self.provider.create_node( {}, { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"], }, 1, ) self.head_ip = self.provider.non_terminated_node_ips({})[0] self.load_metrics = LoadMetrics(local_ip=self.head_ip) self.autoscaler = StandardAutoscaler( self.config_path, self.load_metrics, # Don't let the autoscaler start any node launchers. Instead, we # will launch nodes ourself after every update call. max_concurrent_launches=0, max_failures=0, process_runner=self.runner, update_interval_s=0, ) # Manually create a node launcher. Note that we won't start it as a # separate thread. self.node_launcher = NodeLauncher( provider=self.autoscaler.provider, queue=self.autoscaler.launch_queue, index=0, pending=self.autoscaler.pending_launches, node_types=self.autoscaler.available_node_types, )
def testScaleUpLoadMetrics(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.update() lm.update("1.2.3.4", {}, True, {}, True, {}, waiting_bundles=[{ "GPU": 1 }], infeasible_bundles=[{ "CPU": 16 }]) autoscaler.update() self.waitForNodes(2) nodes = { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } assert nodes == {"p2.xlarge", "m4.4xlarge"}
def test_docker_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" ssh_control_hash = hashlib.md5(cluster_name.encode()).hexdigest() ssh_user_hash = hashlib.md5(getuser().encode()).hexdigest() ssh_control_path = "/tmp/ray_ssh_{}/{}".format(ssh_user_hash[:10], ssh_control_hash[:10]) docker_config = {"container_name": "container"} args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) assert len(process_runner.calls) == 0, "No calls should be made in ctor" env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo hello", environment_variables=env_vars) # This string is insane because there are an absurd number of embedded # quotes. While this is a ridiculous string, the escape behavior is # important and somewhat difficult to get right for environment variables. cmd = """'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (docker exec -it container /bin/bash -c '"'"'bash --login -c -i '"'"'"'"'"'"'"'"'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"quote between this \\" and this"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';export var2='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"123"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';echo hello)'"'"'"'"'"'"'"'"''"'"' )'""" # noqa: E501 expected = [ "ssh", "-tt", "-i", "8265.pem", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "IdentitiesOnly=yes", "-o", "ExitOnForwardFailure=yes", "-o", "ServerAliveInterval=5", "-o", "ServerAliveCountMax=3", "-o", "ControlMaster=auto", "-o", "ControlPath={}/%C".format(ssh_control_path), "-o", "ControlPersist=10s", "-o", "ConnectTimeout=120s", "[email protected]", "bash", "--login", "-c", "-i", cmd ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): print(f"expeted:\t{y}") print(f"actual: \t{x}") assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def testScaleUpMinWorkers(self): config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 2 config["max_workers"] = 50 config["idle_timeout_minutes"] = 1 # Since config["min_workers"] > 1, the remaining worker is started # with the default worker node type. config["available_node_types"]["p2.8xlarge"]["min_workers"] = 1 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) assert len(self.provider.mock_nodes) == 2 assert { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } == {"p2.8xlarge", "m4.large"} self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.16xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) assert len(self.provider.non_terminated_nodes({})) == 6 # Make sure that after idle_timeout_minutes we don't kill idle # min workers. for node_id in self.provider.non_terminated_nodes({}): lm.last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60 autoscaler.update() self.waitForNodes(2) cnt = 0 for id in self.provider.mock_nodes: if self.provider.mock_nodes[id].state == "running" or \ self.provider.mock_nodes[id].state == "pending": assert self.provider.mock_nodes[id].node_type in { "p2.8xlarge", "m4.large" } cnt += 1 assert cnt == 2
def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2)
def testUpdateConfig(self): config = MULTI_WORKER_CLUSTER.copy() config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) config["min_workers"] = 0 config["available_node_types"]["m4.large"]["node_config"][ "field_changed"] = 1 config_path = self.write_config(config) autoscaler.update() self.waitForNodes(0)
def test_kubernetes_command_runner(): fail_cmd = "fail command" process_runner = MockProcessRunner([fail_cmd]) provider = MockProvider() provider.create_node({}, {}, 1) args = { "log_prefix": "prefix", "namespace": "namespace", "node_id": 0, "auth_config": auth_config, "process_runner": process_runner, } cmd_runner = KubernetesCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", environment_variables=env_vars) expected = [ "kubectl", "-n", "namespace", "exec", "-it", "0", "--", "bash", "--login", "-c", "-i", """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'""" # noqa: E501 ] assert process_runner.calls[0] == " ".join(expected) logger = logging.getLogger("ray.autoscaler._private.command_runner") with pytest.raises(SystemExit) as pytest_wrapped_e, patch.object( logger, "error") as mock_logger_error: cmd_runner.run(fail_cmd, exit_on_fail=True) failed_cmd_expected = f'prefixCommand failed: \n\n kubectl -n namespace exec -it 0 --\'bash --login -c -i \'"\'"\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({fail_cmd})\'"\'"\'\'\n' # noqa: E501 mock_logger_error.assert_called_once_with(failed_cmd_expected) assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1
def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "m4.large") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "m4.large")
def test_ssh_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" ssh_control_hash = hashlib.md5(cluster_name.encode()).hexdigest() ssh_user_hash = hashlib.md5(getuser().encode()).hexdigest() ssh_control_path = "/tmp/ray_ssh_{}/{}".format(ssh_user_hash[:10], ssh_control_hash[:10]) args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, } cmd_runner = SSHCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", port_forward=[(8265, 8265)], environment_variables=env_vars) expected = [ "ssh", "-tt", "-L", "8265:localhost:8265", "-i", "8265.pem", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "IdentitiesOnly=yes", "-o", "ExitOnForwardFailure=yes", "-o", "ServerAliveInterval=5", "-o", "ServerAliveCountMax=3", "-o", "ControlMaster=auto", "-o", "ControlPath={}/%C".format(ssh_control_path), "-o", "ControlPersist=10s", "-o", "ConnectTimeout=120s", "[email protected]", "bash", "--login", "-c", "-i", """'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1='"'"'"quote between this \\" and this"'"'"';export var2='"'"'"123"'"'"';echo helloo)'""" # noqa: E501 ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def testCommandPassing(self): t = "custom" config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"][ "worker_setup_commands"] = ["new_worker_setup_command"] config["available_node_types"]["p2.xlarge"][ "initialization_commands"] = ["new_worker_initialization_cmd"] config["available_node_types"]["p2.xlarge"]["resources"][t] = 1 # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "new_worker_setup_command") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "setup_cmd") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "worker_setup_cmd") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "new_worker_initialization_cmd") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "init_cmd")
def testDockerWorkers(self): config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"]["docker"] = { "worker_image": "p2.8x_image:latest", "worker_run_options": ["p2.8x-run-options"] } config["available_node_types"]["p2.xlarge"]["docker"] = { "worker_image": "p2x_image:nightly" } config["docker"]["worker_run_options"] = ["standard-run-options"] config["docker"]["image"] = "default-image:nightly" config["docker"]["worker_image"] = "default-image:nightly" # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() # Fill up m4, p2.8, p2 and request 2 more CPUs autoscaler.request_resources([{ "CPU": 2 }, { "CPU": 16 }, { "CPU": 32 }, { "CPU": 2 }]) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x_image:latest") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "default-image:nightly") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "standard-run-options") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "p2x_image:nightly") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "default-image:nightly") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2.8x-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2x_image:nightly")
def test_docker_rsync(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" docker_config = {"container_name": "container"} args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) local_mount = "/home/ubuntu/base/mount/" remote_mount = "/root/protected_mount/" docker_mount_prefix = get_docker_host_mount_location(cluster_name) remote_host_mount = f"{docker_mount_prefix}{remote_mount}" local_file = "/home/ubuntu/base-file" remote_file = "/root/protected-file" remote_host_file = f"{docker_mount_prefix}{remote_file}" process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_mount, remote_mount, options={"docker_mount_if_possible": True}) # Make sure we do not copy directly to raw destination process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz {local_mount} [email protected]:{remote_mount}") process_runner.assert_not_has_call("1.2.3.4", pattern=f"mkdir -p {remote_mount}") # No docker cp for file_mounts process_runner.assert_not_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz {local_mount} [email protected]:{remote_host_mount}") process_runner.clear_history() ############################## process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_file, remote_file, options={"docker_mount_if_possible": False}) # Make sure we do not copy directly to raw destination process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz {local_file} [email protected]:{remote_file}") process_runner.assert_not_has_call("1.2.3.4", pattern=f"mkdir -p {remote_file}") process_runner.assert_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz {local_file} [email protected]:{remote_host_file}") process_runner.clear_history() ############################## cmd_runner.run_rsync_down(remote_mount, local_mount, options={"docker_mount_if_possible": True}) process_runner.assert_not_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_mount} {local_mount}") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_host_mount} {local_mount}") process_runner.clear_history() ############################## cmd_runner.run_rsync_down(remote_file, local_file, options={"docker_mount_if_possible": False}) process_runner.assert_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_file} {local_file}") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_host_file} {local_file}")
def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "CPU: 2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "CPU: 32") runner.assert_has_call("172.0.0.1", "GPU: 8")
def testPlacementGroup(self): # Note this is mostly an integration test. See # testPlacementGroupScaling for more comprehensive tests. config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 0 config["max_workers"] = 999 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "m4.4xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] assert len(self.provider.non_terminated_nodes({})) == 1 autoscaler.update() self.waitForNodes(1) pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 5)), ] # Since placement groups are implemented with custom resources, this is # an example of the accompanying resource demands. Note the resource # demand autoscaler will be unable to fulfill these demands, but we # should still handle the other infeasible/waiting bundles. placement_group_resource_demands = [{ "GPU_group_0_6c2506ac733bc37496295b02c4fad446": 0.0101, "GPU_group_6c2506ac733bc37496295b02c4fad446": 0.0101 }] lm.update(head_ip, {"CPU": 16}, True, {"CPU": 16}, False, {}, infeasible_bundles=placement_group_resource_demands, waiting_bundles=[{ "GPU": 8 }], pending_placement_groups=pending_placement_groups) autoscaler.update() self.waitForNodes(5) for i in range(1, 5): assert self.provider.mock_nodes[i].node_type == "p2.8xlarge" pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ]