def test_docker_shm_override(run_option_type): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" docker_config = { "container_name": "container", "image": "rayproject/ray:latest", run_option_type: ["--shm-size=80g"] } args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) process_runner.respond_to_call("json .Config.Env", 2 * ["[]"]) cmd_runner.run_init(as_head=True, file_mounts={}, sync_run_yet=True) # Ensure the user-provided SHM size is used. process_runner.assert_has_call("1.2.3.4", pattern="--shm-size=80g") # Ensure that SHM auto detection is bypassed process_runner.assert_not_has_call("1.2.3.4", pattern="/proc/meminfo")
def test_kubernetes_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) args = { "log_prefix": "prefix", "namespace": "namespace", "node_id": 0, "auth_config": auth_config, "process_runner": process_runner, } cmd_runner = KubernetesCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", environment_variables=env_vars) expected = [ "kubectl", "-n", "namespace", "exec", "-it", "0", "--", "bash", "--login", "-c", "-i", """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'""" # noqa: E501 ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def test_rsync_exclude_and_filter(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, } cmd_runner = SSHCommandRunner(**args) local_mount = "/home/ubuntu/base/mount/" remote_mount = "/root/protected_mount/" process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_mount, remote_mount, options={ "docker_mount_if_possible": True, "rsync_exclude": ["test"], "rsync_filter": [".ignore"] }) process_runner.assert_has_call( "1.2.3.4", pattern="--exclude test --filter dir-merge,- .ignore")
def testCommandPassing(self): t = "custom" config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"][ "worker_setup_commands"] = ["new_worker_setup_command"] config["available_node_types"]["p2.xlarge"][ "initialization_commands"] = ["new_worker_initialization_cmd"] config["available_node_types"]["p2.xlarge"]["resources"][t] = 1 # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler( config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "new_worker_setup_command") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "setup_cmd") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "worker_setup_cmd") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "new_worker_initialization_cmd") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "init_cmd")
def test_docker_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" ssh_control_hash = hashlib.md5(cluster_name.encode()).hexdigest() ssh_user_hash = hashlib.md5(getuser().encode()).hexdigest() ssh_control_path = "/tmp/ray_ssh_{}/{}".format(ssh_user_hash[:10], ssh_control_hash[:10]) docker_config = {"container_name": "container"} args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) assert len(process_runner.calls) == 0, "No calls should be made in ctor" env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo hello", environment_variables=env_vars) # This string is insane because there are an absurd number of embedded # quotes. While this is a ridiculous string, the escape behavior is # important and somewhat difficult to get right for environment variables. cmd = """'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (docker exec -it container /bin/bash -c '"'"'bash --login -c -i '"'"'"'"'"'"'"'"'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"quote between this \\" and this"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';export var2='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"123"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';echo hello)'"'"'"'"'"'"'"'"''"'"' )'""" # noqa: E501 expected = [ "ssh", "-tt", "-i", "8265.pem", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "IdentitiesOnly=yes", "-o", "ExitOnForwardFailure=yes", "-o", "ServerAliveInterval=5", "-o", "ServerAliveCountMax=3", "-o", "ControlMaster=auto", "-o", "ControlPath={}/%C".format(ssh_control_path), "-o", "ControlPersist=10s", "-o", "ConnectTimeout=120s", "[email protected]", "bash", "--login", "-c", "-i", cmd ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): print(f"expeted:\t{y}") print(f"actual: \t{x}") assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def testGetOrCreateMultiNodeType(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config_path = self.write_config(config) config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "empty_node") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "empty_node")
def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler( config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "\"CPU\":2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "\"CPU\":32") runner.assert_has_call("172.0.0.1", "\"GPU\":8")
def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() get_or_create_head_node(MULTI_WORKER_CLUSTER, config_path, no_restart=False, restart_only=False, yes=True, override_cluster_name=None, _provider=self.provider, _runner=runner) self.waitForNodes(1) runner.assert_has_call("1.2.3.4", "init_cmd") runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, "m4.large") self.assertEqual( self.provider.mock_nodes[0].node_config.get("FooProperty"), 42) self.assertEqual( self.provider.mock_nodes[0].node_config.get("TestProp"), 1) self.assertEqual( self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE), "m4.large")
def test_ssh_command_runner(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" ssh_control_hash = hashlib.md5(cluster_name.encode()).hexdigest() ssh_user_hash = hashlib.md5(getuser().encode()).hexdigest() ssh_control_path = "/tmp/ray_ssh_{}/{}".format(ssh_user_hash[:10], ssh_control_hash[:10]) args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, } cmd_runner = SSHCommandRunner(**args) env_vars = {"var1": "quote between this \" and this", "var2": "123"} cmd_runner.run("echo helloo", port_forward=[(8265, 8265)], environment_variables=env_vars) expected = [ "ssh", "-tt", "-L", "8265:localhost:8265", "-i", "8265.pem", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "IdentitiesOnly=yes", "-o", "ExitOnForwardFailure=yes", "-o", "ServerAliveInterval=5", "-o", "ServerAliveCountMax=3", "-o", "ControlMaster=auto", "-o", "ControlPath={}/%C".format(ssh_control_path), "-o", "ControlPersist=10s", "-o", "ConnectTimeout=120s", "[email protected]", "bash", "--login", "-c", "-i", """'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1='"'"'"quote between this \\" and this"'"'"';export var2='"'"'"123"'"'"';echo helloo)'""" # noqa: E501 ] # Much easier to debug this loop than the function call. for x, y in zip(process_runner.calls[0], expected): assert x == y process_runner.assert_has_call("1.2.3.4", exact=expected)
def test_docker_rsync(): process_runner = MockProcessRunner() provider = MockProvider() provider.create_node({}, {}, 1) cluster_name = "cluster" docker_config = {"container_name": "container"} args = { "log_prefix": "prefix", "node_id": 0, "provider": provider, "auth_config": auth_config, "cluster_name": cluster_name, "process_runner": process_runner, "use_internal_ip": False, "docker_config": docker_config, } cmd_runner = DockerCommandRunner(**args) local_mount = "/home/ubuntu/base/mount/" remote_mount = "/root/protected_mount/" docker_mount_prefix = get_docker_host_mount_location(cluster_name) remote_host_mount = f"{docker_mount_prefix}{remote_mount}" local_file = "/home/ubuntu/base-file" remote_file = "/root/protected-file" remote_host_file = f"{docker_mount_prefix}{remote_file}" process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_mount, remote_mount, options={"docker_mount_if_possible": True}) # Make sure we do not copy directly to raw destination process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz {local_mount} [email protected]:{remote_mount}") process_runner.assert_not_has_call("1.2.3.4", pattern=f"mkdir -p {remote_mount}") # No docker cp for file_mounts process_runner.assert_not_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz {local_mount} [email protected]:{remote_host_mount}") process_runner.clear_history() ############################## process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up(local_file, remote_file, options={"docker_mount_if_possible": False}) # Make sure we do not copy directly to raw destination process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz {local_file} [email protected]:{remote_file}") process_runner.assert_not_has_call("1.2.3.4", pattern=f"mkdir -p {remote_file}") process_runner.assert_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz {local_file} [email protected]:{remote_host_file}") process_runner.clear_history() ############################## cmd_runner.run_rsync_down(remote_mount, local_mount, options={"docker_mount_if_possible": True}) process_runner.assert_not_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_mount} {local_mount}") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_host_mount} {local_mount}") process_runner.clear_history() ############################## cmd_runner.run_rsync_down(remote_file, local_file, options={"docker_mount_if_possible": False}) process_runner.assert_has_call("1.2.3.4", pattern="docker cp") process_runner.assert_not_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_file} {local_file}") process_runner.assert_has_call( "1.2.3.4", pattern=f"-avz [email protected]:{remote_host_file} {local_file}")
def testDockerWorkers(self): config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"]["docker"] = { "worker_image": "p2.8x_image:latest", "worker_run_options": ["p2.8x-run-options"] } config["available_node_types"]["p2.xlarge"]["docker"] = { "worker_image": "p2x_image:nightly" } config["docker"]["worker_run_options"] = ["standard-run-options"] config["docker"]["image"] = "default-image:nightly" config["docker"]["worker_image"] = "default-image:nightly" # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() # Fill up m4, p2.8, p2 and request 2 more CPUs autoscaler.request_resources([{ "CPU": 2 }, { "CPU": 16 }, { "CPU": 32 }, { "CPU": 2 }]) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x_image:latest") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "default-image:nightly") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "standard-run-options") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "p2x_image:nightly") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "default-image:nightly") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2.8x-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2x_image:nightly")