def test_distributed_job_ssh(args): job_spec = utils.gen_default_job_description("distributed", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 2 state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" for endpoint_id in endpoints_ids: ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoint_id is %s, endpoints resp is %s", endpoint_id, ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] cmd_prefix = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", ] # check they can connect to each other for role in ["ps-0", "worker-0"]: cmd = copy.deepcopy(cmd_prefix) cmd.extend([ "ssh", role, "-o", "LogLevel=ERROR", "--", "echo", "dummy" ]) code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) logger.debug("code %s, output '%s'", code, output) assert code == 0 assert output == "dummy\n"
def test_ssh_do_not_expose_private_key(args): job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo a ; printenv DLTS_SSH_PRIVATE_KEY ; echo b", ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) assert code == 0, "code is %s, output is %s" % (code, output) expected = "a\nb" assert expected in output, "could not find %s in output %s" % ( expected, output)
def test_ssh_cuda_visible_devices(args, job_spec, expected): with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo a; env | grep CUDA_VISIBLE_DEVICES;", "grep CUDA_VISIBLE_DEVICES ~/.ssh/environment; echo b", ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) assert code == 0, "code is %s, output is %s" % (code, output) assert expected in output, "could not find %s in output %s" % ( expected, output)
def test_regular_job_ssh(args): job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo", "dummy" ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) assert code == 0, "code is %s, output is %s" % (code, output) assert output == "dummy\n", "output is %s" % (output)
def test_fault_tolerance(args): # Job is only retried when launcher is controller. if utils.get_launcher(args.config) == "python": return job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port) pod = utils.kube_get_pods(args.config, "default", "jobId=%s" % (job.jid))[0] utils.kube_delete_pod(args.config, "default", pod.metadata.name) ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id, state="pending") ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port) # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo", "dummy" ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) assert code == 0, "code is %s, output is %s" % (code, output) assert output == "dummy\n", "output is %s" % (output)
def test_regular_job_custom_ssh_key(args): job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with open("data/id_rsa.pub") as f: job_spec["ssh_public_keys"] = [f.read()] with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] dest = "/tmp/test_regular_job_customer_ssh_key" script_cmd = [] with open("data/id_rsa") as f: script_cmd.append("rm %s ; " % dest) for line in f.readlines(): script_cmd.append("echo") script_cmd.append(line.strip()) script_cmd.append(">> %s ;" % dest) script_cmd.append("chmod 400 %s ;" % dest) cmd = ["sh", "-c", " ".join(script_cmd)] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) assert code == 0, "code is %s, output is %s" % (code, output) cmd = [ "ssh", "-i", dest, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo", "dummy" ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) assert code == 0, "code is %s, output is %s" % (code, output) assert output == "dummy\n", "output is %s" % (output)
def test_regular_job_env(args): envs = { "DLWS_HOST_NETWORK": "", "DLTS_HOST_NETWORK": "", "DLWS_NUM_PS": "0", "DLTS_NUM_PS": "0", "DLWS_NUM_WORKER": "1", "DLTS_NUM_WORKER": "1", "DLWS_NUM_GPU_PER_WORKER": "0", "DLTS_NUM_GPU_PER_WORKER": "0", "DLWS_VC_NAME": str(args.vc), "DLTS_VC_NAME": str(args.vc), "DLWS_UID": str(args.uid), "DLTS_UID": str(args.uid), "DLWS_USER_NAME": args.email.split("@")[0], "DLTS_USER_NAME": args.email.split("@")[0], "DLWS_USER_EMAIL": args.email, "DLTS_USER_EMAIL": args.email, "DLWS_ROLE_NAME": "master", "DLTS_ROLE_NAME": "master", "DLWS_JOB_ID": "unknown", "DLTS_JOB_ID": "unknown", } job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: envs["DLWS_JOB_ID"] = job.jid envs["DLTS_JOB_ID"] = job.jid endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] bash_cmd = ";".join([ "printf '%s=' ; printenv %s" % (key, key) for key, _ in envs.items() ]) ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", ] ssh_cmd.append(bash_cmd) code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) assert code == 0, "code is %s, output is %s" % (code, output) for key, val in envs.items(): expected_output = "%s=%s" % (key, val) assert output.find( expected_output) != -1, "could not find %s in log %s" % ( expected_output, output)
def test_distributed_job_system_envs(args): envs = utils.load_system_envs(args) job_spec = utils.gen_default_job_description("distributed", args.email, args.uid, args.vc, cmd="sleep infinity") with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" for endpoint_id in endpoints_ids: ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] ssh_id = ssh_endpoint["id"] bash_cmd = ";".join([ "printf '%s=' ; printenv %s" % (key, key) for key, _ in envs.items() ]) # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", str(ssh_port), "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", ] ssh_cmd.append(bash_cmd) code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) logger.debug("cmd %s code is %s, output is %s", " ".join(ssh_cmd), code, output) for key, val in envs.items(): expected_output = "%s=%s" % (key, val) assert output.find( expected_output) != -1, "could not find %s in log %s" % ( expected_output, output)
def test_ssh_cuda_visible_devices(args): job_spec = utils.gen_default_job_description("distributed", args.email, args.uid, args.vc, cmd="sleep infinity", resourcegpu=4) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" for endpoint_id in endpoints_ids: ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] ssh_id = ssh_endpoint["id"] role_idx = ssh_id.split("-")[-2] match = re.match("([a-z]+)([0-9]+)", role_idx) assert match is not None, "%s is not role index name" % role_idx role, idx = match.groups() # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", str(ssh_port), "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo a; env | grep CUDA_VISIBLE_DEVICES;", "grep CUDA_VISIBLE_DEVICES ~/.ssh/environment; echo b", ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) logger.debug("cmd %s code is %s, output is %s", " ".join(ssh_cmd), code, output) if role == "ps": expected = "a\nb" else: expected = "a\nCUDA_VISIBLE_DEVICES=0,1,2,3\nCUDA_VISIBLE_DEVICES=0,1,2,3\nb" assert expected in output, "could not find %s in output %s" % ( expected, output)
def test_distributed_job_env(args): envs = { "DLWS_HOST_NETWORK": "enable", "DLTS_HOST_NETWORK": "enable", "DLWS_NUM_PS": "1", "DLTS_NUM_PS": "1", "DLWS_NUM_WORKER": "1", "DLTS_NUM_WORKER": "1", "DLWS_NUM_GPU_PER_WORKER": "0", "DLTS_NUM_GPU_PER_WORKER": "0", "DLWS_VC_NAME": str(args.vc), "DLTS_VC_NAME": str(args.vc), "DLWS_UID": str(args.uid), "DLTS_UID": str(args.uid), "DLWS_USER_NAME": args.email.split("@")[0], "DLTS_USER_NAME": args.email.split("@")[0], "DLWS_USER_EMAIL": args.email, "DLTS_USER_EMAIL": args.email, "DLWS_ROLE_NAME": "master", "DLTS_ROLE_NAME": "master", "DLWS_JOB_ID": "unknown", "DLTS_JOB_ID": "unknown", } job_spec = utils.gen_default_job_description("distributed", args.email, args.uid, args.vc, cmd="sleep infinity") with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" envs["DLWS_JOB_ID"] = job.jid envs["DLTS_JOB_ID"] = job.jid for endpoint_id in endpoints_ids: ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoints resp is %s", ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] ssh_id = ssh_endpoint["id"] role_idx = ssh_id.split("-")[-2] match = re.match("([a-z]+)([0-9]+)", role_idx) assert match is not None, "%s is not role index name" % (role_idx) role, idx = match.groups() envs["DLWS_ROLE_NAME"] = role envs["DLTS_ROLE_NAME"] = role envs["DLWS_ROLE_IDX"] = idx envs["DLTS_ROLE_IDX"] = idx bash_cmd = ";".join([ "printf '%s=' ; printenv %s" % (key, key) for key, _ in envs.items() ]) # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] ssh_cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", str(ssh_port), "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", ] ssh_cmd.append(bash_cmd) code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", ssh_cmd) logger.debug("cmd %s code is %s, output is %s", " ".join(ssh_cmd), code, output) for key, val in envs.items(): expected_output = "%s=%s" % (key, val) assert output.find( expected_output) != -1, "could not find %s in log %s" % ( expected_output, output)
def test_distributed_with_default_cmd(args): cmd = """ ################################################## # DeepScale 1.0 redirect (Start) ################################################## set -e hostname whoami # Create DeepScale 1.0 redirect on master node sudo chmod -R 777 /root && mkdir -p /root/.ssh && rm -f /root/.ssh/config && ln -s ~/.ssh/config /root/.ssh/config && echo "/root/.ssh/config created" && ls -l /root/.ssh mkdir -p /opt && sudo rm -f /opt/hostfile && sudo ln -s /job/hostfile /opt/hostfile && cat /opt/hostfile # Create DeepScale 1.0 redirect for all workers for i in $(seq 0 $(( ${DLWS_NUM_WORKER} - 1 ))); do echo "Creating DeepScale 1.0 redirect for worker ${i}" ssh worker-${i} "sudo chmod -R 777 /root && mkdir -p /root/.ssh && rm -f /root/.ssh/config && ln -s ~/.ssh/config /root/.ssh/config && echo "/root/.ssh/config created" && ls -l /root/.ssh" ssh worker-${i} "mkdir -p /opt && sudo rm -f /opt/hostfile && sudo ln -s /job/hostfile /opt/hostfile && cat /opt/hostfile" done ################################################## # DeepScale 1.0 redirect (End) ################################################## ################################################## # Unlimit memlock (Start) ################################################## for i in $(seq 0 $(( ${DLWS_NUM_WORKER} - 1 ))); do echo "Creating redirect for worker ${i}" ssh worker-${i} "sudo bash -c 'echo -e \"* soft memlock unlimited\n* hard memlock unlimited\" | cat >> /etc/security/limits.conf'" done ################################################## # Unlimit memlock (End) ################################################## ################################################## # User command starts here ################################################## sleep infinity""" job_spec = utils.gen_default_job_description("distributed", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 2 state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" for endpoint_id in endpoints_ids: ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) logger.debug("endpoint_id is %s, endpoints resp is %s", endpoint_id, ssh_endpoint) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] cmd_prefix = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", ] # check they can connect to each other for role in ["ps-0", "worker-0"]: cmd = copy.deepcopy(cmd_prefix) cmd.extend([ "ssh", role, "-o", "LogLevel=ERROR", "--", "echo", "dummy" ]) code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) logger.debug("code %s, output '%s'", code, output) assert code == 0 assert output == "dummy\n"