Exemplo n.º 1
0
def test_inference_job_scale(args):
    if utils.get_launcher(args.config) == "controller":
        return
    job_spec = utils.gen_default_job_description("inference", args.email,
                                                args.uid, args.vc, cmd="sleep 600")

    with utils.run_job(args.rest, job_spec) as job:
        job_id = job.jid
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        deployment_name = job_id + "-deployment"
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert 1 == deployment.spec.replicas

        desired_replicas = 2
        logger.info("scale up job %s to %d" % (job_id, desired_replicas))
        resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas)
        assert "Success" == resp

        time.sleep(30)
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert desired_replicas == deployment.spec.replicas

        desired_replicas = 1
        logger.info("scale down job %s to %d" % (job_id, desired_replicas))
        resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas)
        assert "Success" == resp

        time.sleep(30)
        deployment = utils.kube_get_deployment(args.config, "default", deployment_name)
        assert desired_replicas == deployment.spec.replicas
Exemplo n.º 2
0
def test_op_job(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        job_id = job.jid
        utils.block_until_state_in(args.rest, job_id, {"running"})

        # Try to ApproveJob
        logger.info("approve job %s" % job_id)
        resp = utils.approve_job(args.rest, args.email, job_id)
        assert "Cannot approve the job. Job ID:%s" % job_id == resp["result"]

        # PauseJob
        logger.info("pause job %s" % job_id)
        resp = utils.pause_job(args.rest, args.email, job_id)
        assert "Success, the job is scheduled to be paused." == resp["result"]

        # ResumeJob
        utils.block_until_state_in(args.rest, job_id, {"paused"})
        logger.info("resume job %s" % job_id)
        resp = utils.resume_job(args.rest, args.email, job_id)
        assert "Success, the job is scheduled to be resumed." == resp["result"]

        # KillJob
        utils.block_until_state_in(args.rest, job_id, {"running"})
        logger.info("kill job %s" % job_id)
        resp = utils.kill_job(args.rest, args.email, job_id)
        assert "Success, the job is scheduled to be terminated." == resp[
            "result"]

        state = job.block_until_state_not_in({"killing"})
        assert "killed" == state
Exemplo n.º 3
0
def test_inference_job_use_alias_to_run(args):
    job_spec = utils.gen_default_job_description(
        "inference",
        args.email,
        args.uid,
        args.vc,
        cmd="echo dummy `whoami` ; sleep 120")

    def satisified(expected, times, log):
        """ return True on found `expected` occurs `times` times in `log` """
        start = 0
        for _ in range(times):
            end = log.find(expected, start)
            if end == -1:
                return False
            start = end + 1
        return True

    expected_word = "dummy %s" % (args.email.split("@")[0])

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        for _ in range(300):
            log = utils.get_job_log(args.rest, args.email, job.jid)
            if satisified(expected_word, 2, log):
                break
            time.sleep(0.5)

        assert satisified(expected_word, 2, log), 'log is %s' % (log)
Exemplo n.º 4
0
def test_inference_job_running(args):
    envs = {
        "DLWS_HOST_NETWORK": "",
        "DLTS_HOST_NETWORK": "",
        "DLWS_NUM_GPU_PER_WORKER": "1",
        "DLTS_NUM_GPU_PER_WORKER": "1",
        "DLWS_VC_NAME": str(args.vc),
        "DLTS_VC_NAME": str(args.vc),
        "DLWS_UID": str(args.uid),
        "DLTS_UID": str(args.uid),
        "DLWS_USER_NAME": args.email.split("@")[0],
        "DLTS_USER_NAME": args.email.split("@")[0],
        "DLWS_USER_EMAIL": args.email,
        "DLTS_USER_EMAIL": args.email,
        "DLWS_ROLE_NAME": "master",
        "DLTS_ROLE_NAME": "master",
        "DLWS_JOB_ID": "unknown",
        "DLTS_JOB_ID": "unknown",
    }

    job_spec = utils.gen_default_job_description("inference", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        envs["DLWS_JOB_ID"] = job.jid
        envs["DLTS_JOB_ID"] = job.jid

        pods = utils.kube_get_pods(args.config, "default", "jobId=" + job.jid)
        assert len(pods) == 2

        for pod in pods:
            envs["DLWS_ROLE_NAME"] = pod.metadata.labels["jobRole"]
            envs["DLTS_ROLE_NAME"] = pod.metadata.labels["jobRole"]
            pod_name = pod.metadata.name
            container_name = pod.spec.containers[0].name

            cmd = ["bash", "-c"]

            remain_cmd = [
                "printf %s= ; printenv %s" % (key, key)
                for key, _ in envs.items()
            ]

            cmd.append(";".join(remain_cmd))

            code, output = utils.kube_pod_exec(args.config, "default", pod_name,
                                               container_name, cmd)

            logger.debug("cmd %s output for %s.%s is %s", cmd, pod_name,
                         container_name, output)

            for key, val in envs.items():
                expected_output = "%s=%s" % (key, val)
                assert output.find(
                    expected_output) != -1, "could not find %s in log %s" % (
                        expected_output, output)
Exemplo n.º 5
0
def test_data_job_running(args):
    expected_state = "finished"
    expected_word = "wantThisInLog"
    cmd = "mkdir -p /tmp/dlts_test_dir; " \
          "echo %s > /tmp/dlts_test_dir/testfile; " \
          "cd /DataUtils; " \
          "./copy_data.sh /tmp/dlts_test_dir adl://indexserveplatform-experiment-c09.azuredatalakestore.net/local/dlts_test_dir True 4194304 4 2 >/dev/null 2>&1;" \
          "./copy_data.sh adl://indexserveplatform-experiment-c09.azuredatalakestore.net/local/dlts_test_dir /tmp/dlts_test_dir_copyback False 33554432 4 2 >/dev/null 2>&1;" \
          "cat /tmp/dlts_test_dir_copyback/testfile; " % expected_word

    image = "indexserveregistry.azurecr.io/dlts-data-transfer-image:latest"

    job_spec = utils.gen_default_job_description("data",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd=cmd,
                                                 image=image)
    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        assert expected_state == state

        for _ in range(10):
            log = utils.get_job_log(args.rest, args.email, job.jid)
            if expected_word in log:
                break
            time.sleep(0.5)
        assert expected_word in log, 'assert {} in {}'.format(
            expected_word, log)
Exemplo n.º 6
0
def test_blobfuse(args):
    path = "/tmp/blob/${DLTS_JOB_ID}"
    cmd = "echo dummy > %s; cat %s ; rm %s ;" % (path, path, path)

    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd=cmd)

    job_spec["plugins"] = utils.load_azure_blob_config(args.config,
                                                       "/tmp/blob")

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        assert state == "finished", "state is not finished, but %s" % state

        for _ in range(5):
            log = utils.get_job_log(args.rest, args.email, job.jid)
            if log.find("dummy") != -1:
                break
            time.sleep(0.5)

        assert log.find("dummy") != -1, "could not find dummy in log %s" % (
            log)
Exemplo n.º 7
0
def test_no_resource_info(args):
    expected = "Insufficient nvidia.com/gpu"

    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 resourcegpu=5)
    # TODO hardcode 5 here, may need to change to `gpu_per_host + 1` manually
    # when testing other clusters

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state == "scheduling"

        for _ in range(50):
            details = utils.get_job_detail(args.rest, args.email, job.jid)

            message = utils.walk_json_safe(details, "jobStatusDetail", 0,
                                           "message")
            if expected in message:
                break

            time.sleep(0.5)
        assert expected in message, "unexpected detail " + details
Exemplo n.º 8
0
def test_batch_kill_jobs(args):
    expected_msg = "successfully killed"
    expected_state = "killed"

    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    num_jobs = 2
    job_ids = []
    for i in range(num_jobs):
        job_id = utils.post_job(args.rest, job_spec)
        job_ids.append(job_id)

    # FIXME there is a race condition between rest and jobmanager
    # E.g. kill job request comes in when jobmanager is processing an unapproved
    # job. "killing" will be overriden by "queued".
    for job_id in job_ids:
        state = utils.block_until_state_not_in(
            args.rest, job_id, {"unapproved", "queued", "scheduling"})
        assert state == "running"

    resp = utils.kill_jobs(args.rest, args.email,
                           [job_id for job_id in job_ids])

    assert isinstance(resp["result"], dict)
    assert len(resp["result"]) == num_jobs
    for _, msg in resp["result"].items():
        assert expected_msg == msg

    for job_id in job_ids:
        state = utils.block_until_state_not_in(args.rest, job_id, {"killing"})
        assert expected_state == state
Exemplo n.º 9
0
def test_ssh_one_gpu_job_cuda_visible_devices(args):
    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 resourcegpu=1)

    expected = "a\nb"
    test_ssh_cuda_visible_devices(args, job_spec, expected)
Exemplo n.º 10
0
def test_ssh_multi_gpu_job_cuda_visible_devices(args):
    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 resourcegpu=2)

    expected = "a\nCUDA_VISIBLE_DEVICES=0,1\nCUDA_VISIBLE_DEVICES=0,1\nb"
    test_ssh_cuda_visible_devices(args, job_spec, expected)
Exemplo n.º 11
0
def test_distributed_job_ssh(args):
    job_spec = utils.gen_default_job_description("distributed", args.email,
                                                 args.uid, args.vc)
    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 2

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        for endpoint_id in endpoints_ids:
            ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                     job.jid, endpoint_id)
            logger.debug("endpoint_id is %s, endpoints resp is %s",
                         endpoint_id, ssh_endpoint)

            ssh_host = "%s.%s" % (ssh_endpoint["nodeName"],
                                  ssh_endpoint["domain"])
            ssh_port = ssh_endpoint["port"]

            # exec into jobmanager to execute ssh to avoid firewall
            job_manager_pod = utils.kube_get_pods(args.config, "default",
                                                  "app=jobmanager")[0]
            job_manager_pod_name = job_manager_pod.metadata.name

            alias = args.email.split("@")[0]

            cmd_prefix = [
                "ssh",
                "-i",
                "/dlwsdata/work/%s/.ssh/id_rsa" % alias,
                "-p",
                ssh_port,
                "-o",
                "StrictHostKeyChecking=no",
                "-o",
                "LogLevel=ERROR",
                "%s@%s" % (alias, ssh_host),
                "--",
            ]

            # check they can connect to each other
            for role in ["ps-0", "worker-0"]:
                cmd = copy.deepcopy(cmd_prefix)
                cmd.extend([
                    "ssh", role, "-o", "LogLevel=ERROR", "--", "echo", "dummy"
                ])
                code, output = utils.kube_pod_exec(args.config, "default",
                                                   job_manager_pod_name,
                                                   "jobmanager", cmd)
                logger.debug("code %s, output '%s'", code, output)
                assert code == 0
                assert output == "dummy\n"
Exemplo n.º 12
0
def test_job_fail(args):
    expected_state = "failed"
    cmd = "false"

    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd=cmd)
    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        assert expected_state == state
Exemplo n.º 13
0
def test_ssh_do_not_expose_private_key(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        ssh_cmd = [
            "ssh",
            "-i",
            "/dlwsdata/work/%s/.ssh/id_rsa" % alias,
            "-p",
            ssh_port,
            "-o",
            "StrictHostKeyChecking=no",
            "-o",
            "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host),
            "--",
            "echo a ; printenv DLTS_SSH_PRIVATE_KEY ; echo b",
        ]
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           ssh_cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)

        expected = "a\nb"
        assert expected in output, "could not find %s in output %s" % (
            expected, output)
Exemplo n.º 14
0
def test_job_insight(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        payload = {"messages": ["dummy"]}
        resp = utils.set_job_insight(args.rest, args.email, job.jid, payload)
        assert resp.status_code == 200

        insight = utils.get_job_insight(args.rest, args.email, job.jid)
        assert payload == insight
Exemplo n.º 15
0
def test_gpu_type_override(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)
    # wrong gpu type
    job_spec["gpuType"] = "V100"

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state in ["scheduling", "running"]

        pod = utils.kube_get_pods(args.config, "default",
                                  "jobId=%s" % job.jid)[0]

        # gpu type should be overriden by the correct one
        assert pod.metadata.labels.get("gpuType") == "P40"
Exemplo n.º 16
0
def test_batch_op_jobs(args):
    num_jobs = 2
    job_ids = []

    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)
    for i in range(num_jobs):
        job_id = utils.post_job(args.rest, job_spec)
        job_ids.append(job_id)

    # FIXME there is a race condition between rest and jobmanager
    # E.g. kill job request comes in when jobmanager is processing an unapproved
    # job. "killing" will be overriden by "queued".
    for job_id in job_ids:
        utils.block_until_state_in(args.rest, job_id, {"running"})

    # Try to ApproveJobs
    logger.info("approve jobs %s" % job_ids)
    resp = utils.approve_jobs(args.rest, args.email, job_ids)
    for _, msg in resp["result"].items():
        assert "cannot approve a(n) \"running\" job" == msg

    # PauseJobs
    logger.info("pause jobs %s" % job_ids)
    resp = utils.pause_jobs(args.rest, args.email, job_ids)
    for _, msg in resp["result"].items():
        assert "successfully paused" == msg

    # ResumeJob
    for job_id in job_ids:
        utils.block_until_state_in(args.rest, job_id, {"paused"})
    logger.info("resume jobs %s" % job_ids)
    resp = utils.resume_jobs(args.rest, args.email, job_ids)
    for _, msg in resp["result"].items():
        assert "successfully resumed" == msg

    # KillJob
    for job_id in job_ids:
        utils.block_until_state_in(args.rest, job_id, {"running"})
    logger.info("kill jobs %s" % job_ids)
    resp = utils.kill_jobs(args.rest, args.email, job_ids)
    for _, msg in resp["result"].items():
        assert "successfully killed" == msg

    for job_id in job_ids:
        state = utils.block_until_state_not_in(args.rest, job_id, {"killing"})
        assert "killed" == state
Exemplo n.º 17
0
def perf(args):
    cmd = "sleep 30"

    job_spec = utils.gen_default_job_description("distributed",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd=cmd)
    for _ in range(10):
        jids = []
        for _ in range(5):
            jids.append(utils.post_job(args.rest, job_spec))

        for jid in jids:
            state = utils.block_until_state_not_in(
                args.rest, jid,
                {"unapproved", "queued", "scheduling", "running"})
            logger.info("%s is in state %s", jid, state)
Exemplo n.º 18
0
def test_distributed_job_mountpoints(args):
    job_spec = utils.gen_default_job_description("distributed", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state in ["scheduling", "running"]

        pods = utils.kube_get_pods(args.config, "default",
                                   "jobId=%s" % job.jid)

        mps = utils.load_cluster_nfs_mountpoints(args, job.jid)
        mps.extend(utils.load_system_mountpoints(args))
        mps.extend(utils.load_infiniband_mounts(args))

        for pod in pods:
            for mp in mps:
                assert utils.mountpoint_in_pod(mp, pod), \
                    "mountpoint %s not in distributed job %s" % (mp, job.jid)
Exemplo n.º 19
0
def test_sudo_installed(args):
    cmd = "sudo ls"
    image = "pytorch/pytorch:latest"  # no sudo installed in this image

    job_spec = utils.gen_default_job_description(
        "regular",
        args.email,
        args.uid,
        args.vc,
        cmd=cmd,
        image=image,
    )

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        log = utils.get_job_log(args.rest, args.email, job.jid)

        assert state == "finished"
Exemplo n.º 20
0
def test_blobfuse(args):
    job_spec = utils.gen_default_job_description("distributed", args.email,
                                                 args.uid, args.vc)

    job_spec["plugins"] = utils.load_azure_blob_config(args.config,
                                                       "/tmp/blob")

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ps_label = "jobId=%s,jobRole=ps" % job.jid
        pods = utils.kube_get_pods(args.config, "default", ps_label)
        assert len(pods) == 1

        ps_pod_name = pods[0].metadata.name
        ps_container_name = pods[0].spec.containers[0].name
        msg = "this is dummy from ps"
        ps_cmd = ["bash", "-c", "echo %s > /tmp/blob/${DLWS_JOB_ID}" % (msg)]

        code, output = utils.kube_pod_exec(args.config, "default", ps_pod_name,
                                           ps_container_name, ps_cmd)
        assert code == 0, "code is %d, output is %s" % (code, output)

        worker_label = "jobId=%s,jobRole=worker" % job.jid
        pods = utils.kube_get_pods(args.config, "default", worker_label)
        assert len(pods) == 1

        worker_pod_name = pods[0].metadata.name
        worker_container_name = pods[0].spec.containers[0].name
        worker_cmd = [
            "bash", "-c",
            "cat /tmp/blob/${DLWS_JOB_ID} ; rm /tmp/blob/${DLWS_JOB_ID}"
        ]

        code, output = utils.kube_pod_exec(args.config, "default",
                                           worker_pod_name,
                                           worker_container_name, worker_cmd)
        assert code == 0, "code is %d, output is %s" % (code, output)
        assert msg + "\n" == output, "code is %d, output is %s" % (code,
                                                                   output)
Exemplo n.º 21
0
def test_list_all_jobs(args):
    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd="")

    # All jobs should include finished jobs
    with utils.run_job(args.rest, job_spec) as job:
        job_id = job.jid
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        assert state == "finished"

    resp = utils.get_job_list(args.rest, args.email, args.vc, "all", 10)
    finished_jobs = resp.get("finishedJobs", None)
    assert isinstance(finished_jobs, list)

    finished_job_ids = [job["jobId"] for job in finished_jobs]
    assert job_id in finished_job_ids
Exemplo n.º 22
0
def test_regular_job_ssh(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)
        logger.debug("endpoints resp is %s", ssh_endpoint)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        cmd = [
            "ssh", "-i",
            "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o",
            "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host), "--", "echo", "dummy"
        ]
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)
        assert output == "dummy\n", "output is %s" % (output)
Exemplo n.º 23
0
def test_image_pull_msg(args):
    expected = "ImagePullBackOff"

    job_spec = utils.gen_default_job_description("distributed",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 image="not_exist_image")
    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state == "scheduling"

        for _ in range(50):
            details = utils.get_job_detail(args.rest, args.email, job.jid)

            message = utils.walk_json_safe(details, "jobStatusDetail", 0,
                                           "message")
            if expected in message:
                break

            time.sleep(0.5)
        assert expected in message, "unexpected detail " + details
Exemplo n.º 24
0
def test_regular_job_mountpoints(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state in ["scheduling", "running"]

        pod = utils.kube_get_pods(args.config, "default",
                                  "jobId=%s" % job.jid)[0]

        mps = utils.load_cluster_nfs_mountpoints(args, job.jid)
        mps.extend(utils.load_system_mountpoints(args))

        for mp in mps:
            assert utils.mountpoint_in_pod(mp, pod), \
                "mountpoint %s not in regular job %s" % (mp, job.jid)

        # Regular job should not have IB mounted
        ib_mps = utils.load_infiniband_mounts(args)
        for mp in ib_mps:
            assert not utils.mountpoint_in_pod(mp, pod), \
                "infiniband mountpoint %s in regular job %s" % (mp, job.jid)
Exemplo n.º 25
0
def test_distributed_job_running(args, preemptable=False):
    expected = "wantThisInLog"
    cmd = "echo %s ; sleep 120" % expected

    job_spec = utils.gen_default_job_description("distributed",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 preemptable=preemptable,
                                                 cmd=cmd)
    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        for _ in range(50):
            log = utils.get_job_log(args.rest, args.email, job.jid)

            if expected in log:
                break

            time.sleep(0.5)
        assert expected in log, "assert {} in {}".format(expected, log)
Exemplo n.º 26
0
def test_job_priority(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)
    with utils.run_job(args.rest, job_spec) as job:
        # wait until running to avoid state change race
        state = job.block_until_state_not_in({"unapproved", "queued"})
        assert state in ["scheduling", "running"]

        # invalid payload
        resp = utils.set_job_priorities(args.rest, args.email, None)
        assert resp.status_code == 400

        # unauthorized user cannot change priority
        resp = utils.set_job_priorities(args.rest, "unauthorized_user",
                                        {job.jid: 101})
        assert resp.status_code == 403
        priority = utils.get_job_priorities(args.rest)[job.jid]
        assert priority == 100

        # job owner can change priority
        resp = utils.set_job_priorities(args.rest, args.email, {job.jid: 101})
        assert resp.status_code == 200
        priority = utils.get_job_priorities(args.rest)[job.jid]
        assert priority == 101
Exemplo n.º 27
0
def test_do_not_expose_private_key(args):
    cmd = "echo a ; printenv DLTS_SSH_PRIVATE_KEY ; echo b"

    job_spec = utils.gen_default_job_description("regular",
                                                 args.email,
                                                 args.uid,
                                                 args.vc,
                                                 cmd=cmd)

    with utils.run_job(args.rest, job_spec) as job:
        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling", "running"})
        assert state == "finished"

        expected = "a\nb"

        for _ in range(50):
            log = utils.get_job_log(args.rest, args.email, job.jid)

            if expected in log:
                break

            time.sleep(0.5)
        assert expected in log, 'assert {} in {}'.format(expected, log)
Exemplo n.º 28
0
def test_regular_job_env(args):
    envs = {
        "DLWS_HOST_NETWORK": "",
        "DLTS_HOST_NETWORK": "",
        "DLWS_NUM_PS": "0",
        "DLTS_NUM_PS": "0",
        "DLWS_NUM_WORKER": "1",
        "DLTS_NUM_WORKER": "1",
        "DLWS_NUM_GPU_PER_WORKER": "0",
        "DLTS_NUM_GPU_PER_WORKER": "0",
        "DLWS_VC_NAME": str(args.vc),
        "DLTS_VC_NAME": str(args.vc),
        "DLWS_UID": str(args.uid),
        "DLTS_UID": str(args.uid),
        "DLWS_USER_NAME": args.email.split("@")[0],
        "DLTS_USER_NAME": args.email.split("@")[0],
        "DLWS_USER_EMAIL": args.email,
        "DLTS_USER_EMAIL": args.email,
        "DLWS_ROLE_NAME": "master",
        "DLTS_ROLE_NAME": "master",
        "DLWS_JOB_ID": "unknown",
        "DLTS_JOB_ID": "unknown",
    }

    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        envs["DLWS_JOB_ID"] = job.jid
        envs["DLTS_JOB_ID"] = job.jid

        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)
        logger.debug("endpoints resp is %s", ssh_endpoint)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        bash_cmd = ";".join([
            "printf '%s=' ; printenv %s" % (key, key)
            for key, _ in envs.items()
        ])

        ssh_cmd = [
            "ssh",
            "-i",
            "/dlwsdata/work/%s/.ssh/id_rsa" % alias,
            "-p",
            ssh_port,
            "-o",
            "StrictHostKeyChecking=no",
            "-o",
            "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host),
            "--",
        ]
        ssh_cmd.append(bash_cmd)
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           ssh_cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)

        for key, val in envs.items():
            expected_output = "%s=%s" % (key, val)
            assert output.find(
                expected_output) != -1, "could not find %s in log %s" % (
                    expected_output, output)
Exemplo n.º 29
0
def test_fault_tolerance(args):
    # Job is only retried when launcher is controller.
    if utils.get_launcher(args.config) == "python":
        return

    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)

    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)
        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port)

        pod = utils.kube_get_pods(args.config, "default",
                                  "jobId=%s" % (job.jid))[0]
        utils.kube_delete_pod(args.config, "default", pod.metadata.name)

        ssh_endpoint = utils.wait_endpoint_state(args.rest,
                                                 args.email,
                                                 job.jid,
                                                 endpoint_id,
                                                 state="pending")

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]
        logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port)

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        cmd = [
            "ssh", "-i",
            "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o",
            "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host), "--", "echo", "dummy"
        ]
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)
        assert output == "dummy\n", "output is %s" % (output)
Exemplo n.º 30
0
def test_regular_job_custom_ssh_key(args):
    job_spec = utils.gen_default_job_description("regular", args.email,
                                                 args.uid, args.vc)
    with open("data/id_rsa.pub") as f:
        job_spec["ssh_public_keys"] = [f.read()]

    with utils.run_job(args.rest, job_spec) as job:
        endpoints = utils.create_endpoint(args.rest, args.email, job.jid,
                                          ["ssh"])
        endpoints_ids = list(endpoints.keys())
        assert len(endpoints_ids) == 1
        endpoint_id = endpoints_ids[0]

        state = job.block_until_state_not_in(
            {"unapproved", "queued", "scheduling"})
        assert state == "running"

        ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email,
                                                 job.jid, endpoint_id)
        logger.debug("endpoints resp is %s", ssh_endpoint)

        ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"])
        ssh_port = ssh_endpoint["port"]

        # exec into jobmanager to execute ssh to avoid firewall
        job_manager_pod = utils.kube_get_pods(args.config, "default",
                                              "app=jobmanager")[0]
        job_manager_pod_name = job_manager_pod.metadata.name

        alias = args.email.split("@")[0]

        dest = "/tmp/test_regular_job_customer_ssh_key"

        script_cmd = []

        with open("data/id_rsa") as f:
            script_cmd.append("rm %s ; " % dest)

            for line in f.readlines():
                script_cmd.append("echo")
                script_cmd.append(line.strip())
                script_cmd.append(">> %s ;" % dest)

            script_cmd.append("chmod 400 %s ;" % dest)

        cmd = ["sh", "-c", " ".join(script_cmd)]

        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)

        cmd = [
            "ssh", "-i", dest, "-p", ssh_port, "-o",
            "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR",
            "%s@%s" % (alias, ssh_host), "--", "echo", "dummy"
        ]
        code, output = utils.kube_pod_exec(args.config, "default",
                                           job_manager_pod_name, "jobmanager",
                                           cmd)
        assert code == 0, "code is %s, output is %s" % (code, output)
        assert output == "dummy\n", "output is %s" % (output)