예제 #1
0
    def test_pod_template_with_custom_scheduler(self):
        enable_custom_scheduler = True
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        gpu_num = 2
        pod = {
            "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0",
            "gpuLimit": gpu_num,
        }
        data = pod_template.generate_pod(pod)

        # eanbled custom scheduler would clear the resource limits: spec.containers[].resources.limits
        self.assertEqual(
            0, data["spec"]["containers"][0]["resources"]["limits"]
            ["nvidia.com/gpu"])

        # metadata.annotations["pod.alpha/DeviceInformation"] should be set
        # annotations = data["metadata"]["annotations"]
        device_annotation = json.loads(
            data["metadata"]["annotations"]["pod.alpha/DeviceInformation"])
        self.assertEqual(
            gpu_num, device_annotation["runningcontainer"][pod["podName"]]
            ["requests"]["alpha.gpu/numgpu"])
        # disabled topology
        self.assertEqual(
            0,
            device_annotation["requests"]["alpha.gpu/gpu-generate-topology"])
예제 #2
0
    def test_generate_pods_missing_required_params(self):
        enable_custom_scheduler = True
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        job.params = {}
        job_description, error = pod_template.generate_pods(job)

        self.assertIsNone(job_description)
        self.assertTrue(error)
        self.assertEqual("Missing required parameters!", error)
예제 #3
0
    def test_generate_pod_with_labels(self):
        enable_custom_scheduler = False
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        pod = {
            "gpuLimit": 2,
            "labels": [{
                "name": "my_label_name",
                "value": "my_label_value"
            }],
        }
        data = pod_template.generate_pod(pod)

        self.assertEqual("my_label_value",
                         data["metadata"]["labels"]["my_label_name"])
예제 #4
0
    def test_pod_template_without_custer_scheduler(self):
        enable_custom_scheduler = False
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        pod = {"gpuLimit": 2}
        data = pod_template.generate_pod(pod)

        # not eanbled custom scheduler, set the resource limits: spec.containers[].resources.limits
        self.assertEqual(
            pod["gpuLimit"], data["spec"]["containers"][0]["resources"]
            ["limits"]["nvidia.com/gpu"])
        # metadata.annotations["pod.alpha/DeviceInformation"] should be empty
        self.assertTrue(("annotations" not in data["metadata"])
                        or ("pod.alpha/DeviceInformation"
                            not in data["metadata"]["annotations"]))
예제 #5
0
    def test_generate_pod_with_envs(self):
        enable_custom_scheduler = False
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        pod = {
            "gpuLimit": 2,
            "envs": [{
                "name": "my_env_name",
                "value": "my_env_value"
            }],
        }
        data = pod_template.generate_pod(pod)

        self.assertIn({
            "name": "my_env_name",
            "value": "my_env_value"
        }, data["spec"]["containers"][0]["env"])
예제 #6
0
    def test_generate_launch_script(self):
        job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c"
        path_to_save = "/tmp"
        user_id = "20000"
        gpu_num = 3
        user_script = "sleep infinity"

        script_file = PodTemplate.generate_launch_script(
            job_id, path_to_save, user_id, gpu_num, user_script)

        # return the container command
        self.assertListEqual(["bash", "/pod/scripts/bootstrap.sh"],
                             script_file)
예제 #7
0
def SubmitJob(job):
    # check if existing any pod with label: run=job_id
    assert ("jobId" in job)
    job_id = job["jobId"]
    if not all_pods_not_existing(job_id):
        logging.warning(
            "Waiting until previously pods are cleaned up! Job {}".format(
                job_id))
        job_deployer = JobDeployer()
        errors = job_deployer.delete_job(job_id, force=True)
        if errors:
            logging.warning("Force delete job {}: {}".format(job_id, errors))
        return

    ret = {}
    dataHandler = DataHandler()

    try:
        # TODO refine later
        # before resubmit the job, reset the endpoints
        # update all endpoint to status 'pending', so it would restart when job is ready
        endpoints = dataHandler.GetJobEndpoints(job_id)
        for endpoint_id, endpoint in endpoints.items():
            endpoint["status"] = "pending"
            logging.info(
                "Reset endpoint status to 'pending': {}".format(endpoint_id))
            dataHandler.UpdateEndpoint(endpoint)

        job["cluster"] = config
        job_object, errors = JobSchema().load(job)
        # TODO assert job_object is a Job
        assert (isinstance(job_object, Job))

        job_object.params = json.loads(base64.b64decode(job["jobParams"]))

        # inject gid, uid and user
        # TODO it should return only one entry
        user_info = dataHandler.GetIdentityInfo(
            job_object.params["userName"])[0]
        job_object.params["gid"] = user_info["gid"]
        job_object.params["uid"] = user_info["uid"]
        job_object.params["user"] = job_object.get_alias()

        enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
        if job_object.params["jobtrainingtype"] == "RegularJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        elif job_object.params["jobtrainingtype"] == "PSDistJob":
            pod_template = DistPodTemplate(job_object.get_template())
        elif job_object.params["jobtrainingtype"] == "InferenceJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        else:
            dataHandler.SetJobError(
                job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                job_object.params["jobtrainingtype"])
            dataHandler.Close()
            return False

        pods, error = pod_template.generate_pods(job_object)
        if error:
            dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
            dataHandler.Close()
            return False

        job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
        job_description_path = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
        local_jobDescriptionPath = os.path.realpath(
            os.path.join(config["storage-mount-path"], job_description_path))
        if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
            os.makedirs(os.path.dirname(local_jobDescriptionPath))
        with open(local_jobDescriptionPath, 'w') as f:
            f.write(job_description)

        job_deployer = JobDeployer()
        try:
            pods = job_deployer.create_pods(pods)
            ret["output"] = "Created pods: {}".format(
                [pod.metadata.name for pod in pods])
        except Exception as e:
            ret["output"] = "Error: %s" % e.message
            logging.error(e, exc_info=True)

        ret["jobId"] = job_object.job_id

        dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath",
                                       job_description_path)
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                       base64.b64encode(job_description))
        dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = job_description_path
        jobMeta["jobPath"] = job_object.job_path
        jobMeta["workPath"] = job_object.work_path
        # the command of the first container
        jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        logging.error("Submit job failed: %s" % job, exc_info=True)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(job["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
            dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
예제 #8
0
    def test_generate_pods(self):
        enable_custom_scheduler = True
        pod_template = PodTemplate(job.get_template(), enable_custom_scheduler)

        job.params = {
            "gid":
            "20000",
            "uid":
            "20000",
            "user":
            "******",
            "mountpoints": [{
                "description": "NFS (remote file share)",
                "enabled": True,
                "containerPath": "/home/user",
                "hostPath": "/dlwsdata/work/user",
                "name": "homefolder"
            }],
            "image":
            "indexserveregistry.azurecr.io/deepscale:1.0",
            "userId":
            "20000",
            "dataPath":
            "",
            "jobId":
            "140782a0-7f6d-4039-9801-fd6294c7c88a",
            "isParent":
            1,
            "jobType":
            "training",
            "jobPath":
            "user/jobs/190627/140782a0-7f6d-4039-9801-fd6294c7c88a",
            "containerUserId":
            "0",
            "resourcegpu":
            1,
            "env": [],
            "enabledatapath":
            True,
            "runningasroot":
            True,
            "interactivePorts": [],
            "preemptionAllowed":
            False,
            "jobtrainingtype":
            "RegularJob",
            "do_log":
            False,
            "is_interactive":
            False,
            "familyToken":
            "72fc61265bcb4416b68b44c82d120b3b",
            "enableworkpath":
            True,
            "vcName":
            "vc1",
            "userName":
            "******",
            "workPath":
            "user",
            "cmd":
            "sleep infinity",
            "jobName":
            "test-job",
            "enablejobpath":
            True,
            "gpuType":
            "P40",
            "ssh":
            True
        }

        pods, error = pod_template.generate_pods(job)

        self.assertFalse(error)
        # generate list of pod yamls
        self.assertTrue(list, type(pods))
        self.assertEqual(1, len(pods))
        self.assertIsNotNone(pods[0]["spec"]["containers"][0]["command"])
예제 #9
0
    def submit_job_impl(self, job):
        # check if existing any pod with label: run=job_id
        assert ("jobId" in job)
        job_id = job["jobId"]
        if not self._all_pods_not_existing(job_id):
            logging.warning(
                "Waiting until previously pods are cleaned up! Job {}".format(
                    job_id))
            job_deployer = JobDeployer()
            errors = job_deployer.delete_job(job_id, force=True)
            if errors:
                logging.warning("Force delete job {}: {}".format(
                    job_id, errors))
            return

        ret = {}
        dataHandler = DataHandler()

        try:
            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job_id)
            for endpoint_id, endpoint in endpoints.items():
                endpoint["status"] = "pending"
                logging.info("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            job["cluster"] = config
            job_object, errors = JobSchema().load(job)
            # TODO assert job_object is a Job
            assert isinstance(
                job_object,
                Job), "job_object is not of Job, but " + str(type(job_object))

            job_object.params = json.loads(base64.b64decode(job["jobParams"]))

            # inject gid, uid and user
            # TODO it should return only one entry
            user_info = dataHandler.GetIdentityInfo(
                job_object.params["userName"])[0]
            job_object.params["gid"] = user_info["gid"]
            job_object.params["uid"] = user_info["uid"]
            job_object.params["user"] = job_object.get_alias()

            if "job_token" not in job_object.params:
                if "user_sign_token" in config and "userName" in job_object.params:
                    job_object.params["job_token"] = hashlib.md5(
                        job_object.params["userName"] + ":" +
                        config["user_sign_token"]).hexdigest()
                else:
                    job_object.params["job_token"] = "tryme2017"

            if "envs" not in job_object.params:
                job_object.params["envs"] = []
            job_object.params["envs"].append({
                "name":
                "DLTS_JOB_TOKEN",
                "value":
                job_object.params["job_token"]
            })

            enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
            secret_template = job_object.get_blobfuse_secret_template()
            if job_object.params["jobtrainingtype"] == "RegularJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    enable_custom_scheduler=enable_custom_scheduler,
                    secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "PSDistJob":
                pod_template = DistPodTemplate(job_object.get_template(),
                                               secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "InferenceJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    deployment_template=job_object.get_deployment_template(),
                    enable_custom_scheduler=False,
                    secret_template=secret_template)
            else:
                dataHandler.SetJobError(
                    job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                    job_object.params["jobtrainingtype"])
                dataHandler.Close()
                return False

            pods, error = pod_template.generate_pods(job_object)
            if error:
                dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
                dataHandler.Close()
                return False

            job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
            job_description_path = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
            local_jobDescriptionPath = os.path.realpath(
                os.path.join(config["storage-mount-path"],
                             job_description_path))
            if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
                os.makedirs(os.path.dirname(local_jobDescriptionPath))
            with open(local_jobDescriptionPath, 'w') as f:
                f.write(job_description)

            secrets = pod_template.generate_secrets(job_object)

            job_deployer = JobDeployer()
            try:
                secrets = job_deployer.create_secrets(secrets)
                ret["output"] = "Created secrets: {}. ".format(
                    [secret.metadata.name for secret in secrets])
                pods = job_deployer.create_pods(pods)
                ret["output"] += "Created pods: {}".format(
                    [pod.metadata.name for pod in pods])
            except Exception as e:
                ret["output"] = "Error: %s" % e.message
                logging.error(e, exc_info=True)

            ret["jobId"] = job_object.job_id

            dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                           "scheduling")
            dataHandler.UpdateJobTextField(job_object.job_id,
                                           "jobDescriptionPath",
                                           job_description_path)
            dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                           base64.b64encode(job_description))
            dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                           datetime.datetime.now().isoformat())

            jobMeta = {}
            jobMeta["jobDescriptionPath"] = job_description_path
            jobMeta["jobPath"] = job_object.job_path
            jobMeta["workPath"] = job_object.work_path
            # the command of the first container
            jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

            jobMetaStr = base64.b64encode(json.dumps(jobMeta))
            dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                           jobMetaStr)
        except Exception as e:
            logging.error("Submit job failed: %s" % job, exc_info=True)
            ret["error"] = str(e)
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "Cannot submit job!" + str(e))

                detail = get_job_status_detail(job)
                detail = job_status_detail_with_finished_time(
                    detail, "error", "Server error in job submission")
                dataHandler.UpdateJobTextField(
                    job["jobId"], "jobStatusDetail",
                    base64.b64encode(json.dumps(detail)))

                # Try to clean up the job
                try:
                    job_deployer = JobDeployer()
                    job_deployer.delete_job(job_id, force=True)
                    logging.info(
                        "Cleaning up job %s succeeded after %d retries of job submission"
                        % (job["jobId"], retries))
                except:
                    logging.warning(
                        "Cleaning up job %s failed after %d retries of job submission"
                        % (job["jobId"], retries))

        dataHandler.Close()
        return ret