Пример #1
0
    def store_function(
        self, session, function, name, project="", tag="", versioned=False
    ):
        project = project or config.default_project
        self._create_project_if_not_exists(session, project)
        tag = tag or get_in(function, "metadata.tag") or "latest"
        hash_key = fill_function_hash(function, tag)

        # clear tag from object in case another function will "take" that tag
        update_in(function, "metadata.tag", "")

        # versioned means whether we want to version this function object so that it will queryable by its hash key
        # to enable that we set the uid to the hash key so it will have a unique record (Unique constraint of function
        # is the set (project, name, uid))
        # when it's not enabled it means we want to have one unique function object for the set (project, name, tag)
        # that will be reused on every store function (cause we don't want to version each version e.g. create a new
        # record) so we set the uid to be unversioned-{tag}
        if versioned:
            uid = hash_key
        else:
            uid = f'unversioned-{tag}'

        updated = datetime.now(timezone.utc)
        update_in(function, "metadata.updated", updated)
        fn = self._get_function(session, name, project, uid)
        if not fn:
            fn = Function(name=name, project=project, uid=uid,)
        fn.updated = updated
        labels = get_in(function, "metadata.labels", {})
        update_labels(fn, labels)
        fn.struct = function
        self._upsert(session, fn)
        self.tag_objects_v2(session, [fn], project, tag)
        return hash_key
Пример #2
0
def build_status(name: str = "",
                 project: str = "",
                 tag: str = "",
                 offset: int = 0,
                 logs: str = "on",
                 db_session: Session = Depends(deps.get_db_session)):
    logs = strtobool(logs)
    fn = get_db().get_function(db_session, name, project, tag)
    if not fn:
        log_and_raise(HTTPStatus.NOT_FOUND,
                      name=name,
                      project=project,
                      tag=tag)

    state = get_in(fn, "status.state", "")
    pod = get_in(fn, "status.build_pod", "")
    image = get_in(fn, "spec.build.image", "")
    out = b""
    if not pod:
        if state == "ready":
            image = image or get_in(fn, "spec.image")
        return Response(content=out,
                        media_type="text/plain",
                        headers={
                            "function_status": state,
                            "function_image": image,
                            "builder_pod": pod
                        })

    logger.info("get pod {} status".format(pod))
    state = get_k8s().get_pod_status(pod)
    logger.info("pod state={}".format(state))

    if state == "succeeded":
        logger.info("build completed successfully")
        state = "ready"
    if state in ["failed", "error"]:
        logger.error("build {}, watch the build pod logs: {}".format(
            state, pod))

    if logs and state != "pending":
        resp = get_k8s().logs(pod)
        if resp:
            out = resp.encode()[offset:]

    update_in(fn, "status.state", state)
    if state == "ready":
        update_in(fn, "spec.image", image)

    get_db().store_function(db_session, fn, name, project, tag)

    return Response(content=out,
                    media_type="text/plain",
                    headers={
                        "function_status": state,
                        "function_image": image,
                        "builder_pod": pod
                    })
Пример #3
0
def build_status():
    name = request.args.get('name', '')
    project = request.args.get('project', '')
    tag = request.args.get('tag', '')
    offset = int(request.args.get('offset', '0'))
    logs = strtobool(request.args.get('logs', 'on'))

    fn = _db.get_function(name, project, tag)
    if not fn:
        return json_error(HTTPStatus.NOT_FOUND,
                          name=name,
                          project=project,
                          tag=tag)

    state = get_in(fn, 'status.state', '')
    pod = get_in(fn, 'status.build_pod', '')
    image = get_in(fn, 'spec.build.image', '')
    out = b''
    if not pod:
        if state == 'ready':
            image = image or get_in(fn, 'spec.image')
        return Response(out,
                        mimetype='text/plain',
                        headers={
                            "function_status": state,
                            "function_image": image,
                            "builder_pod": pod
                        })

    logger.info('get pod {} status'.format(pod))
    state = _k8s.get_pod_status(pod)
    logger.info('pod state={}'.format(state))

    if state == 'succeeded':
        logger.info('build completed successfully')
        state = 'ready'
    if state in ['failed', 'error']:
        logger.error('build {}, watch the build pod logs: {}'.format(
            state, pod))

    if logs and state != 'pending':
        resp = _k8s.logs(pod)
        if resp:
            out = resp.encode()[offset:]

    update_in(fn, 'status.state', state)
    if state == 'ready':
        update_in(fn, 'spec.image', image)

    _db.store_function(fn, name, project, tag)

    return Response(out,
                    mimetype='text/plain',
                    headers={
                        "function_status": state,
                        "function_image": image,
                        "builder_pod": pod
                    })
Пример #4
0
def _update_result_body(result_path, event_body, result):
    if result_path and event_body:
        if not hasattr(event_body, "__getitem__"):
            raise TypeError(
                "result_path parameter supports only dict-like event bodies")
        update_in(event_body, result_path, result)
    else:
        event_body = result
    return event_body
Пример #5
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx,
                          meta: client.V1ObjectMeta) -> typing.Dict:
        job = deepcopy(self._mpijob_template)

        pod_labels = deepcopy(meta.labels)
        pod_labels['mlrun/job'] = meta.name
        update_in(job, 'metadata', meta.to_dict())
        update_in(job, 'spec.template.metadata.labels', pod_labels)
        update_in(job, 'spec.replicas', self.spec.replicas or 1)
        if self.spec.image:
            self._update_container(job, 'image', self.full_image_path())
        update_in(job, 'spec.template.spec.volumes', self.spec.volumes)
        self._update_container(job, 'volumeMounts', self.spec.volume_mounts)

        extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()}
        if runobj.spec.verbose:
            extra_env['MLRUN_LOG_LEVEL'] = 'debug'
        extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()]
        self._update_container(job, 'env', extra_env + self.spec.env)
        if self.spec.image_pull_policy:
            self._update_container(job, 'imagePullPolicy',
                                   self.spec.image_pull_policy)
        if self.spec.resources:
            self._update_container(job, 'resources', self.spec.resources)
        if self.spec.workdir:
            self._update_container(job, 'workingDir', self.spec.workdir)

        if self.spec.image_pull_secret:
            update_in(
                job,
                'spec.template.spec.imagePullSecrets',
                [{
                    'name': self.spec.image_pull_secret
                }],
            )

        if self.spec.command:
            self._update_container(job, 'command',
                                   ['mpirun', 'python', self.spec.command] +
                                   self.spec.args)

        return job
Пример #6
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx,
                          meta: client.V1ObjectMeta) -> typing.Dict:
        job = deepcopy(self._mpijob_template)

        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name
        update_in(job, "metadata", meta.to_dict())
        update_in(job, "spec.template.metadata.labels", pod_labels)
        update_in(job, "spec.replicas", self.spec.replicas or 1)
        if self.spec.image:
            self._update_container(job, "image", self.full_image_path())
        update_in(job, "spec.template.spec.volumes", self.spec.volumes)
        self._update_container(job, "volumeMounts", self.spec.volume_mounts)

        extra_env = self._generate_runtime_env(runobj)
        extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
        self._update_container(job, "env", extra_env + self.spec.env)
        if self.spec.image_pull_policy:
            self._update_container(job, "imagePullPolicy",
                                   self.spec.image_pull_policy)
        if self.spec.resources:
            self._update_container(job, "resources", self.spec.resources)
        if self.spec.workdir:
            self._update_container(job, "workingDir", self.spec.workdir)

        if self.spec.image_pull_secret:
            update_in(
                job,
                "spec.template.spec.imagePullSecrets",
                [{
                    "name": self.spec.image_pull_secret
                }],
            )

        if self.spec.command:
            self._update_container(job, "command",
                                   ["mpirun", "python", self.spec.command] +
                                   self.spec.args)

        return job
Пример #7
0
 def update_run(self, session, updates: dict, uid, project="", iter=0):
     project = project or config.default_project
     run = self._get_run(session, uid, project, iter)
     if not run:
         raise DBError(f"run {uid}:{project} not found")
     struct = run.struct
     for key, val in updates.items():
         update_in(struct, key, val)
     run.struct = struct
     new_state = run_state(struct)
     if new_state:
         run.state = new_state
     start_time = run_start_time(struct)
     if start_time:
         run.start_time = start_time
     run.labels.clear()
     for name, value in run_labels(struct).items():
         lbl = Run.Label(name=name, value=value, parent=run.id)
         run.labels.append(lbl)
     session.merge(run)
     session.commit()
     self._delete_empty_labels(session, Run.Label)
Пример #8
0
def get_log(project, uid):
    size = int(request.args.get('size', '-1'))
    offset = int(request.args.get('offset', '0'))

    out = b''
    log_file = log_path(project, uid)
    if log_file.exists():
        with log_file.open('rb') as fp:
            fp.seek(offset)
            out = fp.read(size)
        status = ''
    else:
        data = _db.read_run(uid, project)
        if not data:
            return json_error(HTTPStatus.NOT_FOUND, project=project, uid=uid)

        status = get_in(data, 'status.state', '')
        if _k8s:
            pods = _k8s.get_logger_pods(uid)
            if pods:
                pod, new_status = list(pods.items())[0]
                new_status = new_status.lower()

                # TODO: handle in cron/tracking
                if new_status != 'pending':
                    resp = _k8s.logs(pod)
                    if resp:
                        out = resp.encode()[offset:]
                    if status == 'running':
                        now = now_date().isoformat()
                        update_in(data, 'status.last_update', now)
                        if new_status == 'failed':
                            update_in(data, 'status.state', 'error')
                            update_in(data, 'status.error',
                                      'error, check logs')
                            _db.store_run(data, uid, project)
                        if new_status == 'succeeded':
                            update_in(data, 'status.state', 'completed')
                            _db.store_run(data, uid, project)
                status = new_status
            elif status == 'running':
                update_in(data, 'status.state', 'error')
                update_in(data, 'status.error',
                          'pod not found, maybe terminated')
                _db.store_run(data, uid, project)
                status = 'failed'

    return Response(out, mimetype='text/plain', headers={"pod_status": status})
Пример #9
0
def build_status(
        name: str = "",
        project: str = "",
        tag: str = "",
        offset: int = 0,
        logs: bool = True,
        last_log_timestamp: float = 0.0,
        verbose: bool = False,
        db_session: Session = Depends(deps.get_db_session),
):
    fn = get_db().get_function(db_session, name, project, tag)
    if not fn:
        log_and_raise(HTTPStatus.NOT_FOUND.value,
                      name=name,
                      project=project,
                      tag=tag)

    # nuclio deploy status
    if fn.get("kind") in RuntimeKinds.nuclio_runtimes():
        (
            state,
            address,
            nuclio_name,
            last_log_timestamp,
            text,
        ) = get_nuclio_deploy_status(name,
                                     project,
                                     tag,
                                     last_log_timestamp=last_log_timestamp,
                                     verbose=verbose)
        if state == "ready":
            logger.info("Nuclio function deployed successfully", name=name)
        if state == "error":
            logger.error(f"Nuclio deploy error, {text}", name=name)
        update_in(fn, "status.nuclio_name", nuclio_name)
        update_in(fn, "status.state", state)
        update_in(fn, "status.address", address)

        versioned = False
        if state == "ready":
            # Versioned means the version will be saved in the DB forever, we don't want to spam
            # the DB with intermediate or unusable versions, only successfully deployed versions
            versioned = True
        get_db().store_function(db_session,
                                fn,
                                name,
                                project,
                                tag,
                                versioned=versioned)
        return Response(
            content=text,
            media_type="text/plain",
            headers={
                "x-mlrun-function-status": state,
                "x-mlrun-last-timestamp": str(last_log_timestamp),
                "x-mlrun-address": address,
                "x-mlrun-name": nuclio_name,
            },
        )

    # job deploy status
    state = get_in(fn, "status.state", "")
    pod = get_in(fn, "status.build_pod", "")
    image = get_in(fn, "spec.build.image", "")
    out = b""
    if not pod:
        if state == "ready":
            image = image or get_in(fn, "spec.image")
        return Response(
            content=out,
            media_type="text/plain",
            headers={
                "function_status": state,
                "function_image": image,
                "builder_pod": pod,
            },
        )

    logger.info("get pod {} status".format(pod))
    state = get_k8s().get_pod_status(pod)
    logger.info("pod state={}".format(state))

    if state == "succeeded":
        logger.info("build completed successfully")
        state = "ready"
    if state in ["failed", "error"]:
        logger.error("build {}, watch the build pod logs: {}".format(
            state, pod))

    if logs and state != "pending":
        resp = get_k8s().logs(pod)
        if resp:
            out = resp.encode()[offset:]

    update_in(fn, "status.state", state)
    if state == "ready":
        update_in(fn, "spec.image", image)

    versioned = False
    if state == "ready":
        versioned = True
    get_db().store_function(db_session,
                            fn,
                            name,
                            project,
                            tag,
                            versioned=versioned)

    return Response(
        content=out,
        media_type="text/plain",
        headers={
            "x-mlrun-function-status": state,
            "function_status": state,
            "function_image": image,
            "builder_pod": pod,
        },
    )
Пример #10
0
    def get_log(db_session: Session,
                project: str,
                uid: str,
                size: int = -1,
                offset: int = 0,
                source: LogSources = LogSources.AUTO):
        out = b""
        log_file = log_path(project, uid)
        status = None
        if log_file.exists() and source in [
                LogSources.AUTO, LogSources.PERSISTENCY
        ]:
            with log_file.open("rb") as fp:
                fp.seek(offset)
                out = fp.read(size)
            status = ""
        elif source in [LogSources.AUTO, LogSources.K8S]:
            data = get_db().read_run(db_session, uid, project)
            if not data:
                log_and_raise(HTTPStatus.NOT_FOUND, project=project, uid=uid)

            status = get_in(data, "status.state", "")
            if get_k8s():
                pods = get_k8s().get_logger_pods(uid)
                if pods:
                    pod, new_status = list(pods.items())[0]
                    new_status = new_status.lower()

                    # TODO: handle in cron/tracking
                    if new_status != "pending":
                        resp = get_k8s().logs(pod)
                        if resp:
                            out = resp.encode()[offset:]
                        if status == "running":
                            now = now_date().isoformat()
                            update_in(data, "status.last_update", now)
                            if new_status == "failed":
                                update_in(data, "status.state", "error")
                                update_in(data, "status.error",
                                          "error, check logs")
                                get_db().store_run(db_session, data, uid,
                                                   project)
                            if new_status == "succeeded":
                                update_in(data, "status.state", "completed")
                                get_db().store_run(db_session, data, uid,
                                                   project)
                    status = new_status
                elif status == "running":
                    update_in(data, "status.state", "error")
                    update_in(data, "status.error",
                              "pod not found, maybe terminated")
                    get_db().store_run(db_session, data, uid, project)
                    status = "failed"
        return out, status
Пример #11
0
def build_status(
        name: str = "",
        project: str = "",
        tag: str = "",
        offset: int = 0,
        logs: bool = True,
        last_log_timestamp: float = 0.0,
        verbose: bool = False,
        auth_info: mlrun.api.schemas.AuthInfo = Depends(
            deps.authenticate_request),
        db_session: Session = Depends(deps.get_db_session),
):
    mlrun.api.utils.auth.verifier.AuthVerifier(
    ).query_project_resource_permissions(
        mlrun.api.schemas.AuthorizationResourceTypes.function,
        project or mlrun.mlconf.default_project,
        name,
        # store since with the current mechanism we update the status (and store the function) in the DB when a client
        # query for the status
        mlrun.api.schemas.AuthorizationAction.store,
        auth_info,
    )
    fn = mlrun.api.crud.Functions().get_function(db_session, name, project,
                                                 tag)
    if not fn:
        log_and_raise(HTTPStatus.NOT_FOUND.value,
                      name=name,
                      project=project,
                      tag=tag)

    # nuclio deploy status
    if fn.get("kind") in RuntimeKinds.nuclio_runtimes():
        (
            state,
            address,
            nuclio_name,
            last_log_timestamp,
            text,
            status,
        ) = get_nuclio_deploy_status(
            name,
            project,
            tag,
            last_log_timestamp=last_log_timestamp,
            verbose=verbose,
            auth_info=auth_info,
        )
        if state == "ready":
            logger.info("Nuclio function deployed successfully", name=name)
        if state in ["error", "unhealthy"]:
            logger.error(f"Nuclio deploy error, {text}", name=name)

        internal_invocation_urls = status.get("internalInvocationUrls", [])
        external_invocation_urls = status.get("externalInvocationUrls", [])

        # on earlier versions of mlrun, address used to represent the nodePort external invocation url
        # now that functions can be not exposed (using service_type clusterIP) this no longer relevant
        # and hence, for BC it would be filled with the external invocation url first item
        # or completely empty.
        address = external_invocation_urls[
            0] if external_invocation_urls else ""

        update_in(fn, "status.nuclio_name", nuclio_name)
        update_in(fn, "status.internal_invocation_urls",
                  internal_invocation_urls)
        update_in(fn, "status.external_invocation_urls",
                  external_invocation_urls)
        update_in(fn, "status.state", state)
        update_in(fn, "status.address", address)

        versioned = False
        if state == "ready":
            # Versioned means the version will be saved in the DB forever, we don't want to spam
            # the DB with intermediate or unusable versions, only successfully deployed versions
            versioned = True
        mlrun.api.crud.Functions().store_function(
            db_session,
            fn,
            name,
            project,
            tag,
            versioned=versioned,
        )
        return Response(
            content=text,
            media_type="text/plain",
            headers={
                "x-mlrun-function-status":
                state,
                "x-mlrun-last-timestamp":
                str(last_log_timestamp),
                "x-mlrun-address":
                address,
                "x-mlrun-internal-invocation-urls":
                ",".join(internal_invocation_urls),
                "x-mlrun-external-invocation-urls":
                ",".join(external_invocation_urls),
                "x-mlrun-name":
                nuclio_name,
            },
        )

    # job deploy status
    state = get_in(fn, "status.state", "")
    pod = get_in(fn, "status.build_pod", "")
    image = get_in(fn, "spec.build.image", "")
    out = b""
    if not pod:
        if state == "ready":
            image = image or get_in(fn, "spec.image")
        return Response(
            content=out,
            media_type="text/plain",
            headers={
                "function_status": state,
                "function_image": image,
                "builder_pod": pod,
            },
        )

    logger.info(f"get pod {pod} status")
    state = get_k8s().get_pod_status(pod)
    logger.info(f"pod state={state}")

    if state == "succeeded":
        logger.info("build completed successfully")
        state = mlrun.api.schemas.FunctionState.ready
    if state in ["failed", "error"]:
        logger.error(f"build {state}, watch the build pod logs: {pod}")
        state = mlrun.api.schemas.FunctionState.error

    if logs and state != "pending":
        resp = get_k8s().logs(pod)
        if resp:
            out = resp.encode()[offset:]

    update_in(fn, "status.state", state)
    if state == mlrun.api.schemas.FunctionState.ready:
        update_in(fn, "spec.image", image)

    versioned = False
    if state == mlrun.api.schemas.FunctionState.ready:
        versioned = True
    mlrun.api.crud.Functions().store_function(
        db_session,
        fn,
        name,
        project,
        tag,
        versioned=versioned,
    )

    return Response(
        content=out,
        media_type="text/plain",
        headers={
            "x-mlrun-function-status": state,
            "function_status": state,
            "function_image": image,
            "builder_pod": pod,
        },
    )
Пример #12
0
def build_status(
        name: str = "",
        project: str = "",
        tag: str = "",
        offset: int = 0,
        logs: bool = True,
        last_log_timestamp: float = 0.0,
        verbose: bool = False,
        auth_verifier: deps.AuthVerifier = Depends(deps.AuthVerifier),
        db_session: Session = Depends(deps.get_db_session),
):
    fn = get_db().get_function(db_session, name, project, tag)
    if not fn:
        log_and_raise(HTTPStatus.NOT_FOUND.value,
                      name=name,
                      project=project,
                      tag=tag)

    # nuclio deploy status
    if fn.get("kind") in RuntimeKinds.nuclio_runtimes():
        (
            state,
            address,
            nuclio_name,
            last_log_timestamp,
            text,
            status,
        ) = get_nuclio_deploy_status(name,
                                     project,
                                     tag,
                                     last_log_timestamp=last_log_timestamp,
                                     verbose=verbose)
        if state == "ready":
            logger.info("Nuclio function deployed successfully", name=name)
        if state in ["error", "unhealthy"]:
            logger.error(f"Nuclio deploy error, {text}", name=name)

        # internal / external invocation urls were added on nuclio 1.6.x
        # and hence, it might be empty
        # to backward compatible with older nuclio versions, we use hard-coded default values
        internal_invocation_urls = status.get(
            "internalInvocationUrls",
            [resolve_function_internal_invocation_url(name)])
        external_invocation_urls = status.get("externalInvocationUrls",
                                              [address] if address else [])

        # on nuclio > 1.6.x we get the external invocation url on the status block
        if external_invocation_urls and not address:
            address = external_invocation_urls[0]

        update_in(fn, "status.nuclio_name", nuclio_name)
        update_in(fn, "status.internal_invocation_urls",
                  internal_invocation_urls)
        update_in(fn, "status.external_invocation_urls",
                  external_invocation_urls)
        update_in(fn, "status.state", state)
        update_in(fn, "status.address", address)

        versioned = False
        if state == "ready":
            # Versioned means the version will be saved in the DB forever, we don't want to spam
            # the DB with intermediate or unusable versions, only successfully deployed versions
            versioned = True
        get_db().store_function(
            db_session,
            fn,
            name,
            project,
            tag,
            versioned=versioned,
            leader_session=auth_verifier.auth_info.session,
        )
        return Response(
            content=text,
            media_type="text/plain",
            headers={
                "x-mlrun-function-status":
                state,
                "x-mlrun-last-timestamp":
                str(last_log_timestamp),
                "x-mlrun-address":
                address,
                "x-mlrun-internal-invocation-urls":
                ",".join(internal_invocation_urls),
                "x-mlrun-external-invocation-urls":
                ",".join(external_invocation_urls),
                "x-mlrun-name":
                nuclio_name,
            },
        )

    # job deploy status
    state = get_in(fn, "status.state", "")
    pod = get_in(fn, "status.build_pod", "")
    image = get_in(fn, "spec.build.image", "")
    out = b""
    if not pod:
        if state == "ready":
            image = image or get_in(fn, "spec.image")
        return Response(
            content=out,
            media_type="text/plain",
            headers={
                "function_status": state,
                "function_image": image,
                "builder_pod": pod,
            },
        )

    logger.info(f"get pod {pod} status")
    state = get_k8s().get_pod_status(pod)
    logger.info(f"pod state={state}")

    if state == "succeeded":
        logger.info("build completed successfully")
        state = mlrun.api.schemas.FunctionState.ready
    if state in ["failed", "error"]:
        logger.error(f"build {state}, watch the build pod logs: {pod}")
        state = mlrun.api.schemas.FunctionState.error

    if logs and state != "pending":
        resp = get_k8s().logs(pod)
        if resp:
            out = resp.encode()[offset:]

    update_in(fn, "status.state", state)
    if state == mlrun.api.schemas.FunctionState.ready:
        update_in(fn, "spec.image", image)

    versioned = False
    if state == mlrun.api.schemas.FunctionState.ready:
        versioned = True
    get_db().store_function(
        db_session,
        fn,
        name,
        project,
        tag,
        versioned=versioned,
        leader_session=auth_verifier.auth_info.session,
    )

    return Response(
        content=out,
        media_type="text/plain",
        headers={
            "x-mlrun-function-status": state,
            "function_status": state,
            "function_image": image,
            "builder_pod": pod,
        },
    )
Пример #13
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels['mlrun/job'] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, 'image', self.full_image_path())
            self._update_container(pod_template, 'volumeMounts', self.spec.volume_mounts)
            extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()}
            # if self.spec.rundb:
            #     extra_env['MLRUN_DBPATH'] = self.spec.rundb
            extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()]
            self._update_container(pod_template, 'env', extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template, 'imagePullPolicy', self.spec.image_pull_policy)
            if self.spec.workdir:
                self._update_container(pod_template, 'workingDir', self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(pod_template, 'spec.imagePullSecrets',
                          [{'name': self.spec.image_pull_secret}])
            update_in(pod_template, 'metadata.labels', pod_labels)
            update_in(pod_template, 'spec.volumes', self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher doesn't require
        # special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template)

        # update the replicas only for workers
        update_in(job, 'spec.mpiReplicaSpecs.Worker.replicas', self.spec.replicas or 1)

        if execution.get_param('slots_per_worker'):
            update_in(job, 'spec.slotsPerWorker', execution.get_param('slots_per_worker'))

        update_in(job, 'metadata', meta.to_dict())

        return job
Пример #14
0
def tag_test(spec, name):
    spec = deepcopy(spec)
    update_in(spec, 'metadata.name', name)
    update_in(spec, 'metadata.lables.test', name)
    return spec
Пример #15
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            extra_env = self._generate_runtime_env(runobj)
            extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job
Пример #16
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)
        command, args, extra_env = self._get_cmd_args(runobj)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)
            update_in(pod_template, "spec.nodeName", self.spec.node_name)
            update_in(pod_template, "spec.nodeSelector",
                      self.spec.node_selector)
            update_in(pod_template, "spec.affinity",
                      self.spec._get_sanitized_affinity())
            if self.spec.priority_class_name and len(
                    mlconf.get_valid_function_priority_class_names()):
                update_in(
                    pod_template,
                    "spec.priorityClassName",
                    self.spec.priority_class_name,
                )

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template,
                                             [command] + args)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job