def list_runtimes(label_selector: str = None): runtimes = [] for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) runtimes.append({"kind": kind, "resources": resources}) return runtimes
def _build_function( db_session, auth_info: mlrun.api.schemas.AuthInfo, function, with_mlrun, skip_deployed, mlrun_version_specifier, ): fn = None ready = None try: fn = new_function(runtime=function) run_db = get_run_db_instance(db_session, auth_info.session) fn.set_db_connection(run_db) fn.save(versioned=False) if fn.kind in RuntimeKinds.nuclio_runtimes(): mlrun.api.api.utils.ensure_function_has_auth_set(fn, auth_info) deploy_nuclio_function(fn) # deploy only start the process, the get status API is used to check readiness ready = False else: ready = build_runtime(fn, with_mlrun, mlrun_version_specifier, skip_deployed) fn.save(versioned=True) logger.info("Fn:\n %s", fn.to_yaml()) except Exception as err: logger.error(traceback.format_exc()) log_and_raise(HTTPStatus.BAD_REQUEST.value, reason=f"runtime error: {err}") return fn, ready
def delete_runtimes(label_selector: str = None, force: bool = False, db_session: Session = Depends(deps.get_db_session)): for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force) return Response(status_code=status.HTTP_204_NO_CONTENT)
def _cleanup_runtimes(): logger.debug('Cleaning runtimes') db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session) finally: close_session(db_session)
def delete_runtimes( label_selector: str = None, force: bool = False, grace_period: int = config.runtime_resources_deletion_grace_period, db_session: Session = Depends(deps.get_db_session), ): for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force, grace_period) return Response(status_code=HTTPStatus.NO_CONTENT.value)
def get_runtime(kind: str, label_selector: str = None): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(HTTPStatus.BAD_REQUEST.value, kind=kind, err="Invalid runtime kind") runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) return { "kind": kind, "resources": resources, }
def get_runtime(kind: str, label_selector: str = None): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(status.HTTP_400_BAD_REQUEST, kind=kind, err='Invalid runtime kind') runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) return { 'kind': kind, 'resources': resources, }
def delete_runtime(kind: str, label_selector: str = None, force: bool = False, db_session: Session = Depends(deps.get_db_session)): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(status.HTTP_400_BAD_REQUEST, kind=kind, err='Invalid runtime kind') runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force) return Response(status_code=status.HTTP_204_NO_CONTENT)
def _cleanup_runtimes(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session) except Exception as exc: logger.warning("Failed deleting resources. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def _monitor_runs(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.monitor_runs(get_db(), db_session) except Exception as exc: logger.warning("Failed monitoring runs. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def delete_runtime( kind: str, label_selector: str = None, force: bool = False, grace_period: int = config.runtime_resources_deletion_grace_period, db_session: Session = Depends(deps.get_db_session), ): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(HTTPStatus.BAD_REQUEST.value, kind=kind, err="Invalid runtime kind") runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force, grace_period) return Response(status_code=HTTPStatus.NO_CONTENT.value)
def _build_function(db_session, function, with_mlrun): fn = None ready = None try: fn = new_function(runtime=function) run_db = get_run_db_instance(db_session) fn.set_db_connection(run_db) fn.save(versioned=False) if fn.kind in RuntimeKinds.nuclio_runtimes(): deploy_nuclio_function(fn) # deploy only start the process, the get status API is used to check readiness ready = False else: ready = build_runtime(fn, with_mlrun) fn.save(versioned=True) logger.info("Fn:\n %s", fn.to_yaml()) except Exception as err: logger.error(traceback.format_exc()) log_and_raise(HTTPStatus.BAD_REQUEST.value, reason="runtime error: {}".format(err)) return fn, ready
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, db_session: Session = Depends(deps.get_db_session), ): fn = get_db().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, ) = get_nuclio_deploy_status(name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state == "error": logger.error(f"Nuclio deploy error, {text}", name=name) update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True get_db().store_function(db_session, fn, name, project, tag, versioned=versioned) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info("get pod {} status".format(pod)) state = get_k8s().get_pod_status(pod) logger.info("pod state={}".format(state)) if state == "succeeded": logger.info("build completed successfully") state = "ready" if state in ["failed", "error"]: logger.error("build {}, watch the build pod logs: {}".format( state, pod)) if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == "ready": update_in(fn, "spec.image", image) versioned = False if state == "ready": versioned = True get_db().store_function(db_session, fn, name, project, tag, versioned=versioned) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )
def _build_function( db_session, auth_info: mlrun.api.schemas.AuthInfo, function, with_mlrun=True, skip_deployed=False, mlrun_version_specifier=None, builder_env=None, ): fn = None ready = None try: fn = new_function(runtime=function) except Exception as err: logger.error(traceback.format_exc()) log_and_raise(HTTPStatus.BAD_REQUEST.value, reason=f"runtime error: {err}") try: run_db = get_run_db_instance(db_session) fn.set_db_connection(run_db) fn.save(versioned=False) if fn.kind in RuntimeKinds.nuclio_runtimes(): mlrun.api.api.utils.ensure_function_has_auth_set(fn, auth_info) mlrun.api.api.utils.process_function_service_account(fn) if fn.kind == RuntimeKinds.serving: # Handle model monitoring try: if fn.spec.track_models: logger.info( "Tracking enabled, initializing model monitoring") _init_serving_function_stream_args(fn=fn) model_monitoring_access_key = _process_model_monitoring_secret( db_session, fn.metadata.project, "MODEL_MONITORING_ACCESS_KEY", ) _create_model_monitoring_stream( project=fn.metadata.project) mlrun.api.crud.ModelEndpoints( ).deploy_monitoring_functions( project=fn.metadata.project, model_monitoring_access_key= model_monitoring_access_key, db_session=db_session, auth_info=auth_info, ) except Exception as exc: logger.warning( "Failed deploying model monitoring infrastructure for the project", project=fn.metadata.project, exc=exc, traceback=traceback.format_exc(), ) deploy_nuclio_function(fn, auth_info=auth_info) # deploy only start the process, the get status API is used to check readiness ready = False else: ready = build_runtime( auth_info, fn, with_mlrun, mlrun_version_specifier, skip_deployed, builder_env=builder_env, ) fn.save(versioned=True) logger.info("Fn:\n %s", fn.to_yaml()) except Exception as err: logger.error(traceback.format_exc()) log_and_raise(HTTPStatus.BAD_REQUEST.value, reason=f"runtime error: {err}") return fn, ready
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, auth_info: mlrun.api.schemas.AuthInfo = Depends( deps.authenticate_request), db_session: Session = Depends(deps.get_db_session), ): mlrun.api.utils.auth.verifier.AuthVerifier( ).query_project_resource_permissions( mlrun.api.schemas.AuthorizationResourceTypes.function, project or mlrun.mlconf.default_project, name, # store since with the current mechanism we update the status (and store the function) in the DB when a client # query for the status mlrun.api.schemas.AuthorizationAction.store, auth_info, ) fn = mlrun.api.crud.Functions().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, status, ) = get_nuclio_deploy_status( name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose, auth_info=auth_info, ) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state in ["error", "unhealthy"]: logger.error(f"Nuclio deploy error, {text}", name=name) internal_invocation_urls = status.get("internalInvocationUrls", []) external_invocation_urls = status.get("externalInvocationUrls", []) # on earlier versions of mlrun, address used to represent the nodePort external invocation url # now that functions can be not exposed (using service_type clusterIP) this no longer relevant # and hence, for BC it would be filled with the external invocation url first item # or completely empty. address = external_invocation_urls[ 0] if external_invocation_urls else "" update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.internal_invocation_urls", internal_invocation_urls) update_in(fn, "status.external_invocation_urls", external_invocation_urls) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True mlrun.api.crud.Functions().store_function( db_session, fn, name, project, tag, versioned=versioned, ) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-internal-invocation-urls": ",".join(internal_invocation_urls), "x-mlrun-external-invocation-urls": ",".join(external_invocation_urls), "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info(f"get pod {pod} status") state = get_k8s().get_pod_status(pod) logger.info(f"pod state={state}") if state == "succeeded": logger.info("build completed successfully") state = mlrun.api.schemas.FunctionState.ready if state in ["failed", "error"]: logger.error(f"build {state}, watch the build pod logs: {pod}") state = mlrun.api.schemas.FunctionState.error if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == mlrun.api.schemas.FunctionState.ready: update_in(fn, "spec.image", image) versioned = False if state == mlrun.api.schemas.FunctionState.ready: versioned = True mlrun.api.crud.Functions().store_function( db_session, fn, name, project, tag, versioned=versioned, ) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, auth_verifier: deps.AuthVerifier = Depends(deps.AuthVerifier), db_session: Session = Depends(deps.get_db_session), ): fn = get_db().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, status, ) = get_nuclio_deploy_status(name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state in ["error", "unhealthy"]: logger.error(f"Nuclio deploy error, {text}", name=name) # internal / external invocation urls were added on nuclio 1.6.x # and hence, it might be empty # to backward compatible with older nuclio versions, we use hard-coded default values internal_invocation_urls = status.get( "internalInvocationUrls", [resolve_function_internal_invocation_url(name)]) external_invocation_urls = status.get("externalInvocationUrls", [address] if address else []) # on nuclio > 1.6.x we get the external invocation url on the status block if external_invocation_urls and not address: address = external_invocation_urls[0] update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.internal_invocation_urls", internal_invocation_urls) update_in(fn, "status.external_invocation_urls", external_invocation_urls) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True get_db().store_function( db_session, fn, name, project, tag, versioned=versioned, leader_session=auth_verifier.auth_info.session, ) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-internal-invocation-urls": ",".join(internal_invocation_urls), "x-mlrun-external-invocation-urls": ",".join(external_invocation_urls), "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info(f"get pod {pod} status") state = get_k8s().get_pod_status(pod) logger.info(f"pod state={state}") if state == "succeeded": logger.info("build completed successfully") state = mlrun.api.schemas.FunctionState.ready if state in ["failed", "error"]: logger.error(f"build {state}, watch the build pod logs: {pod}") state = mlrun.api.schemas.FunctionState.error if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == mlrun.api.schemas.FunctionState.ready: update_in(fn, "spec.image", image) versioned = False if state == mlrun.api.schemas.FunctionState.ready: versioned = True get_db().store_function( db_session, fn, name, project, tag, versioned=versioned, leader_session=auth_verifier.auth_info.session, ) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )