def initialize(self): logger.info("Initializing projects follower") self._projects: typing.Dict[str, mlrun.api.schemas.Project] = {} self._leader_name = mlrun.config.config.httpdb.projects.leader self._sync_session = None if self._leader_name == "iguazio": self._leader_client = mlrun.api.utils.clients.iguazio.Client() if not mlrun.config.config.httpdb.projects.iguazio_access_key: raise mlrun.errors.MLRunInvalidArgumentError( "Iguazio access key must be configured when the leader is Iguazio" ) self._sync_session = mlrun.config.config.httpdb.projects.iguazio_access_key elif self._leader_name == "nop": self._leader_client = mlrun.api.utils.projects.remotes.nop_leader.Member() else: raise NotImplementedError("Unsupported project leader") self._periodic_sync_interval_seconds = humanfriendly.parse_timespan( mlrun.config.config.httpdb.projects.periodic_sync_interval ) self._synced_until_datetime = None # run one sync to start off on the right foot and fill out the cache but don't fail initialization on it try: self._sync_projects() except Exception as exc: logger.warning("Initial projects sync failed", exc=str(exc)) self._start_periodic_sync()
def _send_request_to_api(self, method, path, session_cookie=None, **kwargs): url = f"{self._api_url}/api/{path}" if session_cookie: cookies = kwargs.get("cookies", {}) # in case some dev using this function for some reason setting cookies manually through kwargs + have a # cookie with "session" key there + filling the session cookie - explode if "session" in cookies and cookies["session"] != session_cookie: raise mlrun.errors.MLRunInvalidArgumentError( "Session cookie already set" ) cookies["session"] = session_cookie kwargs["cookies"] = cookies if kwargs.get("timeout") is None: kwargs["timeout"] = 20 response = self._session.request(method, url, verify=False, **kwargs) if not response.ok: log_kwargs = copy.deepcopy(kwargs) log_kwargs.update({"method": method, "path": path}) if response.content: try: data = response.json() ctx = data.get("meta", {}).get("ctx") errors = data.get("errors", []) except Exception: pass else: log_kwargs.update({"ctx": ctx, "errors": errors}) logger.warning("Request to iguazio failed", **log_kwargs) mlrun.errors.raise_for_status(response) return response
def _run(self, runobj: RunObject, execution: MLClientCtx): if runobj.metadata.iteration: self.store_run(runobj) meta = self._get_meta(runobj, True) job = self._generate_mpi_job(runobj, execution, meta) resp = self._submit_mpijob(job, meta.namespace) state = None timeout = int(config.submit_timeout) or 120 for _ in range(timeout): resp = self.get_job(meta.name, meta.namespace) state = self._get_job_launcher_status(resp) if resp and state: break time.sleep(1) if resp: logger.info("MpiJob {} state={}".format(meta.name, state or "unknown")) if state: state = state.lower() launcher, _ = self._get_launcher(meta.name, meta.namespace) execution.set_hostname(launcher) execution.set_state("running" if state == "active" else state) if self.kfp: writer = AsyncLogWriter(self._db_conn, runobj) status = self._get_k8s().watch( launcher, meta.namespace, writer=writer ) logger.info( "MpiJob {} finished with state {}".format(meta.name, status) ) if status == "succeeded": execution.set_state("completed") else: execution.set_state( "error", "MpiJob {} finished with state {}".format( meta.name, status ), ) else: txt = "MpiJob {} launcher pod {} state {}".format( meta.name, launcher, state ) logger.info(txt) runobj.status.status_text = txt else: txt = "MpiJob status unknown or failed, check pods: {}".format( self.get_pods(meta.name, meta.namespace) ) logger.warning(txt) runobj.status.status_text = txt if self.kfp: execution.set_state("error", txt) return None
def initialize(self): logger.info("Initializing projects follower") self.projects_store_mode = ( mlrun.mlconf.httpdb.projects.follower_projects_store_mode) if self.projects_store_mode not in self.ProjectsStoreMode.all(): raise mlrun.errors.MLRunInvalidArgumentError( f"Provided projects store mode is not supported. mode={self.projects_store_mode}" ) self._projects: typing.Dict[str, mlrun.api.schemas.Project] = {} self._projects_store_for_deletion = self.ProjectsStore(self) self._leader_name = mlrun.mlconf.httpdb.projects.leader self._sync_session = None if self._leader_name == "iguazio": self._leader_client = mlrun.api.utils.clients.iguazio.Client() if not mlrun.mlconf.httpdb.projects.iguazio_access_key: raise mlrun.errors.MLRunInvalidArgumentError( "Iguazio access key must be configured when the leader is Iguazio" ) self._sync_session = mlrun.mlconf.httpdb.projects.iguazio_access_key elif self._leader_name == "nop": self._leader_client = mlrun.api.utils.projects.remotes.nop_leader.Member( ) else: raise NotImplementedError("Unsupported project leader") self._periodic_sync_interval_seconds = humanfriendly.parse_timespan( mlrun.mlconf.httpdb.projects.periodic_sync_interval) self._synced_until_datetime = None # Only if we're storing the projects in cache, we need to maintain this cache i.e. run the periodic sync if self.projects_store_mode == self.ProjectsStoreMode.cache: # run one sync to start off on the right foot and fill out the cache but don't fail initialization on it try: self._sync_projects() except Exception as exc: logger.warning("Initial projects sync failed", exc=str(exc)) self._start_periodic_sync()
def _find_last_updated_artifact( artifacts: typing.List[mlrun.api.db.sqldb.models.Artifact], ): # sanity if not artifacts: raise RuntimeError("No artifacts given") last_updated_artifact = None last_updated_artifact_time = datetime.datetime.min artifacts_with_same_update_time = [] for artifact in artifacts: if artifact.updated > last_updated_artifact_time: last_updated_artifact = artifact last_updated_artifact_time = last_updated_artifact.updated artifacts_with_same_update_time = [last_updated_artifact] elif artifact.updated == last_updated_artifact_time: artifacts_with_same_update_time.append(artifact) if len(artifacts_with_same_update_time) > 1: logger.warning( "Found several artifact with same update time, heuristically choosing the first", artifacts=[ artifact.to_dict() for artifact in artifacts_with_same_update_time ], ) # we don't really need to do anything to choose the first, it's already happening because the first if is > # and not >= if not last_updated_artifact: logger.warning( "No artifact had update time, heuristically choosing the first", artifacts=[artifact.to_dict() for artifact in artifacts], ) last_updated_artifact = artifacts[0] return last_updated_artifact
def init_app(): global _db, _logs_dir, _k8s, _scheduler logger.info('configuration dump\n%s', config.dump_yaml()) if config.httpdb.db_type == 'sqldb': logger.info('using SQLDB') _db = SQLDB(config.httpdb.dsn) else: logger.info('using FileRunDB') _db = FileRunDB(config.httpdb.dirpath) _db.connect() _logs_dir = Path(config.httpdb.logs_path) try: _k8s = K8sHelper() except Exception: pass # @yaronha - Initialize here task = periodic.Task() periodic.schedule(task, 60) _scheduler = Scheduler() for data in _db.list_schedules(): if 'schedule' not in data: logger.warning('bad scheduler data - %s', data) continue _submit(data)
def show(self, format=None): """show the data object content in Jupyter :param format: format to use (when there is no/wrong suffix), e.g. 'png' """ if not is_ipython: logger.warning( "Jupyter/IPython was not detected, .show() will only display inside Jupyter" ) return from IPython import display suffix = self.suffix.lower() if format: suffix = "." + format if suffix in [".jpg", ".png", ".gif"]: display.display(display.Image(self.get(), format=suffix[1:])) elif suffix in [".htm", ".html"]: display.display(display.HTML(self.get(encoding="utf-8"))) elif suffix in [".csv", ".pq", ".parquet"]: display.display(self.as_df()) elif suffix in [".yaml", ".txt", ".py"]: display.display(display.Pretty(self.get(encoding="utf-8"))) elif suffix == ".json": display.display(display.JSON(orjson.loads(self.get()))) elif suffix == ".md": display.display(display.Markdown(self.get(encoding="utf-8"))) else: logger.error(f"unsupported show() format {suffix} for {self.url}")
def _validate_body_and_path_names_matches( name: str, project: mlrun.api.schemas.ProjectPatch): # ProjectPatch allow extra fields, therefore although it doesn't have name in the schema, name might be there if hasattr(project, "name") and name != getattr(project, "name"): message = "Conflict between name in body and name in path" logger.warning(message, path_name=name, body_name=getattr(project, "name")) raise mlrun.errors.MLRunConflictError(message)
async def handler_returning_http_exception( request: fastapi.Request, exc: HandledException2 ): logger.warning( "Handler caught HandledException2 exception, returning HTTPException with 401" ) return await http_exception_handler( request, fastapi.HTTPException(status_code=HTTPStatus.UNAUTHORIZED.value) )
def _upsert(session, obj, ignore=False): try: session.add(obj) session.commit() except SQLAlchemyError as err: session.rollback() cls = obj.__class__.__name__ logger.warning(f"conflict adding {cls}, {err}") if not ignore: raise DBError(f"duplicate {cls} - {err}") from err
def _reschedule_tasks(): db_session = None try: db_session = create_session() for data in get_db().list_schedules(db_session): if "schedule" not in data: logger.warning("bad scheduler data - %s", data) continue submit(db_session, data) finally: close_session(db_session)
def _try_resolve_project_from_body(content_type: str, data: bytes) -> typing.Optional[str]: if "/yaml" not in content_type: logger.warning( "Could not resolve project from body, unsupported content type", content_type=content_type, ) return None workflow_manifest = yaml.load(data, Loader=yaml.FullLoader) return mlrun.api.crud.Pipelines().resolve_project_from_workflow_manifest( workflow_manifest)
async def http_status_error_handler(request: fastapi.Request, exc: mlrun.errors.MLRunHTTPStatusError): status_code = exc.response.status_code error_message = repr(exc) logger.warning( "Request handling returned error status", error_message=error_message, status_code=status_code, ) return await http_exception_handler( request, fastapi.HTTPException(status_code=status_code, detail=error_message))
async def _periodic_function_wrapper(interval: int, function, *args, **kwargs): while True: try: if asyncio.iscoroutinefunction(function): await function(*args, **kwargs) else: await run_in_threadpool(function, *args, **kwargs) except Exception: logger.warning( f'Failed during periodic function execution: {function.__name__}, exc: {traceback.format_exc()}' ) await asyncio.sleep(interval)
async def start(self, db_session: Session): logger.info("Starting scheduler") self._scheduler.start() # the scheduler shutdown and start operation are not fully async compatible yet - # https://github.com/agronholm/apscheduler/issues/360 - this sleep make them work await asyncio.sleep(0) # don't fail the start on re-scheduling failure try: self._reload_schedules(db_session) except Exception as exc: logger.warning("Failed reloading schedules", exc=exc)
def create_pipeline( self, experiment_name: str, run_name: str, content_type: str, data: bytes, arguments: dict = None, namespace: str = mlrun.mlconf.namespace, ): if arguments is None: arguments = {} if "/yaml" in content_type: content_type = ".yaml" elif " /zip" in content_type: content_type = ".zip" else: mlrun.api.api.utils.log_and_raise( http.HTTPStatus.BAD_REQUEST.value, reason=f"unsupported pipeline type {content_type}", ) logger.debug("Writing pipeline to temp file", content_type=content_type) print(str(data)) pipeline_file = tempfile.NamedTemporaryFile(suffix=content_type) with open(pipeline_file.name, "wb") as fp: fp.write(data) logger.info( "Creating pipeline", experiment_name=experiment_name, run_name=run_name, arguments=arguments, ) try: kfp_client = kfp.Client(namespace=namespace) experiment = kfp_client.create_experiment(name=experiment_name) run = kfp_client.run_pipeline( experiment.id, run_name, pipeline_file.name, params=arguments ) except Exception as exc: logger.warning( "Failed creating pipeline", traceback=traceback.format_exc(), exc=str(exc), ) raise mlrun.errors.MLRunBadRequestError(f"Failed creating pipeline: {exc}") finally: pipeline_file.close() return run
def _cleanup_runtimes(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session) except Exception as exc: logger.warning("Failed deleting resources. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def _monitor_runs(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.monitor_runs(get_db(), db_session) except Exception as exc: logger.warning("Failed monitoring runs. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def _fix_artifact_tags_duplications(db: mlrun.api.db.sqldb.db.SQLDB, db_session: sqlalchemy.orm.Session): logger.info("Fixing artifact tags duplications") # get all artifacts artifacts = db._find_artifacts(db_session, None, "*") # get all artifact tags tags = db._query(db_session, mlrun.api.db.sqldb.models.Artifact.Tag).all() # artifact record id -> artifact artifact_record_id_map = {artifact.id: artifact for artifact in artifacts} tags_to_delete = [] projects = {artifact.project for artifact in artifacts} for project in projects: artifact_keys = { artifact.key for artifact in artifacts if artifact.project == project } for artifact_key in artifact_keys: artifact_key_tags = [] for tag in tags: # sanity if tag.obj_id not in artifact_record_id_map: logger.warning("Found orphan tag, deleting", tag=tag.to_dict()) if artifact_record_id_map[tag.obj_id].key == artifact_key: artifact_key_tags.append(tag) tag_name_tags_map = collections.defaultdict(list) for tag in artifact_key_tags: tag_name_tags_map[tag.name].append(tag) for tag_name, _tags in tag_name_tags_map.items(): if len(_tags) == 1: continue tags_artifacts = [ artifact_record_id_map[tag.obj_id] for tag in _tags ] last_updated_artifact = _find_last_updated_artifact( tags_artifacts) for tag in _tags: if tag.obj_id != last_updated_artifact.id: tags_to_delete.append(tag) if tags_to_delete: logger.info( "Found duplicated artifact tags. Removing duplications", tags_to_delete=[ tag_to_delete.to_dict() for tag_to_delete in tags_to_delete ], tags=[tag.to_dict() for tag in tags], artifacts=[artifact.to_dict() for artifact in artifacts], ) for tag in tags_to_delete: db_session.delete(tag) db_session.commit()
def run_function_periodically(interval: int, name: str, replace: bool, function, *args, **kwargs): global tasks logger.debug("Submitting function to run periodically", name=name) if name in tasks: if not replace: message = "Task with that name already exists" logger.warning(message, name=name) raise mlrun.errors.MLRunInvalidArgumentError(message) cancel_periodic_function(name) loop = asyncio.get_running_loop() task = loop.create_task( _periodic_function_wrapper(interval, function, *args, **kwargs)) tasks[name] = task
def _resolve_project_from_command( self, command: typing.List[str], hyphen_p_is_also_project: bool, has_func_url_flags: bool, has_runtime_flags: bool, ): # project has precedence over function url so search for it first for index, argument in enumerate(command): if ( (argument == "-p" and hyphen_p_is_also_project) or argument == "--project" ) and index + 1 < len(command): return command[index + 1] if has_func_url_flags: for index, argument in enumerate(command): if (argument == "-f" or argument == "--func-url") and index + 1 < len( command ): function_url = command[index + 1] if function_url.startswith("db://"): ( project, _, _, _, ) = mlrun.utils.helpers.parse_versioned_object_uri( function_url[len("db://") :] ) if project: return project if has_runtime_flags: for index, argument in enumerate(command): if (argument == "-r" or argument == "--runtime") and index + 1 < len( command ): runtime = command[index + 1] try: parsed_runtime = ast.literal_eval(runtime) except Exception as exc: logger.warning( "Failed parsing runtime. Skipping", runtime=runtime, exc=exc ) else: if isinstance(parsed_runtime, dict): project = parsed_runtime.get("metadata", {}).get("project") if project: return project return None
def get_workflow_engine(engine_kind, local=False): if local: if engine_kind == "kfp": logger.warning( "running kubeflow pipeline locally, note some ops may not run locally!" ) return _LocalRunner if not engine_kind or engine_kind == "kfp": return _KFPRunner if engine_kind == "local": return _LocalRunner raise mlrun.errors.MLRunInvalidArgumentError( f"Provided workflow engine is not supported. engine_kind={engine_kind}" )
def _validate_body_and_path_names_matches( path_name: str, project: typing.Union[mlrun.api.schemas.Project, dict] ): if isinstance(project, mlrun.api.schemas.Project): body_name = project.metadata.name elif isinstance(project, dict): body_name = project.get("metadata", {}).get("name") else: raise NotImplementedError("Unsupported project instance type") if body_name and path_name != body_name: message = "Conflict between name in body and name in path" logger.warning(message, path_name=path_name, body_name=body_name) raise mlrun.errors.MLRunConflictError(message)
async def log_request_response(request: fastapi.Request, call_next): request_id = str(uuid.uuid4()) silent_logging_paths = [ "healthz", ] path_with_query_string = uvicorn.protocols.utils.get_path_with_query_string( request.scope ) if not any( silent_logging_path in path_with_query_string for silent_logging_path in silent_logging_paths ): logger.debug( "Received request", method=request.method, client_address=get_client_address(request.scope), http_version=request.scope["http_version"], request_id=request_id, uri=path_with_query_string, ) try: response = await call_next(request) except Exception as exc: logger.warning( "Request handling failed. Sending response", # User middleware (like this one) runs after the exception handling middleware, the only thing running after # it is Starletter's ServerErrorMiddleware which is responsible for catching any un-handled exception # and transforming it to 500 response. therefore we can statically assign status code to 500 status_code=500, request_id=request_id, uri=path_with_query_string, method=request.method, exc=exc, traceback=traceback.format_exc(), ) raise else: if not any( silent_logging_path in path_with_query_string for silent_logging_path in silent_logging_paths ): logger.debug( "Sending response", status_code=response.status_code, request_id=request_id, uri=path_with_query_string, method=request.method, ) return response
async def startup_event(): logger.info("configuration dump\n%s", config.dump_yaml()) initialize_singletons() # don't fail the app on re-scheduling failure try: task = periodic.Task() periodic.schedule(task, 60) _reschedule_tasks() except Exception as exc: logger.warning(f'Failed rescheduling tasks, err: {exc}') _start_periodic_cleanup()
def _resolve_nuclio_version(self): if not self._cached_nuclio_version: # config override everything nuclio_version = config.nuclio_version if not nuclio_version and config.nuclio_dashboard_url: try: nuclio_client = nuclio.Client() nuclio_version = nuclio_client.get_dashboard_version() except Exception as exc: logger.warning("Failed to resolve nuclio version", exc=str(exc)) self._cached_nuclio_version = nuclio_version return self._cached_nuclio_version
def _send_request_to_api( self, method, path, error_message: str, session=None, **kwargs ): url = f"{self._api_url}/api/{path}" # support session being already a cookie session_cookie = session if ( session_cookie and not session_cookie.startswith('j:{"sid"') and not session_cookie.startswith(urllib.parse.quote_plus('j:{"sid"')) ): session_cookie = f'j:{{"sid": "{session_cookie}"}}' if session_cookie: cookies = kwargs.get("cookies", {}) # in case some dev using this function for some reason setting cookies manually through kwargs + have a # cookie with "session" key there + filling the session cookie - explode if "session" in cookies and cookies["session"] != session_cookie: raise mlrun.errors.MLRunInvalidArgumentError( "Session cookie already set" ) cookies["session"] = session_cookie kwargs["cookies"] = cookies if kwargs.get("timeout") is None: kwargs["timeout"] = 20 if "projects" in path: if mlrun.api.schemas.HeaderNames.projects_role not in kwargs.get( "headers", {} ): kwargs.setdefault("headers", {})[ mlrun.api.schemas.HeaderNames.projects_role ] = "mlrun" response = self._session.request(method, url, verify=False, **kwargs) if not response.ok: log_kwargs = copy.deepcopy(kwargs) log_kwargs.update({"method": method, "path": path}) if response.content: try: data = response.json() ctx = data.get("meta", {}).get("ctx") errors = data.get("errors", []) except Exception: pass else: error_message = f"{error_message}: {str(errors)}" log_kwargs.update({"ctx": ctx, "errors": errors}) logger.warning("Request to iguazio failed", **log_kwargs) mlrun.errors.raise_for_status(response, error_message) return response
async def background_task_wrapper(self, project: str, name: str, function, *args, **kwargs): try: if asyncio.iscoroutinefunction(function): await function(*args, **kwargs) else: await fastapi.concurrency.run_in_threadpool( function, *args, **kwargs) except Exception: logger.warning( f"Failed during background task execution: {function.__name__}, exc: {traceback.format_exc()}" ) self._update_background_task( project, name, mlrun.api.schemas.BackgroundTaskState.failed) else: self._update_background_task( project, name, mlrun.api.schemas.BackgroundTaskState.succeeded)
async def http_status_error_handler(request: fastapi.Request, exc: mlrun.errors.MLRunHTTPStatusError): status_code = exc.response.status_code error_message = repr(exc) logger.warning( "Request handling returned error status", error_message=error_message, status_code=status_code, traceback=traceback.format_exc(), ) # TODO: 0.6.6 is the last version expecting the error details to be under reason, when it's no longer a relevant # version can be changed to detail=error_message return await http_exception_handler( request, fastapi.HTTPException(status_code=status_code, detail={"reason": error_message}), )
def _send_request_to_api(self, method, path, **kwargs): url = f"{self._api_url}{path}" if kwargs.get("timeout") is None: kwargs["timeout"] = self._request_timeout response = self._session.request(method, url, verify=False, **kwargs) if not response.ok: log_kwargs = copy.deepcopy(kwargs) log_kwargs.update({"method": method, "path": path}) if response.content: try: data = response.json() except Exception: pass else: log_kwargs.update({"data": data}) logger.warning("Request to opa failed", **log_kwargs) mlrun.errors.raise_for_status(response) return response