def _resolve_job_function( self, scheduled_kind: schemas.ScheduleKinds, scheduled_object: Any, project_name: str, schedule_name: str, schedule_concurrency_limit: int, ) -> Tuple[Callable, Optional[Union[List, Tuple]], Optional[Dict]]: """ :return: a tuple (function, args, kwargs) to be used with the APScheduler.add_job """ if scheduled_kind == schemas.ScheduleKinds.job: scheduled_object_copy = copy.deepcopy(scheduled_object) return ( Scheduler.submit_run_wrapper, [ scheduled_object_copy, project_name, schedule_name, schedule_concurrency_limit, ], {}, ) if scheduled_kind == schemas.ScheduleKinds.local_function: return scheduled_object, [], {} # sanity message = "Scheduled object kind missing implementation" logger.warn(message, scheduled_object_kind=scheduled_kind) raise NotImplementedError(message)
def _reload_schedules( self, db_session: Session, auth_info: mlrun.api.schemas.AuthInfo, ): logger.info("Reloading schedules") db_schedules = get_db().list_schedules(db_session) for db_schedule in db_schedules: # don't let one failure fail the rest try: self._create_schedule_in_scheduler( db_schedule.project, db_schedule.name, db_schedule.kind, db_schedule.scheduled_object, db_schedule.cron_trigger, db_schedule.concurrency_limit, auth_info, ) except Exception as exc: logger.warn( "Failed rescheduling job. Continuing", exc=str(exc), db_schedule=db_schedule, )
async def submit_run_wrapper( scheduled_object, project_name, schedule_name, schedule_concurrency_limit, auth_info: mlrun.api.schemas.AuthInfo, ): # import here to avoid circular imports from mlrun.api.api.utils import submit_run # removing the schedule from the body otherwise when the scheduler will submit this task it will go to an # endless scheduling loop scheduled_object.pop("schedule", None) # removing the uid from the task metadata so that a new uid will be generated for every run # otherwise all runs will have the same uid scheduled_object.get("task", {}).get("metadata", {}).pop("uid", None) if "task" in scheduled_object and "metadata" in scheduled_object[ "task"]: scheduled_object["task"]["metadata"].setdefault("labels", {}) scheduled_object["task"]["metadata"]["labels"][ schemas.constants.LabelNames.schedule_name] = schedule_name db_session = create_session() active_runs = get_db().list_runs( db_session, state=RunStates.non_terminal_states(), project=project_name, labels= f"{schemas.constants.LabelNames.schedule_name}={schedule_name}", ) if len(active_runs) >= schedule_concurrency_limit: logger.warn( "Schedule exceeded concurrency limit, skipping this run", project=project_name, schedule_name=schedule_name, schedule_concurrency_limit=schedule_concurrency_limit, active_runs=len(active_runs), ) return response = await submit_run(db_session, auth_info, scheduled_object) run_metadata = response["data"]["metadata"] run_uri = RunObject.create_uri(run_metadata["project"], run_metadata["uid"], run_metadata["iteration"]) get_db().update_schedule( db_session, run_metadata["project"], schedule_name, last_run_uri=run_uri, leader_session=auth_info.session, ) close_session(db_session) return response
def _reload_schedules(self, db_session: Session): logger.info("Reloading schedules") db_schedules = get_db().list_schedules(db_session) for db_schedule in db_schedules: # don't let one failure fail the rest try: # import here to avoid circular imports import mlrun.api.crud access_key = None username = None if mlrun.api.utils.auth.verifier.AuthVerifier( ).is_jobs_auth_required(): username, access_key = self._get_schedule_secrets( db_schedule.project, db_schedule.name) self._create_schedule_in_scheduler( db_schedule.project, db_schedule.name, db_schedule.kind, db_schedule.scheduled_object, db_schedule.cron_trigger, db_schedule.concurrency_limit, mlrun.api.schemas.AuthInfo(username=username, access_key=access_key), ) except Exception as exc: logger.warn( "Failed rescheduling job. Continuing", exc=str(exc), traceback=traceback.format_exc(), db_schedule=db_schedule, )
def _parse_query_parameters(request_body: Dict[str, Any]) -> Dict[str, str]: """ This function searches for the target field in Grafana's SimpleJson json. Once located, the target string is parsed by splitting on semi-colons (;). Each part in the resulting list is then split by an equal sign (=) to be read as key-value pairs. """ # Try to get the target targets = request_body.get("targets", []) if len(targets) > 1: logger.warn( f"The 'targets' list contains more then one element ({len(targets)}), all targets except the first one are " f"ignored.") target_obj = targets[0] if targets else {} target_query = target_obj.get("target") if target_obj else "" if not target_query: raise MLRunBadRequestError( f"Target missing in request body:\n {request_body}") parameters = _parse_parameters(target_query) return parameters
def post_init(self, mode="sync"): server = getattr(self.context, "_server", None) or getattr( self.context, "server", None) if not server: logger.warn( "GraphServer not initialized for VotingEnsemble instance") return _init_endpoint_record(server, self)
def grafana_incoming_features(body: Dict[str, Any], query_parameters: Dict[str, str], access_key: str): endpoint_id = query_parameters.get("endpoint_id") project = query_parameters.get("project") start = body.get("rangeRaw", {}).get("from", "now-1h") end = body.get("rangeRaw", {}).get("to", "now") endpoint = ModelEndpoints.get_endpoint(access_key=access_key, project=project, endpoint_id=endpoint_id) time_series = [] feature_names = endpoint.spec.feature_names if not feature_names: logger.warn( "'feature_names' is either missing or not initialized in endpoint record", endpoint_id=endpoint.metadata.uid, ) return time_series path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=EVENTS) _, container, path = parse_model_endpoint_store_prefix(path) client = get_frames_client( token=access_key, address=config.v3io_framesd, container=container, ) data: pd.DataFrame = client.read( backend="tsdb", table=path, columns=feature_names, filter=f"endpoint_id=='{endpoint_id}'", start=start, end=end, ) data.drop(["endpoint_id"], axis=1, inplace=True, errors="ignore") data.index = data.index.astype(np.int64) // 10**6 for feature, indexed_values in data.to_dict().items(): target = GrafanaTimeSeriesTarget(target=feature) for index, value in indexed_values.items(): data_point = GrafanaDataPoint(value=float(value), timestamp=index) target.add_data_point(data_point) time_series.append(target) return time_series
def _validate_cron_trigger( self, cron_trigger: schemas.ScheduleCronTrigger, # accepting now from outside for testing purposes now: datetime = None, ): """ Enforce no more then one job per min_allowed_interval """ logger.debug("Validating cron trigger") apscheduler_cron_trigger = self.transform_schemas_cron_trigger_to_apscheduler_cron_trigger( cron_trigger ) now = now or datetime.now(apscheduler_cron_trigger.timezone) next_run_time = None second_next_run_time = now # doing 60 checks to allow one minute precision, if the _min_allowed_interval is less then one minute validation # won't fail in certain scenarios that it should. See test_validate_cron_trigger_multi_checks for detailed # explanation for index in range(60): next_run_time = apscheduler_cron_trigger.get_next_fire_time( None, second_next_run_time ) # will be none if we got a schedule that has no next fire time - for example schedule with year=1999 if next_run_time is None: return second_next_run_time = apscheduler_cron_trigger.get_next_fire_time( next_run_time, next_run_time ) # will be none if we got a schedule that has no next fire time - for example schedule with year=2050 if second_next_run_time is None: return min_allowed_interval_seconds = humanfriendly.parse_timespan( self._min_allowed_interval ) if second_next_run_time < next_run_time + timedelta( seconds=min_allowed_interval_seconds ): logger.warn( "Cron trigger too frequent. Rejecting", cron_trigger=cron_trigger, next_run_time=next_run_time, second_next_run_time=second_next_run_time, delta=second_next_run_time - next_run_time, ) raise ValueError( f"Cron trigger too frequent. no more then one job " f"per {self._min_allowed_interval} is allowed" )
def post_init(self, mode="sync"): """sync/async model loading, for internal use""" if not self.ready: if mode == "async": t = threading.Thread(target=self._load_and_update_state) t.start() self.context.logger.info( f"started async model loading for {self.name}") else: self._load_and_update_state() server = getattr(self.context, "_server", None) or getattr( self.context, "server", None) if not server: logger.warn( "GraphServer not initialized for VotingEnsemble instance") return _init_endpoint_record(server, self)
def _reload_schedules(self, db_session: Session): logger.info("Reloading schedules") db_schedules = get_db().list_schedules(db_session) for db_schedule in db_schedules: # don't let one failure fail the rest try: # import here to avoid circular imports import mlrun.api.crud session = None if self._store_schedule_credentials_in_secrets: schedule_secret_key = mlrun.api.crud.Secrets( ).generate_schedule_secret_key(db_schedule.name) secret_key_map = (mlrun.api.crud.Secrets(). generate_schedule_key_map_secret_key()) session = mlrun.api.crud.Secrets().get_secret( db_schedule.project, self._secrets_provider, schedule_secret_key, allow_secrets_from_k8s=True, allow_internal_secrets=True, key_map_secret_key=secret_key_map, ) self._create_schedule_in_scheduler( db_schedule.project, db_schedule.name, db_schedule.kind, db_schedule.scheduled_object, db_schedule.cron_trigger, db_schedule.concurrency_limit, mlrun.api.schemas.AuthInfo(session=session), ) except Exception as exc: logger.warn( "Failed rescheduling job. Continuing", exc=str(exc), traceback=traceback.format_exc(), db_schedule=db_schedule, )
def do(self, event: Dict): endpoint_id = event[ENDPOINT_ID] if endpoint_id not in self.feature_names: endpoint_record = get_endpoint_record( kv_container=self.kv_container, kv_path=self.kv_path, endpoint_id=endpoint_id, ) feature_names = endpoint_record.get(FEATURE_NAMES) feature_names = json.loads( feature_names) if feature_names else None label_columns = endpoint_record.get(LABEL_COLUMNS) label_columns = json.loads( label_columns) if label_columns else None if not feature_names: logger.warn( f"Feature names are not initialized, they will be automatically generated", endpoint_id=endpoint_id, ) feature_names = [ f"f{i}" for i, _ in enumerate(event[FEATURES]) ] get_v3io_client().kv.update( container=self.kv_container, table_path=self.kv_path, key=event[ENDPOINT_ID], attributes={FEATURE_NAMES: json.dumps(feature_names)}, ) if not label_columns: logger.warn( f"label column names are not initialized, they will be automatically generated", endpoint_id=endpoint_id, ) label_columns = [ f"p{i}" for i, _ in enumerate(event[PREDICTION]) ] get_v3io_client().kv.update( container=self.kv_container, table_path=self.kv_path, key=event[ENDPOINT_ID], attributes={LABEL_COLUMNS: json.dumps(label_columns)}, ) self.label_columns[endpoint_id] = label_columns self.feature_names[endpoint_id] = feature_names feature_names = self.feature_names[endpoint_id] features = event[FEATURES] event[NAMED_FEATURES] = { name: feature for name, feature in zip(feature_names, features) } label_columns = self.label_columns[endpoint_id] prediction = event[PREDICTION] event[NAMED_PREDICTIONS] = { name: prediction for name, prediction in zip(label_columns, prediction) } return event
def do(self, event: Dict): endpoint_id = event[ENDPOINT_ID] if endpoint_id not in self.feature_names: endpoint_record = get_endpoint_record( kv_container=self.kv_container, kv_path=self.kv_path, endpoint_id=endpoint_id, access_key=self.access_key, ) feature_names = endpoint_record.get(FEATURE_NAMES) feature_names = json.loads( feature_names) if feature_names else None label_columns = endpoint_record.get(LABEL_COLUMNS) label_columns = json.loads( label_columns) if label_columns else None if not feature_names and self._infer_columns_from_data: feature_names = self._infer_feature_names_from_data(event) if not feature_names: logger.warn( "Feature names are not initialized, they will be automatically generated", endpoint_id=endpoint_id, ) feature_names = [ f"f{i}" for i, _ in enumerate(event[FEATURES]) ] get_v3io_client().kv.update( container=self.kv_container, table_path=self.kv_path, access_key=self.access_key, key=event[ENDPOINT_ID], attributes={FEATURE_NAMES: json.dumps(feature_names)}, raise_for_status=RaiseForStatus.always, ) if not label_columns and self._infer_columns_from_data: label_columns = self._infer_label_columns_from_data(event) if not label_columns: logger.warn( "label column names are not initialized, they will be automatically generated", endpoint_id=endpoint_id, ) label_columns = [ f"p{i}" for i, _ in enumerate(event[PREDICTION]) ] get_v3io_client().kv.update( container=self.kv_container, table_path=self.kv_path, access_key=self.access_key, key=event[ENDPOINT_ID], attributes={LABEL_COLUMNS: json.dumps(label_columns)}, raise_for_status=RaiseForStatus.always, ) self.label_columns[endpoint_id] = label_columns self.feature_names[endpoint_id] = feature_names logger.info("Label columns", endpoint_id=endpoint_id, label_columns=label_columns) logger.info("Feature names", endpoint_id=endpoint_id, feature_names=feature_names) feature_names = self.feature_names[endpoint_id] features = event[FEATURES] event[NAMED_FEATURES] = { name: feature for name, feature in zip(feature_names, features) } label_columns = self.label_columns[endpoint_id] prediction = event[PREDICTION] event[NAMED_PREDICTIONS] = { name: prediction for name, prediction in zip(label_columns, prediction) } logger.info("Mapped event", event=event) return event
async def submit_run_wrapper( scheduler, scheduled_object, project_name, schedule_name, schedule_concurrency_limit, auth_info: mlrun.api.schemas.AuthInfo, ): # import here to avoid circular imports import mlrun.api.crud from mlrun.api.api.utils import submit_run # removing the schedule from the body otherwise when the scheduler will submit this task it will go to an # endless scheduling loop scheduled_object.pop("schedule", None) # removing the uid from the task metadata so that a new uid will be generated for every run # otherwise all runs will have the same uid scheduled_object.get("task", {}).get("metadata", {}).pop("uid", None) if "task" in scheduled_object and "metadata" in scheduled_object[ "task"]: scheduled_object["task"]["metadata"].setdefault("labels", {}) scheduled_object["task"]["metadata"]["labels"][ schemas.constants.LabelNames.schedule_name] = schedule_name db_session = create_session() active_runs = mlrun.api.crud.Runs().list_runs( db_session, state=RunStates.non_terminal_states(), project=project_name, labels= f"{schemas.constants.LabelNames.schedule_name}={schedule_name}", ) if len(active_runs) >= schedule_concurrency_limit: logger.warn( "Schedule exceeded concurrency limit, skipping this run", project=project_name, schedule_name=schedule_name, schedule_concurrency_limit=schedule_concurrency_limit, active_runs=len(active_runs), ) return # if credentials are needed but missing (will happen for schedules on upgrade from scheduler that didn't store # credentials to one that does store) enrich them # Note that here we're using the "knowledge" that submit_run only requires the session of the auth info if not auth_info.session and scheduler._store_schedule_credentials_in_secrets: # import here to avoid circular imports import mlrun.api.utils.auth import mlrun.api.utils.singletons.project_member logger.info( "Schedule missing auth info which is required. Trying to fill from project owner", project_name=project_name, schedule_name=schedule_name, ) project_owner = mlrun.api.utils.singletons.project_member.get_project_member( ).get_project_owner(db_session, project_name) # Update the schedule with the new auth info so we won't need to do the above again in the next run scheduler.update_schedule( db_session, mlrun.api.schemas.AuthInfo(session=project_owner.session), project_name, schedule_name, ) response = await submit_run(db_session, auth_info, scheduled_object) run_metadata = response["data"]["metadata"] run_uri = RunObject.create_uri(run_metadata["project"], run_metadata["uid"], run_metadata["iteration"]) get_db().update_schedule( db_session, run_metadata["project"], schedule_name, last_run_uri=run_uri, ) close_session(db_session) return response
def run(self): try: endpoints = self.db.list_model_endpoints(self.project) except Exception as e: logger.error("Failed to list endpoints", exc=e) return active_endpoints = set() for endpoint in endpoints.endpoints: if endpoint.spec.active: active_endpoints.add(endpoint.metadata.uid) store, sub = store_manager.get_or_create_store(self.parquet_path) prefix = self.parquet_path.replace(sub, "") fs = store.get_filesystem(silent=False) if not fs.exists(sub): logger.warn( f"{sub} does not exist" ) return for endpoint_dir in fs.ls(sub): endpoint_id = endpoint_dir["name"].split("=")[-1] if endpoint_id not in active_endpoints: continue try: last_year = self.get_last_created_dir(fs, endpoint_dir) last_month = self.get_last_created_dir(fs, last_year) last_day = self.get_last_created_dir(fs, last_month) last_hour = self.get_last_created_dir(fs, last_day) parquet_files = fs.ls(last_hour["name"]) last_parquet = sorted(parquet_files, key=lambda k: k["mtime"])[-1] parquet_name = last_parquet["name"] full_path = f"{prefix}{parquet_name}" logger.info(f"Now processing {full_path}") endpoint = self.db.get_model_endpoint( project=self.project, endpoint_id=endpoint_id ) df = pd.read_parquet(full_path) timestamp = df["timestamp"].iloc[-1] named_features_df = list(df["named_features"]) named_features_df = pd.DataFrame(named_features_df) current_stats = DFDataInfer.get_stats( df=named_features_df, options=InferOptions.Histogram ) drift_result = self.virtual_drift.compute_drift_from_histograms( feature_stats=endpoint.status.feature_stats, current_stats=current_stats, ) logger.info("Drift result", drift_result=drift_result) drift_status, drift_measure = self.check_for_drift( drift_result=drift_result, endpoint=endpoint ) logger.info( "Drift status", endpoint_id=endpoint_id, drift_status=drift_status, drift_measure=drift_measure, ) if drift_status == "POSSIBLE_DRIFT" or drift_status == "DRIFT_DETECTED": self.v3io.stream.put_records( container=self.stream_container, stream_path=self.stream_path, records=[ { "data": json.dumps( { "endpoint_id": endpoint_id, "drift_status": drift_status, "drift_measure": drift_measure, "drift_per_feature": {**drift_result}, } ) } ], ) self.v3io.kv.update( container=self.kv_container, table_path=self.kv_path, key=endpoint_id, attributes={ "current_stats": json.dumps(current_stats), "drift_measures": json.dumps(drift_result), "drift_status": drift_status, }, ) tsdb_drift_measures = { "endpoint_id": endpoint_id, "timestamp": pd.to_datetime(timestamp, format=TIME_FORMAT), "record_type": "drift_measures", "tvd_mean": drift_result["tvd_mean"], "kld_mean": drift_result["kld_mean"], "hellinger_mean": drift_result["hellinger_mean"], } self.frames.write( backend="tsdb", table=self.tsdb_path, dfs=pd.DataFrame.from_dict([tsdb_drift_measures]), index_cols=["timestamp", "endpoint_id", "record_type"], ) logger.info(f"Done updating drift measures {full_path}") except Exception as e: logger.error(e)