def cleanup_endpoints(db: Session, client: TestClient): if not _is_env_params_dont_exist(): kv_path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.ENDPOINTS, ) _, kv_container, kv_path = parse_model_endpoint_store_prefix(kv_path) tsdb_path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.EVENTS, ) _, tsdb_container, tsdb_path = parse_model_endpoint_store_prefix( tsdb_path) v3io = get_v3io_client(endpoint=config.v3io_api, access_key=_get_access_key()) frames = get_frames_client( token=_get_access_key(), container=tsdb_container, address=config.v3io_framesd, ) try: all_records = v3io.kv.new_cursor( container=kv_container, table_path=kv_path, raise_for_status=RaiseForStatus.never, ).all() all_records = [r["__name"] for r in all_records] # Cleanup KV for record in all_records: v3io.kv.delete( container=kv_container, table_path=kv_path, key=record, raise_for_status=RaiseForStatus.never, ) except RuntimeError: pass try: # Cleanup TSDB frames.delete( backend="tsdb", table=tsdb_path, if_missing=fpb2.IGNORE, ) except CreateError: pass
def _create_model_monitoring_stream(project: str): stream_path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind="stream") _, container, stream_path = parse_model_endpoint_store_prefix(stream_path) # TODO: How should we configure sharding here? logger.info( "Creating model endpoint stream for project", project=project, stream_path=stream_path, container=container, endpoint=config.v3io_api, ) v3io_client = v3io.dataplane.Client( endpoint=config.v3io_api, access_key=os.environ.get("V3IO_ACCESS_KEY")) response = v3io_client.create_stream( container=container, path=stream_path, shard_count=config.model_endpoint_monitoring.serving_stream_args. shard_count, retention_period_hours=config.model_endpoint_monitoring. serving_stream_args.retention_period_hours, raise_for_status=v3io.dataplane.RaiseForStatus.never, ) if not (response.status_code == 400 and "ResourceInUse" in str(response.body)): response.raise_for_status([409, 204])
def delete_endpoint_record(access_key: str, project: str, endpoint_id: str): """ Deletes the KV record of a given model endpoint, project and endpoint_id are used for lookup :param access_key: V3IO access key for managing user permissions :param project: The name of the project :param endpoint_id: The id of the endpoint """ logger.info("Clearing model endpoint table", endpoint_id=endpoint_id) client = get_v3io_client(endpoint=config.v3io_api) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=ENDPOINTS) _, container, path = parse_model_endpoint_store_prefix(path) client.kv.delete( container=container, table_path=path, key=endpoint_id, access_key=access_key, ) logger.info("Model endpoint table cleared", endpoint_id=endpoint_id)
def delete_endpoint_record( self, auth_info: mlrun.api.schemas.AuthInfo, project: str, endpoint_id: str, access_key: str, ): """ Deletes the KV record of a given model endpoint, project and endpoint_id are used for lookup :param auth_info: The required auth information for doing the deletion :param project: The name of the project :param endpoint_id: The id of the endpoint :param access_key: access key with permission to delete """ logger.info("Clearing model endpoint table", endpoint_id=endpoint_id) client = get_v3io_client(endpoint=config.v3io_api) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.ENDPOINTS ) _, container, path = parse_model_endpoint_store_prefix(path) client.kv.delete( container=container, table_path=path, key=endpoint_id, access_key=access_key, ) logger.info("Model endpoint table cleared", endpoint_id=endpoint_id)
def test_get_endpoint_metrics(db: Session, client: TestClient): path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=EVENTS ) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=_get_access_key(), container=container, address=config.v3io_framesd, ) start = datetime.utcnow() for i in range(5): endpoint = _mock_random_endpoint() write_endpoint_to_kv(_get_access_key(), endpoint) frames.create(backend="tsdb", table=path, rate="10/m", if_exists=1) total = 0 dfs = [] for i in range(10): count = randint(1, 10) total += count data = { "predictions_per_second_count_1s": count, "endpoint_id": endpoint.metadata.uid, "timestamp": start - timedelta(minutes=10 - i), } df = pd.DataFrame(data=[data]) dfs.append(df) frames.write( backend="tsdb", table=path, dfs=dfs, index_cols=["timestamp", "endpoint_id"], ) response = client.get( url=f"/api/projects/{TEST_PROJECT}/model-endpoints/{endpoint.metadata.uid}?metric=predictions_per_second_count_1s", # noqa headers={"X-V3io-Session-Key": _get_access_key()}, ) endpoint = ModelEndpoint(**response.json()) assert len(endpoint.status.metrics) > 0 predictions_per_second = endpoint.status.metrics[ "predictions_per_second_count_1s" ] assert predictions_per_second.name == "predictions_per_second_count_1s" response_total = sum((m[1] for m in predictions_per_second.values)) assert total == response_total
def __init__(self, context: MLClientCtx, project: str): self.context = context self.project = project self.virtual_drift = VirtualDrift(inf_capping=10) template = config.model_endpoint_monitoring.store_prefixes.default self.parquet_path = template.format(project=self.project, kind="parquet") kv_path = template.format(project=self.project, kind="endpoints") _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix( kv_path) tsdb_path = template.format(project=project, kind="events") _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( tsdb_path) stream_path = template.format(project=self.project, kind="log_stream") _, self.stream_container, self.stream_path = parse_model_endpoint_store_prefix( stream_path) logger.info( "Initializing BatchProcessor", parquet_path=self.parquet_path, kv_container=self.kv_container, kv_path=self.kv_path, tsdb_container=self.tsdb_container, tsdb_path=self.tsdb_path, stream_container=self.stream_container, stream_path=self.stream_path, ) self.default_possible_drift_threshold = ( config.model_endpoint_monitoring.drift_thresholds.default. possible_drift) self.default_drift_detected_threshold = ( config.model_endpoint_monitoring.drift_thresholds.default. drift_detected) self.db = get_run_db() self.v3io = get_v3io_client() self.frames = get_frames_client(address=config.v3io_framesd, container=self.tsdb_container)
def write_endpoint_to_kv( self, access_key: str, endpoint: ModelEndpoint, update: bool = True ): """ Writes endpoint data to KV, a prerequisite for initializing the monitoring process :param access_key: V3IO access key for managing user permissions :param endpoint: ModelEndpoint object :param update: When True, use client.kv.update, otherwise use client.kv.put """ labels = endpoint.metadata.labels or {} searchable_labels = {f"_{k}": v for k, v in labels.items()} if labels else {} feature_names = endpoint.spec.feature_names or [] label_names = endpoint.spec.label_names or [] feature_stats = endpoint.status.feature_stats or {} current_stats = endpoint.status.current_stats or {} children = endpoint.status.children or [] monitor_configuration = endpoint.spec.monitor_configuration or {} client = get_v3io_client(endpoint=config.v3io_api) function = client.kv.update if update else client.kv.put path = config.model_endpoint_monitoring.store_prefixes.default.format( project=endpoint.metadata.project, kind=self.ENDPOINTS ) _, container, path = parse_model_endpoint_store_prefix(path) function( container=container, table_path=path, key=endpoint.metadata.uid, access_key=access_key, attributes={ "endpoint_id": endpoint.metadata.uid, "project": endpoint.metadata.project, "function_uri": endpoint.spec.function_uri, "model": endpoint.spec.model, "model_class": endpoint.spec.model_class or "", "labels": json.dumps(labels), "model_uri": endpoint.spec.model_uri or "", "stream_path": endpoint.spec.stream_path or "", "active": endpoint.spec.active or "", "state": endpoint.status.state or "", "feature_stats": json.dumps(feature_stats), "current_stats": json.dumps(current_stats), "feature_names": json.dumps(feature_names), "children": json.dumps(children), "label_names": json.dumps(label_names), "monitor_configuration": json.dumps(monitor_configuration), **searchable_labels, }, ) return endpoint
def grafana_incoming_features(body: Dict[str, Any], query_parameters: Dict[str, str], access_key: str): endpoint_id = query_parameters.get("endpoint_id") project = query_parameters.get("project") start = body.get("rangeRaw", {}).get("from", "now-1h") end = body.get("rangeRaw", {}).get("to", "now") endpoint = ModelEndpoints.get_endpoint(access_key=access_key, project=project, endpoint_id=endpoint_id) time_series = [] feature_names = endpoint.spec.feature_names if not feature_names: logger.warn( "'feature_names' is either missing or not initialized in endpoint record", endpoint_id=endpoint.metadata.uid, ) return time_series path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=EVENTS) _, container, path = parse_model_endpoint_store_prefix(path) client = get_frames_client( token=access_key, address=config.v3io_framesd, container=container, ) data: pd.DataFrame = client.read( backend="tsdb", table=path, columns=feature_names, filter=f"endpoint_id=='{endpoint_id}'", start=start, end=end, ) data.drop(["endpoint_id"], axis=1, inplace=True, errors="ignore") data.index = data.index.astype(np.int64) // 10**6 for feature, indexed_values in data.to_dict().items(): target = GrafanaTimeSeriesTarget(target=feature) for index, value in indexed_values.items(): data_point = GrafanaDataPoint(value=float(value), timestamp=index) target.add_data_point(data_point) time_series.append(target) return time_series
def cleanup_endpoints(self): db = mlrun.get_run_db() endpoints = db.list_model_endpoints(self.project_name) for endpoint in endpoints.endpoints: db.delete_model_endpoint_record( endpoint.metadata.project, endpoint.metadata.uid ) v3io = get_v3io_client( endpoint=config.v3io_api, access_key=self._get_auth_info().data_session ) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=self.project_name, kind=mlrun.api.crud.ModelEndpoints().ENDPOINTS ) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=self._get_auth_info().data_session, container=container, address=config.v3io_framesd, ) try: all_records = v3io.kv.new_cursor( container=container, table_path=path, raise_for_status=RaiseForStatus.never, ).all() all_records = [r["__name"] for r in all_records] # Cleanup KV for record in all_records: v3io.kv.delete( container=container, table_path=path, key=record, raise_for_status=RaiseForStatus.never, ) except RuntimeError: pass try: # Cleanup TSDB frames.delete( backend="tsdb", table=path, if_missing=fpb2.IGNORE, ) except CreateError: pass
def test_get_endpoint_metric_function(): path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=EVENTS ) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=_get_access_key(), container=container, address=config.v3io_framesd, ) start = datetime.utcnow() endpoint = _mock_random_endpoint() write_endpoint_to_kv(_get_access_key(), endpoint) frames.create(backend="tsdb", table=path, rate="10/m", if_exists=1) total = 0 dfs = [] for i in range(10): count = randint(1, 10) total += count data = { "predictions_per_second_count_1s": count, "endpoint_id": endpoint.metadata.uid, "timestamp": start - timedelta(minutes=10 - i), } df = pd.DataFrame(data=[data]) dfs.append(df) frames.write( backend="tsdb", table=path, dfs=dfs, index_cols=["timestamp", "endpoint_id"], ) endpoint_metrics = get_endpoint_metrics( access_key=_get_access_key(), project=TEST_PROJECT, endpoint_id=endpoint.metadata.uid, metrics=["predictions_per_second_count_1s"], ) assert "predictions_per_second_count_1s" in endpoint_metrics actual_values = endpoint_metrics["predictions_per_second_count_1s"].values assert len(actual_values) == 10 assert sum(map(lambda t: t[1], actual_values)) == total
def get_endpoint_metrics( self, access_key: str, project: str, endpoint_id: str, metrics: List[str], start: str = "now-1h", end: str = "now", ) -> Dict[str, Metric]: if not metrics: raise MLRunInvalidArgumentError("Metric names must be provided") path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.EVENTS ) _, container, path = parse_model_endpoint_store_prefix(path) client = get_frames_client( token=access_key, address=config.v3io_framesd, container=container, ) data = client.read( backend="tsdb", table=path, columns=["endpoint_id", *metrics], filter=f"endpoint_id=='{endpoint_id}'", start=start, end=end, ) data_dict = data.to_dict() metrics_mapping = {} for metric in metrics: metric_data = data_dict.get(metric) if metric_data is None: continue values = [ (str(timestamp), value) for timestamp, value in metric_data.items() ] metrics_mapping[metric] = Metric(name=metric, values=values) return metrics_mapping
def cleanup_endpoints(db: Session, client: TestClient): # Do nothing unless its system test env if _is_env_params_dont_exist(): return v3io = get_v3io_client(endpoint=config.v3io_api, access_key=_get_access_key()) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=ENDPOINTS ) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=_get_access_key(), container=container, address=config.v3io_framesd, ) try: all_records = v3io.kv.new_cursor( container=container, table_path=path, raise_for_status=RaiseForStatus.never, ).all() all_records = [r["__name"] for r in all_records] # Cleanup KV for record in all_records: v3io.kv.delete( container=container, table_path=path, key=record, raise_for_status=RaiseForStatus.never, ) except RuntimeError: pass try: # Cleanup TSDB frames.delete( backend="tsdb", table=path, if_missing=fpb2.IGNORE, ) except CreateError: pass
def test_grafana_incoming_features(db: Session, client: TestClient): path = config.model_endpoint_monitoring.store_prefixes.default.format( project=TEST_PROJECT, kind=EVENTS) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=_get_access_key(), container=container, address=config.v3io_framesd, ) frames.create(backend="tsdb", table=path, rate="10/m", if_exists=1) start = datetime.utcnow() endpoints = [_mock_random_endpoint() for _ in range(5)] for e in endpoints: e.spec.feature_names = ["f0", "f1", "f2", "f3"] for endpoint in endpoints: ModelEndpoints.create_or_patch(_get_access_key(), endpoint) total = 0 dfs = [] for i in range(10): count = randint(1, 10) total += count data = { "f0": i, "f1": i + 1, "f2": i + 2, "f3": i + 3, "endpoint_id": endpoint.metadata.uid, "timestamp": start - timedelta(minutes=10 - i), } df = pd.DataFrame(data=[data]) dfs.append(df) frames.write( backend="tsdb", table=path, dfs=dfs, index_cols=["timestamp", "endpoint_id"], ) for endpoint in endpoints: response = client.post( url="/api/grafana-proxy/model-endpoints/query", headers={"X-V3io-Session-Key": _get_access_key()}, json={ "targets": [{ "target": f"project={TEST_PROJECT};endpoint_id={endpoint.metadata.uid};target_endpoint=incoming_features" # noqa }] }, ) response = response.json() targets = [t["target"] for t in response] assert targets == ["f0", "f1", "f2", "f3"] lens = [t["datapoints"] for t in response] assert all(map(lambda l: len(l) == 10, lens))
self.tsdb_batching_max_events = tsdb_batching_max_events self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs self.parquet_batching_max_events = parquet_batching_max_events self.parquet_batching_timeout_secs = parquet_batching_timeout_secs self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"] self.aggregate_count_period = aggregate_count_period self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"] self.aggregate_avg_period = aggregate_avg_period self.v3io_access_key = v3io_access_key or environ.get( "V3IO_ACCESS_KEY") self.v3io_framesd = v3io_framesd or config.v3io_framesd template = config.model_endpoint_monitoring.store_prefixes.default kv_path = template.format(project=project, kind="endpoints") _, self.kv_container, self.kv_path = parse_model_endpoint_store_prefix( kv_path) tsdb_path = template.format(project=project, kind="events") _, self.tsdb_container, self.tsdb_path = parse_model_endpoint_store_prefix( tsdb_path) self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}" self.parquet_path = template.format(project=project, kind="parquet") logger.info( "Writer paths", kv_path=self.kv_path, tsdb_path=self.tsdb_path, parquet_path=self.parquet_path, )
def delete_model_endpoints_resources(self, project_name: str): auth_info = mlrun.api.schemas.AuthInfo( data_session=os.getenv("V3IO_ACCESS_KEY") ) access_key = auth_info.data_session # we would ideally base on config.v3io_api but can't for backwards compatibility reasons, # we're using the igz version heuristic if not config.igz_version or not config.v3io_api: return endpoints = self.list_endpoints(auth_info, project_name) for endpoint in endpoints.endpoints: self.delete_endpoint_record( auth_info, endpoint.metadata.project, endpoint.metadata.uid, access_key, ) v3io = get_v3io_client(endpoint=config.v3io_api, access_key=access_key) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project_name, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.ENDPOINTS, ) tsdb_path = parse_model_endpoint_project_prefix(path, project_name) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=access_key, container=container, address=config.v3io_framesd, ) try: all_records = v3io.kv.new_cursor( container=container, table_path=path, raise_for_status=RaiseForStatus.never, access_key=access_key, ).all() all_records = [r["__name"] for r in all_records] # Cleanup KV for record in all_records: v3io.kv.delete( container=container, table_path=path, key=record, access_key=access_key, raise_for_status=RaiseForStatus.never, ) except RuntimeError as exc: # KV might raise an exception even it was set not raise one. exception is raised if path is empty or # not exist, therefore ignoring failures until they'll fix the bug. # TODO: remove try except after bug is fixed logger.debug( "Failed cleaning model endpoints KV. Ignoring", exc=str(exc), traceback=traceback.format_exc(), ) pass # Cleanup TSDB try: frames.delete( backend="tsdb", table=path, if_missing=frames_pb2.IGNORE, ) except CreateError: # frames might raise an exception if schema file does not exist. pass # final cleanup of tsdb path tsdb_path.replace("://u", ":///u") store, _ = mlrun.store_manager.get_or_create_store(tsdb_path) store.rm(tsdb_path, recursive=True)
def list_endpoints( self, auth_info: mlrun.api.schemas.AuthInfo, project: str, model: Optional[str] = None, function: Optional[str] = None, labels: Optional[List[str]] = None, metrics: Optional[List[str]] = None, start: str = "now-1h", end: str = "now", top_level: Optional[bool] = False, uids: Optional[List[str]] = None, ) -> ModelEndpointList: """ Returns a list of ModelEndpointState objects. Each object represents the current state of a model endpoint. This functions supports filtering by the following parameters: 1) model 2) function 3) labels 4) top level 5) uids By default, when no filters are applied, all available endpoints for the given project will be listed. In addition, this functions provides a facade for listing endpoint related metrics. This facade is time-based and depends on the 'start' and 'end' parameters. By default, when the metrics parameter is None, no metrics are added to the output of this function. :param access_key: V3IO access key for managing user permissions :param project: The name of the project :param model: The name of the model to filter by :param function: The name of the function to filter by :param labels: A list of labels to filter by. Label filters work by either filtering a specific value of a label (i.e. list("key==value")) or by looking for the existence of a given key (i.e. "key") :param metrics: A list of metrics to return for each endpoint, read more in 'TimeMetric' :param start: The start time of the metrics :param end: The end time of the metrics :param top_level: if True will return only routers and endpoint that are NOT children of any router :param uids: will return ModelEndpointList of endpoints with uid in uids """ logger.info( "Listing endpoints", project=project, model=model, function=function, labels=labels, metrics=metrics, start=start, end=end, top_level=top_level, uids=uids, ) endpoint_list = ModelEndpointList(endpoints=[]) if uids is None: client = get_v3io_client(endpoint=config.v3io_api) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=mlrun.api.schemas.ModelMonitoringStoreKinds.ENDPOINTS, ) _, container, path = parse_model_endpoint_store_prefix(path) cursor = client.kv.new_cursor( container=container, table_path=path, access_key=auth_info.data_session, filter_expression=self.build_kv_cursor_filter_expression( project, function, model, labels, top_level, ), attribute_names=["endpoint_id"], raise_for_status=RaiseForStatus.never, ) try: items = cursor.all() except Exception: return endpoint_list uids = [item["endpoint_id"] for item in items] for endpoint_id in uids: endpoint = self.get_endpoint( auth_info=auth_info, project=project, endpoint_id=endpoint_id, metrics=metrics, start=start, end=end, ) endpoint_list.endpoints.append(endpoint) return endpoint_list
def list_endpoints( access_key: str, project: str, model: Optional[str] = None, function: Optional[str] = None, labels: Optional[List[str]] = None, metrics: Optional[List[str]] = None, start: str = "now-1h", end: str = "now", ) -> ModelEndpointList: """ Returns a list of ModelEndpointState objects. Each object represents the current state of a model endpoint. This functions supports filtering by the following parameters: 1) model 2) function 3) labels By default, when no filters are applied, all available endpoints for the given project will be listed. In addition, this functions provides a facade for listing endpoint related metrics. This facade is time-based and depends on the 'start' and 'end' parameters. By default, when the metrics parameter is None, no metrics are added to the output of this function. :param access_key: V3IO access key for managing user permissions :param project: The name of the project :param model: The name of the model to filter by :param function: The name of the function to filter by :param labels: A list of labels to filter by. Label filters work by either filtering a specific value of a label (i.e. list("key==value")) or by looking for the existence of a given key (i.e. "key") :param metrics: A list of metrics to return for each endpoint, read more in 'TimeMetric' :param start: The start time of the metrics :param end: The end time of the metrics """ logger.info( "Listing endpoints", project=project, model=model, function=function, labels=labels, metrics=metrics, start=start, end=end, ) client = get_v3io_client(endpoint=config.v3io_api) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=ENDPOINTS) _, container, path = parse_model_endpoint_store_prefix(path) cursor = client.kv.new_cursor( container=container, table_path=path, access_key=access_key, filter_expression=build_kv_cursor_filter_expression( project, function, model, labels), attribute_names=["endpoint_id"], ) endpoint_list = ModelEndpointList(endpoints=[]) while True: item = cursor.next_item() if item is None: break endpoint_id = item["endpoint_id"] endpoint = ModelEndpoints.get_endpoint( access_key=access_key, project=project, endpoint_id=endpoint_id, metrics=metrics, start=start, end=end, ) endpoint_list.endpoints.append(endpoint) return endpoint_list
def get_endpoint( access_key: str, project: str, endpoint_id: str, metrics: Optional[List[str]] = None, start: str = "now-1h", end: str = "now", feature_analysis: bool = False, ) -> ModelEndpoint: """ Returns a ModelEndpoint object with additional metrics and feature related data. :param access_key: V3IO access key for managing user permissions :param project: The name of the project :param endpoint_id: The id of the model endpoint :param metrics: A list of metrics to return for each endpoint, read more in 'TimeMetric' :param start: The start time of the metrics :param end: The end time of the metrics :param feature_analysis: When True, the base feature statistics and current feature statistics will be added to the output of the resulting object """ logger.info( "Getting model endpoint record from kv", endpoint_id=endpoint_id, ) client = get_v3io_client(endpoint=config.v3io_api) path = config.model_endpoint_monitoring.store_prefixes.default.format( project=project, kind=ENDPOINTS) _, container, path = parse_model_endpoint_store_prefix(path) endpoint = client.kv.get( container=container, table_path=path, key=endpoint_id, access_key=access_key, raise_for_status=RaiseForStatus.never, ) endpoint = endpoint.output.item if not endpoint: raise MLRunNotFoundError(f"Endpoint {endpoint_id} not found") labels = endpoint.get("labels") feature_names = endpoint.get("feature_names") feature_names = _json_loads_if_not_none(feature_names) label_names = endpoint.get("label_names") label_names = _json_loads_if_not_none(label_names) feature_stats = endpoint.get("feature_stats") feature_stats = _json_loads_if_not_none(feature_stats) current_stats = endpoint.get("current_stats") current_stats = _json_loads_if_not_none(current_stats) drift_measures = endpoint.get("drift_measures") drift_measures = _json_loads_if_not_none(drift_measures) monitor_configuration = endpoint.get("monitor_configuration") monitor_configuration = _json_loads_if_not_none(monitor_configuration) endpoint = ModelEndpoint( metadata=ModelEndpointMetadata( project=endpoint.get("project"), labels=_json_loads_if_not_none(labels), uid=endpoint_id, ), spec=ModelEndpointSpec( function_uri=endpoint.get("function_uri"), model=endpoint.get("model"), model_class=endpoint.get("model_class") or None, model_uri=endpoint.get("model_uri") or None, feature_names=feature_names or None, label_names=label_names or None, stream_path=endpoint.get("stream_path") or None, algorithm=endpoint.get("algorithm") or None, monitor_configuration=monitor_configuration or None, active=endpoint.get("active") or None, ), status=ModelEndpointStatus( state=endpoint.get("state") or None, feature_stats=feature_stats or None, current_stats=current_stats or None, first_request=endpoint.get("first_request") or None, last_request=endpoint.get("last_request") or None, accuracy=endpoint.get("accuracy") or None, error_count=endpoint.get("error_count") or None, drift_status=endpoint.get("drift_status") or None, ), ) if feature_analysis and feature_names: endpoint_features = get_endpoint_features( feature_names=feature_names, feature_stats=feature_stats, current_stats=current_stats, ) if endpoint_features: endpoint.status.features = endpoint_features endpoint.status.drift_measures = drift_measures if metrics: endpoint_metrics = get_endpoint_metrics( access_key=access_key, project=project, endpoint_id=endpoint_id, start=start, end=end, metrics=metrics, ) if endpoint_metrics: endpoint.status.metrics = endpoint_metrics return endpoint
def test_get_endpoint_metrics(self): auth_info = self._get_auth_info() access_key = auth_info.data_session db = mlrun.get_run_db() path = config.model_endpoint_monitoring.store_prefixes.default.format( project=self.project_name, kind=mlrun.api.crud.ModelEndpoints().EVENTS ) _, container, path = parse_model_endpoint_store_prefix(path) frames = get_frames_client( token=access_key, container=container, address=config.v3io_framesd, ) start = datetime.utcnow() for i in range(5): endpoint = self._mock_random_endpoint() db.create_or_patch_model_endpoint( endpoint.metadata.project, endpoint.metadata.uid, endpoint ) frames.create(backend="tsdb", table=path, rate="10/m", if_exists=1) total = 0 dfs = [] for j in range(10): count = randint(1, 10) total += count data = { "predictions_per_second_count_1s": count, "endpoint_id": endpoint.metadata.uid, "timestamp": start - timedelta(minutes=10 - j), } df = pd.DataFrame(data=[data]) dfs.append(df) frames.write( backend="tsdb", table=path, dfs=dfs, index_cols=["timestamp", "endpoint_id"], ) endpoint = db.get_model_endpoint( self.project_name, endpoint.metadata.uid, metrics=["predictions_per_second_count_1s"], ) assert len(endpoint.status.metrics) > 0 predictions_per_second = endpoint.status.metrics[ "predictions_per_second_count_1s" ] assert predictions_per_second.name == "predictions_per_second_count_1s" response_total = sum((m[1] for m in predictions_per_second.values)) assert total == response_total