def get_table(self, uri): """get storey Table object by uri""" try: from storey import Driver, Table, V3ioDriver except ImportError: raise ImportError( "storey package is not installed, use pip install storey") if uri in self._tabels: return self._tabels[uri] if uri in [".", "" ] or uri.startswith("$"): # $.. indicates in-mem table self._tabels[uri] = Table("", Driver()) return self._tabels[uri] if uri.startswith("v3io://") or uri.startswith("v3ios://"): endpoint, uri = parse_v3io_path(uri) self._tabels[uri] = Table(uri, V3ioDriver(webapi=endpoint)) return self._tabels[uri] if is_store_uri(uri): resource = get_store_resource(uri) if resource.kind in [ mlrun.api.schemas.ObjectKind.feature_set.value, mlrun.api.schemas.ObjectKind.feature_vector.value, ]: target = get_online_target(resource) if not target: raise mlrun.errors.MLRunInvalidArgumentError( f"resource {uri} does not have an online data source") self._tabels[uri] = target.get_table_object() return self._tabels[uri] raise mlrun.errors.MLRunInvalidArgumentError( f"table {uri} not found in cache")
def get_table_object(self): from storey import Table, V3ioDriver # TODO use options/cred endpoint, uri = parse_v3io_path(self._target_path) return Table( uri, V3ioDriver(webapi=endpoint), flush_interval_secs=mlrun.mlconf.feature_store.flush_interval, )
def get_table(self, uri): """get storey Table object by uri""" try: from storey import Table, Driver, V3ioDriver except ImportError: raise ImportError( "storey package is not installed, use pip install storey") if uri in self._tabels: return self._tabels[uri] if uri in [".", ""]: self._tabels[uri] = Table("", Driver()) return self._tabels[uri] if uri.startswith("v3io://") or uri.startswith("v3ios://"): endpoint, uri = parse_v3io_path(uri) self._tabels[uri] = Table(uri, V3ioDriver(webapi=endpoint)) return self._tabels[uri] # todo: map store:// uri's to Table objects raise ValueError(f"table {uri} not found in cache")
def get_table_object(self): from storey import Table, V3ioDriver # TODO use options/cred endpoint, uri = parse_v3io_path(self._target_path) return Table(uri, V3ioDriver(webapi=endpoint))
SlidingWindows( self.aggregate_count_windows, self.aggregate_count_period, ), ), FieldAggregator( LATENCY, LATENCY, ["avg"], SlidingWindows( self.aggregate_avg_windows, self.aggregate_avg_period, ), ), ], table=Table("notable", NoopDriver()), ), SampleWindow( self.sample_window ), # Add required gap between event to apply sampling Map(self.compute_predictions_per_second), # Branch 1.1: Updated KV [ Map(self.process_before_kv), WriteToKV(container=self.kv_container, table=self.kv_path), InferSchema( v3io_access_key=self.v3io_access_key, v3io_framesd=self.v3io_framesd, container=self.kv_container, table=self.kv_path, ),
def __init__(self, parquet_path_template: str): self.parquet_path_template = parquet_path_template self._kv_keys = [ "timestamp", "project", "model", "function", "tag", "model_class", "endpoint_id", "labels", "unpacked_labels", "latency_avg_1s", "predictions_per_second_count_1s", "first_request", "last_request", "error_count", ] self._events_tsdb_keys = [ "timestamp", "project", "model", "function", "tag", "model_class", "endpoint_id", "predictions_per_second_count_1s", "latency_avg_1s", ] self._features_tsdb_keys = [ "timestamp", "endpoint_id", "project", "named_features", "prediction", ] self._flow = build_flow([ Source(), ProcessEndpointEvent(), FlatMap(self.unpack_predictions), # Branch 1: Aggregate events, count averages and update TSDB and KV [ AggregateByKey( aggregates=[ FieldAggregator( "predictions_per_second", "endpoint_id", ["count"], SlidingWindows(["1s"], "1s"), ), FieldAggregator( "latency", "latency", ["avg"], SlidingWindows(["1s"], "1s"), ), ], table=Table("notable", NoopDriver()), ), SampleWindow(10), # Branch 1.1: Updated KV [ Map(self.process_before_kv), UpdateKV("{project}/model-endpoints"), InferSchema("{project}/model-endpoints"), ], # Branch 1.2: Update TSDB [ Map(self.process_before_events_tsdb), Batch(max_events=10, timeout_secs=60 * 5), UpdateTSDB( path_builder=lambda e: f"{e[-1]['project']}/endpoint-events", tsdb_columns=self._events_tsdb_keys, rate="10/m", ), ], [ Map(self.process_before_features_tsdb), Batch( max_events=10, timeout_secs=60 * 5, key=lambda e: e.body["endpoint_id"], ), UpdateTSDB( path_builder=lambda e: f"{e[-1]['project']}/endpoint-features", rate="10/m", infer_columns=True, exclude_columns={"project"}, ), ], ], # Branch 2: Batch events, write to parquet [ Batch( max_events=10_000, # Every 1000 events or timeout_secs=60 * 60, # Every 1 hour key="endpoint_id", ), FlatMap(lambda batch: _process_before_parquet(batch)), UpdateParquet( path_template=self.parquet_path_template, partition_cols=["endpoint_id", "batch_timestamp"], infer_columns_from_data=True, # Settings for _Batching max_events=10_000, # Every 1000 events or timeout_secs=60 * 60, # Every 1 hour key="endpoint_id", ), ], ]).run()