Exemplo n.º 1
0
    def get_table(self, uri):
        """get storey Table object by uri"""
        try:
            from storey import Driver, Table, V3ioDriver
        except ImportError:
            raise ImportError(
                "storey package is not installed, use pip install storey")

        if uri in self._tabels:
            return self._tabels[uri]
        if uri in [".", ""
                   ] or uri.startswith("$"):  # $.. indicates in-mem table
            self._tabels[uri] = Table("", Driver())
            return self._tabels[uri]

        if uri.startswith("v3io://") or uri.startswith("v3ios://"):
            endpoint, uri = parse_v3io_path(uri)
            self._tabels[uri] = Table(uri, V3ioDriver(webapi=endpoint))
            return self._tabels[uri]

        if is_store_uri(uri):
            resource = get_store_resource(uri)
            if resource.kind in [
                    mlrun.api.schemas.ObjectKind.feature_set.value,
                    mlrun.api.schemas.ObjectKind.feature_vector.value,
            ]:
                target = get_online_target(resource)
                if not target:
                    raise mlrun.errors.MLRunInvalidArgumentError(
                        f"resource {uri} does not have an online data source")
                self._tabels[uri] = target.get_table_object()
                return self._tabels[uri]

        raise mlrun.errors.MLRunInvalidArgumentError(
            f"table {uri} not found in cache")
Exemplo n.º 2
0
    def get_table_object(self):
        from storey import Table, V3ioDriver

        # TODO use options/cred
        endpoint, uri = parse_v3io_path(self._target_path)
        return Table(
            uri,
            V3ioDriver(webapi=endpoint),
            flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
        )
Exemplo n.º 3
0
    def get_table(self, uri):
        """get storey Table object by uri"""
        try:
            from storey import Table, Driver, V3ioDriver
        except ImportError:
            raise ImportError(
                "storey package is not installed, use pip install storey")

        if uri in self._tabels:
            return self._tabels[uri]
        if uri in [".", ""]:
            self._tabels[uri] = Table("", Driver())
            return self._tabels[uri]

        if uri.startswith("v3io://") or uri.startswith("v3ios://"):
            endpoint, uri = parse_v3io_path(uri)
            self._tabels[uri] = Table(uri, V3ioDriver(webapi=endpoint))
            return self._tabels[uri]

        # todo: map store:// uri's to Table objects

        raise ValueError(f"table {uri} not found in cache")
Exemplo n.º 4
0
    def get_table_object(self):
        from storey import Table, V3ioDriver

        # TODO use options/cred
        endpoint, uri = parse_v3io_path(self._target_path)
        return Table(uri, V3ioDriver(webapi=endpoint))
             SlidingWindows(
                 self.aggregate_count_windows,
                 self.aggregate_count_period,
             ),
         ),
         FieldAggregator(
             LATENCY,
             LATENCY,
             ["avg"],
             SlidingWindows(
                 self.aggregate_avg_windows,
                 self.aggregate_avg_period,
             ),
         ),
     ],
     table=Table("notable", NoopDriver()),
 ),
 SampleWindow(
     self.sample_window
 ),  # Add required gap between event to apply sampling
 Map(self.compute_predictions_per_second),
 # Branch 1.1: Updated KV
 [
     Map(self.process_before_kv),
     WriteToKV(container=self.kv_container, table=self.kv_path),
     InferSchema(
         v3io_access_key=self.v3io_access_key,
         v3io_framesd=self.v3io_framesd,
         container=self.kv_container,
         table=self.kv_path,
     ),
    def __init__(self, parquet_path_template: str):
        self.parquet_path_template = parquet_path_template

        self._kv_keys = [
            "timestamp",
            "project",
            "model",
            "function",
            "tag",
            "model_class",
            "endpoint_id",
            "labels",
            "unpacked_labels",
            "latency_avg_1s",
            "predictions_per_second_count_1s",
            "first_request",
            "last_request",
            "error_count",
        ]

        self._events_tsdb_keys = [
            "timestamp",
            "project",
            "model",
            "function",
            "tag",
            "model_class",
            "endpoint_id",
            "predictions_per_second_count_1s",
            "latency_avg_1s",
        ]

        self._features_tsdb_keys = [
            "timestamp",
            "endpoint_id",
            "project",
            "named_features",
            "prediction",
        ]

        self._flow = build_flow([
            Source(),
            ProcessEndpointEvent(),
            FlatMap(self.unpack_predictions),
            # Branch 1: Aggregate events, count averages and update TSDB and KV
            [
                AggregateByKey(
                    aggregates=[
                        FieldAggregator(
                            "predictions_per_second",
                            "endpoint_id",
                            ["count"],
                            SlidingWindows(["1s"], "1s"),
                        ),
                        FieldAggregator(
                            "latency",
                            "latency",
                            ["avg"],
                            SlidingWindows(["1s"], "1s"),
                        ),
                    ],
                    table=Table("notable", NoopDriver()),
                ),
                SampleWindow(10),
                # Branch 1.1: Updated KV
                [
                    Map(self.process_before_kv),
                    UpdateKV("{project}/model-endpoints"),
                    InferSchema("{project}/model-endpoints"),
                ],
                # Branch 1.2: Update TSDB
                [
                    Map(self.process_before_events_tsdb),
                    Batch(max_events=10, timeout_secs=60 * 5),
                    UpdateTSDB(
                        path_builder=lambda e:
                        f"{e[-1]['project']}/endpoint-events",
                        tsdb_columns=self._events_tsdb_keys,
                        rate="10/m",
                    ),
                ],
                [
                    Map(self.process_before_features_tsdb),
                    Batch(
                        max_events=10,
                        timeout_secs=60 * 5,
                        key=lambda e: e.body["endpoint_id"],
                    ),
                    UpdateTSDB(
                        path_builder=lambda e:
                        f"{e[-1]['project']}/endpoint-features",
                        rate="10/m",
                        infer_columns=True,
                        exclude_columns={"project"},
                    ),
                ],
            ],
            # Branch 2: Batch events, write to parquet
            [
                Batch(
                    max_events=10_000,  # Every 1000 events or
                    timeout_secs=60 * 60,  # Every 1 hour
                    key="endpoint_id",
                ),
                FlatMap(lambda batch: _process_before_parquet(batch)),
                UpdateParquet(
                    path_template=self.parquet_path_template,
                    partition_cols=["endpoint_id", "batch_timestamp"],
                    infer_columns_from_data=True,
                    # Settings for _Batching
                    max_events=10_000,  # Every 1000 events or
                    timeout_secs=60 * 60,  # Every 1 hour
                    key="endpoint_id",
                ),
            ],
        ]).run()